2155 files changed, 86375 insertions, 29104 deletions
diff --git a/.gitignore b/.gitignore
index 8225aa6..b59a311 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,6 +45,8 @@ tools/clang
 tools/lldb
 # lld, which is tracked independently.
 tools/lld
+# Polly, which is tracked independently.
+tools/polly
 # Sphinx build tree, if building in-source dir.
 docs/_build
 
diff --git a/Android.mk b/Android.mk
index d456977..75dd2a0 100644
--- a/Android.mk
+++ b/Android.mk
@@ -25,6 +25,7 @@ subdirs := \
   lib/Linker \
   lib/LTO \
   lib/MC \
+  lib/MC/MCAnalysis \
   lib/MC/MCDisassembler \
   lib/MC/MCParser \
   lib/Object \
@@ -113,7 +114,6 @@ subdirs += \
 subdirs += \
   utils/count \
   utils/FileCheck \
-  utils/FileUpdate \
   utils/not \
   utils/TableGen \
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0d6eead..b9fca2a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -114,6 +114,12 @@ string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
 # They are used as destination of target generators.
 set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin)
 set(LLVM_LIBRARY_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib)
+if(WIN32 OR CYGWIN)
+  # DLL platform -- put DLLs into bin.
+  set(LLVM_SHLIB_OUTPUT_INTDIR ${LLVM_RUNTIME_OUTPUT_INTDIR})
+else()
+  set(LLVM_SHLIB_OUTPUT_INTDIR ${LLVM_LIBRARY_OUTPUT_INTDIR})
+endif()
 
 # Each of them corresponds to llvm-config's.
 set(LLVM_TOOLS_BINARY_DIR ${LLVM_RUNTIME_OUTPUT_INTDIR}) # --bindir
@@ -234,7 +240,7 @@ option(LLVM_USE_OPROFILE
 # If enabled, verify we are on a platform that supports oprofile.
 if( LLVM_USE_OPROFILE )
   if( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" )
-    message(FATAL_ERROR "OProfile support is available on Linux only.") 
+    message(FATAL_ERROR "OProfile support is available on Linux only.")
   endif( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" )
 endif( LLVM_USE_OPROFILE )
 
@@ -244,6 +250,9 @@ set(LLVM_USE_SANITIZER "" CACHE STRING
 option(LLVM_USE_SPLIT_DWARF
   "Use -gsplit-dwarf when compiling llvm." OFF)
 
+option(WITH_POLLY "Build LLVM with Polly" ON)
+option(LINK_POLLY_INTO_TOOLS "Static link Polly into tools" OFF)
+
 # Define an option controlling whether we should build for 32-bit on 64-bit
 # platforms, where supported.
 if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 )
@@ -496,7 +505,6 @@ add_subdirectory(lib)
 
 if( LLVM_INCLUDE_UTILS )
   add_subdirectory(utils/FileCheck)
-  add_subdirectory(utils/FileUpdate)
   add_subdirectory(utils/PerfectShuffle)
   add_subdirectory(utils/count)
   add_subdirectory(utils/not)
@@ -515,9 +523,6 @@ endif()
 
 add_subdirectory(projects)
 
-option(WITH_POLLY "Build LLVM with Polly" ON)
-option(LINK_POLLY_INTO_TOOLS "Static link Polly into tools" OFF)
-
 if(WITH_POLLY)
   if(NOT EXISTS ${LLVM_MAIN_SRC_DIR}/tools/polly/CMakeLists.txt)
     set(WITH_POLLY OFF)
@@ -583,6 +588,7 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
     PATTERN "*.inc"
     # Exclude include/llvm/CMakeFiles/intrinsics_gen.dir, matched by "*.def"
     PATTERN "CMakeFiles" EXCLUDE
+    PATTERN "config.h" EXCLUDE
     PATTERN ".svn" EXCLUDE
     )
 endif()
diff --git a/CREDITS.TXT b/CREDITS.TXT
index 311a661..0447c40 100644
--- a/CREDITS.TXT
+++ b/CREDITS.TXT
@@ -107,6 +107,10 @@ N: Rafael Avila de Espindola
 E: rafael.espindola@gmail.com
 D: The ARM backend
 
+N: Dave Estes
+E: cestes@codeaurora.org
+D: AArch64 machine description for Cortex-A53
+
 N: Alkis Evlogimenos
 E: alkis@evlogimenos.com
 D: Linear scan register allocator, many codegen improvements, Java frontend
@@ -162,10 +166,12 @@ D: Improvements for space efficiency
 
 N: James Grosbach
 E: grosbach@apple.com
+I: grosbach
 D: SjLj exception handling support
 D: General fixes and improvements for the ARM back-end
 D: MCJIT
 D: ARM integrated assembler and assembly parser
+D: Led effort for the backend formerly known as ARM64
 
 N: Lang Hames
 E: lhames@gmail.com
@@ -339,6 +345,10 @@ D: LTO tool, PassManager rewrite, Loop Pass Manager, Loop Rotate
 D: GCC PCH Integration (llvm-gcc), llvm-gcc improvements
 D: Optimizer improvements, Loop Index Split
 
+N: Ana Pazos
+E: apazos@codeaurora.org
+D: Fixes and improvements to the AArch64 backend
+
 N: Wesley Peck
 E: peckw@wesleypeck.com
 W: http://wesleypeck.com/
@@ -368,8 +378,10 @@ D: ARM calling conventions rewrite, hard float support
 
 N: Chad Rosier
 E: mcrosier@codeaurora.org
-D: ARM fast-isel improvements
-D: Performance monitoring
+I: mcrosier
+D: AArch64 fast instruction selection pass
+D: Fixes and improvements to the ARM fast-isel pass
+D: Fixes and improvements to the AArch64 backend
 
 N: Nadav Rotem
 E: nrotem@apple.com
diff --git a/Makefile.rules b/Makefile.rules
index 9417971..ebebc0a 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -377,6 +377,7 @@ ifeq ($(ENABLE_COVERAGE),1)
   BuildMode := $(BuildMode)+Coverage
   CXX.Flags += -ftest-coverage -fprofile-arcs
   C.Flags   += -ftest-coverage -fprofile-arcs
+  LD.Flags   += -ftest-coverage -fprofile-arcs
 endif
 
 # If DISABLE_ASSERTIONS=1 is specified (make command line or configured),
@@ -1724,7 +1725,7 @@ $(ObjDir)/%GenDFAPacketizer.inc.tmp : %.td $(ObjDir)/.dir $(LLVM_TBLGEN)
 
 # Dump all the records to <target>.td.expanded.  This is useful for debugging.
 $(TARGET:%=%.td.expanded): \
-%.td.expanded : %.td $(LLVM_TBLGEN)
+%.td.expanded : %.td $(LLVM_TBLGEN) $(TDFiles)
 	$(Echo) "Building a fully expanded version of $(<F)"
 	$(Verb) $(LLVMTableGen) -o $(call SYSPATH, $@) $<
 
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index 08f756c..a1c2ac5 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -1282,16 +1282,6 @@ AC_PATH_PROG(TAR,  [tar],  [gtar])
 AC_PATH_PROG(BINPWD,[pwd],  [pwd])
 
 dnl Looking for misc. graph plotting software
-AC_PATH_PROG(GRAPHVIZ, [Graphviz], [echo Graphviz])
-if test "$GRAPHVIZ" != "echo Graphviz" ; then
-  AC_DEFINE([HAVE_GRAPHVIZ],[1],[Define if the Graphviz program is available])
-  dnl If we're targeting for mingw we should emit windows paths, not msys
-  if test "$llvm_cv_os_type" = "MingW" ; then
-    GRAPHVIZ=`echo $GRAPHVIZ | sed 's/^\/\([[A-Za-z]]\)\//\1:\//' `
-  fi
-  AC_DEFINE_UNQUOTED([LLVM_PATH_GRAPHVIZ],"$GRAPHVIZ${EXEEXT}",
-   [Define to path to Graphviz program if found or 'echo Graphviz' otherwise])
-fi
 AC_PATH_PROG(DOT, [dot], [echo dot])
 if test "$DOT" != "echo dot" ; then
   AC_DEFINE([HAVE_DOT],[1],[Define if the dot program is available])
@@ -1302,76 +1292,6 @@ if test "$DOT" != "echo dot" ; then
   AC_DEFINE_UNQUOTED([LLVM_PATH_DOT],"$DOT${EXEEXT}",
    [Define to path to dot program if found or 'echo dot' otherwise])
 fi
-AC_PATH_PROG(FDP, [fdp], [echo fdp])
-if test "$FDP" != "echo fdp" ; then
-  AC_DEFINE([HAVE_FDP],[1],[Define if the neat program is available])
-  dnl If we're targeting for mingw we should emit windows paths, not msys
-  if test "$llvm_cv_os_type" = "MingW" ; then
-    FDP=`echo $FDP | sed 's/^\/\([[A-Za-z]]\)\//\1:\//' `
-  fi
-  AC_DEFINE_UNQUOTED([LLVM_PATH_FDP],"$FDP${EXEEXT}",
-   [Define to path to fdp program if found or 'echo fdp' otherwise])
-fi
-AC_PATH_PROG(NEATO, [neato], [echo neato])
-if test "$NEATO" != "echo neato" ; then
-  AC_DEFINE([HAVE_NEATO],[1],[Define if the neat program is available])
-  dnl If we're targeting for mingw we should emit windows paths, not msys
-  if test "$llvm_cv_os_type" = "MingW" ; then
-    NEATO=`echo $NEATO | sed 's/^\/\([[A-Za-z]]\)\//\1:\//' `
-  fi
-  AC_DEFINE_UNQUOTED([LLVM_PATH_NEATO],"$NEATO${EXEEXT}",
-   [Define to path to neato program if found or 'echo neato' otherwise])
-fi
-AC_PATH_PROG(TWOPI, [twopi], [echo twopi])
-if test "$TWOPI" != "echo twopi" ; then
-  AC_DEFINE([HAVE_TWOPI],[1],[Define if the neat program is available])
-  dnl If we're targeting for mingw we should emit windows paths, not msys
-  if test "$llvm_cv_os_type" = "MingW" ; then
-    TWOPI=`echo $TWOPI | sed 's/^\/\([[A-Za-z]]\)\//\1:\//' `
-  fi
-  AC_DEFINE_UNQUOTED([LLVM_PATH_TWOPI],"$TWOPI${EXEEXT}",
-   [Define to path to twopi program if found or 'echo twopi' otherwise])
-fi
-AC_PATH_PROG(CIRCO, [circo], [echo circo])
-if test "$CIRCO" != "echo circo" ; then
-  AC_DEFINE([HAVE_CIRCO],[1],[Define if the neat program is available])
-  dnl If we're targeting for mingw we should emit windows paths, not msys
-  if test "$llvm_cv_os_type" = "MingW" ; then
-    CIRCO=`echo $CIRCO | sed 's/^\/\([[A-Za-z]]\)\//\1:\//' `
-  fi
-  AC_DEFINE_UNQUOTED([LLVM_PATH_CIRCO],"$CIRCO${EXEEXT}",
-   [Define to path to circo program if found or 'echo circo' otherwise])
-fi
-AC_PATH_PROGS(GV, [gv gsview32], [echo gv])
-if test "$GV" != "echo gv" ; then
-  AC_DEFINE([HAVE_GV],[1],[Define if the gv program is available])
-  dnl If we're targeting for mingw we should emit windows paths, not msys
-  if test "$llvm_cv_os_type" = "MingW" ; then
-    GV=`echo $GV | sed 's/^\/\([[A-Za-z]]\)\//\1:\//' `
-  fi
-  AC_DEFINE_UNQUOTED([LLVM_PATH_GV],"$GV${EXEEXT}",
-   [Define to path to gv program if found or 'echo gv' otherwise])
-fi
-AC_PATH_PROG(DOTTY, [dotty], [echo dotty])
-if test "$DOTTY" != "echo dotty" ; then
-  AC_DEFINE([HAVE_DOTTY],[1],[Define if the dotty program is available])
-  dnl If we're targeting for mingw we should emit windows paths, not msys
-  if test "$llvm_cv_os_type" = "MingW" ; then
-    DOTTY=`echo $DOTTY | sed 's/^\/\([[A-Za-z]]\)\//\1:\//' `
-  fi
-  AC_DEFINE_UNQUOTED([LLVM_PATH_DOTTY],"$DOTTY${EXEEXT}",
-   [Define to path to dotty program if found or 'echo dotty' otherwise])
-fi
-AC_PATH_PROGS(XDOT, [xdot xdot.py], [echo xdot])
-if test "$XDOT" != "echo xdot" ; then
-  AC_DEFINE([HAVE_XDOT],[1],[Define if the xdot program is available])
-  dnl If we're targeting for mingw we should emit windows paths, not msys
-  if test "$llvm_cv_os_type" = "MingW" ; then
-    XDOT=`echo $XDOT | sed 's/^\/\([[A-Za-z]]\)\//\1:\//' `
-  fi
-  AC_DEFINE_UNQUOTED([LLVM_PATH_XDOT],"$XDOT${EXEEXT}",
-   [Define to path to xdot program if found or 'echo xdot' otherwise])
-fi
 
 dnl Find the install program
 AC_PROG_INSTALL
diff --git a/autoconf/m4/link_options.m4 b/autoconf/m4/link_options.m4
index b58d617..abf6596 100644
--- a/autoconf/m4/link_options.m4
+++ b/autoconf/m4/link_options.m4
@@ -6,7 +6,7 @@
 AC_DEFUN([AC_LINK_GET_VERSION],
   [AC_CACHE_CHECK([for linker version],[llvm_cv_link_version],
   [
-   version_string="$(ld -v 2>&1 | head -1)"
+   version_string="$(${LD:-ld} -v 2>&1 | head -1)"
 
    # Check for ld64.
    if (echo "$version_string" | grep -q "ld64"); then
diff --git a/autoconf/m4/path_tclsh.m4 b/autoconf/m4/path_tclsh.m4
deleted file mode 100644
index 85433de..0000000
--- a/autoconf/m4/path_tclsh.m4
+++ /dev/null
@@ -1,39 +0,0 @@
-dnl This macro checks for tclsh which is required to run dejagnu. On some 
-dnl platforms (notably FreeBSD), tclsh is named tclshX.Y - this handles
-dnl that for us so we can get the latest installed tclsh version.
-dnl 
-AC_DEFUN([DJ_AC_PATH_TCLSH], [
-no_itcl=true
-AC_MSG_CHECKING(for the tclsh program in tclinclude directory)
-AC_ARG_WITH(tclinclude,
-  AS_HELP_STRING([--with-tclinclude],
-                [directory where tcl headers are]), 
-  [with_tclinclude=${withval}],[with_tclinclude=''])
-AC_CACHE_VAL(ac_cv_path_tclsh,[
-dnl first check to see if --with-itclinclude was specified
-if test x"${with_tclinclude}" != x ; then
-  if test -f ${with_tclinclude}/tclsh ; then
-    ac_cv_path_tclsh=`(cd ${with_tclinclude}; pwd)`
-  elif test -f ${with_tclinclude}/src/tclsh ; then
-    ac_cv_path_tclsh=`(cd ${with_tclinclude}/src; pwd)`
-  else
-    AC_MSG_ERROR([${with_tclinclude} directory doesn't contain tclsh])
-  fi
-fi])
-
-dnl see if one is installed
-if test x"${ac_cv_path_tclsh}" = x ; then
-  AC_MSG_RESULT(none)
-  AC_PATH_PROGS([TCLSH],[tclsh8.4 tclsh8.4.8 tclsh8.4.7 tclsh8.4.6 tclsh8.4.5 tclsh8.4.4 tclsh8.4.3 tclsh8.4.2 tclsh8.4.1 tclsh8.4.0 tclsh8.3 tclsh8.3.5 tclsh8.3.4 tclsh8.3.3 tclsh8.3.2 tclsh8.3.1 tclsh8.3.0 tclsh])
-  if test x"${TCLSH}" = x ; then
-    ac_cv_path_tclsh='';
-  else
-    ac_cv_path_tclsh="${TCLSH}";
-  fi
-else
-  AC_MSG_RESULT(${ac_cv_path_tclsh})
-  TCLSH="${ac_cv_path_tclsh}"
-  AC_SUBST(TCLSH)
-fi
-])
-
diff --git a/bindings/ocaml/executionengine/llvm_executionengine.mli b/bindings/ocaml/executionengine/llvm_executionengine.mli
index 16f0893..74a6062 100644
--- a/bindings/ocaml/executionengine/llvm_executionengine.mli
+++ b/bindings/ocaml/executionengine/llvm_executionengine.mli
@@ -151,4 +151,6 @@ module ExecutionEngine: sig
   val data_layout : t -> Llvm_target.DataLayout.t
 end
 
+(** [initialize_native_target ()] initializes the native target corresponding
+    to the host. Returns [true] if initialization is {b not} done. *)
 val initialize_native_target : unit -> bool
diff --git a/bindings/ocaml/llvm/llvm.ml b/bindings/ocaml/llvm/llvm.ml
index d36f360..39875a5 100644
--- a/bindings/ocaml/llvm/llvm.ml
+++ b/bindings/ocaml/llvm/llvm.ml
@@ -16,6 +16,7 @@ type lluse
 type llbasicblock
 type llbuilder
 type llmemorybuffer
+type llmdkind
 
 module TypeKind = struct
   type t =
@@ -299,7 +300,7 @@ type ('a, 'b) llrev_pos =
 external create_context : unit -> llcontext = "llvm_create_context"
 external dispose_context : llcontext -> unit = "llvm_dispose_context"
 external global_context : unit -> llcontext = "llvm_global_context"
-external mdkind_id : llcontext -> string -> int = "llvm_mdkind_id"
+external mdkind_id : llcontext -> string -> llmdkind = "llvm_mdkind_id"
 
 (*===-- Modules -----------------------------------------------------------===*)
 external create_module : llcontext -> string -> llmodule = "llvm_create_module"
@@ -442,9 +443,9 @@ external constexpr_opcode : llvalue -> Opcode.t = "llvm_constexpr_get_opcode"
 
 (*--... Operations on instructions .........................................--*)
 external has_metadata : llvalue -> bool = "llvm_has_metadata"
-external metadata : llvalue -> int -> llvalue option = "llvm_metadata"
-external set_metadata : llvalue -> int -> llvalue -> unit = "llvm_set_metadata"
-external clear_metadata : llvalue -> int -> unit = "llvm_clear_metadata"
+external metadata : llvalue -> llmdkind -> llvalue option = "llvm_metadata"
+external set_metadata : llvalue -> llmdkind -> llvalue -> unit = "llvm_set_metadata"
+external clear_metadata : llvalue -> llmdkind -> unit = "llvm_clear_metadata"
 
 (*--... Operations on metadata .......,.....................................--*)
 external mdstring : llcontext -> string -> llvalue = "llvm_mdstring"
diff --git a/bindings/ocaml/llvm/llvm.mli b/bindings/ocaml/llvm/llvm.mli
index e996121..f5f5b53 100644
--- a/bindings/ocaml/llvm/llvm.mli
+++ b/bindings/ocaml/llvm/llvm.mli
@@ -48,6 +48,9 @@ type llbuilder
     See the [llvm::MemoryBuffer] class. *)
 type llmemorybuffer
 
+(** The kind id of metadata attached to an instruction. *)
+type llmdkind
+
 (** The kind of an [lltype], the result of [classify_type ty]. See the
     [llvm::Type::TypeID] enumeration. *)
 module TypeKind : sig
@@ -154,38 +157,40 @@ end
     See the [llvm::ICmpInst::Predicate] enumeration. *)
 module Icmp : sig
   type t =
-  | Eq
-  | Ne
-  | Ugt
-  | Uge
-  | Ult
-  | Ule
-  | Sgt
-  | Sge
-  | Slt
-  | Sle
+  | Eq  (* Equal *)
+  | Ne  (* Not equal *)
+  | Ugt (* Unsigned greater than *)
+  | Uge (* Unsigned greater or equal *)
+  | Ult (* Unsigned less than *)
+  | Ule (* Unsigned less or equal *)
+  | Sgt (* Signed greater than *)
+  | Sge (* Signed greater or equal *)
+  | Slt (* Signed less than *)
+  | Sle (* Signed less or equal *)
 end
 
 (** The predicate for a floating-point comparison ([fcmp]) instruction.
+    Ordered means that neither operand is a QNAN while unordered means
+    that either operand may be a QNAN.
     See the [llvm::FCmpInst::Predicate] enumeration. *)
 module Fcmp : sig
   type t =
-  | False
-  | Oeq
-  | Ogt
-  | Oge
-  | Olt
-  | Ole
-  | One
-  | Ord
-  | Uno
-  | Ueq
-  | Ugt
-  | Uge
-  | Ult
-  | Ule
-  | Une
-  | True
+  | False (* Always false *)
+  | Oeq   (* Ordered and equal *)
+  | Ogt   (* Ordered and greater than *)
+  | Oge   (* Ordered and greater or equal *)
+  | Olt   (* Ordered and less than *)
+  | Ole   (* Ordered and less or equal *)
+  | One   (* Ordered and not equal *)
+  | Ord   (* Ordered (no operand is NaN) *)
+  | Uno   (* Unordered (one operand at least is NaN) *)
+  | Ueq   (* Unordered and equal *)
+  | Ugt   (* Unordered and greater than *)
+  | Uge   (* Unordered and greater or equal *)
+  | Ult   (* Unordered and less than *)
+  | Ule   (* Unordered and less or equal *)
+  | Une   (* Unordered and not equal *)
+  | True  (* Always true *)
 end
 
 (** The opcodes for LLVM instructions and constant expressions. *)
@@ -392,7 +397,7 @@ val global_context : unit -> llcontext
 (** [mdkind_id context name] returns the MDKind ID that corresponds to the
     name [name] in the context [context].  See the function
     [llvm::LLVMContext::getMDKindID]. *)
-val mdkind_id : llcontext -> string -> int
+val mdkind_id : llcontext -> string -> llmdkind
 
 
 (** {6 Modules} *)
@@ -770,15 +775,15 @@ val has_metadata : llvalue -> bool
 (** [metadata i kind] optionally returns the metadata associated with the
     kind [kind] in the instruction [i] See the function
     [llvm::Instruction::getMetadata]. *)
-val metadata : llvalue -> int -> llvalue option
+val metadata : llvalue -> llmdkind -> llvalue option
 
 (** [set_metadata i kind md] sets the metadata [md] of kind [kind] in the
     instruction [i]. See the function [llvm::Instruction::setMetadata]. *)
-val set_metadata : llvalue -> int -> llvalue -> unit
+val set_metadata : llvalue -> llmdkind -> llvalue -> unit
 
 (** [clear_metadata i kind] clears the metadata of kind [kind] in the
     instruction [i]. See the function [llvm::Instruction::setMetadata]. *)
-val clear_metadata : llvalue -> int -> unit
+val clear_metadata : llvalue -> llmdkind -> unit
 
 
 (** {7 Operations on metadata} *)
@@ -1048,12 +1053,12 @@ val const_lshr : llvalue -> llvalue -> llvalue
     See the method [llvm::ConstantExpr::getAShr]. *)
 val const_ashr : llvalue -> llvalue -> llvalue
 
-(** [const_gep pc indices] returns the constant [getElementPtr] of [p1] with the
+(** [const_gep pc indices] returns the constant [getElementPtr] of [pc] with the
     constant integers indices from the array [indices].
     See the method [llvm::ConstantExpr::getGetElementPtr]. *)
 val const_gep : llvalue -> llvalue array -> llvalue
 
-(** [const_in_bounds_gep pc indices] returns the constant [getElementPtr] of [p1]
+(** [const_in_bounds_gep pc indices] returns the constant [getElementPtr] of [pc]
     with the constant integers indices from the array [indices].
     See the method [llvm::ConstantExpr::getInBoundsGetElementPtr]. *)
 val const_in_bounds_gep : llvalue -> llvalue array -> llvalue
@@ -2357,7 +2362,7 @@ val build_insertelement : llvalue -> llvalue -> llvalue -> string ->
 val build_shufflevector : llvalue -> llvalue -> llvalue -> string ->
                                llbuilder -> llvalue
 
-(** [build_insertvalue agg idx name b] creates a
+(** [build_extractvalue agg idx name b] creates a
     [%name = extractvalue %agg, %idx]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateExtractValue]. *)
diff --git a/bindings/ocaml/target/target_ocaml.c b/bindings/ocaml/target/target_ocaml.c
index 9e8778a..74e8185 100644
--- a/bindings/ocaml/target/target_ocaml.c
+++ b/bindings/ocaml/target/target_ocaml.c
@@ -352,8 +352,8 @@ CAMLprim value llvm_targetmachine_data_layout(value Machine) {
   CAMLreturn(DataLayout);
 }
 
-/* TargetMachine.t -> bool -> unit */
-CAMLprim value llvm_targetmachine_set_verbose_asm(value Machine, value Verb) {
+/* bool -> TargetMachine.t -> unit */
+CAMLprim value llvm_targetmachine_set_verbose_asm(value Verb, value Machine) {
   LLVMSetTargetMachineAsmVerbosity(TargetMachine_val(Machine), Bool_val(Verb));
   return Val_unit;
 }
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index 1325e79..b862ceb 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -252,15 +252,9 @@ function(llvm_find_program name)
   endif(LLVM_PATH_${NAME})
 endfunction()
 
-llvm_find_program(gv)
-llvm_find_program(circo)
-llvm_find_program(twopi)
-llvm_find_program(neato)
-llvm_find_program(fdp)
-llvm_find_program(dot)
-llvm_find_program(dotty)
-llvm_find_program(xdot xdot.py)
-llvm_find_program(Graphviz)
+if (LLVM_ENABLE_DOXYGEN)
+  llvm_find_program(dot)
+endif ()
 
 if( LLVM_ENABLE_FFI )
   find_path(FFI_INCLUDE_PATH ffi.h PATHS ${FFI_INCLUDE_DIR})
@@ -304,25 +298,6 @@ else()
   set(ENABLE_PIC 0)
 endif()
 
-find_package(LibXml2)
-if (LIBXML2_FOUND)
-  set(CLANG_HAVE_LIBXML 1)
-  # When cross-compiling, liblzma is not detected as a dependency for libxml2,
-  # which makes linking c-index-test fail. But for native builds, all libraries
-  # are installed and checked by CMake before Makefiles are generated and everything
-  # works according to the plan. However, if a -llzma is added to native builds,
-  # an additional requirement on the static liblzma.a is required, but will not
-  # be checked by CMake, breaking native compilation.
-  # Since this is only pertinent to cross-compilations, and there's no way CMake
-  # can check for every foreign library on every OS, we add the dep and warn the dev.
-  if ( CMAKE_CROSSCOMPILING )
-    if (NOT PC_LIBXML_VERSION VERSION_LESS "2.8.0")
-      message(STATUS "Adding LZMA as a dep to XML2 for cross-compilation, make sure liblzma.a is available.")
-      set(LIBXML2_LIBRARIES ${LIBXML2_LIBRARIES} "-llzma")
-    endif ()
-  endif ()
-endif ()
-
 check_cxx_compiler_flag("-Wno-variadic-macros" SUPPORTS_NO_VARIADIC_MACROS_FLAG)
 
 set(USE_NO_MAYBE_UNINITIALIZED 0)
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index 69ffa5b..31540d9 100644
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -150,6 +150,11 @@ endfunction(add_dead_strip)
 # Note: Don't set variables CMAKE_*_OUTPUT_DIRECTORY any more,
 # or a certain builder, for eaxample, msbuild.exe, would be confused.
 function(set_output_directory target bindir libdir)
+  # Do nothing if *_OUTPUT_INTDIR is empty.
+  if("${bindir}" STREQUAL "")
+    return()
+  endif()
+
   if(NOT "${CMAKE_CFG_INTDIR}" STREQUAL ".")
     foreach(build_mode ${CMAKE_CONFIGURATION_TYPES})
       string(TOUPPER "${build_mode}" CONFIG_SUFFIX)
@@ -205,7 +210,7 @@ function(llvm_add_library name)
     if(ARG_SHARED OR ARG_STATIC)
       message(WARNING "MODULE with SHARED|STATIC doesn't make sense.")
     endif()
-    if(NOT LLVM_ON_UNIX OR CYGWIN)
+    if(NOT LLVM_ENABLE_PLUGINS)
       message(STATUS "${name} ignored -- Loadable modules not supported on this platform.")
       return()
     endif()
@@ -570,12 +575,6 @@ function(configure_lit_site_cfg input output)
 
   set(SHLIBEXT "${LTDL_SHLIB_EXT}")
 
-  if(BUILD_SHARED_LIBS)
-    set(LLVM_SHARED_LIBS_ENABLED "1")
-  else()
-    set(LLVM_SHARED_LIBS_ENABLED "0")
-  endif(BUILD_SHARED_LIBS)
-
   # Configuration-time: See Unit/lit.site.cfg.in
   if (CMAKE_CFG_INTDIR STREQUAL ".")
     set(LLVM_BUILD_MODE ".")
@@ -590,10 +589,16 @@ function(configure_lit_site_cfg input output)
   string(REPLACE ${CMAKE_CFG_INTDIR} ${LLVM_BUILD_MODE} LLVM_LIBS_DIR  ${LLVM_LIBRARY_DIR})
 
   # SHLIBDIR points the build tree.
-  string(REPLACE ${CMAKE_CFG_INTDIR} ${LLVM_BUILD_MODE} SHLIBDIR ${LLVM_LIBRARY_OUTPUT_INTDIR})
+  string(REPLACE ${CMAKE_CFG_INTDIR} ${LLVM_BUILD_MODE} SHLIBDIR "${LLVM_SHLIB_OUTPUT_INTDIR}")
 
   set(PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE})
-  set(ENABLE_SHARED ${LLVM_SHARED_LIBS_ENABLED})
+  # FIXME: "ENABLE_SHARED" doesn't make sense, since it is used just for
+  # plugins. We may rename it.
+  if(LLVM_ENABLE_PLUGINS)
+    set(ENABLE_SHARED "1")
+  else()
+    set(ENABLE_SHARED "0")
+  endif()
 
   if(LLVM_ENABLE_ASSERTIONS AND NOT MSVC_IDE)
     set(ENABLE_ASSERTIONS "1")
diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 447ba52..b8577f7 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -67,12 +67,6 @@ if( LLVM_ENABLE_ASSERTIONS )
         "${flags_var_to_scrub}" "${${flags_var_to_scrub}}")
     endforeach()
   endif()
-else()
-  if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "RELEASE" )
-    if( NOT MSVC_IDE AND NOT XCODE )
-      add_definitions( -DNDEBUG )
-    endif()
-  endif()
 endif()
 
 if(WIN32)
@@ -113,18 +107,6 @@ if(APPLE)
   set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-flat_namespace -Wl,-undefined -Wl,suppress")
 endif()
 
-function(add_flag_or_print_warning flag)
-  check_c_compiler_flag(${flag} C_SUPPORTS_FLAG)
-  check_cxx_compiler_flag(${flag} CXX_SUPPORTS_FLAG)
-  if (C_SUPPORTS_FLAG AND CXX_SUPPORTS_FLAG)
-    message(STATUS "Building with ${flag}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${flag}" PARENT_SCOPE)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${flag}" PARENT_SCOPE)
-  else()
-    message(WARNING "${flag} is not supported.")
-  endif()
-endfunction()
-
 function(append value)
   foreach(variable ${ARGN})
     set(${variable} "${${variable}} ${value}" PARENT_SCOPE)
@@ -139,13 +121,25 @@ function(append_if condition value)
   endif()
 endfunction()
 
-macro(add_flag_if_supported flag)
-  check_c_compiler_flag(${flag} C_SUPPORTS_FLAG)
-  append_if(C_SUPPORTS_FLAG "${flag}" CMAKE_C_FLAGS)
-  check_cxx_compiler_flag(${flag} CXX_SUPPORTS_FLAG)
-  append_if(CXX_SUPPORTS_FLAG "${flag}" CMAKE_CXX_FLAGS)
+macro(add_flag_if_supported flag name)
+  check_c_compiler_flag("-Werror ${flag}" "C_SUPPORTS_${name}")
+  append_if("C_SUPPORTS_${name}" "${flag}" CMAKE_C_FLAGS)
+  check_cxx_compiler_flag("-Werror ${flag}" "CXX_SUPPORTS_${name}")
+  append_if("CXX_SUPPORTS_${name}" "${flag}" CMAKE_CXX_FLAGS)
 endmacro()
 
+function(add_flag_or_print_warning flag name)
+  check_c_compiler_flag("-Werror ${flag}" "C_SUPPORTS_${name}")
+  check_cxx_compiler_flag("-Werror ${flag}" "CXX_SUPPORTS_${name}")
+  if ("C_SUPPORTS_${name}" AND "CXX_SUPPORTS_${name}")
+    message(STATUS "Building with ${flag}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${flag}" PARENT_SCOPE)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${flag}" PARENT_SCOPE)
+  else()
+    message(WARNING "${flag} is not supported.")
+  endif()
+endfunction()
+
 if( LLVM_ENABLE_PIC )
   if( XCODE )
     # Xcode has -mdynamic-no-pic on by default, which overrides -fPIC. I don't
@@ -154,7 +148,7 @@ if( LLVM_ENABLE_PIC )
   elseif( WIN32 OR CYGWIN)
     # On Windows all code is PIC. MinGW warns if -fPIC is used.
   else()
-    add_flag_or_print_warning("-fPIC")
+    add_flag_or_print_warning("-fPIC" FPIC)
 
     if( WIN32 OR CYGWIN)
       # MinGW warns if -fvisibility-inlines-hidden is used.
@@ -290,10 +284,7 @@ elseif( LLVM_COMPILER_IS_GCC_COMPATIBLE )
     endif()
 
     append_if(LLVM_ENABLE_PEDANTIC "-pedantic -Wno-long-long" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-    check_cxx_compiler_flag("-Werror -Wcovered-switch-default" CXX_SUPPORTS_COVERED_SWITCH_DEFAULT_FLAG)
-    append_if(CXX_SUPPORTS_COVERED_SWITCH_DEFAULT_FLAG "-Wcovered-switch-default" CMAKE_CXX_FLAGS)
-    check_c_compiler_flag("-Werror -Wcovered-switch-default" C_SUPPORTS_COVERED_SWITCH_DEFAULT_FLAG)
-    append_if(C_SUPPORTS_COVERED_SWITCH_DEFAULT_FLAG "-Wcovered-switch-default" CMAKE_C_FLAGS)
+    add_flag_if_supported("-Wcovered-switch-default" COVERED_SWITCH_DEFAULT_FLAG)
     append_if(USE_NO_UNINITIALIZED "-Wno-uninitialized" CMAKE_CXX_FLAGS)
     append_if(USE_NO_MAYBE_UNINITIALIZED "-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
     check_cxx_compiler_flag("-Werror -Wnon-virtual-dtor" CXX_SUPPORTS_NON_VIRTUAL_DTOR_FLAG)
@@ -311,6 +302,9 @@ elseif( LLVM_COMPILER_IS_GCC_COMPATIBLE )
     endif()
   endif (LLVM_ENABLE_WARNINGS)
   append_if(LLVM_ENABLE_WERROR "-Werror" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  if (NOT LLVM_ENABLE_TIMESTAMPS)
+    add_flag_if_supported("-Werror=date-time" WERROR_DATE_TIME)
+  endif ()
   if (LLVM_ENABLE_CXX1Y)
     check_cxx_compiler_flag("-std=c++1y" CXX_SUPPORTS_CXX1Y)
     append_if(CXX_SUPPORTS_CXX1Y "-std=c++1y" CMAKE_CXX_FLAGS)
@@ -333,14 +327,14 @@ endif( MSVC )
 macro(append_common_sanitizer_flags)
   # Append -fno-omit-frame-pointer and turn on debug info to get better
   # stack traces.
-  add_flag_if_supported("-fno-omit-frame-pointer")
+  add_flag_if_supported("-fno-omit-frame-pointer" FNO_OMIT_FRAME_POINTER)
   if (NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" AND
       NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "RELWITHDEBINFO")
-    add_flag_if_supported("-gline-tables-only")
+    add_flag_if_supported("-gline-tables-only" GLINE_TABLES_ONLY)
   endif()
   # Use -O1 even in debug mode, otherwise sanitizers slowdown is too large.
   if (uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
-    add_flag_if_supported("-O1")
+    add_flag_if_supported("-O1" O1)
   endif()
 endmacro()
 
@@ -349,12 +343,12 @@ if(LLVM_USE_SANITIZER)
   if (LLVM_ON_UNIX)
     if (LLVM_USE_SANITIZER STREQUAL "Address")
       append_common_sanitizer_flags()
-      add_flag_or_print_warning("-fsanitize=address")
+      append("-fsanitize=address" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     elseif (LLVM_USE_SANITIZER MATCHES "Memory(WithOrigins)?")
       append_common_sanitizer_flags()
-      add_flag_or_print_warning("-fsanitize=memory")
+      append("-fsanitize=memory" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
       if(LLVM_USE_SANITIZER STREQUAL "MemoryWithOrigins")
-        add_flag_or_print_warning("-fsanitize-memory-track-origins")
+        append("-fsanitize-memory-track-origins" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
       endif()
     else()
       message(WARNING "Unsupported value of LLVM_USE_SANITIZER: ${LLVM_USE_SANITIZER}")
@@ -390,15 +384,9 @@ if(NOT CYGWIN AND NOT WIN32)
     if (C_SUPPORTS_FNO_FUNCTION_SECTIONS)
       # Don't add -ffunction-section if it can be disabled with -fno-function-sections.
       # Doing so will break sanitizers.
-      check_c_compiler_flag("-Werror -ffunction-sections" C_SUPPORTS_FFUNCTION_SECTIONS)
-      check_cxx_compiler_flag("-Werror -ffunction-sections" CXX_SUPPORTS_FFUNCTION_SECTIONS)
-      append_if(C_SUPPORTS_FFUNCTION_SECTIONS "-ffunction-sections" CMAKE_C_FLAGS)
-      append_if(CXX_SUPPORTS_FFUNCTION_SECTIONS "-ffunction-sections" CMAKE_CXX_FLAGS)
+      add_flag_if_supported("-ffunction-sections" FFUNCTION_SECTIONS)
     endif()
-    check_c_compiler_flag("-Werror -fdata-sections" C_SUPPORTS_FDATA_SECTIONS)
-    check_cxx_compiler_flag("-Werror -fdata-sections" CXX_SUPPORTS_FDATA_SECTIONS)
-    append_if(C_SUPPORTS_FDATA_SECTIONS "-fdata-sections" CMAKE_C_FLAGS)
-    append_if(CXX_SUPPORTS_FDATA_SECTIONS "-fdata-sections" CMAKE_CXX_FLAGS)
+    add_flag_if_supported("-fdata-sections" FDATA_SECTIONS)
   endif()
 endif()
 
@@ -419,3 +407,12 @@ if(MSVC)
   string(REGEX REPLACE "(^| ) */EH[-cs]+ *( |$)" "\\1 \\2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   string(REGEX REPLACE "(^| ) */GR-? *( |$)" "\\1 \\2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endif()
+
+# Plugin support
+# FIXME: Make this configurable.
+if(WIN32 OR CYGWIN)
+  # DLL platform(s) don't support plugins.
+  set(LLVM_ENABLE_PLUGINS OFF)
+else()
+  set(LLVM_ENABLE_PLUGINS ON)
+endif()
diff --git a/configure b/configure
index e1959df..e9aba06 100755
--- a/configure
+++ b/configure
@@ -14,7 +14,7 @@
 ## M4sh Initialization.  ##
 ## --------------------- ##
 
-# Be Bourne compatible
+# Be Bourne compatible.
 if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
   emulate sh
   NULLCMD=:
@@ -7612,7 +7612,7 @@ if test "${llvm_cv_link_version+set}" = set; then
   echo $ECHO_N "(cached) $ECHO_C" >&6
 else
 
-   version_string="$(ld -v 2>&1 | head -1)"
+   version_string="$(${LD:-ld} -v 2>&1 | head -1)"
 
    # Check for ld64.
    if (echo "$version_string" | grep -q "ld64"); then
diff --git a/docs/Atomics.rst b/docs/Atomics.rst
index 1243f34..5f17c61 100644
--- a/docs/Atomics.rst
+++ b/docs/Atomics.rst
@@ -110,8 +110,7 @@ where threads and signals are involved.
 
 ``cmpxchg`` and ``atomicrmw`` are essentially like an atomic load followed by an
 atomic store (where the store is conditional for ``cmpxchg``), but no other
-memory operation can happen on any thread between the load and store.  Note that
-LLVM's cmpxchg does not provide quite as many options as the C++0x version.
+memory operation can happen on any thread between the load and store.
 
 A ``fence`` provides Acquire and/or Release ordering which is not part of
 another operation; it is normally used along with Monotonic memory operations.
@@ -430,10 +429,9 @@ other ``atomicrmw`` operations generate a loop with ``LOCK CMPXCHG``.  Depending
 on the users of the result, some ``atomicrmw`` operations can be translated into
 operations like ``LOCK AND``, but that does not work in general.
 
-On ARM, MIPS, and many other RISC architectures, Acquire, Release, and
-SequentiallyConsistent semantics require barrier instructions for every such
+On ARM (before v8), MIPS, and many other RISC architectures, Acquire, Release,
+and SequentiallyConsistent semantics require barrier instructions for every such
 operation. Loads and stores generate normal instructions.  ``cmpxchg`` and
 ``atomicrmw`` can be represented using a loop with LL/SC-style instructions
 which take some sort of exclusive lock on a cache line (``LDREX`` and ``STREX``
-on ARM, etc.). At the moment, the IR does not provide any way to represent a
-weak ``cmpxchg`` which would not require a loop.
+on ARM, etc.).
diff --git a/docs/CMake.rst b/docs/CMake.rst
index fed283d..bfc9cb9 100644
--- a/docs/CMake.rst
+++ b/docs/CMake.rst
@@ -132,7 +132,7 @@ write the variable and the type on the CMake command line:
 Frequently-used CMake variables
 -------------------------------
 
-Here are listed some of the CMake variables that are used often, along with a
+Here are some of the CMake variables that are used often, along with a
 brief explanation and LLVM-specific notes. For full documentation, check the
 CMake docs or execute ``cmake --help-variable VARIABLE_NAME``.
 
@@ -157,8 +157,8 @@ CMake docs or execute ``cmake --help-variable VARIABLE_NAME``.
   Extra flags to use when compiling C++ source files.
 
 **BUILD_SHARED_LIBS**:BOOL
-  Flag indicating is shared libraries will be built. Its default value is
-  OFF. Shared libraries are not supported on Windows and not recommended in the
+  Flag indicating if shared libraries will be built. Its default value is
+  OFF. Shared libraries are not supported on Windows and not recommended on the
   other OSes.
 
 .. _LLVM-specific variables:
@@ -487,7 +487,7 @@ into LLVM source tree. You can achieve it in two easy steps:
 #. Adding ``add_subdirectory(<pass name>)`` line into
    ``<LLVM root>/lib/Transform/CMakeLists.txt``.
 
-Compiler/Platform specific topics
+Compiler/Platform-specific topics
 =================================
 
 Notes for specific compilers and/or platforms.
diff --git a/docs/CodeGenerator.rst b/docs/CodeGenerator.rst
index cc09946..5736e43 100644
--- a/docs/CodeGenerator.rst
+++ b/docs/CodeGenerator.rst
@@ -1228,7 +1228,7 @@ used. Each virtual register can only be mapped to physical registers of a
 particular class. For instance, in the X86 architecture, some virtuals can only
 be allocated to 8 bit registers.  A register class is described by
 ``TargetRegisterClass`` objects.  To discover if a virtual register is
-compatible with a given physical, this code can be used:</p>
+compatible with a given physical, this code can be used:
 
 .. code-block:: c++
 
@@ -1683,7 +1683,7 @@ ones supported by the matcher), through a Requires clause:
   def : MnemonicAlias<"pushf", "pushfq">, Requires<[In64BitMode]>;
   def : MnemonicAlias<"pushf", "pushfl">, Requires<[In32BitMode]>;
 
-In this example, the mnemonic gets mapped into different a new one depending on
+In this example, the mnemonic gets mapped into a different one depending on
 the current instruction set.
 
 Instruction Aliases
@@ -2027,7 +2027,7 @@ supported on x86/x86-64 and PowerPC. It is performed if:
 
 * Option ``-tailcallopt`` is enabled.
 
-* Platform specific constraints are met.
+* Platform-specific constraints are met.
 
 x86/x86-64 constraints:
 
diff --git a/docs/CodingStandards.rst b/docs/CodingStandards.rst
index edbef3a..3cfa1f6 100644
--- a/docs/CodingStandards.rst
+++ b/docs/CodingStandards.rst
@@ -107,10 +107,7 @@ unlikely to be supported by our host compilers.
 * Trailing return types: N2541_
 * Lambdas: N2927_
 
-  * But *not* ``std::function``, until Clang implements `MSVC-compatible RTTI`_.
-    In many cases, you may be able to use ``llvm::function_ref`` instead, and it
-    is a superior choice in those cases.
-  * And *not* lambdas with default arguments.
+  * But *not* lambdas with default arguments.
 
 * ``decltype``: N2343_
 * Nested closing right angle brackets: N1757_
diff --git a/docs/CommandGuide/bugpoint.rst b/docs/CommandGuide/bugpoint.rst
index e4663e5..f11585d 100644
--- a/docs/CommandGuide/bugpoint.rst
+++ b/docs/CommandGuide/bugpoint.rst
@@ -124,10 +124,6 @@ OPTIONS
  do not use this option, **bugpoint** will attempt to generate a reference output
  by compiling the program with the "safe" backend and running it.
 
-**--profile-info-file** *filename*
-
- Profile file loaded by **--profile-loader**.
-
 **--run-{int,jit,llc,custom}**
 
  Whenever the test program is compiled, **bugpoint** should generate code for it
diff --git a/docs/CommandGuide/opt.rst b/docs/CommandGuide/opt.rst
index 3fed684..ad5b62c 100644
--- a/docs/CommandGuide/opt.rst
+++ b/docs/CommandGuide/opt.rst
@@ -99,10 +99,6 @@ OPTIONS
  :option:`-std-compile-opts` and :option:`-verify-each` can quickly track down
  this kind of problem.
 
-.. option:: -profile-info-file <filename>
-
- Specify the name of the file loaded by the ``-profile-loader`` option.
-
 .. option:: -stats
 
  Print statistics.
diff --git a/docs/Extensions.rst b/docs/Extensions.rst
index a49485c..271c085 100644
--- a/docs/Extensions.rst
+++ b/docs/Extensions.rst
@@ -76,7 +76,7 @@ the target.  It corresponds to the COFF relocation types
 
 Syntax:
 
-   ``.linkonce [ comdat type [ section identifier ] ]``
+   ``.linkonce [ comdat type ]``
 
 Supported COMDAT types:
 
@@ -95,16 +95,6 @@ Supported COMDAT types:
    Duplicates are discarded, but the linker issues an error if any duplicates
    do not have exactly the same content.
 
-``associative``
-   Links the section if a certain other COMDAT section is linked. This other
-   section is indicated by its section identifier following the comdat type.
-   The following restrictions apply to the associated section:
-
-   1. It must be the name of a section already defined.
-   2. It must differ from the current section.
-   3. It must be a COMDAT section.
-   4. It cannot be another associative COMDAT section.
-
 ``largest``
    Links the largest section from among the duplicates.
 
@@ -118,10 +108,6 @@ Supported COMDAT types:
   .linkonce
     ...
 
-  .section .xdata$foo
-  .linkonce associative .text$foo
-    ...
-
 ``.section`` Directive
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -160,6 +146,25 @@ different COMDATs:
   Symbol2:
   .long 1
 
+In addition to the types allowed with ``.linkonce``, ``.section`` also accepts
+``associative``. The meaning is that the section is linked  if a certain other
+COMDAT section is linked. This other section is indicated by the comdat symbol
+in this directive. It can be any symbol defined in the associated section, but
+is usually the associated section's comdat.
+
+   The following restrictions apply to the associated section:
+
+   1. It must be a COMDAT section.
+   2. It cannot be another associative COMDAT section.
+
+In the following example the symobl ``sym`` is the comdat symbol of ``.foo``
+and ``.bar`` is associated to ``.foo``.
+
+.. code-block:: gas
+
+	.section	.foo,"bw",discard, "sym"
+	.section	.bar,"rd",associative, "sym"
+
 Target Specific Behaviour
 =========================
 
@@ -190,3 +195,17 @@ range via a slight deviation.  It will generate an indirect jump as follows:
   blx r12
   sub.w sp, sp, r4
 
+Variable Length Arrays
+^^^^^^^^^^^^^^^^^^^^^^
+
+The reference implementation (Microsoft Visual Studio 2012) does not permit the
+emission of Variable Length Arrays (VLAs).
+
+The Windows ARM Itanium ABI extends the base ABI by adding support for emitting
+a dynamic stack allocation.  When emitting a variable stack allocation, a call
+to ``__chkstk`` is emitted unconditionally to ensure that guard pages are setup
+properly.  The emission of this stack probe emission is handled similar to the
+standard stack probe emission.
+
+The MSVC environment does not emit code for VLAs currently.
+
diff --git a/docs/GarbageCollection.rst b/docs/GarbageCollection.rst
index 323a6ea..dc6dab1 100644
--- a/docs/GarbageCollection.rst
+++ b/docs/GarbageCollection.rst
@@ -633,7 +633,7 @@ Threaded
   Denotes a multithreaded mutator; the collector must still stop the mutator
   ("stop the world") before beginning reachability analysis.  Stopping a
   multithreaded mutator is a complicated problem.  It generally requires highly
-  platform specific code in the runtime, and the production of carefully
+  platform-specific code in the runtime, and the production of carefully
   designed machine code at safe points.
 
 Concurrent
diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst
index 3d2ec1e..6de9b90 100644
--- a/docs/GettingStarted.rst
+++ b/docs/GettingStarted.rst
@@ -87,9 +87,10 @@ Here's the short story for getting up and running quickly with LLVM:
    * ``make check-all`` --- This run the regression tests to ensure everything
      is in working order.
 
-   * It is also possible to use CMake instead of the makefiles. With CMake it is
-     possible to generate project files for several IDEs: Xcode, Eclipse CDT4,
-     CodeBlocks, Qt-Creator (use the CodeBlocks generator), KDevelop3.
+   * It is also possible to use `CMake <CMake.html>`_ instead of the makefiles.
+     With CMake it is possible to generate project files for several IDEs:
+     Xcode, Eclipse CDT4, CodeBlocks, Qt-Creator (use the CodeBlocks
+     generator), KDevelop3.
 
    * If you get an "internal compiler error (ICE)" or test failures, see
      `below`.
@@ -680,7 +681,7 @@ The following options can be used to set or enable LLVM specific options:
 
   Enables optimized compilation (debugging symbols are removed and GCC
   optimization flags are enabled). Note that this is the default setting if you
-  are using the LLVM distribution. The default behavior of an Subversion
+  are using the LLVM distribution. The default behavior of a Subversion
   checkout is to use an unoptimized build (also known as a debug build).
 
 ``--enable-debug-runtime``
@@ -698,14 +699,12 @@ The following options can be used to set or enable LLVM specific options:
 
   Controls which targets will be built and linked into llc. The default value
   for ``target_options`` is "all" which builds and links all available targets.
-  The value "host-only" can be specified to build only a native compiler (no
-  cross-compiler targets available). The "native" target is selected as the
-  target of the build host. You can also specify a comma separated list of
-  target names that you want available in llc. The target names use all lower
-  case. The current set of targets is:
+  The "host" target is selected as the target of the build host. You can also
+  specify a comma separated list of target names that you want available in llc.
+  The target names use all lower case. The current set of targets is:
 
-    ``arm, cpp, hexagon, mips, mipsel, msp430, powerpc, ptx, sparc, spu,
-    systemz, x86, x86_64, xcore``.
+    ``aarch64, arm, arm64, cpp, hexagon, mips, mipsel, mips64, mips64el, msp430,
+    powerpc, nvptx, r600, sparc, systemz, x86, x86_64, xcore``.
 
 ``--enable-doxygen``
 
@@ -743,7 +742,7 @@ builds:
 
 Debug Builds
 
-  These builds are the default when one is using an Subversion checkout and
+  These builds are the default when one is using a Subversion checkout and
   types ``gmake`` (unless the ``--enable-optimized`` option was used during
   configuration).  The build system will compile the tools and libraries with
   debugging information.  To get a Debug Build using the LLVM distribution the
diff --git a/docs/GettingStartedVS.rst b/docs/GettingStartedVS.rst
index aa980d2..d914cc1 100644
--- a/docs/GettingStartedVS.rst
+++ b/docs/GettingStartedVS.rst
@@ -99,6 +99,9 @@ Here's the short story for getting up and running quickly with LLVM:
      build.
    * See the :doc:`LLVM CMake guide <CMake>` for detailed information about
      how to configure the LLVM build.
+   * CMake generates project files for all build types. To select a specific
+     build type, use the Configuration manager from the VS IDE or the 
+     ``/property:Configuration`` command line option when using MSBuild.
 
 6. Start Visual Studio
 
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index fa40363..cc9656a 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -117,8 +117,8 @@ And the hard way:
 
 .. code-block:: llvm
 
-    %0 = add i32 %X, %X           ; yields {i32}:%0
-    %1 = add i32 %0, %0           ; yields {i32}:%1
+    %0 = add i32 %X, %X           ; yields i32:%0
+    %1 = add i32 %0, %0           ; yields i32:%1
     %result = add i32 %1, %1
 
 This last way of multiplying ``%X`` by 8 illustrates several important
@@ -464,6 +464,34 @@ DLL storage class:
     exists for defining a dll interface, the compiler, assembler and linker know
     it is externally referenced and must refrain from deleting the symbol.
 
+.. _tls_model:
+
+Thread Local Storage Models
+---------------------------
+
+A variable may be defined as ``thread_local``, which means that it will
+not be shared by threads (each thread will have a separated copy of the
+variable). Not all targets support thread-local variables. Optionally, a
+TLS model may be specified:
+
+``localdynamic``
+    For variables that are only used within the current shared library.
+``initialexec``
+    For variables in modules that will not be loaded dynamically.
+``localexec``
+    For variables defined in the executable and only used within it.
+
+If no explicit model is given, the "general dynamic" model is used.
+
+The models correspond to the ELF TLS models; see `ELF Handling For
+Thread-Local Storage <http://people.redhat.com/drepper/tls.pdf>`_ for
+more information on under which circumstances the different models may
+be used. The target may choose a different TLS model if the specified
+model is not supported, or if a better choice of model can be made.
+
+A model can also be specified in a alias, but then it only governs how
+the alias is accessed. It will not have any effect in the aliasee.
+
 .. _namedtypes:
 
 Structure Types
@@ -491,29 +519,13 @@ Global Variables
 Global variables define regions of memory allocated at compilation time
 instead of run-time.
 
-Global variables definitions must be initialized, may have an explicit section
-to be placed in, and may have an optional explicit alignment specified.
+Global variables definitions must be initialized.
 
 Global variables in other translation units can also be declared, in which
 case they don't have an initializer.
 
-A variable may be defined as ``thread_local``, which means that it will
-not be shared by threads (each thread will have a separated copy of the
-variable). Not all targets support thread-local variables. Optionally, a
-TLS model may be specified:
-
-``localdynamic``
-    For variables that are only used within the current shared library.
-``initialexec``
-    For variables in modules that will not be loaded dynamically.
-``localexec``
-    For variables defined in the executable and only used within it.
-
-The models correspond to the ELF TLS models; see `ELF Handling For
-Thread-Local Storage <http://people.redhat.com/drepper/tls.pdf>`_ for
-more information on under which circumstances the different models may
-be used. The target may choose a different TLS model if the specified
-model is not supported, or if a better choice of model can be made.
+Either global variable definitions or declarations may have an explicit section
+to be placed in and may have an optional explicit alignment specified.
 
 A variable may be defined as a global ``constant``, which indicates that
 the contents of the variable will **never** be modified (enabling better
@@ -550,6 +562,8 @@ is zero. The address space qualifier must precede any other attributes.
 
 LLVM allows an explicit section to be specified for globals. If the
 target supports it, it will emit globals to the section specified.
+Additionally, the global can placed in a comdat if the target has the necessary
+support.
 
 By default, global initializers are optimized by assuming that global
 variables defined within the module are not modified from their
@@ -572,11 +586,14 @@ iteration.
 
 Globals can also have a :ref:`DLL storage class <dllstorageclass>`.
 
+Variables and aliasaes can have a
+:ref:`Thread Local Storage Model <tls_model>`.
+
 Syntax::
 
     [@<GlobalVarName> =] [Linkage] [Visibility] [DLLStorageClass] [ThreadLocal]
-                         [AddrSpace] [unnamed_addr] [ExternallyInitialized]
-                         <global | constant> <Type>
+                         [unnamed_addr] [AddrSpace] [ExternallyInitialized]
+                         <global | constant> <Type> [<InitializerConstant>]
                          [, section "name"] [, align <Alignment>]
 
 For example, the following defines a global in a numbered address space
@@ -612,8 +629,9 @@ an optional ``unnamed_addr`` attribute, a return type, an optional
 :ref:`parameter attribute <paramattrs>` for the return type, a function
 name, a (possibly empty) argument list (each with optional :ref:`parameter
 attributes <paramattrs>`), optional :ref:`function attributes <fnattrs>`,
-an optional section, an optional alignment, an optional :ref:`garbage
-collector name <gc>`, an optional :ref:`prefix <prefixdata>`, an opening
+an optional section, an optional alignment,
+an optional :ref:`comdat <langref_comdats>`,
+an optional :ref:`garbage collector name <gc>`, an optional :ref:`prefix <prefixdata>`, an opening
 curly brace, a list of basic blocks, and a closing curly brace.
 
 LLVM function declarations consist of the "``declare``" keyword, an
@@ -643,6 +661,7 @@ predecessors, it also cannot have any :ref:`PHI nodes <i_phi>`.
 
 LLVM allows an explicit section to be specified for functions. If the
 target supports it, it will emit functions to the section specified.
+Additionally, the function can placed in a COMDAT.
 
 An explicit alignment may be specified for a function. If not present,
 or if the alignment is set to zero, the alignment of the function is set
@@ -658,37 +677,131 @@ Syntax::
     define [linkage] [visibility] [DLLStorageClass]
            [cconv] [ret attrs]
            <ResultType> @<FunctionName> ([argument list])
-           [unnamed_addr] [fn Attrs] [section "name"] [align N]
-           [gc] [prefix Constant] { ... }
+           [unnamed_addr] [fn Attrs] [section "name"] [comdat $<ComdatName>]
+           [align N] [gc] [prefix Constant] { ... }
 
 .. _langref_aliases:
 
 Aliases
 -------
 
-Aliases act as "second name" for the aliasee value (which can be either
-function, global variable, another alias or bitcast of global value).
+Aliases, unlike function or variables, don't create any new data. They
+are just a new symbol and metadata for an existing position.
+
+Aliases have a name and an aliasee that is either a global value or a
+constant expression.
+
 Aliases may have an optional :ref:`linkage type <linkage>`, an optional
-:ref:`visibility style <visibility>`, and an optional :ref:`DLL storage class
-<dllstorageclass>`.
+:ref:`visibility style <visibility>`, an optional :ref:`DLL storage class
+<dllstorageclass>` and an optional :ref:`tls model <tls_model>`.
 
 Syntax::
 
-    @<Name> = [Visibility] [DLLStorageClass] alias [Linkage] <AliaseeTy> @<Aliasee>
+    @<Name> = [Visibility] [DLLStorageClass] [ThreadLocal] [unnamed_addr] alias [Linkage] <AliaseeTy> @<Aliasee>
 
 The linkage must be one of ``private``, ``internal``, ``linkonce``, ``weak``,
 ``linkonce_odr``, ``weak_odr``, ``external``. Note that some system linkers
-might not correctly handle dropping a weak symbol that is aliased by a non-weak
-alias.
+might not correctly handle dropping a weak symbol that is aliased.
 
 Alias that are not ``unnamed_addr`` are guaranteed to have the same address as
-the aliasee.
+the aliasee expression. ``unnamed_addr`` ones are only guaranteed to point
+to the same content.
+
+Since aliases are only a second name, some restrictions apply, of which
+some can only be checked when producing an object file:
+
+* The expression defining the aliasee must be computable at assembly
+  time. Since it is just a name, no relocations can be used.
+
+* No alias in the expression can be weak as the possibility of the
+  intermediate alias being overridden cannot be represented in an
+  object file.
 
-The aliasee must be a definition.
+* No global value in the expression can be a declaration, since that
+  would require a relocation, which is not possible.
 
-Aliases are not allowed to point to aliases with linkages that can be
-overridden. Since they are only a second name, the possibility of the
-intermediate alias being overridden cannot be represented in an object file.
+.. _langref_comdats:
+
+Comdats
+-------
+
+Comdat IR provides access to COFF and ELF object file COMDAT functionality.
+
+Comdats have a name which represents the COMDAT key.  All global objects which
+specify this key will only end up in the final object file if the linker chooses
+that key over some other key.  Aliases are placed in the same COMDAT that their
+aliasee computes to, if any.
+
+Comdats have a selection kind to provide input on how the linker should
+choose between keys in two different object files.
+
+Syntax::
+
+    $<Name> = comdat SelectionKind
+
+The selection kind must be one of the following:
+
+``any``
+    The linker may choose any COMDAT key, the choice is arbitrary.
+``exactmatch``
+    The linker may choose any COMDAT key but the sections must contain the
+    same data.
+``largest``
+    The linker will choose the section containing the largest COMDAT key.
+``noduplicates``
+    The linker requires that only section with this COMDAT key exist.
+``samesize``
+    The linker may choose any COMDAT key but the sections must contain the
+    same amount of data.
+
+Note that the Mach-O platform doesn't support COMDATs and ELF only supports
+``any`` as a selection kind.
+
+Here is an example of a COMDAT group where a function will only be selected if
+the COMDAT key's section is the largest:
+
+.. code-block:: llvm
+
+   $foo = comdat largest
+   @foo = global i32 2, comdat $foo
+
+   define void @bar() comdat $foo {
+     ret void
+   }
+
+In a COFF object file, this will create a COMDAT section with selection kind
+``IMAGE_COMDAT_SELECT_LARGEST`` containing the contents of the ``@foo`` symbol
+and another COMDAT section with selection kind
+``IMAGE_COMDAT_SELECT_ASSOCIATIVE`` which is associated with the first COMDAT
+section and contains the contents of the ``@baz`` symbol.
+
+There are some restrictions on the properties of the global object.
+It, or an alias to it, must have the same name as the COMDAT group when
+targeting COFF.
+The contents and size of this object may be used during link-time to determine
+which COMDAT groups get selected depending on the selection kind.
+Because the name of the object must match the name of the COMDAT group, the
+linkage of the global object must not be local; local symbols can get renamed
+if a collision occurs in the symbol table.
+
+The combined use of COMDATS and section attributes may yield surprising results.
+For example:
+
+.. code-block:: llvm
+
+   $foo = comdat any
+   $bar = comdat any
+   @g1 = global i32 42, section "sec", comdat $foo
+   @g2 = global i32 42, section "sec", comdat $bar
+
+From the object file perspective, this requires the creation of two sections
+with the same name.  This is necessary because both globals belong to different
+COMDAT groups and COMDATs, at the object file level, are represented by
+sections.
+
+Note that certain IR constructs like global variables and functions may create
+COMDATs in the object file in addition to any which are specified using COMDAT
+IR.  This arises, for example, when a global variable has linkonce_odr linkage.
 
 .. _namedmetadatastructure:
 
@@ -997,6 +1110,14 @@ example:
     inlining this function is desirable (such as the "inline" keyword in
     C/C++). It is just a hint; it imposes no requirements on the
     inliner.
+``jumptable``
+    This attribute indicates that the function should be added to a
+    jump-instruction table at code-generation time, and that all address-taken
+    references to this function should be replaced with a reference to the
+    appropriate jump-instruction-table function pointer. Note that this creates
+    a new pointer for the original function, which means that code that depends
+    on function-pointer identity can break. So, any function annotated with
+    ``jumptable`` must also be ``unnamed_addr``.
 ``minsize``
     This attribute suggests that optimization passes and code generator
     passes make choices that keep the code size of this function as small
@@ -2715,11 +2836,12 @@ number representing the maximum relative error, for example:
 '``range``' Metadata
 ^^^^^^^^^^^^^^^^^^^^
 
-``range`` metadata may be attached only to loads of integer types. It
-expresses the possible ranges the loaded value is in. The ranges are
-represented with a flattened list of integers. The loaded value is known
-to be in the union of the ranges defined by each consecutive pair. Each
-pair has the following properties:
+``range`` metadata may be attached only to ``load``, ``call`` and ``invoke`` of
+integer types. It expresses the possible ranges the loaded value or the value
+returned by the called function at this call site is in. The ranges are
+represented with a flattened list of integers. The loaded value or the value
+returned is known to be in the union of the ranges defined by each consecutive
+pair. Each pair has the following properties:
 
 -  The type must match the type loaded by the instruction.
 -  The pair ``a,b`` represents the range ``[a,b)``.
@@ -2737,8 +2859,9 @@ Examples:
 
       %a = load i8* %x, align 1, !range !0 ; Can only be 0 or 1
       %b = load i8* %y, align 1, !range !1 ; Can only be 255 (-1), 0 or 1
-      %c = load i8* %z, align 1, !range !2 ; Can only be 0, 1, 3, 4 or 5
-      %d = load i8* %z, align 1, !range !3 ; Can only be -2, -1, 3, 4 or 5
+      %c = call i8 @foo(),       !range !2 ; Can only be 0, 1, 3, 4 or 5
+      %d = invoke i8 @bar() to label %cont
+             unwind label %lpad, !range !3 ; Can only be -2, -1, 3, 4 or 5
     ...
     !0 = metadata !{ i8 0, i8 2 }
     !1 = metadata !{ i8 255, i8 2 }
@@ -2768,7 +2891,7 @@ constructs:
 
 The loop identifier metadata can be used to specify additional per-loop
 metadata. Any operands after the first operand can be treated as user-defined
-metadata. For example the ``llvm.vectorizer.unroll`` metadata is understood
+metadata. For example the ``llvm.loop.vectorize.unroll`` metadata is understood
 by the loop vectorizer to indicate how many times to unroll the loop:
 
 .. code-block:: llvm
@@ -2776,7 +2899,7 @@ by the loop vectorizer to indicate how many times to unroll the loop:
       br i1 %exitcond, label %._crit_edge, label %.lr.ph, !llvm.loop !0
     ...
     !0 = metadata !{ metadata !0, metadata !1 }
-    !1 = metadata !{ metadata !"llvm.vectorizer.unroll", i32 2 }
+    !1 = metadata !{ metadata !"llvm.loop.vectorize.unroll", i32 2 }
 
 '``llvm.mem``'
 ^^^^^^^^^^^^^^^
@@ -2796,7 +2919,7 @@ with the same loop identifier.
 Precisely, given two instructions ``m1`` and ``m2`` that both have the 
 ``llvm.mem.parallel_loop_access`` metadata, with ``L1`` and ``L2`` being the 
 set of loops associated with that metadata, respectively, then there is no loop 
-carried dependence between ``m1`` and ``m2`` for loops ``L1`` or 
+carried dependence between ``m1`` and ``m2`` for loops in both ``L1`` and 
 ``L2``.
 
 As a special case, if all memory accessing instructions in a loop have 
@@ -2861,54 +2984,54 @@ the loop identifier metadata node directly:
    !1 = metadata !{ metadata !1 } ; an identifier for the inner loop
    !2 = metadata !{ metadata !2 } ; an identifier for the outer loop
 
-'``llvm.vectorizer``'
-^^^^^^^^^^^^^^^^^^^^^
+'``llvm.loop.vectorize``'
+^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Metadata prefixed with ``llvm.vectorizer`` is used to control per-loop
+Metadata prefixed with ``llvm.loop.vectorize`` is used to control per-loop
 vectorization parameters such as vectorization factor and unroll factor.
 
-``llvm.vectorizer`` metadata should be used in conjunction with ``llvm.loop``
-loop identification metadata.
+``llvm.loop.vectorize`` metadata should be used in conjunction with
+``llvm.loop`` loop identification metadata.
 
-'``llvm.vectorizer.unroll``' Metadata
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.loop.vectorize.unroll``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 This metadata instructs the loop vectorizer to unroll the specified
 loop exactly ``N`` times.
 
-The first operand is the string ``llvm.vectorizer.unroll`` and the second
+The first operand is the string ``llvm.loop.vectorize.unroll`` and the second
 operand is an integer specifying the unroll factor. For example:
 
 .. code-block:: llvm
 
-   !0 = metadata !{ metadata !"llvm.vectorizer.unroll", i32 4 }
+   !0 = metadata !{ metadata !"llvm.loop.vectorize.unroll", i32 4 }
 
-Note that setting ``llvm.vectorizer.unroll`` to 1 disables unrolling of the
-loop.
+Note that setting ``llvm.loop.vectorize.unroll`` to 1 disables
+unrolling of the loop.
 
-If ``llvm.vectorizer.unroll`` is set to 0 then the amount of unrolling will be
-determined automatically.
+If ``llvm.loop.vectorize.unroll`` is set to 0 then the amount of
+unrolling will be determined automatically.
 
-'``llvm.vectorizer.width``' Metadata
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.loop.vectorize.width``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 This metadata sets the target width of the vectorizer to ``N``. Without
 this metadata, the vectorizer will choose a width automatically.
 Regardless of this metadata, the vectorizer will only vectorize loops if
 it believes it is valid to do so.
 
-The first operand is the string ``llvm.vectorizer.width`` and the second
-operand is an integer specifying the width. For example:
+The first operand is the string ``llvm.loop.vectorize.width`` and the
+second operand is an integer specifying the width. For example:
 
 .. code-block:: llvm
 
-   !0 = metadata !{ metadata !"llvm.vectorizer.width", i32 4 }
+   !0 = metadata !{ metadata !"llvm.loop.vectorize.width", i32 4 }
 
-Note that setting ``llvm.vectorizer.width`` to 1 disables vectorization of the
-loop.
+Note that setting ``llvm.loop.vectorize.width`` to 1 disables
+vectorization of the loop.
 
-If ``llvm.vectorizer.width`` is set to 0 then the width will be determined
-automatically.
+If ``llvm.loop.vectorize.width`` is set to 0 then the width will be
+determined automatically.
 
 Module Flags Metadata
 =====================
@@ -3110,6 +3233,42 @@ Each individual option is required to be either a valid option for the target's
 linker, or an option that is reserved by the target specific assembly writer or
 object file emitter. No other aspect of these options is defined by the IR.
 
+C type width Module Flags Metadata
+----------------------------------
+
+The ARM backend emits a section into each generated object file describing the
+options that it was compiled with (in a compiler-independent way) to prevent
+linking incompatible objects, and to allow automatic library selection. Some
+of these options are not visible at the IR level, namely wchar_t width and enum
+width.
+
+To pass this information to the backend, these options are encoded in module
+flags metadata, using the following key-value pairs:
+
+.. list-table::
+   :header-rows: 1
+   :widths: 30 70
+
+   * - Key
+     - Value
+
+   * - short_wchar
+     - * 0 --- sizeof(wchar_t) == 4
+       * 1 --- sizeof(wchar_t) == 2
+
+   * - short_enum
+     - * 0 --- Enums are at least as large as an ``int``.
+       * 1 --- Enums are stored in the smallest integer type which can
+         represent all of its values.
+
+For example, the following metadata section specifies that the module was
+compiled with a ``wchar_t`` width of 4 bytes, and the underlying type of an
+enum is the smallest type which can represent all of its values::
+
+    !llvm.module.flags = !{!0, !1}
+    !0 = metadata !{i32 1, metadata !"short_wchar", i32 1}
+    !1 = metadata !{i32 1, metadata !"short_enum", i32 0}
+
 .. _intrinsicglobalvariables:
 
 Intrinsic Global Variables
@@ -3543,9 +3702,9 @@ Example:
 .. code-block:: llvm
 
       %retval = invoke i32 @Test(i32 15) to label %Continue
-                  unwind label %TestCleanup              ; {i32}:retval set
+                  unwind label %TestCleanup              ; i32:retval set
       %retval = invoke coldcc i32 %Testfnptr(i32 15) to label %Continue
-                  unwind label %TestCleanup              ; {i32}:retval set
+                  unwind label %TestCleanup              ; i32:retval set
 
 .. _i_resume:
 
@@ -3634,10 +3793,10 @@ Syntax:
 
 ::
 
-      <result> = add <ty> <op1>, <op2>          ; yields {ty}:result
-      <result> = add nuw <ty> <op1>, <op2>      ; yields {ty}:result
-      <result> = add nsw <ty> <op1>, <op2>      ; yields {ty}:result
-      <result> = add nuw nsw <ty> <op1>, <op2>  ; yields {ty}:result
+      <result> = add <ty> <op1>, <op2>          ; yields ty:result
+      <result> = add nuw <ty> <op1>, <op2>      ; yields ty:result
+      <result> = add nsw <ty> <op1>, <op2>      ; yields ty:result
+      <result> = add nuw nsw <ty> <op1>, <op2>  ; yields ty:result
 
 Overview:
 """""""""
@@ -3673,7 +3832,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = add i32 4, %var          ; yields {i32}:result = 4 + %var
+      <result> = add i32 4, %var          ; yields i32:result = 4 + %var
 
 .. _i_fadd:
 
@@ -3685,7 +3844,7 @@ Syntax:
 
 ::
 
-      <result> = fadd [fast-math flags]* <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = fadd [fast-math flags]* <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -3712,7 +3871,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = fadd float 4.0, %var          ; yields {float}:result = 4.0 + %var
+      <result> = fadd float 4.0, %var          ; yields float:result = 4.0 + %var
 
 '``sub``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^
@@ -3722,10 +3881,10 @@ Syntax:
 
 ::
 
-      <result> = sub <ty> <op1>, <op2>          ; yields {ty}:result
-      <result> = sub nuw <ty> <op1>, <op2>      ; yields {ty}:result
-      <result> = sub nsw <ty> <op1>, <op2>      ; yields {ty}:result
-      <result> = sub nuw nsw <ty> <op1>, <op2>  ; yields {ty}:result
+      <result> = sub <ty> <op1>, <op2>          ; yields ty:result
+      <result> = sub nuw <ty> <op1>, <op2>      ; yields ty:result
+      <result> = sub nsw <ty> <op1>, <op2>      ; yields ty:result
+      <result> = sub nuw nsw <ty> <op1>, <op2>  ; yields ty:result
 
 Overview:
 """""""""
@@ -3764,8 +3923,8 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = sub i32 4, %var          ; yields {i32}:result = 4 - %var
-      <result> = sub i32 0, %val          ; yields {i32}:result = -%var
+      <result> = sub i32 4, %var          ; yields i32:result = 4 - %var
+      <result> = sub i32 0, %val          ; yields i32:result = -%var
 
 .. _i_fsub:
 
@@ -3777,7 +3936,7 @@ Syntax:
 
 ::
 
-      <result> = fsub [fast-math flags]* <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = fsub [fast-math flags]* <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -3807,8 +3966,8 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = fsub float 4.0, %var           ; yields {float}:result = 4.0 - %var
-      <result> = fsub float -0.0, %val          ; yields {float}:result = -%var
+      <result> = fsub float 4.0, %var           ; yields float:result = 4.0 - %var
+      <result> = fsub float -0.0, %val          ; yields float:result = -%var
 
 '``mul``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^
@@ -3818,10 +3977,10 @@ Syntax:
 
 ::
 
-      <result> = mul <ty> <op1>, <op2>          ; yields {ty}:result
-      <result> = mul nuw <ty> <op1>, <op2>      ; yields {ty}:result
-      <result> = mul nsw <ty> <op1>, <op2>      ; yields {ty}:result
-      <result> = mul nuw nsw <ty> <op1>, <op2>  ; yields {ty}:result
+      <result> = mul <ty> <op1>, <op2>          ; yields ty:result
+      <result> = mul nuw <ty> <op1>, <op2>      ; yields ty:result
+      <result> = mul nsw <ty> <op1>, <op2>      ; yields ty:result
+      <result> = mul nuw nsw <ty> <op1>, <op2>  ; yields ty:result
 
 Overview:
 """""""""
@@ -3861,7 +4020,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = mul i32 4, %var          ; yields {i32}:result = 4 * %var
+      <result> = mul i32 4, %var          ; yields i32:result = 4 * %var
 
 .. _i_fmul:
 
@@ -3873,7 +4032,7 @@ Syntax:
 
 ::
 
-      <result> = fmul [fast-math flags]* <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = fmul [fast-math flags]* <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -3900,7 +4059,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = fmul float 4.0, %var          ; yields {float}:result = 4.0 * %var
+      <result> = fmul float 4.0, %var          ; yields float:result = 4.0 * %var
 
 '``udiv``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^^
@@ -3910,8 +4069,8 @@ Syntax:
 
 ::
 
-      <result> = udiv <ty> <op1>, <op2>         ; yields {ty}:result
-      <result> = udiv exact <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = udiv <ty> <op1>, <op2>         ; yields ty:result
+      <result> = udiv exact <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -3944,7 +4103,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = udiv i32 4, %var          ; yields {i32}:result = 4 / %var
+      <result> = udiv i32 4, %var          ; yields i32:result = 4 / %var
 
 '``sdiv``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^^
@@ -3954,8 +4113,8 @@ Syntax:
 
 ::
 
-      <result> = sdiv <ty> <op1>, <op2>         ; yields {ty}:result
-      <result> = sdiv exact <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = sdiv <ty> <op1>, <op2>         ; yields ty:result
+      <result> = sdiv exact <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -3990,7 +4149,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = sdiv i32 4, %var          ; yields {i32}:result = 4 / %var
+      <result> = sdiv i32 4, %var          ; yields i32:result = 4 / %var
 
 .. _i_fdiv:
 
@@ -4002,7 +4161,7 @@ Syntax:
 
 ::
 
-      <result> = fdiv [fast-math flags]* <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = fdiv [fast-math flags]* <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4029,7 +4188,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = fdiv float 4.0, %var          ; yields {float}:result = 4.0 / %var
+      <result> = fdiv float 4.0, %var          ; yields float:result = 4.0 / %var
 
 '``urem``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^^
@@ -4039,7 +4198,7 @@ Syntax:
 
 ::
 
-      <result> = urem <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = urem <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4071,7 +4230,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = urem i32 4, %var          ; yields {i32}:result = 4 % %var
+      <result> = urem i32 4, %var          ; yields i32:result = 4 % %var
 
 '``srem``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^^
@@ -4081,7 +4240,7 @@ Syntax:
 
 ::
 
-      <result> = srem <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = srem <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4126,7 +4285,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = srem i32 4, %var          ; yields {i32}:result = 4 % %var
+      <result> = srem i32 4, %var          ; yields i32:result = 4 % %var
 
 .. _i_frem:
 
@@ -4138,7 +4297,7 @@ Syntax:
 
 ::
 
-      <result> = frem [fast-math flags]* <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = frem [fast-math flags]* <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4166,7 +4325,7 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = frem float 4.0, %var          ; yields {float}:result = 4.0 % %var
+      <result> = frem float 4.0, %var          ; yields float:result = 4.0 % %var
 
 .. _bitwiseops:
 
@@ -4187,10 +4346,10 @@ Syntax:
 
 ::
 
-      <result> = shl <ty> <op1>, <op2>           ; yields {ty}:result
-      <result> = shl nuw <ty> <op1>, <op2>       ; yields {ty}:result
-      <result> = shl nsw <ty> <op1>, <op2>       ; yields {ty}:result
-      <result> = shl nuw nsw <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = shl <ty> <op1>, <op2>           ; yields ty:result
+      <result> = shl nuw <ty> <op1>, <op2>       ; yields ty:result
+      <result> = shl nsw <ty> <op1>, <op2>       ; yields ty:result
+      <result> = shl nuw nsw <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4228,9 +4387,9 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = shl i32 4, %var   ; yields {i32}: 4 << %var
-      <result> = shl i32 4, 2      ; yields {i32}: 16
-      <result> = shl i32 1, 10     ; yields {i32}: 1024
+      <result> = shl i32 4, %var   ; yields i32: 4 << %var
+      <result> = shl i32 4, 2      ; yields i32: 16
+      <result> = shl i32 1, 10     ; yields i32: 1024
       <result> = shl i32 1, 32     ; undefined
       <result> = shl <2 x i32> < i32 1, i32 1>, < i32 1, i32 2>   ; yields: result=<2 x i32> < i32 2, i32 4>
 
@@ -4242,8 +4401,8 @@ Syntax:
 
 ::
 
-      <result> = lshr <ty> <op1>, <op2>         ; yields {ty}:result
-      <result> = lshr exact <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = lshr <ty> <op1>, <op2>         ; yields ty:result
+      <result> = lshr exact <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4277,10 +4436,10 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = lshr i32 4, 1   ; yields {i32}:result = 2
-      <result> = lshr i32 4, 2   ; yields {i32}:result = 1
-      <result> = lshr i8  4, 3   ; yields {i8}:result = 0
-      <result> = lshr i8 -2, 1   ; yields {i8}:result = 0x7F
+      <result> = lshr i32 4, 1   ; yields i32:result = 2
+      <result> = lshr i32 4, 2   ; yields i32:result = 1
+      <result> = lshr i8  4, 3   ; yields i8:result = 0
+      <result> = lshr i8 -2, 1   ; yields i8:result = 0x7F
       <result> = lshr i32 1, 32  ; undefined
       <result> = lshr <2 x i32> < i32 -2, i32 4>, < i32 1, i32 2>   ; yields: result=<2 x i32> < i32 0x7FFFFFFF, i32 1>
 
@@ -4292,8 +4451,8 @@ Syntax:
 
 ::
 
-      <result> = ashr <ty> <op1>, <op2>         ; yields {ty}:result
-      <result> = ashr exact <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = ashr <ty> <op1>, <op2>         ; yields ty:result
+      <result> = ashr exact <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4328,10 +4487,10 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = ashr i32 4, 1   ; yields {i32}:result = 2
-      <result> = ashr i32 4, 2   ; yields {i32}:result = 1
-      <result> = ashr i8  4, 3   ; yields {i8}:result = 0
-      <result> = ashr i8 -2, 1   ; yields {i8}:result = -1
+      <result> = ashr i32 4, 1   ; yields i32:result = 2
+      <result> = ashr i32 4, 2   ; yields i32:result = 1
+      <result> = ashr i8  4, 3   ; yields i8:result = 0
+      <result> = ashr i8 -2, 1   ; yields i8:result = -1
       <result> = ashr i32 1, 32  ; undefined
       <result> = ashr <2 x i32> < i32 -2, i32 4>, < i32 1, i32 3>   ; yields: result=<2 x i32> < i32 -1, i32 0>
 
@@ -4343,7 +4502,7 @@ Syntax:
 
 ::
 
-      <result> = and <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = and <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4380,9 +4539,9 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = and i32 4, %var         ; yields {i32}:result = 4 & %var
-      <result> = and i32 15, 40          ; yields {i32}:result = 8
-      <result> = and i32 4, 8            ; yields {i32}:result = 0
+      <result> = and i32 4, %var         ; yields i32:result = 4 & %var
+      <result> = and i32 15, 40          ; yields i32:result = 8
+      <result> = and i32 4, 8            ; yields i32:result = 0
 
 '``or``' Instruction
 ^^^^^^^^^^^^^^^^^^^^
@@ -4392,7 +4551,7 @@ Syntax:
 
 ::
 
-      <result> = or <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = or <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4429,9 +4588,9 @@ Example:
 
 ::
 
-      <result> = or i32 4, %var         ; yields {i32}:result = 4 | %var
-      <result> = or i32 15, 40          ; yields {i32}:result = 47
-      <result> = or i32 4, 8            ; yields {i32}:result = 12
+      <result> = or i32 4, %var         ; yields i32:result = 4 | %var
+      <result> = or i32 15, 40          ; yields i32:result = 47
+      <result> = or i32 4, 8            ; yields i32:result = 12
 
 '``xor``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^
@@ -4441,7 +4600,7 @@ Syntax:
 
 ::
 
-      <result> = xor <ty> <op1>, <op2>   ; yields {ty}:result
+      <result> = xor <ty> <op1>, <op2>   ; yields ty:result
 
 Overview:
 """""""""
@@ -4479,10 +4638,10 @@ Example:
 
 .. code-block:: llvm
 
-      <result> = xor i32 4, %var         ; yields {i32}:result = 4 ^ %var
-      <result> = xor i32 15, 40          ; yields {i32}:result = 39
-      <result> = xor i32 4, 8            ; yields {i32}:result = 12
-      <result> = xor i32 %V, -1          ; yields {i32}:result = ~%V
+      <result> = xor i32 4, %var         ; yields i32:result = 4 ^ %var
+      <result> = xor i32 15, 40          ; yields i32:result = 39
+      <result> = xor i32 4, 8            ; yields i32:result = 12
+      <result> = xor i32 %V, -1          ; yields i32:result = ~%V
 
 Vector Operations
 -----------------
@@ -4748,7 +4907,7 @@ Syntax:
 
 ::
 
-      <result> = alloca [inalloca] <type> [, <ty> <NumElements>] [, align <alignment>]     ; yields {type*}:result
+      <result> = alloca [inalloca] <type> [, <ty> <NumElements>] [, align <alignment>]     ; yields type*:result
 
 Overview:
 """""""""
@@ -4790,10 +4949,10 @@ Example:
 
 .. code-block:: llvm
 
-      %ptr = alloca i32                             ; yields {i32*}:ptr
-      %ptr = alloca i32, i32 4                      ; yields {i32*}:ptr
-      %ptr = alloca i32, i32 4, align 1024          ; yields {i32*}:ptr
-      %ptr = alloca i32, align 1024                 ; yields {i32*}:ptr
+      %ptr = alloca i32                             ; yields i32*:ptr
+      %ptr = alloca i32, i32 4                      ; yields i32*:ptr
+      %ptr = alloca i32, i32 4, align 1024          ; yields i32*:ptr
+      %ptr = alloca i32, align 1024                 ; yields i32*:ptr
 
 .. _i_load:
 
@@ -4876,9 +5035,9 @@ Examples:
 
 .. code-block:: llvm
 
-      %ptr = alloca i32                               ; yields {i32*}:ptr
-      store i32 3, i32* %ptr                          ; yields {void}
-      %val = load i32* %ptr                           ; yields {i32}:val = i32 3
+      %ptr = alloca i32                               ; yields i32*:ptr
+      store i32 3, i32* %ptr                          ; yields void
+      %val = load i32* %ptr                           ; yields i32:val = i32 3
 
 .. _i_store:
 
@@ -4890,8 +5049,8 @@ Syntax:
 
 ::
 
-      store [volatile] <ty> <value>, <ty>* <pointer>[, align <alignment>][, !nontemporal !<index>]        ; yields {void}
-      store atomic [volatile] <ty> <value>, <ty>* <pointer> [singlethread] <ordering>, align <alignment>  ; yields {void}
+      store [volatile] <ty> <value>, <ty>* <pointer>[, align <alignment>][, !nontemporal !<index>]        ; yields void
+      store atomic [volatile] <ty> <value>, <ty>* <pointer> [singlethread] <ordering>, align <alignment>  ; yields void
 
 Overview:
 """""""""
@@ -4955,9 +5114,9 @@ Example:
 
 .. code-block:: llvm
 
-      %ptr = alloca i32                               ; yields {i32*}:ptr
-      store i32 3, i32* %ptr                          ; yields {void}
-      %val = load i32* %ptr                           ; yields {i32}:val = i32 3
+      %ptr = alloca i32                               ; yields i32*:ptr
+      store i32 3, i32* %ptr                          ; yields void
+      %val = load i32* %ptr                           ; yields i32:val = i32 3
 
 .. _i_fence:
 
@@ -4969,7 +5128,7 @@ Syntax:
 
 ::
 
-      fence [singlethread] <ordering>                   ; yields {void}
+      fence [singlethread] <ordering>                   ; yields void
 
 Overview:
 """""""""
@@ -5012,8 +5171,8 @@ Example:
 
 .. code-block:: llvm
 
-      fence acquire                          ; yields {void}
-      fence singlethread seq_cst             ; yields {void}
+      fence acquire                          ; yields void
+      fence singlethread seq_cst             ; yields void
 
 .. _i_cmpxchg:
 
@@ -5025,14 +5184,14 @@ Syntax:
 
 ::
 
-      cmpxchg [volatile] <ty>* <pointer>, <ty> <cmp>, <ty> <new> [singlethread] <success ordering> <failure ordering> ; yields {ty}
+      cmpxchg [weak] [volatile] <ty>* <pointer>, <ty> <cmp>, <ty> <new> [singlethread] <success ordering> <failure ordering> ; yields  { ty, i1 }
 
 Overview:
 """""""""
 
 The '``cmpxchg``' instruction is used to atomically modify memory. It
 loads a value in memory and compares it to a given value. If they are
-equal, it stores a new value into the memory.
+equal, it tries to store a new value into the memory.
 
 Arguments:
 """"""""""
@@ -5049,10 +5208,10 @@ to modify the number or order of execution of this ``cmpxchg`` with
 other :ref:`volatile operations <volatile>`.
 
 The success and failure :ref:`ordering <ordering>` arguments specify how this
-``cmpxchg`` synchronizes with other atomic operations. The both ordering
-parameters must be at least ``monotonic``, the ordering constraint on failure
-must be no stronger than that on success, and the failure ordering cannot be
-either ``release`` or ``acq_rel``.
+``cmpxchg`` synchronizes with other atomic operations. Both ordering parameters
+must be at least ``monotonic``, the ordering constraint on failure must be no
+stronger than that on success, and the failure ordering cannot be either
+``release`` or ``acq_rel``.
 
 The optional "``singlethread``" argument declares that the ``cmpxchg``
 is only atomic with respect to code (usually signal handlers) running in
@@ -5065,10 +5224,17 @@ equal to the size in memory of the operand.
 Semantics:
 """"""""""
 
-The contents of memory at the location specified by the '``<pointer>``'
-operand is read and compared to '``<cmp>``'; if the read value is the
-equal, '``<new>``' is written. The original value at the location is
-returned.
+The contents of memory at the location specified by the '``<pointer>``' operand
+is read and compared to '``<cmp>``'; if the read value is the equal, the
+'``<new>``' is written. The original value at the location is returned, together
+with a flag indicating success (true) or failure (false).
+
+If the cmpxchg operation is marked as ``weak`` then a spurious failure is
+permitted: the operation may not write ``<new>`` even if the comparison
+matched.
+
+If the cmpxchg operation is strong (the default), the i1 value is 1 if and only
+if the value loaded equals ``cmp``.
 
 A successful ``cmpxchg`` is a read-modify-write instruction for the purpose of
 identifying release sequences. A failed ``cmpxchg`` is equivalent to an atomic
@@ -5080,14 +5246,15 @@ Example:
 .. code-block:: llvm
 
     entry:
-      %orig = atomic load i32* %ptr unordered                   ; yields {i32}
+      %orig = atomic load i32* %ptr unordered                   ; yields i32
       br label %loop
 
     loop:
       %cmp = phi i32 [ %orig, %entry ], [%old, %loop]
       %squared = mul i32 %cmp, %cmp
-      %old = cmpxchg i32* %ptr, i32 %cmp, i32 %squared acq_rel monotonic ; yields {i32}
-      %success = icmp eq i32 %cmp, %old
+      %val_success = cmpxchg i32* %ptr, i32 %cmp, i32 %squared acq_rel monotonic ; yields  { i32, i1 }
+      %value_loaded = extractvalue { i32, i1 } %val_success, 0
+      %success = extractvalue { i32, i1 } %val_success, 1
       br i1 %success, label %done, label %loop
 
     done:
@@ -5103,7 +5270,7 @@ Syntax:
 
 ::
 
-      atomicrmw [volatile] <operation> <ty>* <pointer>, <ty> <value> [singlethread] <ordering>                   ; yields {ty}
+      atomicrmw [volatile] <operation> <ty>* <pointer>, <ty> <value> [singlethread] <ordering>                   ; yields ty
 
 Overview:
 """""""""
@@ -5164,7 +5331,7 @@ Example:
 
 .. code-block:: llvm
 
-      %old = atomicrmw add i32* %ptr, i32 1 acquire                        ; yields {i32}
+      %old = atomicrmw add i32* %ptr, i32 1 acquire                        ; yields i32
 
 .. _i_getelementptr:
 
@@ -5898,7 +6065,7 @@ Syntax:
 
 ::
 
-      <result> = icmp <cond> <ty> <op1>, <op2>   ; yields {i1} or {<N x i1>}:result
+      <result> = icmp <cond> <ty> <op1>, <op2>   ; yields i1 or <N x i1>:result
 
 Overview:
 """""""""
@@ -5989,7 +6156,7 @@ Syntax:
 
 ::
 
-      <result> = fcmp <cond> <ty> <op1>, <op2>     ; yields {i1} or {<N x i1>}:result
+      <result> = fcmp <cond> <ty> <op1>, <op2>     ; yields i1 or <N x i1>:result
 
 Overview:
 """""""""
@@ -6241,7 +6408,7 @@ This instruction requires several arguments:
       uses value of call or is void).
    -  Option ``-tailcallopt`` is enabled, or
       ``llvm::GuaranteedTailCallOpt`` is ``true``.
-   -  `Platform specific constraints are
+   -  `Platform-specific constraints are
       met. <CodeGenerator.html#tailcallopt>`_
 
 #. The optional "cconv" marker indicates which :ref:`calling
@@ -6294,7 +6461,7 @@ Example:
       call void %foo(i8 97 signext)
 
       %struct.A = type { i32, i8 }
-      %r = call %struct.A @foo()                        ; yields { 32, i8 }
+      %r = call %struct.A @foo()                        ; yields { i32, i8 }
       %gr = extractvalue %struct.A %r, 0                ; yields i32
       %gr1 = extractvalue %struct.A %r, 1               ; yields i8
       %Z = call void @foo() noreturn                    ; indicates that %foo never returns normally
@@ -8456,7 +8623,7 @@ Examples:
 
 .. code-block:: llvm
 
-      %r2 = call float @llvm.fmuladd.f32(float %a, float %b, float %c) ; yields {float}:r2 = (a * b) + c
+      %r2 = call float @llvm.fmuladd.f32(float %a, float %b, float %c) ; yields float:r2 = (a * b) + c
 
 Half Precision Floating Point Intrinsics
 ----------------------------------------
@@ -8484,7 +8651,7 @@ Syntax:
 
 ::
 
-      declare i16 @llvm.convert.to.fp16(f32 %a)
+      declare i16 @llvm.convert.to.fp16(float %a)
 
 Overview:
 """""""""
@@ -8512,7 +8679,7 @@ Examples:
 
 .. code-block:: llvm
 
-      %res = call i16 @llvm.convert.to.fp16(f32 %a)
+      %res = call i16 @llvm.convert.to.fp16(float %a)
       store i16 %res, i16* @x, align 2
 
 .. _int_convert_from_fp16:
@@ -8525,7 +8692,7 @@ Syntax:
 
 ::
 
-      declare f32 @llvm.convert.from.fp16(i16 %a)
+      declare float @llvm.convert.from.fp16(i16 %a)
 
 Overview:
 """""""""
@@ -8554,7 +8721,7 @@ Examples:
 .. code-block:: llvm
 
       %a = load i16* @x, align 2
-      %res = call f32 @llvm.convert.from.fp16(i16 %a)
+      %res = call float @llvm.convert.from.fp16(i16 %a)
 
 Debugger Intrinsics
 -------------------
@@ -8675,7 +8842,7 @@ Semantics:
 """"""""""
 
 On some architectures the address of the code to be executed needs to be
-different to the address where the trampoline is actually stored. This
+different than the address where the trampoline is actually stored. This
 intrinsic returns the executable address corresponding to ``tramp``
 after performing the required machine specific adjustments. The pointer
 returned can then be :ref:`bitcast and executed <int_trampoline>`.
@@ -8683,7 +8850,7 @@ returned can then be :ref:`bitcast and executed <int_trampoline>`.
 Memory Use Markers
 ------------------
 
-This class of intrinsics exists to information about the lifetime of
+This class of intrinsics provides information about the lifetime of
 memory objects and ranges where variables are immutable.
 
 .. _int_lifestart:
diff --git a/docs/Lexicon.rst b/docs/Lexicon.rst
index 11f1341..fccfd5f 100644
--- a/docs/Lexicon.rst
+++ b/docs/Lexicon.rst
@@ -50,7 +50,7 @@ C
     Common Subexpression Elimination. An optimization that removes common
     subexpression compuation. For example ``(a+b)*(a+b)`` has two subexpressions
     that are the same: ``(a+b)``. This optimization would perform the addition
-    only once and then perform the multiply (but only if it's compulationally
+    only once and then perform the multiply (but only if it's computationally
     correct/safe).
 
 D
diff --git a/docs/Passes.rst b/docs/Passes.rst
index b51829d..9f40092 100644
--- a/docs/Passes.rst
+++ b/docs/Passes.rst
@@ -261,12 +261,6 @@ returns "I don't know" for alias queries.  NoAA is unlike other alias analysis
 implementations, in that it does not chain to a previous analysis.  As such it
 doesn't follow many of the rules that other alias analyses must.
 
-``-no-profile``: No Profile Information
----------------------------------------
-
-The default "no profile" implementation of the abstract ``ProfileInfo``
-interface.
-
 ``-postdomfrontier``: Post-Dominance Frontier Construction
 ----------------------------------------------------------
 
@@ -336,23 +330,6 @@ This pass is used to seek out all of the types in use by the program.  Note
 that this analysis explicitly does not include types only used by the symbol
 table.
 
-``-profile-estimator``: Estimate profiling information
-------------------------------------------------------
-
-Profiling information that estimates the profiling information in a very crude
-and unimaginative way.
-
-``-profile-loader``: Load profile information from ``llvmprof.out``
--------------------------------------------------------------------
-
-A concrete implementation of profiling information that loads the information
-from a profile dump file.
-
-``-profile-verifier``: Verify profiling information
----------------------------------------------------
-
-Pass that checks profiling information for plausibility.
-
 ``-regions``: Detect single entry single exit regions
 -----------------------------------------------------
 
@@ -626,24 +603,6 @@ where it is profitable, the loop could be transformed to count down to zero
 
 Bottom-up inlining of functions into callees.
 
-``-insert-edge-profiling``: Insert instrumentation for edge profiling
----------------------------------------------------------------------
-
-This pass instruments the specified program with counters for edge profiling.
-Edge profiling can give a reasonable approximation of the hot paths through a
-program, and is used for a wide variety of program transformations.
-
-Note that this implementation is very naïve.  It inserts a counter for *every*
-edge in the program, instead of using control flow information to prune the
-number of counters inserted.
-
-``-insert-optimal-edge-profiling``: Insert optimal instrumentation for edge profiling
--------------------------------------------------------------------------------------
-
-This pass instruments the specified program with counters for edge profiling.
-Edge profiling can give a reasonable approximation of the hot paths through a
-program, and is used for a wide variety of program transformations.
-
 .. _passes-instcombine:
 
 ``-instcombine``: Combine redundant instructions
diff --git a/docs/Phabricator.rst b/docs/Phabricator.rst
index 18b2817..8ac9afe 100644
--- a/docs/Phabricator.rst
+++ b/docs/Phabricator.rst
@@ -5,18 +5,29 @@ Code Reviews with Phabricator
 .. contents::
   :local:
 
-If you prefer to use a web user interface for code reviews,
-you can now submit your patches for Clang and LLVM at
-`LLVM's Phabricator`_.
+If you prefer to use a web user interface for code reviews, you can now submit
+your patches for Clang and LLVM at `LLVM's Phabricator`_ instance.
+
+While Phabricator is a useful tool for some, the relevant -commits mailing list
+is the system of record for all LLVM code review. The mailing list should be
+added as a subscriber on all reviews, and Phabricator users should be prepared
+to respond to free-form comments in mail sent to the commits list.
 
 Sign up
 -------
 
+To get started with Phabricator, navigate to `http://reviews.llvm.org`_ and
+click the power icon in the top right. You can register with a GitHub account,
+a Google account, or you can create your own profile.
+
+Make *sure* that the email address registered with Phabricator is subscribed
+to the relevant -commits mailing list. If your are not subscribed to the commit
+list, all mail sent by Phabricator on your behalf will be held for moderation.
+
 Note that if you use your Subversion user name as Phabricator user name,
 Phabricator will automatically connect your submits to your Phabricator user in
 the `Code Repository Browser`_.
 
-
 Requesting a review via the command line
 ----------------------------------------
 
@@ -90,6 +101,15 @@ a change from Phabricator.
 Committing a change
 -------------------
 
+Arcanist can manage the commit transparently. It will retrieve the description,
+reviewers, the ``Differential Revision``, etc from the review and commit it to the repository.
+
+::
+
+  arc patch D<Revision>
+  arc commit --revision D<Revision>
+
+
 When committing an LLVM change that has been reviewed using
 Phabricator, the convention is for the commit message to end with the
 line:
@@ -113,6 +133,7 @@ Status
 Please let us know whether you like it and what could be improved!
 
 .. _LLVM's Phabricator: http://reviews.llvm.org
+.. _`http://reviews.llvm.org`: http://reviews.llvm.org
 .. _Code Repository Browser: http://reviews.llvm.org/diffusion/
 .. _Arcanist Quick Start: http://www.phabricator.com/docs/phabricator/article/Arcanist_Quick_Start.html
 .. _Arcanist User Guide: http://www.phabricator.com/docs/phabricator/article/Arcanist_User_Guide.html
diff --git a/docs/ProgrammersManual.rst b/docs/ProgrammersManual.rst
index 7e46ac4..a7b28b3 100644
--- a/docs/ProgrammersManual.rst
+++ b/docs/ProgrammersManual.rst
@@ -387,7 +387,8 @@ Fine grained debug info with ``DEBUG_TYPE`` and the ``-debug-only`` option
 Sometimes you may find yourself in a situation where enabling ``-debug`` just
 turns on **too much** information (such as when working on the code generator).
 If you want to enable debug information with more fine-grained control, you
-define the ``DEBUG_TYPE`` macro and the ``-debug`` only option as follows:
+can define the ``DEBUG_TYPE`` macro and use the ``-debug-only`` option as
+follows:
 
 .. code-block:: c++
 
@@ -545,14 +546,15 @@ methods.  Within GDB, for example, you can usually use something like ``call
 DAG.viewGraph()`` to pop up a window.  Alternatively, you can sprinkle calls to
 these functions in your code in places you want to debug.
 
-Getting this to work requires a small amount of configuration.  On Unix systems
+Getting this to work requires a small amount of setup.  On Unix systems
 with X11, install the `graphviz <http://www.graphviz.org>`_ toolkit, and make
 sure 'dot' and 'gv' are in your path.  If you are running on Mac OS X, download
 and install the Mac OS X `Graphviz program
 <http://www.pixelglow.com/graphviz/>`_ and add
 ``/Applications/Graphviz.app/Contents/MacOS/`` (or wherever you install it) to
-your path.  Once in your system and path are set up, rerun the LLVM configure
-script and rebuild LLVM to enable this functionality.
+your path. The programs need not be present when configuring, building or
+running LLVM and can simply be installed when needed during an active debug
+session.
 
 ``SelectionDAG`` has been extended to make it easier to locate *interesting*
 nodes in large complex graphs.  From gdb, if you ``call DAG.setGraphColor(node,
@@ -1916,7 +1918,7 @@ which is a pointer to an integer on the run time stack.
 
 *Inserting instructions*
 
-There are essentially two ways to insert an ``Instruction`` into an existing
+There are essentially three ways to insert an ``Instruction`` into an existing
 sequence of instructions that form a ``BasicBlock``:
 
 * Insertion into an explicit instruction list
@@ -1986,6 +1988,41 @@ sequence of instructions that form a ``BasicBlock``:
   which is much cleaner, especially if you're creating a lot of instructions and
   adding them to ``BasicBlock``\ s.
 
+* Insertion using an instance of ``IRBuilder``
+
+  Inserting several ``Instruction``\ s can be quite laborious using the previous
+  methods. The ``IRBuilder`` is a convenience class that can be used to add
+  several instructions to the end of a ``BasicBlock`` or before a particular
+  ``Instruction``. It also supports constant folding and renaming named
+  registers (see ``IRBuilder``'s template arguments).
+
+  The example below demonstrates a very simple use of the ``IRBuilder`` where
+  three instructions are inserted before the instruction ``pi``. The first two
+  instructions are Call instructions and third instruction multiplies the return
+  value of the two calls.
+
+  .. code-block:: c++
+
+    Instruction *pi = ...;
+    IRBuilder<> Builder(pi);
+    CallInst* callOne = Builder.CreateCall(...);
+    CallInst* callTwo = Builder.CreateCall(...);
+    Value* result = Builder.CreateMul(callOne, callTwo);
+
+  The example below is similar to the above example except that the created
+  ``IRBuilder`` inserts instructions at the end of the ``BasicBlock`` ``pb``.
+
+  .. code-block:: c++
+
+    BasicBlock *pb = ...;
+    IRBuilder<> Builder(pb);
+    CallInst* callOne = Builder.CreateCall(...);
+    CallInst* callTwo = Builder.CreateCall(...);
+    Value* result = Builder.CreateMul(callOne, callTwo);
+
+  See :doc:`tutorial/LangImpl3` for a practical use of the ``IRBuilder``.
+
+
 .. _schanges_deleting:
 
 Deleting Instructions
@@ -2133,46 +2170,13 @@ compiler, consider compiling LLVM and LLVM-GCC in single-threaded mode, and
 using the resultant compiler to build a copy of LLVM with multithreading
 support.
 
-.. _startmultithreaded:
-
-Entering and Exiting Multithreaded Mode
----------------------------------------
-
-In order to properly protect its internal data structures while avoiding
-excessive locking overhead in the single-threaded case, the LLVM must intialize
-certain data structures necessary to provide guards around its internals.  To do
-so, the client program must invoke ``llvm_start_multithreaded()`` before making
-any concurrent LLVM API calls.  To subsequently tear down these structures, use
-the ``llvm_stop_multithreaded()`` call.  You can also use the
-``llvm_is_multithreaded()`` call to check the status of multithreaded mode.
-
-Note that both of these calls must be made *in isolation*.  That is to say that
-no other LLVM API calls may be executing at any time during the execution of
-``llvm_start_multithreaded()`` or ``llvm_stop_multithreaded``.  It is the
-client's responsibility to enforce this isolation.
-
-The return value of ``llvm_start_multithreaded()`` indicates the success or
-failure of the initialization.  Failure typically indicates that your copy of
-LLVM was built without multithreading support, typically because GCC atomic
-intrinsics were not found in your system compiler.  In this case, the LLVM API
-will not be safe for concurrent calls.  However, it *will* be safe for hosting
-threaded applications in the JIT, though :ref:`care must be taken
-<jitthreading>` to ensure that side exits and the like do not accidentally
-result in concurrent LLVM API calls.
-
 .. _shutdown:
 
 Ending Execution with ``llvm_shutdown()``
 -----------------------------------------
 
 When you are done using the LLVM APIs, you should call ``llvm_shutdown()`` to
-deallocate memory used for internal structures.  This will also invoke
-``llvm_stop_multithreaded()`` if LLVM is operating in multithreaded mode.  As
-such, ``llvm_shutdown()`` requires the same isolation guarantees as
-``llvm_stop_multithreaded()``.
-
-Note that, if you use scope-based shutdown, you can use the
-``llvm_shutdown_obj`` class, which calls ``llvm_shutdown()`` in its destructor.
+deallocate memory used for internal structures.
 
 .. _managedstatic:
 
@@ -2180,20 +2184,11 @@ Lazy Initialization with ``ManagedStatic``
 ------------------------------------------
 
 ``ManagedStatic`` is a utility class in LLVM used to implement static
-initialization of static resources, such as the global type tables.  Before the
-invocation of ``llvm_shutdown()``, it implements a simple lazy initialization
-scheme.  Once ``llvm_start_multithreaded()`` returns, however, it uses
+initialization of static resources, such as the global type tables.  In a
+single-threaded environment, it implements a simple lazy initialization scheme.
+When LLVM is compiled with support for multi-threading, however, it uses
 double-checked locking to implement thread-safe lazy initialization.
 
-Note that, because no other threads are allowed to issue LLVM API calls before
-``llvm_start_multithreaded()`` returns, it is possible to have
-``ManagedStatic``\ s of ``llvm::sys::Mutex``\ s.
-
-The ``llvm_acquire_global_lock()`` and ``llvm_release_global_lock`` APIs provide
-access to the global lock used to implement the double-checked locking for lazy
-initialization.  These should only be used internally to LLVM, and only if you
-know what you're doing!
-
 .. _llvmcontext:
 
 Achieving Isolation with ``LLVMContext``
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 8dc1681..fb2e248 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -50,11 +50,19 @@ Non-comprehensive list of changes in this release
   the ``-no-integrated-as`` option,
 
 * llvm-ar now handles IR files like regular object files. In particular, a
-  regular symbol table is created for symbols defined in IR files.
+  regular symbol table is created for symbols defined in IR files, including
+  those in file scope inline assembly.
 
 * LLVM now always uses cfi directives for producing most stack
   unwinding information.
 
+* The prefix for loop vectorizer hint metadata has been changed from
+  ``llvm.vectorizer`` to ``llvm.loop.vectorize``.
+
+* Some backends previously implemented Atomic NAND(x,y) as ``x & ~y``. Now 
+  all backends implement it as ``~(x & y)``, matching the semantics of GCC 4.4
+  and later.
+
 .. NOTE
    For small 1-3 sentence descriptions, just add an entry at the end of
    this list. If your description won't fit comfortably in one bullet
diff --git a/docs/SourceLevelDebugging.rst b/docs/SourceLevelDebugging.rst
index f957a7d..869d3a3 100644
--- a/docs/SourceLevelDebugging.rst
+++ b/docs/SourceLevelDebugging.rst
@@ -235,8 +235,8 @@ File descriptors
 .. code-block:: llvm
 
   !0 = metadata !{
-    i32,       ;; Tag = 41 (DW_TAG_file_type)
-    metadata,  ;; Source directory (including trailing slash) & file pair
+    i32,      ;; Tag = 41 (DW_TAG_file_type)
+    metadata, ;; Source directory (including trailing slash) & file pair
   }
 
 These descriptors contain information for a file.  Global variables and top
@@ -269,7 +269,7 @@ Global variable descriptors
     metadata, ;; The static member declaration, if any
   }
 
-These descriptors provide debug information about globals variables.  They
+These descriptors provide debug information about global variables.  They
 provide details such as name, type and where the variable is defined.  All
 global variables are collected inside the named metadata ``!llvm.dbg.cu``.
 
@@ -297,7 +297,7 @@ Subprogram descriptors
               ;; derived class
     i32,      ;; Flags - Artificial, Private, Protected, Explicit, Prototyped.
     i1,       ;; isOptimized
-    Function * , ;; Pointer to LLVM function
+    {}*,      ;; Reference to the LLVM function
     metadata, ;; Lists function template parameters
     metadata, ;; Function declaration descriptor
     metadata, ;; List of function variables
@@ -314,13 +314,13 @@ Block descriptors
 .. code-block:: llvm
 
   !3 = metadata !{
-    i32,     ;; Tag = 11 (DW_TAG_lexical_block)
-    metadata,;; Source directory (including trailing slash) & file pair
-    metadata,;; Reference to context descriptor
-    i32,     ;; Line number
-    i32,     ;; Column number
-    i32,     ;; DWARF path discriminator value
-    i32      ;; Unique ID to identify blocks from a template function
+    i32,      ;; Tag = 11 (DW_TAG_lexical_block)
+    metadata, ;; Source directory (including trailing slash) & file pair
+    metadata, ;; Reference to context descriptor
+    i32,      ;; Line number
+    i32,      ;; Column number
+    i32,      ;; DWARF path discriminator value
+    i32       ;; Unique ID to identify blocks from a template function
   }
 
 This descriptor provides debug information about nested blocks within a
@@ -330,9 +330,9 @@ lexical blocks at same depth.
 .. code-block:: llvm
 
   !3 = metadata !{
-    i32,     ;; Tag = 11 (DW_TAG_lexical_block)
-    metadata,;; Source directory (including trailing slash) & file pair
-    metadata ;; Reference to the scope we're annotating with a file change
+    i32,      ;; Tag = 11 (DW_TAG_lexical_block)
+    metadata, ;; Source directory (including trailing slash) & file pair
+    metadata  ;; Reference to the scope we're annotating with a file change
   }
 
 This descriptor provides a wrapper around a lexical scope to handle file
@@ -528,9 +528,9 @@ Subrange descriptors
 .. code-block:: llvm
 
   !42 = metadata !{
-    i32,    ;; Tag = 33 (DW_TAG_subrange_type)
-    i64,    ;; Low value
-    i64     ;; High value
+    i32,      ;; Tag = 33 (DW_TAG_subrange_type)
+    i64,      ;; Low value
+    i64       ;; High value
   }
 
 These descriptors are used to define ranges of array subscripts for an array
@@ -570,6 +570,7 @@ Local variables
     metadata, ;; Reference to the type descriptor
     i32,      ;; flags
     metadata  ;; (optional) Reference to inline location
+    metadata  ;; (optional) Reference to a complex expression (see below)
   }
 
 These descriptors are used to define variables local to a sub program.  The
diff --git a/docs/TestingGuide.rst b/docs/TestingGuide.rst
index f9222372..481be55 100644
--- a/docs/TestingGuide.rst
+++ b/docs/TestingGuide.rst
@@ -304,8 +304,7 @@ For instance, on ``test/CodeGen/ARM``, the ``lit.local.cfg`` is:
 .. code-block:: python
 
   config.suffixes = ['.ll', '.c', '.cpp', '.test']
-  targets = set(config.root.targets_to_build.split())
-  if not 'ARM' in targets:
+  if not 'ARM' in config.root.targets:
     config.unsupported = True
 
 Other platform-specific tests are those that depend on a specific feature
diff --git a/docs/Vectorizers.rst b/docs/Vectorizers.rst
index 887ccaa..2b70217 100644
--- a/docs/Vectorizers.rst
+++ b/docs/Vectorizers.rst
@@ -51,6 +51,89 @@ Users can control the unroll factor using the command line flag "-force-vector-u
   $ clang  -mllvm -force-vector-unroll=2 ...
   $ opt -loop-vectorize -force-vector-unroll=2 ...
 
+Pragma loop hint directives
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``#pragma clang loop`` directive allows loop vectorization hints to be
+specified for the subsequent for, while, do-while, or c++11 range-based for
+loop. The directive allows vectorization and interleaving to be enabled or
+disabled. Vector width as well as interleave count can also be manually
+specified. The following example explicitly enables vectorization and
+interleaving:
+
+.. code-block:: c++
+
+  #pragma clang loop vectorize(enable) interleave(enable)
+  while(...) {
+    ...
+  }
+
+The following example implicitly enables vectorization and interleaving by
+specifying a vector width and interleaving count:
+
+.. code-block:: c++
+
+  #pragma clang loop vectorize_width(2) interleave_count(2)
+  for(...) {
+    ...
+  }
+
+See the Clang
+`language extensions
+<http://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations>`_
+for details.
+
+Diagnostics
+-----------
+
+Many loops cannot be vectorized including loops with complicated control flow,
+unvectorizable types, and unvectorizable calls. The loop vectorizer generates
+optimization remarks which can be queried using command line options to identify
+and diagnose loops that are skipped by the loop-vectorizer.
+
+Optimization remarks are enabled using:
+
+``-Rpass=loop-vectorize`` identifies loops that were successfully vectorized.
+
+``-Rpass-missed=loop-vectorize`` identifies loops that failed vectorization and
+indicates if vectorization was specified.
+
+``-Rpass-analysis=loop-vectorize`` identifies the statements that caused
+vectorization to fail.
+
+Consider the following loop:
+
+.. code-block:: c++
+
+  #pragma clang loop vectorize(enable)
+  for (int i = 0; i < Length; i++) {
+    switch(A[i]) {
+    case 0: A[i] = i*2; break;
+    case 1: A[i] = i;   break;
+    default: A[i] = 0;
+    }
+  }
+
+The command line ``-Rpass-missed=loop-vectorized`` prints the remark:
+
+.. code-block:: console
+
+  no_switch.cpp:4:5: remark: loop not vectorized: vectorization is explicitly enabled [-Rpass-missed=loop-vectorize]
+
+And the command line ``-Rpass-analysis=loop-vectorize`` indicates that the
+switch statement cannot be vectorized.
+
+.. code-block:: console
+
+  no_switch.cpp:4:5: remark: loop not vectorized: loop contains a switch statement [-Rpass-analysis=loop-vectorize]
+    switch(A[i]) {
+    ^
+
+To ensure line and column numbers are produced include the command line options
+``-gline-tables-only`` and ``-gcolumn-info``. See the Clang `user manual
+<http://clang.llvm.org/docs/UsersManual.html#options-to-emit-optimization-reports>`_
+for details
+
 Features
 --------
 
diff --git a/docs/WritingAnLLVMPass.rst b/docs/WritingAnLLVMPass.rst
index f9cb4fe..cfbda04 100644
--- a/docs/WritingAnLLVMPass.rst
+++ b/docs/WritingAnLLVMPass.rst
@@ -259,7 +259,6 @@ To see what happened to the other string you registered, try running
       -hello                    - Hello World Pass
       -indvars                  - Induction Variable Simplification
       -inline                   - Function Integration/Inlining
-      -insert-edge-profiling    - Insert instrumentation for edge profiling
   ...
 
 The pass name gets added as the information string for your pass, giving some
diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index f37e3f8..8693a30 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -166,7 +166,8 @@ typedef enum {
     LLVMCold = 1ULL << 34,
     LLVMOptimizeNone = 1ULL << 35,
     LLVMInAllocaAttribute = 1ULL << 36,
-    LLVMNonNullAttribute = 1ULL << 37
+    LLVMNonNullAttribute = 1ULL << 37,
+    LLVMJumpTableAttribute = 1ULL << 38,
     */
 } LLVMAttribute;
 
@@ -2847,16 +2848,13 @@ void LLVMDisposePassManager(LLVMPassManagerRef PM);
  * @{
  */
 
-/** Allocate and initialize structures needed to make LLVM safe for
-    multithreading. The return value indicates whether multithreaded
-    initialization succeeded. Must be executed in isolation from all
-    other LLVM api calls.
-    @see llvm::llvm_start_multithreaded */
+/** Deprecated: Multi-threading can only be enabled/disabled with the compile
+    time define LLVM_ENABLE_THREADS.  This function always returns
+    LLVMIsMultithreaded(). */
 LLVMBool LLVMStartMultithreaded(void);
 
-/** Deallocate structures necessary to make LLVM safe for multithreading.
-    Must be executed in isolation from all other LLVM api calls.
-    @see llvm::llvm_stop_multithreaded */
+/** Deprecated: Multi-threading can only be enabled/disabled with the compile
+    time define LLVM_ENABLE_THREADS. */
 void LLVMStopMultithreaded(void);
 
 /** Check whether LLVM is executing in thread-safe mode or not.
diff --git a/include/llvm-c/module.modulemap b/include/llvm-c/module.modulemap
index 2bcdbc1..a456119 100644
--- a/include/llvm-c/module.modulemap
+++ b/include/llvm-c/module.modulemap
@@ -1,5 +1,4 @@
 module LLVM_C {
-  requires cplusplus
   umbrella "."
   module * { export * }
 }
diff --git a/include/llvm/ADT/APSInt.h b/include/llvm/ADT/APSInt.h
index 053deff..ee34e9b 100644
--- a/include/llvm/ADT/APSInt.h
+++ b/include/llvm/ADT/APSInt.h
@@ -56,7 +56,7 @@ public:
     APInt::toString(Str, Radix, isSigned());
   }
   /// toString - Converts an APInt to a std::string.  This is an inefficient
-  /// method, your should prefer passing in a SmallString instead.
+  /// method; you should prefer passing in a SmallString instead.
   std::string toString(unsigned Radix) const {
     return APInt::toString(Radix, isSigned());
   }
diff --git a/include/llvm/ADT/ArrayRef.h b/include/llvm/ADT/ArrayRef.h
index 1b64fee..0fff505 100644
--- a/include/llvm/ADT/ArrayRef.h
+++ b/include/llvm/ADT/ArrayRef.h
@@ -147,6 +147,12 @@ namespace llvm {
       return ArrayRef<T>(data()+N, M);
     }
 
+    // \brief Drop the last \p N elements of the array.
+    ArrayRef<T> drop_back(unsigned N = 1) const {
+      assert(size() >= N && "Dropping more elements than exist");
+      return slice(0, size() - N);
+    }
+
     /// @}
     /// @name Operator Overloads
     /// @{
diff --git a/include/llvm/ADT/BitVector.h b/include/llvm/ADT/BitVector.h
index da2b3ad..34e2284 100644
--- a/include/llvm/ADT/BitVector.h
+++ b/include/llvm/ADT/BitVector.h
@@ -34,6 +34,7 @@ class BitVector {
   unsigned Capacity;     // Size of allocated memory in BitWord.
 
 public:
+  typedef unsigned size_type;
   // Encapsulation of a single bit.
   class reference {
     friend class BitVector;
@@ -111,10 +112,10 @@ public:
   bool empty() const { return Size == 0; }
 
   /// size - Returns the number of bits in this bitvector.
-  unsigned size() const { return Size; }
+  size_type size() const { return Size; }
 
   /// count - Returns the number of bits which are set.
-  unsigned count() const {
+  size_type count() const {
     unsigned NumBits = 0;
     for (unsigned i = 0; i < NumBitWords(size()); ++i)
       if (sizeof(BitWord) == 4)
diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h
index 8269132..85f37b9 100644
--- a/include/llvm/ADT/DenseMap.h
+++ b/include/llvm/ADT/DenseMap.h
@@ -43,6 +43,7 @@ protected:
   typedef std::pair<KeyT, ValueT> BucketT;
 
 public:
+  typedef unsigned size_type;
   typedef KeyT key_type;
   typedef ValueT mapped_type;
   typedef BucketT value_type;
@@ -70,7 +71,7 @@ public:
   unsigned size() const { return getNumEntries(); }
 
   /// Grow the densemap so that it has at least Size buckets. Does not shrink
-  void resize(size_t Size) {
+  void resize(size_type Size) {
     if (Size > getNumBuckets())
       grow(Size);
   }
@@ -99,10 +100,10 @@ public:
     setNumTombstones(0);
   }
 
-  /// count - Return true if the specified key is in the map.
-  bool count(const KeyT &Val) const {
+  /// Return 1 if the specified key is in the map, 0 otherwise.
+  size_type count(const KeyT &Val) const {
     const BucketT *TheBucket;
-    return LookupBucketFor(Val, TheBucket);
+    return LookupBucketFor(Val, TheBucket) ? 1 : 0;
   }
 
   iterator find(const KeyT &Val) {
diff --git a/include/llvm/ADT/DenseSet.h b/include/llvm/ADT/DenseSet.h
index 1d8c39c..37a81b0 100644
--- a/include/llvm/ADT/DenseSet.h
+++ b/include/llvm/ADT/DenseSet.h
@@ -29,11 +29,12 @@ class DenseSet {
 public:
   typedef ValueT key_type;
   typedef ValueT value_type;
+  typedef unsigned size_type;
 
   explicit DenseSet(unsigned NumInitBuckets = 0) : TheMap(NumInitBuckets) {}
 
   bool empty() const { return TheMap.empty(); }
-  unsigned size() const { return TheMap.size(); }
+  size_type size() const { return TheMap.size(); }
   size_t getMemorySize() const { return TheMap.getMemorySize(); }
 
   /// Grow the DenseSet so that it has at least Size buckets. Will not shrink
@@ -44,7 +45,8 @@ public:
     TheMap.clear();
   }
 
-  bool count(const ValueT &V) const {
+  /// Return 1 if the specified key is in the set, 0 otherwise.
+  size_type count(const ValueT &V) const {
     return TheMap.count(V);
   }
 
diff --git a/include/llvm/ADT/FoldingSet.h b/include/llvm/ADT/FoldingSet.h
index 9b7ee85..14c5933 100644
--- a/include/llvm/ADT/FoldingSet.h
+++ b/include/llvm/ADT/FoldingSet.h
@@ -794,6 +794,14 @@ template<typename T> struct FoldingSetTrait<T*> {
     ID.AddPointer(X);
   }
 };
+template <typename T1, typename T2>
+struct FoldingSetTrait<std::pair<T1, T2>> {
+  static inline void Profile(const std::pair<T1, T2> &P,
+                             llvm::FoldingSetNodeID &ID) {
+    ID.Add(P.first);
+    ID.Add(P.second);
+  }
+};
 } // End of namespace llvm.
 
 #endif
diff --git a/include/llvm/ADT/Hashing.h b/include/llvm/ADT/Hashing.h
index b11e3c1..abf02b8 100644
--- a/include/llvm/ADT/Hashing.h
+++ b/include/llvm/ADT/Hashing.h
@@ -152,7 +152,7 @@ inline uint64_t fetch64(const char *p) {
   uint64_t result;
   memcpy(&result, p, sizeof(result));
   if (sys::IsBigEndianHost)
-    return sys::SwapByteOrder(result);
+    sys::swapByteOrder(result);
   return result;
 }
 
@@ -160,7 +160,7 @@ inline uint32_t fetch32(const char *p) {
   uint32_t result;
   memcpy(&result, p, sizeof(result));
   if (sys::IsBigEndianHost)
-    return sys::SwapByteOrder(result);
+    sys::swapByteOrder(result);
   return result;
 }
 
diff --git a/include/llvm/ADT/IntrusiveRefCntPtr.h b/include/llvm/ADT/IntrusiveRefCntPtr.h
index cd1946c..f9df378 100644
--- a/include/llvm/ADT/IntrusiveRefCntPtr.h
+++ b/include/llvm/ADT/IntrusiveRefCntPtr.h
@@ -154,13 +154,13 @@ public:
     }
 
     template <class X>
-    IntrusiveRefCntPtr(IntrusiveRefCntPtr<X>&& S) : Obj(S.getPtr()) {
+    IntrusiveRefCntPtr(IntrusiveRefCntPtr<X>&& S) : Obj(S.get()) {
       S.Obj = 0;
     }
 
     template <class X>
     IntrusiveRefCntPtr(const IntrusiveRefCntPtr<X>& S)
-      : Obj(S.getPtr()) {
+      : Obj(S.get()) {
       retain();
     }
 
@@ -175,12 +175,9 @@ public:
 
     T* operator->() const { return Obj; }
 
-    T* getPtr() const { return Obj; }
+    T* get() const { return Obj; }
 
-    typedef T* (IntrusiveRefCntPtr::*unspecified_bool_type) () const;
-    operator unspecified_bool_type() const {
-      return Obj ? &IntrusiveRefCntPtr::getPtr : nullptr;
-    }
+    LLVM_EXPLICIT operator bool() const { return Obj; }
 
     void swap(IntrusiveRefCntPtr& other) {
       T* tmp = other.Obj;
@@ -206,42 +203,62 @@ public:
   inline bool operator==(const IntrusiveRefCntPtr<T>& A,
                          const IntrusiveRefCntPtr<U>& B)
   {
-    return A.getPtr() == B.getPtr();
+    return A.get() == B.get();
   }
 
   template<class T, class U>
   inline bool operator!=(const IntrusiveRefCntPtr<T>& A,
                          const IntrusiveRefCntPtr<U>& B)
   {
-    return A.getPtr() != B.getPtr();
+    return A.get() != B.get();
   }
 
   template<class T, class U>
   inline bool operator==(const IntrusiveRefCntPtr<T>& A,
                          U* B)
   {
-    return A.getPtr() == B;
+    return A.get() == B;
   }
 
   template<class T, class U>
   inline bool operator!=(const IntrusiveRefCntPtr<T>& A,
                          U* B)
   {
-    return A.getPtr() != B;
+    return A.get() != B;
   }
 
   template<class T, class U>
   inline bool operator==(T* A,
                          const IntrusiveRefCntPtr<U>& B)
   {
-    return A == B.getPtr();
+    return A == B.get();
   }
 
   template<class T, class U>
   inline bool operator!=(T* A,
                          const IntrusiveRefCntPtr<U>& B)
   {
-    return A != B.getPtr();
+    return A != B.get();
+  }
+
+  template <class T>
+  bool operator==(std::nullptr_t A, const IntrusiveRefCntPtr<T> &B) {
+    return !B;
+  }
+
+  template <class T>
+  bool operator==(const IntrusiveRefCntPtr<T> &A, std::nullptr_t B) {
+    return B == A;
+  }
+
+  template <class T>
+  bool operator!=(std::nullptr_t A, const IntrusiveRefCntPtr<T> &B) {
+    return !(A == B);
+  }
+
+  template <class T>
+  bool operator!=(const IntrusiveRefCntPtr<T> &A, std::nullptr_t B) {
+    return !(A == B);
   }
 
 //===----------------------------------------------------------------------===//
@@ -251,14 +268,14 @@ public:
   template<class T> struct simplify_type<IntrusiveRefCntPtr<T> > {
     typedef T* SimpleType;
     static SimpleType getSimplifiedValue(IntrusiveRefCntPtr<T>& Val) {
-      return Val.getPtr();
+      return Val.get();
     }
   };
 
   template<class T> struct simplify_type<const IntrusiveRefCntPtr<T> > {
     typedef /*const*/ T* SimpleType;
     static SimpleType getSimplifiedValue(const IntrusiveRefCntPtr<T>& Val) {
-      return Val.getPtr();
+      return Val.get();
     }
   };
 
diff --git a/include/llvm/ADT/MapVector.h b/include/llvm/ADT/MapVector.h
index 7fd1570..2eae22c 100644
--- a/include/llvm/ADT/MapVector.h
+++ b/include/llvm/ADT/MapVector.h
@@ -29,7 +29,7 @@ template<typename KeyT, typename ValueT,
          typename MapType = llvm::DenseMap<KeyT, unsigned>,
          typename VectorType = std::vector<std::pair<KeyT, ValueT> > >
 class MapVector {
-  typedef typename VectorType::size_type SizeType;
+  typedef typename VectorType::size_type size_type;
 
   MapType Map;
   VectorType Vector;
@@ -38,7 +38,7 @@ public:
   typedef typename VectorType::iterator iterator;
   typedef typename VectorType::const_iterator const_iterator;
 
-  SizeType size() const {
+  size_type size() const {
     return Vector.size();
   }
 
@@ -100,7 +100,7 @@ public:
     return std::make_pair(begin() + I, false);
   }
 
-  unsigned count(const KeyT &Key) const {
+  size_type count(const KeyT &Key) const {
     typename MapType::const_iterator Pos = Map.find(Key);
     return Pos == Map.end()? 0 : 1;
   }
@@ -123,6 +123,15 @@ public:
     Map.erase(Pos);
     Vector.pop_back();
   }
+
+  /// \brief Remove the element given by Iterator.
+  /// Returns an iterator to the element following the one which was removed,
+  /// which may be end().
+  typename VectorType::iterator erase(typename VectorType::iterator Iterator) {
+    typename MapType::iterator MapIterator = Map.find(Iterator->first);
+    Map.erase(MapIterator);
+    return Vector.erase(Iterator);
+  }
 };
 
 }
diff --git a/include/llvm/ADT/OwningPtr.h b/include/llvm/ADT/OwningPtr.h
deleted file mode 100644
index 5e83358..0000000
--- a/include/llvm/ADT/OwningPtr.h
+++ /dev/null
@@ -1,165 +0,0 @@
-//===- llvm/ADT/OwningPtr.h - Smart ptr that owns the pointee ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines and implements the OwningPtr class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ADT_OWNINGPTR_H
-#define LLVM_ADT_OWNINGPTR_H
-
-#include "llvm/Support/Compiler.h"
-#include <cassert>
-#include <cstddef>
-#include <memory>
-
-namespace llvm {
-
-/// OwningPtr smart pointer - OwningPtr mimics a built-in pointer except that it
-/// guarantees deletion of the object pointed to, either on destruction of the
-/// OwningPtr or via an explicit reset().  Once created, ownership of the
-/// pointee object can be taken away from OwningPtr by using the take method.
-template<class T>
-class OwningPtr {
-  OwningPtr(OwningPtr const &) LLVM_DELETED_FUNCTION;
-  OwningPtr &operator=(OwningPtr const &) LLVM_DELETED_FUNCTION;
-  T *Ptr;
-public:
-  explicit OwningPtr(T *P = 0) : Ptr(P) {}
-
-  OwningPtr(OwningPtr &&Other) : Ptr(Other.take()) {}
-
-  OwningPtr &operator=(OwningPtr &&Other) {
-    reset(Other.take());
-    return *this;
-  }
-
-  OwningPtr(std::unique_ptr<T> Other) : Ptr(Other.release()) {}
-
-  OwningPtr &operator=(std::unique_ptr<T> Other) {
-    reset(Other.release());
-    return *this;
-  }
-
-#if LLVM_HAS_RVALUE_REFERENCE_THIS
-  operator std::unique_ptr<T>() && { return std::unique_ptr<T>(take()); }
-#endif
-
-  ~OwningPtr() {
-    delete Ptr;
-  }
-
-  /// reset - Change the current pointee to the specified pointer.  Note that
-  /// calling this with any pointer (including a null pointer) deletes the
-  /// current pointer.
-  void reset(T *P = 0) {
-    if (P == Ptr) return;
-    T *Tmp = Ptr;
-    Ptr = P;
-    delete Tmp;
-  }
-
-  /// take - Reset the owning pointer to null and return its pointer.  This does
-  /// not delete the pointer before returning it.
-  T *take() {
-    T *Tmp = Ptr;
-    Ptr = nullptr;
-    return Tmp;
-  }
-
-  T *release() { return take(); }
-
-  std::unique_ptr<T> take_unique() { return std::unique_ptr<T>(take()); }
-
-  T &operator*() const {
-    assert(Ptr && "Cannot dereference null pointer");
-    return *Ptr;
-  }
-
-  T *operator->() const { return Ptr; }
-  T *get() const { return Ptr; }
-  LLVM_EXPLICIT operator bool() const { return Ptr != nullptr; }
-  bool operator!() const { return Ptr == nullptr; }
-  bool isValid() const { return Ptr != nullptr; }
-
-  void swap(OwningPtr &RHS) {
-    T *Tmp = RHS.Ptr;
-    RHS.Ptr = Ptr;
-    Ptr = Tmp;
-  }
-};
-
-template<class T>
-inline void swap(OwningPtr<T> &a, OwningPtr<T> &b) {
-  a.swap(b);
-}
-
-/// OwningArrayPtr smart pointer - OwningArrayPtr provides the same
-///  functionality as OwningPtr, except that it works for array types.
-template<class T>
-class OwningArrayPtr {
-  OwningArrayPtr(OwningArrayPtr const &) LLVM_DELETED_FUNCTION;
-  OwningArrayPtr &operator=(OwningArrayPtr const &) LLVM_DELETED_FUNCTION;
-  T *Ptr;
-public:
-  explicit OwningArrayPtr(T *P = 0) : Ptr(P) {}
-
-  OwningArrayPtr(OwningArrayPtr &&Other) : Ptr(Other.take()) {}
-
-  OwningArrayPtr &operator=(OwningArrayPtr &&Other) {
-    reset(Other.take());
-    return *this;
-  }
-
-  ~OwningArrayPtr() {
-    delete [] Ptr;
-  }
-
-  /// reset - Change the current pointee to the specified pointer.  Note that
-  /// calling this with any pointer (including a null pointer) deletes the
-  /// current pointer.
-  void reset(T *P = 0) {
-    if (P == Ptr) return;
-    T *Tmp = Ptr;
-    Ptr = P;
-    delete [] Tmp;
-  }
-
-  /// take - Reset the owning pointer to null and return its pointer.  This does
-  /// not delete the pointer before returning it.
-  T *take() {
-    T *Tmp = Ptr;
-    Ptr = 0;
-    return Tmp;
-  }
-
-  T &operator[](std::ptrdiff_t i) const {
-    assert(Ptr && "Cannot dereference null pointer");
-    return Ptr[i];
-  }
-
-  T *get() const { return Ptr; }
-  LLVM_EXPLICIT operator bool() const { return Ptr != 0; }
-  bool operator!() const { return Ptr == nullptr; }
-
-  void swap(OwningArrayPtr &RHS) {
-    T *Tmp = RHS.Ptr;
-    RHS.Ptr = Ptr;
-    Ptr = Tmp;
-  }
-};
-
-template<class T>
-inline void swap(OwningArrayPtr<T> &a, OwningArrayPtr<T> &b) {
-  a.swap(b);
-}
-
-} // end namespace llvm
-
-#endif
diff --git a/include/llvm/ADT/ScopedHashTable.h b/include/llvm/ADT/ScopedHashTable.h
index 3cc7738..02a6ea3 100644
--- a/include/llvm/ADT/ScopedHashTable.h
+++ b/include/llvm/ADT/ScopedHashTable.h
@@ -148,6 +148,7 @@ public:
   /// ScopeTy - This is a helpful typedef that allows clients to get easy access
   /// to the name of the scope for this hash table.
   typedef ScopedHashTableScope<K, V, KInfo, AllocatorTy> ScopeTy;
+  typedef unsigned size_type;
 private:
   typedef ScopedHashTableVal<K, V> ValTy;
   DenseMap<K, ValTy*, KInfo> TopLevelMap;
@@ -170,7 +171,8 @@ public:
   AllocatorTy &getAllocator() { return Allocator; }
   const AllocatorTy &getAllocator() const { return Allocator; }
 
-  bool count(const K &Key) const {
+  /// Return 1 if the specified key is in the table, 0 otherwise.
+  size_type count(const K &Key) const {
     return TopLevelMap.count(Key);
   }
 
diff --git a/include/llvm/ADT/SmallBitVector.h b/include/llvm/ADT/SmallBitVector.h
index e965bc4..0922017 100644
--- a/include/llvm/ADT/SmallBitVector.h
+++ b/include/llvm/ADT/SmallBitVector.h
@@ -54,6 +54,7 @@ class SmallBitVector {
   };
 
 public:
+  typedef unsigned size_type;
   // Encapsulation of a single bit.
   class reference {
     SmallBitVector &TheVector;
@@ -173,7 +174,7 @@ public:
   }
 
   /// count - Returns the number of bits which are set.
-  unsigned count() const {
+  size_type count() const {
     if (isSmall()) {
       uintptr_t Bits = getSmallBits();
       if (NumBaseBits == 32)
diff --git a/include/llvm/ADT/SmallPtrSet.h b/include/llvm/ADT/SmallPtrSet.h
index 67104f3..74f3fd4 100644
--- a/include/llvm/ADT/SmallPtrSet.h
+++ b/include/llvm/ADT/SmallPtrSet.h
@@ -73,8 +73,9 @@ protected:
   ~SmallPtrSetImplBase();
 
 public:
+  typedef unsigned size_type;
   bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const { return size() == 0; }
-  unsigned size() const { return NumElements; }
+  size_type size() const { return NumElements; }
 
   void clear() {
     // If the capacity of the array is huge, and the # elements used is small,
@@ -263,7 +264,7 @@ public:
   }
 
   /// count - Return 1 if the specified pointer is in the set, 0 otherwise.
-  unsigned count(PtrType Ptr) const {
+  size_type count(PtrType Ptr) const {
     return count_imp(PtrTraits::getAsVoidPointer(Ptr)) ? 1 : 0;
   }
 
diff --git a/include/llvm/ADT/SmallSet.h b/include/llvm/ADT/SmallSet.h
index 6f36234..bb1971e 100644
--- a/include/llvm/ADT/SmallSet.h
+++ b/include/llvm/ADT/SmallSet.h
@@ -37,18 +37,19 @@ class SmallSet {
   typedef typename SmallVector<T, N>::const_iterator VIterator;
   typedef typename SmallVector<T, N>::iterator mutable_iterator;
 public:
+  typedef size_t size_type;
   SmallSet() {}
 
   bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const {
     return Vector.empty() && Set.empty();
   }
 
-  unsigned size() const {
+  size_type size() const {
     return isSmall() ? Vector.size() : Set.size();
   }
 
   /// count - Return 1 if the element is in the set, 0 otherwise.
-  unsigned count(const T &V) const {
+  size_type count(const T &V) const {
     if (isSmall()) {
       // Since the collection is small, just do a linear search.
       return vfind(V) == Vector.end() ? 0 : 1;
diff --git a/include/llvm/ADT/SmallVector.h b/include/llvm/ADT/SmallVector.h
index dcf0354..82538e9 100644
--- a/include/llvm/ADT/SmallVector.h
+++ b/include/llvm/ADT/SmallVector.h
@@ -380,7 +380,8 @@ public:
     } else if (N > this->size()) {
       if (this->capacity() < N)
         this->grow(N);
-      std::uninitialized_fill(this->end(), this->begin()+N, T());
+      for (auto I = this->end(), E = this->begin() + N; I != E; ++I)
+        new (&*I) T();
       this->setEnd(this->begin()+N);
     }
   }
@@ -488,9 +489,9 @@ public:
     }
 
     ::new ((void*) this->end()) T(::std::move(this->back()));
-    this->setEnd(this->end()+1);
     // Push everything else over.
     this->move_backward(I, this->end()-1, this->end());
+    this->setEnd(this->end()+1);
 
     // If we just moved the element we're inserting, be sure to update
     // the reference.
@@ -516,10 +517,10 @@ public:
       this->grow();
       I = this->begin()+EltNo;
     }
-    ::new ((void*) this->end()) T(this->back());
-    this->setEnd(this->end()+1);
+    ::new ((void*) this->end()) T(std::move(this->back()));
     // Push everything else over.
     this->move_backward(I, this->end()-1, this->end());
+    this->setEnd(this->end()+1);
 
     // If we just moved the element we're inserting, be sure to update
     // the reference.
@@ -555,7 +556,8 @@ public:
     // reallocate the vector.
     if (size_t(this->end()-I) >= NumToInsert) {
       T *OldEnd = this->end();
-      append(this->end()-NumToInsert, this->end());
+      append(std::move_iterator<iterator>(this->end() - NumToInsert),
+             std::move_iterator<iterator>(this->end()));
 
       // Copy the existing elements that get replaced.
       this->move_backward(I, OldEnd-NumToInsert, OldEnd);
@@ -608,7 +610,8 @@ public:
     // reallocate the vector.
     if (size_t(this->end()-I) >= NumToInsert) {
       T *OldEnd = this->end();
-      append(this->end()-NumToInsert, this->end());
+      append(std::move_iterator<iterator>(this->end() - NumToInsert),
+             std::move_iterator<iterator>(this->end()));
 
       // Copy the existing elements that get replaced.
       this->move_backward(I, OldEnd-NumToInsert, OldEnd);
diff --git a/include/llvm/ADT/SparseBitVector.h b/include/llvm/ADT/SparseBitVector.h
index 706f248..36754d6 100644
--- a/include/llvm/ADT/SparseBitVector.h
+++ b/include/llvm/ADT/SparseBitVector.h
@@ -45,6 +45,7 @@ struct SparseBitVectorElement
   : public ilist_node<SparseBitVectorElement<ElementSize> > {
 public:
   typedef unsigned long BitWord;
+  typedef unsigned size_type;
   enum {
     BITWORD_SIZE = sizeof(BitWord) * CHAR_BIT,
     BITWORDS_PER_ELEMENT = (ElementSize + BITWORD_SIZE - 1) / BITWORD_SIZE,
@@ -120,7 +121,7 @@ public:
     return Bits[Idx / BITWORD_SIZE] & (1L << (Idx % BITWORD_SIZE));
   }
 
-  unsigned count() const {
+  size_type count() const {
     unsigned NumBits = 0;
     for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i)
       if (sizeof(BitWord) == 4)
diff --git a/include/llvm/ADT/SparseMultiSet.h b/include/llvm/ADT/SparseMultiSet.h
index d2b2f8d..dc1273e 100644
--- a/include/llvm/ADT/SparseMultiSet.h
+++ b/include/llvm/ADT/SparseMultiSet.h
@@ -185,6 +185,7 @@ public:
   typedef const ValueT &const_reference;
   typedef ValueT *pointer;
   typedef const ValueT *const_pointer;
+  typedef unsigned size_type;
 
   SparseMultiSet()
     : Sparse(nullptr), Universe(0), FreelistIdx(SMSNode::INVALID), NumFree(0) {}
@@ -327,7 +328,7 @@ public:
   /// This is not the same as BitVector::size() which returns the size of the
   /// universe.
   ///
-  unsigned size() const {
+  size_type size() const {
     assert(NumFree <= Dense.size() && "Out-of-bounds free entries");
     return Dense.size() - NumFree;
   }
@@ -378,7 +379,7 @@ public:
 
   /// Returns the number of elements identified by Key. This will be linear in
   /// the number of elements of that key.
-  unsigned count(const KeyT &Key) const {
+  size_type count(const KeyT &Key) const {
     unsigned Ret = 0;
     for (const_iterator It = find(Key); It != end(); ++It)
       ++Ret;
diff --git a/include/llvm/ADT/SparseSet.h b/include/llvm/ADT/SparseSet.h
index 899f2e4..632d52a 100644
--- a/include/llvm/ADT/SparseSet.h
+++ b/include/llvm/ADT/SparseSet.h
@@ -124,6 +124,7 @@ class SparseSet {
 
   typedef typename KeyFunctorT::argument_type KeyT;
   typedef SmallVector<ValueT, 8> DenseT;
+  typedef unsigned size_type;
   DenseT Dense;
   SparseT *Sparse;
   unsigned Universe;
@@ -186,7 +187,7 @@ public:
   /// This is not the same as BitVector::size() which returns the size of the
   /// universe.
   ///
-  unsigned size() const { return Dense.size(); }
+  size_type size() const { return Dense.size(); }
 
   /// clear - Clears the set.  This is a very fast constant time operation.
   ///
@@ -231,7 +232,7 @@ public:
   /// count - Returns 1 if this set contains an element identified by Key,
   /// 0 otherwise.
   ///
-  unsigned count(const KeyT &Key) const {
+  size_type count(const KeyT &Key) const {
     return find(Key) == end() ? 0 : 1;
   }
 
diff --git a/include/llvm/ADT/StringMap.h b/include/llvm/ADT/StringMap.h
index ecac5dd..c40e5e2 100644
--- a/include/llvm/ADT/StringMap.h
+++ b/include/llvm/ADT/StringMap.h
@@ -64,7 +64,7 @@ protected:
   }
 
   StringMapImpl(unsigned InitSize, unsigned ItemSize);
-  void RehashTable();
+  unsigned RehashTable(unsigned BucketNo = 0);
 
   /// LookupBucketFor - Look up the bucket that the specified string should end
   /// up in.  If it already exists as a key in the map, the Item pointer for the
@@ -139,10 +139,10 @@ public:
   /// Create - Create a StringMapEntry for the specified key and default
   /// construct the value.
   template<typename AllocatorTy, typename InitType>
-  static StringMapEntry *Create(const char *KeyStart, const char *KeyEnd,
+  static StringMapEntry *Create(StringRef Key,
                                 AllocatorTy &Allocator,
                                 InitType InitVal) {
-    unsigned KeyLength = static_cast<unsigned>(KeyEnd-KeyStart);
+    unsigned KeyLength = Key.size();
 
     // Allocate a new item with space for the string at the end and a null
     // terminator.
@@ -158,27 +158,25 @@ public:
 
     // Copy the string information.
     char *StrBuffer = const_cast<char*>(NewItem->getKeyData());
-    memcpy(StrBuffer, KeyStart, KeyLength);
+    memcpy(StrBuffer, Key.data(), KeyLength);
     StrBuffer[KeyLength] = 0;  // Null terminate for convenience of clients.
     return NewItem;
   }
 
   template<typename AllocatorTy>
-  static StringMapEntry *Create(const char *KeyStart, const char *KeyEnd,
-                                AllocatorTy &Allocator) {
-    return Create(KeyStart, KeyEnd, Allocator, 0);
+  static StringMapEntry *Create(StringRef Key, AllocatorTy &Allocator) {
+    return Create(Key, Allocator, ValueTy());
   }
 
   /// Create - Create a StringMapEntry with normal malloc/free.
   template<typename InitType>
-  static StringMapEntry *Create(const char *KeyStart, const char *KeyEnd,
-                                InitType InitVal) {
+  static StringMapEntry *Create(StringRef Key, InitType InitVal) {
     MallocAllocator A;
-    return Create(KeyStart, KeyEnd, A, std::move(InitVal));
+    return Create(Key, A, std::move(InitVal));
   }
 
-  static StringMapEntry *Create(const char *KeyStart, const char *KeyEnd) {
-    return Create(KeyStart, KeyEnd, ValueTy());
+  static StringMapEntry *Create(StringRef Key) {
+    return Create(Key, ValueTy());
   }
 
   /// GetStringMapEntryFromValue - Given a value that is known to be embedded
@@ -325,6 +323,28 @@ public:
     return true;
   }
 
+  /// insert - Inserts the specified key/value pair into the map if the key
+  /// isn't already in the map. The bool component of the returned pair is true
+  /// if and only if the insertion takes place, and the iterator component of
+  /// the pair points to the element with key equivalent to the key of the pair.
+  std::pair<iterator, bool> insert(std::pair<StringRef, ValueTy> KV) {
+    unsigned BucketNo = LookupBucketFor(KV.first);
+    StringMapEntryBase *&Bucket = TheTable[BucketNo];
+    if (Bucket && Bucket != getTombstoneVal())
+      return std::make_pair(iterator(TheTable + BucketNo, false),
+                            false); // Already exists in map.
+
+    if (Bucket == getTombstoneVal())
+      --NumTombstones;
+    Bucket =
+        MapEntryTy::Create(KV.first, Allocator, std::move(KV.second));
+    ++NumItems;
+    assert(NumItems + NumTombstones <= NumBuckets);
+
+    BucketNo = RehashTable(BucketNo);
+    return std::make_pair(iterator(TheTable + BucketNo, false), true);
+  }
+
   // clear - Empties out the StringMap
   void clear() {
     if (empty()) return;
@@ -348,25 +368,7 @@ public:
   /// return.
   template <typename InitTy>
   MapEntryTy &GetOrCreateValue(StringRef Key, InitTy Val) {
-    unsigned BucketNo = LookupBucketFor(Key);
-    StringMapEntryBase *&Bucket = TheTable[BucketNo];
-    if (Bucket && Bucket != getTombstoneVal())
-      return *static_cast<MapEntryTy*>(Bucket);
-
-    MapEntryTy *NewItem =
-        MapEntryTy::Create(Key.begin(), Key.end(), Allocator, std::move(Val));
-
-    if (Bucket == getTombstoneVal())
-      --NumTombstones;
-    ++NumItems;
-    assert(NumItems + NumTombstones <= NumBuckets);
-
-    // Fill in the bucket for the hash table.  The FullHashValue was already
-    // filled in by LookupBucketFor.
-    Bucket = NewItem;
-
-    RehashTable();
-    return *NewItem;
+    return *insert(std::make_pair(Key, std::move(Val))).first;
   }
 
   MapEntryTy &GetOrCreateValue(StringRef Key) {
diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index 95f3380..2867a0e 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -76,7 +76,8 @@ public:
     le32,       // le32: generic little-endian 32-bit CPU (PNaCl / Emscripten)
     amdil,      // amdil: amd IL
     spir,       // SPIR: standard portable IR for OpenCL 32-bit version
-    spir64      // SPIR: standard portable IR for OpenCL 64-bit version
+    spir64,     // SPIR: standard portable IR for OpenCL 64-bit version
+    kalimba     // Kalimba: generic kalimba
   };
   enum VendorType {
     UnknownVendor,
@@ -88,7 +89,9 @@ public:
     BGQ,
     Freescale,
     IBM,
-    NVIDIA
+    ImaginationTechnologies,
+    NVIDIA,
+    CSR
   };
   enum OSType {
     UnknownOS,
@@ -350,6 +353,10 @@ public:
     return getOS() == Triple::Win32 && getEnvironment() == Triple::MSVC;
   }
 
+  bool isWindowsItaniumEnvironment() const {
+    return getOS() == Triple::Win32 && getEnvironment() == Triple::Itanium;
+  }
+
   bool isWindowsCygwinEnvironment() const {
     return getOS() == Triple::Cygwin ||
            (getOS() == Triple::Win32 && getEnvironment() == Triple::Cygnus);
diff --git a/include/llvm/ADT/Twine.h b/include/llvm/ADT/Twine.h
index a54fd74..4be3ee6 100644
--- a/include/llvm/ADT/Twine.h
+++ b/include/llvm/ADT/Twine.h
@@ -182,6 +182,10 @@ namespace llvm {
       assert(isValid() && "Invalid twine!");
     }
 
+    /// Since the intended use of twines is as temporary objects, assignments
+    /// when concatenating might cause undefined behavior or stack corruptions
+    Twine &operator=(const Twine &Other) LLVM_DELETED_FUNCTION;
+
     /// isNull - Check for the null twine.
     bool isNull() const {
       return getLHSKind() == NullKind;
diff --git a/include/llvm/ADT/UniqueVector.h b/include/llvm/ADT/UniqueVector.h
index 2d02d1c..a9cb2f5 100644
--- a/include/llvm/ADT/UniqueVector.h
+++ b/include/llvm/ADT/UniqueVector.h
@@ -22,13 +22,18 @@ namespace llvm {
 /// class should have an implementation of operator== and of operator<.
 /// Entries can be fetched using operator[] with the entry ID.
 template<class T> class UniqueVector {
+public:
+  typedef typename std::vector<T> VectorType;
+  typedef typename VectorType::iterator iterator;
+  typedef typename VectorType::const_iterator const_iterator;
+
 private:
   // Map - Used to handle the correspondence of entry to ID.
   std::map<T, unsigned> Map;
 
   // Vector - ID ordered vector of entries. Entries can be indexed by ID - 1.
   //
-  std::vector<T> Vector;
+  VectorType Vector;
 
 public:
   /// insert - Append entry to the vector if it doesn't already exist.  Returns
@@ -68,6 +73,18 @@ public:
     return Vector[ID - 1];
   }
 
+  /// \brief Return an iterator to the start of the vector.
+  iterator begin() { return Vector.begin(); }
+
+  /// \brief Return an iterator to the start of the vector.
+  const_iterator begin() const { return Vector.begin(); }
+
+  /// \brief Return an iterator to the end of the vector.
+  iterator end() { return Vector.end(); }
+
+  /// \brief Return an iterator to the end of the vector.
+  const_iterator end() const { return Vector.end(); }
+
   /// size - Returns the number of entries in the vector.
   ///
   size_t size() const { return Vector.size(); }
diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h
index 8852866..79d52fc 100644
--- a/include/llvm/Analysis/AliasAnalysis.h
+++ b/include/llvm/Analysis/AliasAnalysis.h
@@ -274,6 +274,14 @@ public:
     UnknownModRefBehavior = Anywhere | ModRef
   };
 
+  /// Get the location associated with a pointer argument of a callsite.
+  /// The mask bits are set to indicate the allowed aliasing ModRef kinds.
+  /// Note that these mask bits do not necessarily account for the overall
+  /// behavior of the function, but rather only provide additional
+  /// per-argument information.
+  virtual Location getArgLocation(ImmutableCallSite CS, unsigned ArgIdx,
+                                  ModRefResult &Mask);
+
   /// getModRefBehavior - Return the behavior when calling the given call site.
   virtual ModRefBehavior getModRefBehavior(ImmutableCallSite CS);
 
diff --git a/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/include/llvm/Analysis/BlockFrequencyInfoImpl.h
index bd72d3e..7340801 100644
--- a/include/llvm/Analysis/BlockFrequencyInfoImpl.h
+++ b/include/llvm/Analysis/BlockFrequencyInfoImpl.h
@@ -22,6 +22,7 @@
 #include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ScaledNumber.h"
 #include "llvm/Support/raw_ostream.h"
 #include <deque>
 #include <list>
@@ -32,676 +33,6 @@
 
 //===----------------------------------------------------------------------===//
 //
-// UnsignedFloat definition.
-//
-// TODO: Make this private to BlockFrequencyInfoImpl or delete.
-//
-//===----------------------------------------------------------------------===//
-namespace llvm {
-
-class UnsignedFloatBase {
-public:
-  static const int32_t MaxExponent = 16383;
-  static const int32_t MinExponent = -16382;
-  static const int DefaultPrecision = 10;
-
-  static void dump(uint64_t D, int16_t E, int Width);
-  static raw_ostream &print(raw_ostream &OS, uint64_t D, int16_t E, int Width,
-                            unsigned Precision);
-  static std::string toString(uint64_t D, int16_t E, int Width,
-                              unsigned Precision);
-  static int countLeadingZeros32(uint32_t N) { return countLeadingZeros(N); }
-  static int countLeadingZeros64(uint64_t N) { return countLeadingZeros(N); }
-  static uint64_t getHalf(uint64_t N) { return (N >> 1) + (N & 1); }
-
-  static std::pair<uint64_t, bool> splitSigned(int64_t N) {
-    if (N >= 0)
-      return std::make_pair(N, false);
-    uint64_t Unsigned = N == INT64_MIN ? UINT64_C(1) << 63 : uint64_t(-N);
-    return std::make_pair(Unsigned, true);
-  }
-  static int64_t joinSigned(uint64_t U, bool IsNeg) {
-    if (U > uint64_t(INT64_MAX))
-      return IsNeg ? INT64_MIN : INT64_MAX;
-    return IsNeg ? -int64_t(U) : int64_t(U);
-  }
-
-  static int32_t extractLg(const std::pair<int32_t, int> &Lg) {
-    return Lg.first;
-  }
-  static int32_t extractLgFloor(const std::pair<int32_t, int> &Lg) {
-    return Lg.first - (Lg.second > 0);
-  }
-  static int32_t extractLgCeiling(const std::pair<int32_t, int> &Lg) {
-    return Lg.first + (Lg.second < 0);
-  }
-
-  static std::pair<uint64_t, int16_t> divide64(uint64_t L, uint64_t R);
-  static std::pair<uint64_t, int16_t> multiply64(uint64_t L, uint64_t R);
-
-  static int compare(uint64_t L, uint64_t R, int Shift) {
-    assert(Shift >= 0);
-    assert(Shift < 64);
-
-    uint64_t L_adjusted = L >> Shift;
-    if (L_adjusted < R)
-      return -1;
-    if (L_adjusted > R)
-      return 1;
-
-    return L > L_adjusted << Shift ? 1 : 0;
-  }
-};
-
-/// \brief Simple representation of an unsigned floating point.
-///
-/// UnsignedFloat is a unsigned floating point number.  It uses simple
-/// saturation arithmetic, and every operation is well-defined for every value.
-///
-/// The number is split into a signed exponent and unsigned digits.  The number
-/// represented is \c getDigits()*2^getExponent().  In this way, the digits are
-/// much like the mantissa in the x87 long double, but there is no canonical
-/// form, so the same number can be represented by many bit representations
-/// (it's always in "denormal" mode).
-///
-/// UnsignedFloat is templated on the underlying integer type for digits, which
-/// is expected to be one of uint64_t, uint32_t, uint16_t or uint8_t.
-///
-/// Unlike builtin floating point types, UnsignedFloat is portable.
-///
-/// Unlike APFloat, UnsignedFloat does not model architecture floating point
-/// behaviour (this should make it a little faster), and implements most
-/// operators (this makes it usable).
-///
-/// UnsignedFloat is totally ordered.  However, there is no canonical form, so
-/// there are multiple representations of most scalars.  E.g.:
-///
-///     UnsignedFloat(8u, 0) == UnsignedFloat(4u, 1)
-///     UnsignedFloat(4u, 1) == UnsignedFloat(2u, 2)
-///     UnsignedFloat(2u, 2) == UnsignedFloat(1u, 3)
-///
-/// UnsignedFloat implements most arithmetic operations.  Precision is kept
-/// where possible.  Uses simple saturation arithmetic, so that operations
-/// saturate to 0.0 or getLargest() rather than under or overflowing.  It has
-/// some extra arithmetic for unit inversion.  0.0/0.0 is defined to be 0.0.
-/// Any other division by 0.0 is defined to be getLargest().
-///
-/// As a convenience for modifying the exponent, left and right shifting are
-/// both implemented, and both interpret negative shifts as positive shifts in
-/// the opposite direction.
-///
-/// Exponents are limited to the range accepted by x87 long double.  This makes
-/// it trivial to add functionality to convert to APFloat (this is already
-/// relied on for the implementation of printing).
-///
-/// The current plan is to gut this and make the necessary parts of it (even
-/// more) private to BlockFrequencyInfo.
-template <class DigitsT> class UnsignedFloat : UnsignedFloatBase {
-public:
-  static_assert(!std::numeric_limits<DigitsT>::is_signed,
-                "only unsigned floats supported");
-
-  typedef DigitsT DigitsType;
-
-private:
-  typedef std::numeric_limits<DigitsType> DigitsLimits;
-
-  static const int Width = sizeof(DigitsType) * 8;
-  static_assert(Width <= 64, "invalid integer width for digits");
-
-private:
-  DigitsType Digits;
-  int16_t Exponent;
-
-public:
-  UnsignedFloat() : Digits(0), Exponent(0) {}
-
-  UnsignedFloat(DigitsType Digits, int16_t Exponent)
-      : Digits(Digits), Exponent(Exponent) {}
-
-private:
-  UnsignedFloat(const std::pair<uint64_t, int16_t> &X)
-      : Digits(X.first), Exponent(X.second) {}
-
-public:
-  static UnsignedFloat getZero() { return UnsignedFloat(0, 0); }
-  static UnsignedFloat getOne() { return UnsignedFloat(1, 0); }
-  static UnsignedFloat getLargest() {
-    return UnsignedFloat(DigitsLimits::max(), MaxExponent);
-  }
-  static UnsignedFloat getFloat(uint64_t N) { return adjustToWidth(N, 0); }
-  static UnsignedFloat getInverseFloat(uint64_t N) {
-    return getFloat(N).invert();
-  }
-  static UnsignedFloat getFraction(DigitsType N, DigitsType D) {
-    return getQuotient(N, D);
-  }
-
-  int16_t getExponent() const { return Exponent; }
-  DigitsType getDigits() const { return Digits; }
-
-  /// \brief Convert to the given integer type.
-  ///
-  /// Convert to \c IntT using simple saturating arithmetic, truncating if
-  /// necessary.
-  template <class IntT> IntT toInt() const;
-
-  bool isZero() const { return !Digits; }
-  bool isLargest() const { return *this == getLargest(); }
-  bool isOne() const {
-    if (Exponent > 0 || Exponent <= -Width)
-      return false;
-    return Digits == DigitsType(1) << -Exponent;
-  }
-
-  /// \brief The log base 2, rounded.
-  ///
-  /// Get the lg of the scalar.  lg 0 is defined to be INT32_MIN.
-  int32_t lg() const { return extractLg(lgImpl()); }
-
-  /// \brief The log base 2, rounded towards INT32_MIN.
-  ///
-  /// Get the lg floor.  lg 0 is defined to be INT32_MIN.
-  int32_t lgFloor() const { return extractLgFloor(lgImpl()); }
-
-  /// \brief The log base 2, rounded towards INT32_MAX.
-  ///
-  /// Get the lg ceiling.  lg 0 is defined to be INT32_MIN.
-  int32_t lgCeiling() const { return extractLgCeiling(lgImpl()); }
-
-  bool operator==(const UnsignedFloat &X) const { return compare(X) == 0; }
-  bool operator<(const UnsignedFloat &X) const { return compare(X) < 0; }
-  bool operator!=(const UnsignedFloat &X) const { return compare(X) != 0; }
-  bool operator>(const UnsignedFloat &X) const { return compare(X) > 0; }
-  bool operator<=(const UnsignedFloat &X) const { return compare(X) <= 0; }
-  bool operator>=(const UnsignedFloat &X) const { return compare(X) >= 0; }
-
-  bool operator!() const { return isZero(); }
-
-  /// \brief Convert to a decimal representation in a string.
-  ///
-  /// Convert to a string.  Uses scientific notation for very large/small
-  /// numbers.  Scientific notation is used roughly for numbers outside of the
-  /// range 2^-64 through 2^64.
-  ///
-  /// \c Precision indicates the number of decimal digits of precision to use;
-  /// 0 requests the maximum available.
-  ///
-  /// As a special case to make debugging easier, if the number is small enough
-  /// to convert without scientific notation and has more than \c Precision
-  /// digits before the decimal place, it's printed accurately to the first
-  /// digit past zero.  E.g., assuming 10 digits of precision:
-  ///
-  ///     98765432198.7654... => 98765432198.8
-  ///      8765432198.7654... =>  8765432198.8
-  ///       765432198.7654... =>   765432198.8
-  ///        65432198.7654... =>    65432198.77
-  ///         5432198.7654... =>     5432198.765
-  std::string toString(unsigned Precision = DefaultPrecision) {
-    return UnsignedFloatBase::toString(Digits, Exponent, Width, Precision);
-  }
-
-  /// \brief Print a decimal representation.
-  ///
-  /// Print a string.  See toString for documentation.
-  raw_ostream &print(raw_ostream &OS,
-                     unsigned Precision = DefaultPrecision) const {
-    return UnsignedFloatBase::print(OS, Digits, Exponent, Width, Precision);
-  }
-  void dump() const { return UnsignedFloatBase::dump(Digits, Exponent, Width); }
-
-  UnsignedFloat &operator+=(const UnsignedFloat &X);
-  UnsignedFloat &operator-=(const UnsignedFloat &X);
-  UnsignedFloat &operator*=(const UnsignedFloat &X);
-  UnsignedFloat &operator/=(const UnsignedFloat &X);
-  UnsignedFloat &operator<<=(int16_t Shift) { shiftLeft(Shift); return *this; }
-  UnsignedFloat &operator>>=(int16_t Shift) { shiftRight(Shift); return *this; }
-
-private:
-  void shiftLeft(int32_t Shift);
-  void shiftRight(int32_t Shift);
-
-  /// \brief Adjust two floats to have matching exponents.
-  ///
-  /// Adjust \c this and \c X to have matching exponents.  Returns the new \c X
-  /// by value.  Does nothing if \a isZero() for either.
-  ///
-  /// The value that compares smaller will lose precision, and possibly become
-  /// \a isZero().
-  UnsignedFloat matchExponents(UnsignedFloat X);
-
-  /// \brief Increase exponent to match another float.
-  ///
-  /// Increases \c this to have an exponent matching \c X.  May decrease the
-  /// exponent of \c X in the process, and \c this may possibly become \a
-  /// isZero().
-  void increaseExponentToMatch(UnsignedFloat &X, int32_t ExponentDiff);
-
-public:
-  /// \brief Scale a large number accurately.
-  ///
-  /// Scale N (multiply it by this).  Uses full precision multiplication, even
-  /// if Width is smaller than 64, so information is not lost.
-  uint64_t scale(uint64_t N) const;
-  uint64_t scaleByInverse(uint64_t N) const {
-    // TODO: implement directly, rather than relying on inverse.  Inverse is
-    // expensive.
-    return inverse().scale(N);
-  }
-  int64_t scale(int64_t N) const {
-    std::pair<uint64_t, bool> Unsigned = splitSigned(N);
-    return joinSigned(scale(Unsigned.first), Unsigned.second);
-  }
-  int64_t scaleByInverse(int64_t N) const {
-    std::pair<uint64_t, bool> Unsigned = splitSigned(N);
-    return joinSigned(scaleByInverse(Unsigned.first), Unsigned.second);
-  }
-
-  int compare(const UnsignedFloat &X) const;
-  int compareTo(uint64_t N) const {
-    UnsignedFloat Float = getFloat(N);
-    int Compare = compare(Float);
-    if (Width == 64 || Compare != 0)
-      return Compare;
-
-    // Check for precision loss.  We know *this == RoundTrip.
-    uint64_t RoundTrip = Float.template toInt<uint64_t>();
-    return N == RoundTrip ? 0 : RoundTrip < N ? -1 : 1;
-  }
-  int compareTo(int64_t N) const { return N < 0 ? 1 : compareTo(uint64_t(N)); }
-
-  UnsignedFloat &invert() { return *this = UnsignedFloat::getFloat(1) / *this; }
-  UnsignedFloat inverse() const { return UnsignedFloat(*this).invert(); }
-
-private:
-  static UnsignedFloat getProduct(DigitsType L, DigitsType R);
-  static UnsignedFloat getQuotient(DigitsType Dividend, DigitsType Divisor);
-
-  std::pair<int32_t, int> lgImpl() const;
-  static int countLeadingZerosWidth(DigitsType Digits) {
-    if (Width == 64)
-      return countLeadingZeros64(Digits);
-    if (Width == 32)
-      return countLeadingZeros32(Digits);
-    return countLeadingZeros32(Digits) + Width - 32;
-  }
-
-  static UnsignedFloat adjustToWidth(uint64_t N, int32_t S) {
-    assert(S >= MinExponent);
-    assert(S <= MaxExponent);
-    if (Width == 64 || N <= DigitsLimits::max())
-      return UnsignedFloat(N, S);
-
-    // Shift right.
-    int Shift = 64 - Width - countLeadingZeros64(N);
-    DigitsType Shifted = N >> Shift;
-
-    // Round.
-    assert(S + Shift <= MaxExponent);
-    return getRounded(UnsignedFloat(Shifted, S + Shift),
-                      N & UINT64_C(1) << (Shift - 1));
-  }
-
-  static UnsignedFloat getRounded(UnsignedFloat P, bool Round) {
-    if (!Round)
-      return P;
-    if (P.Digits == DigitsLimits::max())
-      // Careful of overflow in the exponent.
-      return UnsignedFloat(1, P.Exponent) <<= Width;
-    return UnsignedFloat(P.Digits + 1, P.Exponent);
-  }
-};
-
-#define UNSIGNED_FLOAT_BOP(op, base)                                           \
-  template <class DigitsT>                                                     \
-  UnsignedFloat<DigitsT> operator op(const UnsignedFloat<DigitsT> &L,          \
-                                     const UnsignedFloat<DigitsT> &R) {        \
-    return UnsignedFloat<DigitsT>(L) base R;                                   \
-  }
-UNSIGNED_FLOAT_BOP(+, += )
-UNSIGNED_FLOAT_BOP(-, -= )
-UNSIGNED_FLOAT_BOP(*, *= )
-UNSIGNED_FLOAT_BOP(/, /= )
-UNSIGNED_FLOAT_BOP(<<, <<= )
-UNSIGNED_FLOAT_BOP(>>, >>= )
-#undef UNSIGNED_FLOAT_BOP
-
-template <class DigitsT>
-raw_ostream &operator<<(raw_ostream &OS, const UnsignedFloat<DigitsT> &X) {
-  return X.print(OS, 10);
-}
-
-#define UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, T1, T2)                             \
-  template <class DigitsT>                                                     \
-  bool operator op(const UnsignedFloat<DigitsT> &L, T1 R) {                    \
-    return L.compareTo(T2(R)) op 0;                                            \
-  }                                                                            \
-  template <class DigitsT>                                                     \
-  bool operator op(T1 L, const UnsignedFloat<DigitsT> &R) {                    \
-    return 0 op R.compareTo(T2(L));                                            \
-  }
-#define UNSIGNED_FLOAT_COMPARE_TO(op)                                          \
-  UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, uint64_t, uint64_t)                       \
-  UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, uint32_t, uint64_t)                       \
-  UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, int64_t, int64_t)                         \
-  UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, int32_t, int64_t)
-UNSIGNED_FLOAT_COMPARE_TO(< )
-UNSIGNED_FLOAT_COMPARE_TO(> )
-UNSIGNED_FLOAT_COMPARE_TO(== )
-UNSIGNED_FLOAT_COMPARE_TO(!= )
-UNSIGNED_FLOAT_COMPARE_TO(<= )
-UNSIGNED_FLOAT_COMPARE_TO(>= )
-#undef UNSIGNED_FLOAT_COMPARE_TO
-#undef UNSIGNED_FLOAT_COMPARE_TO_TYPE
-
-template <class DigitsT>
-uint64_t UnsignedFloat<DigitsT>::scale(uint64_t N) const {
-  if (Width == 64 || N <= DigitsLimits::max())
-    return (getFloat(N) * *this).template toInt<uint64_t>();
-
-  // Defer to the 64-bit version.
-  return UnsignedFloat<uint64_t>(Digits, Exponent).scale(N);
-}
-
-template <class DigitsT>
-UnsignedFloat<DigitsT> UnsignedFloat<DigitsT>::getProduct(DigitsType L,
-                                                          DigitsType R) {
-  // Check for zero.
-  if (!L || !R)
-    return getZero();
-
-  // Check for numbers that we can compute with 64-bit math.
-  if (Width <= 32 || (L <= UINT32_MAX && R <= UINT32_MAX))
-    return adjustToWidth(uint64_t(L) * uint64_t(R), 0);
-
-  // Do the full thing.
-  return UnsignedFloat(multiply64(L, R));
-}
-template <class DigitsT>
-UnsignedFloat<DigitsT> UnsignedFloat<DigitsT>::getQuotient(DigitsType Dividend,
-                                                           DigitsType Divisor) {
-  // Check for zero.
-  if (!Dividend)
-    return getZero();
-  if (!Divisor)
-    return getLargest();
-
-  if (Width == 64)
-    return UnsignedFloat(divide64(Dividend, Divisor));
-
-  // We can compute this with 64-bit math.
-  int Shift = countLeadingZeros64(Dividend);
-  uint64_t Shifted = uint64_t(Dividend) << Shift;
-  uint64_t Quotient = Shifted / Divisor;
-
-  // If Quotient needs to be shifted, then adjustToWidth will round.
-  if (Quotient > DigitsLimits::max())
-    return adjustToWidth(Quotient, -Shift);
-
-  // Round based on the value of the next bit.
-  return getRounded(UnsignedFloat(Quotient, -Shift),
-                    Shifted % Divisor >= getHalf(Divisor));
-}
-
-template <class DigitsT>
-template <class IntT>
-IntT UnsignedFloat<DigitsT>::toInt() const {
-  typedef std::numeric_limits<IntT> Limits;
-  if (*this < 1)
-    return 0;
-  if (*this >= Limits::max())
-    return Limits::max();
-
-  IntT N = Digits;
-  if (Exponent > 0) {
-    assert(size_t(Exponent) < sizeof(IntT) * 8);
-    return N << Exponent;
-  }
-  if (Exponent < 0) {
-    assert(size_t(-Exponent) < sizeof(IntT) * 8);
-    return N >> -Exponent;
-  }
-  return N;
-}
-
-template <class DigitsT>
-std::pair<int32_t, int> UnsignedFloat<DigitsT>::lgImpl() const {
-  if (isZero())
-    return std::make_pair(INT32_MIN, 0);
-
-  // Get the floor of the lg of Digits.
-  int32_t LocalFloor = Width - countLeadingZerosWidth(Digits) - 1;
-
-  // Get the floor of the lg of this.
-  int32_t Floor = Exponent + LocalFloor;
-  if (Digits == UINT64_C(1) << LocalFloor)
-    return std::make_pair(Floor, 0);
-
-  // Round based on the next digit.
-  assert(LocalFloor >= 1);
-  bool Round = Digits & UINT64_C(1) << (LocalFloor - 1);
-  return std::make_pair(Floor + Round, Round ? 1 : -1);
-}
-
-template <class DigitsT>
-UnsignedFloat<DigitsT> UnsignedFloat<DigitsT>::matchExponents(UnsignedFloat X) {
-  if (isZero() || X.isZero() || Exponent == X.Exponent)
-    return X;
-
-  int32_t Diff = int32_t(X.Exponent) - int32_t(Exponent);
-  if (Diff > 0)
-    increaseExponentToMatch(X, Diff);
-  else
-    X.increaseExponentToMatch(*this, -Diff);
-  return X;
-}
-template <class DigitsT>
-void UnsignedFloat<DigitsT>::increaseExponentToMatch(UnsignedFloat &X,
-                                                     int32_t ExponentDiff) {
-  assert(ExponentDiff > 0);
-  if (ExponentDiff >= 2 * Width) {
-    *this = getZero();
-    return;
-  }
-
-  // Use up any leading zeros on X, and then shift this.
-  int32_t ShiftX = std::min(countLeadingZerosWidth(X.Digits), ExponentDiff);
-  assert(ShiftX < Width);
-
-  int32_t ShiftThis = ExponentDiff - ShiftX;
-  if (ShiftThis >= Width) {
-    *this = getZero();
-    return;
-  }
-
-  X.Digits <<= ShiftX;
-  X.Exponent -= ShiftX;
-  Digits >>= ShiftThis;
-  Exponent += ShiftThis;
-  return;
-}
-
-template <class DigitsT>
-UnsignedFloat<DigitsT> &UnsignedFloat<DigitsT>::
-operator+=(const UnsignedFloat &X) {
-  if (isLargest() || X.isZero())
-    return *this;
-  if (isZero() || X.isLargest())
-    return *this = X;
-
-  // Normalize exponents.
-  UnsignedFloat Scaled = matchExponents(X);
-
-  // Check for zero again.
-  if (isZero())
-    return *this = Scaled;
-  if (Scaled.isZero())
-    return *this;
-
-  // Compute sum.
-  DigitsType Sum = Digits + Scaled.Digits;
-  bool DidOverflow = Sum < Digits;
-  Digits = Sum;
-  if (!DidOverflow)
-    return *this;
-
-  if (Exponent == MaxExponent)
-    return *this = getLargest();
-
-  ++Exponent;
-  Digits = UINT64_C(1) << (Width - 1) | Digits >> 1;
-
-  return *this;
-}
-template <class DigitsT>
-UnsignedFloat<DigitsT> &UnsignedFloat<DigitsT>::
-operator-=(const UnsignedFloat &X) {
-  if (X.isZero())
-    return *this;
-  if (*this <= X)
-    return *this = getZero();
-
-  // Normalize exponents.
-  UnsignedFloat Scaled = matchExponents(X);
-  assert(Digits >= Scaled.Digits);
-
-  // Compute difference.
-  if (!Scaled.isZero()) {
-    Digits -= Scaled.Digits;
-    return *this;
-  }
-
-  // Check if X just barely lost its last bit.  E.g., for 32-bit:
-  //
-  //   1*2^32 - 1*2^0 == 0xffffffff != 1*2^32
-  if (*this == UnsignedFloat(1, X.lgFloor() + Width)) {
-    Digits = DigitsType(0) - 1;
-    --Exponent;
-  }
-  return *this;
-}
-template <class DigitsT>
-UnsignedFloat<DigitsT> &UnsignedFloat<DigitsT>::
-operator*=(const UnsignedFloat &X) {
-  if (isZero())
-    return *this;
-  if (X.isZero())
-    return *this = X;
-
-  // Save the exponents.
-  int32_t Exponents = int32_t(Exponent) + int32_t(X.Exponent);
-
-  // Get the raw product.
-  *this = getProduct(Digits, X.Digits);
-
-  // Combine with exponents.
-  return *this <<= Exponents;
-}
-template <class DigitsT>
-UnsignedFloat<DigitsT> &UnsignedFloat<DigitsT>::
-operator/=(const UnsignedFloat &X) {
-  if (isZero())
-    return *this;
-  if (X.isZero())
-    return *this = getLargest();
-
-  // Save the exponents.
-  int32_t Exponents = int32_t(Exponent) - int32_t(X.Exponent);
-
-  // Get the raw quotient.
-  *this = getQuotient(Digits, X.Digits);
-
-  // Combine with exponents.
-  return *this <<= Exponents;
-}
-template <class DigitsT>
-void UnsignedFloat<DigitsT>::shiftLeft(int32_t Shift) {
-  if (!Shift || isZero())
-    return;
-  assert(Shift != INT32_MIN);
-  if (Shift < 0) {
-    shiftRight(-Shift);
-    return;
-  }
-
-  // Shift as much as we can in the exponent.
-  int32_t ExponentShift = std::min(Shift, MaxExponent - Exponent);
-  Exponent += ExponentShift;
-  if (ExponentShift == Shift)
-    return;
-
-  // Check this late, since it's rare.
-  if (isLargest())
-    return;
-
-  // Shift the digits themselves.
-  Shift -= ExponentShift;
-  if (Shift > countLeadingZerosWidth(Digits)) {
-    // Saturate.
-    *this = getLargest();
-    return;
-  }
-
-  Digits <<= Shift;
-  return;
-}
-
-template <class DigitsT>
-void UnsignedFloat<DigitsT>::shiftRight(int32_t Shift) {
-  if (!Shift || isZero())
-    return;
-  assert(Shift != INT32_MIN);
-  if (Shift < 0) {
-    shiftLeft(-Shift);
-    return;
-  }
-
-  // Shift as much as we can in the exponent.
-  int32_t ExponentShift = std::min(Shift, Exponent - MinExponent);
-  Exponent -= ExponentShift;
-  if (ExponentShift == Shift)
-    return;
-
-  // Shift the digits themselves.
-  Shift -= ExponentShift;
-  if (Shift >= Width) {
-    // Saturate.
-    *this = getZero();
-    return;
-  }
-
-  Digits >>= Shift;
-  return;
-}
-
-template <class DigitsT>
-int UnsignedFloat<DigitsT>::compare(const UnsignedFloat &X) const {
-  // Check for zero.
-  if (isZero())
-    return X.isZero() ? 0 : -1;
-  if (X.isZero())
-    return 1;
-
-  // Check for the scale.  Use lgFloor to be sure that the exponent difference
-  // is always lower than 64.
-  int32_t lgL = lgFloor(), lgR = X.lgFloor();
-  if (lgL != lgR)
-    return lgL < lgR ? -1 : 1;
-
-  // Compare digits.
-  if (Exponent < X.Exponent)
-    return UnsignedFloatBase::compare(Digits, X.Digits, X.Exponent - Exponent);
-
-  return -UnsignedFloatBase::compare(X.Digits, Digits, Exponent - X.Exponent);
-}
-
-template <class T> struct isPodLike<UnsignedFloat<T>> {
-  static const bool value = true;
-};
-}
-
-//===----------------------------------------------------------------------===//
-//
 // BlockMass definition.
 //
 // TODO: Make this private to BlockFrequencyInfoImpl or delete.
@@ -770,11 +101,11 @@ public:
   bool operator<(const BlockMass &X) const { return Mass < X.Mass; }
   bool operator>(const BlockMass &X) const { return Mass > X.Mass; }
 
-  /// \brief Convert to floating point.
+  /// \brief Convert to scaled number.
   ///
-  /// Convert to a float.  \a isFull() gives 1.0, while \a isEmpty() gives
-  /// slightly above 0.0.
-  UnsignedFloat<uint64_t> toFloat() const;
+  /// Convert to \a ScaledNumber.  \a isFull() gives 1.0, while \a isEmpty()
+  /// gives slightly above 0.0.
+  ScaledNumber<uint64_t> toScaled() const;
 
   void dump() const;
   raw_ostream &print(raw_ostream &OS) const;
@@ -837,7 +168,7 @@ template <class BT> struct BlockEdgesAdder;
 /// BlockFrequencyInfoImpl.  See there for details.
 class BlockFrequencyInfoImplBase {
 public:
-  typedef UnsignedFloat<uint64_t> Float;
+  typedef ScaledNumber<uint64_t> Scaled64;
 
   /// \brief Representative of a block.
   ///
@@ -866,7 +197,7 @@ public:
 
   /// \brief Stats about a block itself.
   struct FrequencyData {
-    Float Floating;
+    Scaled64 Scaled;
     uint64_t Integer;
   };
 
@@ -884,7 +215,7 @@ public:
     NodeList Nodes;         ///< Header and the members of the loop.
     BlockMass BackedgeMass; ///< Mass returned to loop header.
     BlockMass Mass;
-    Float Scale;
+    Scaled64 Scale;
 
     LoopData(LoopData *Parent, const BlockNode &Header)
         : Parent(Parent), IsPackaged(false), NumHeaders(1), Nodes(1, Header) {}
@@ -1131,7 +462,7 @@ public:
   virtual raw_ostream &print(raw_ostream &OS) const { return OS; }
   void dump() const { print(dbgs()); }
 
-  Float getFloatingBlockFreq(const BlockNode &Node) const;
+  Scaled64 getFloatingBlockFreq(const BlockNode &Node) const;
 
   BlockFrequency getBlockFreq(const BlockNode &Node) const;
 
@@ -1310,7 +641,7 @@ void IrreducibleGraph::addEdges(const BlockNode &Node,
 /// entries point to this block.  Its successors are the headers, which split
 /// the frequency evenly.
 ///
-/// This algorithm leverages BlockMass and UnsignedFloat to maintain precision,
+/// This algorithm leverages BlockMass and ScaledNumber to maintain precision,
 /// separates mass distribution from loop scaling, and dithers to eliminate
 /// probability mass loss.
 ///
@@ -1568,7 +899,7 @@ public:
   BlockFrequency getBlockFreq(const BlockT *BB) const {
     return BlockFrequencyInfoImplBase::getBlockFreq(getNode(BB));
   }
-  Float getFloatingBlockFreq(const BlockT *BB) const {
+  Scaled64 getFloatingBlockFreq(const BlockT *BB) const {
     return BlockFrequencyInfoImplBase::getFloatingBlockFreq(getNode(BB));
   }
 
diff --git a/include/llvm/Analysis/JumpInstrTableInfo.h b/include/llvm/Analysis/JumpInstrTableInfo.h
new file mode 100644
index 0000000..54760aa
--- /dev/null
+++ b/include/llvm/Analysis/JumpInstrTableInfo.h
@@ -0,0 +1,60 @@
+//===-- JumpInstrTableInfo.h: Info for Jump-Instruction Tables --*- C++ -*-===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Information about jump-instruction tables that have been created by
+/// JumpInstrTables pass.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_JUMPINSTRTABLEINFO_H
+#define LLVM_ANALYSIS_JUMPINSTRTABLEINFO_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Pass.h"
+
+#include <vector>
+
+namespace llvm {
+class Function;
+class FunctionType;
+
+/// This class stores information about jump-instruction tables created by the
+/// JumpInstrTables pass (in lib/CodeGen/JumpInstrTables.cpp). Each table is a
+/// map from a function type to a vector of pairs. The first element of each
+/// pair is the function that has the jumptable annotation. The second element
+/// is a function that was declared by JumpInstrTables and used to replace all
+/// address-taking sites for the original function.
+///
+/// The information in this pass is used in AsmPrinter
+/// (lib/CodeGen/AsmPrinter/AsmPrinter.cpp) to generate the required assembly
+/// for the jump-instruction tables.
+class JumpInstrTableInfo : public ImmutablePass {
+public:
+  static char ID;
+
+  JumpInstrTableInfo();
+  virtual ~JumpInstrTableInfo();
+  const char *getPassName() const override {
+    return "Jump-Instruction Table Info";
+  }
+
+  typedef std::pair<Function *, Function *> JumpPair;
+  typedef DenseMap<FunctionType *, std::vector<JumpPair> > JumpTables;
+
+  /// Inserts an entry in a table, adding the table if it doesn't exist.
+  void insertEntry(FunctionType *TableFunTy, Function *Target, Function *Jump);
+
+  /// Gets the tables.
+  const JumpTables &getTables() const { return Tables; }
+
+private:
+  JumpTables Tables;
+};
+}
+
+#endif /* LLVM_ANALYSIS_JUMPINSTRTABLEINFO_H */
diff --git a/include/llvm/Analysis/Passes.h b/include/llvm/Analysis/Passes.h
index 9494b7d..fd65ae5 100644
--- a/include/llvm/Analysis/Passes.h
+++ b/include/llvm/Analysis/Passes.h
@@ -142,6 +142,10 @@ namespace llvm {
   // information and prints it with -analyze.
   //
   FunctionPass *createMemDepPrinter();
+
+  // createJumpInstrTableInfoPass - This creates a pass that stores information
+  // about the jump tables created by JumpInstrTables
+  ImmutablePass *createJumpInstrTableInfoPass();
 }
 
 #endif
diff --git a/include/llvm/Analysis/RegionInfo.h b/include/llvm/Analysis/RegionInfo.h
index 82a788d..93a1a48 100644
--- a/include/llvm/Analysis/RegionInfo.h
+++ b/include/llvm/Analysis/RegionInfo.h
@@ -22,6 +22,16 @@
 // itself is not, but in practice runtime seems to be in the order of magnitude
 // of dominance tree calculation.
 //
+// WARNING: LLVM is generally very concerned about compile time such that
+//          the use of additional analysis passes in the default
+//          optimization sequence is avoided as much as possible.
+//          Specifically, if you do not need the RegionInfo, but dominance
+//          information could be sufficient please base your work only on
+//          the dominator tree. Most passes maintain it, such that using
+//          it has often near zero cost. In contrast RegionInfo is by
+//          default not available, is not maintained by existing
+//          transformations and there is no intention to do so.
+//
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_ANALYSIS_REGIONINFO_H
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index 79fe1dc..f57f3eb 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -322,6 +322,7 @@ public:
   enum ShuffleKind {
     SK_Broadcast,       ///< Broadcast element 0 to all other elements.
     SK_Reverse,         ///< Reverse the order of the vector.
+    SK_Alternate,       ///< Choose alternate elements from vector.
     SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset.
     SK_ExtractSubvector ///< ExtractSubvector Index indicates start offset.
   };
diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index ce78967..83b5408 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@@ -37,7 +37,10 @@ namespace llvm {
   /// for all of the elements in the vector.
   void computeKnownBits(Value *V,  APInt &KnownZero, APInt &KnownOne,
                         const DataLayout *TD = nullptr, unsigned Depth = 0);
-  void computeKnownBitsLoad(const MDNode &Ranges, APInt &KnownZero);
+  /// Compute known bits from the range metadata.
+  /// \p KnownZero the set of bits that are known to be zero
+  void computeKnownBitsFromRangeMetadata(const MDNode &Ranges,
+                                         APInt &KnownZero);
 
   /// ComputeSignBit - Determine whether the sign bit is known to be zero or
   /// one.  Convenience wrapper around computeKnownBits.
diff --git a/include/llvm/Bitcode/LLVMBitCodes.h b/include/llvm/Bitcode/LLVMBitCodes.h
index 04c08ab..f7e30ef 100644
--- a/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/include/llvm/Bitcode/LLVMBitCodes.h
@@ -71,7 +71,8 @@ namespace bitc {
     // MODULE_CODE_PURGEVALS: [numvals]
     MODULE_CODE_PURGEVALS   = 10,
 
-    MODULE_CODE_GCNAME      = 11   // GCNAME: [strchr x N]
+    MODULE_CODE_GCNAME      = 11,  // GCNAME: [strchr x N]
+    MODULE_CODE_COMDAT      = 12,  // COMDAT: [selection_kind, name]
   };
 
   /// PARAMATTR blocks have code for defining a parameter attribute set.
@@ -372,7 +373,16 @@ namespace bitc {
     ATTR_KIND_COLD = 36,
     ATTR_KIND_OPTIMIZE_NONE = 37,
     ATTR_KIND_IN_ALLOCA = 38,
-    ATTR_KIND_NON_NULL = 39
+    ATTR_KIND_NON_NULL = 39,
+    ATTR_KIND_JUMP_TABLE = 40
+  };
+
+  enum ComdatSelectionKindCodes {
+    COMDAT_SELECTION_KIND_ANY = 1,
+    COMDAT_SELECTION_KIND_EXACT_MATCH = 2,
+    COMDAT_SELECTION_KIND_LARGEST = 3,
+    COMDAT_SELECTION_KIND_NO_DUPLICATES = 4,
+    COMDAT_SELECTION_KIND_SAME_SIZE = 5,
   };
 
 } // End bitc namespace
diff --git a/include/llvm/Bitcode/ReaderWriter.h b/include/llvm/Bitcode/ReaderWriter.h
index 4c194a6..8cf5735 100644
--- a/include/llvm/Bitcode/ReaderWriter.h
+++ b/include/llvm/Bitcode/ReaderWriter.h
@@ -41,14 +41,11 @@ namespace llvm {
                                    LLVMContext &Context,
                                    std::string *ErrMsg = nullptr);
 
-  /// getBitcodeTargetTriple - Read the header of the specified bitcode
-  /// buffer and extract just the triple information. If successful,
-  /// this returns a string and *does not* take ownership
-  /// of 'buffer'. On error, this returns "", and fills in *ErrMsg
-  /// if ErrMsg is non-null.
+  /// Read the header of the specified bitcode buffer and extract just the
+  /// triple information. If successful, this returns a string and *does not*
+  /// take ownership of 'buffer'. On error, this returns "".
   std::string getBitcodeTargetTriple(MemoryBuffer *Buffer,
-                                     LLVMContext &Context,
-                                     std::string *ErrMsg = nullptr);
+                                     LLVMContext &Context);
 
   /// Read the specified bitcode file, returning the module.
   /// This method *never* takes ownership of Buffer.
diff --git a/include/llvm/CodeGen/Analysis.h b/include/llvm/CodeGen/Analysis.h
index c3aefd4..c5060fb 100644
--- a/include/llvm/CodeGen/Analysis.h
+++ b/include/llvm/CodeGen/Analysis.h
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file declares several CodeGen-specific LLVM IR analysis utilties.
+// This file declares several CodeGen-specific LLVM IR analysis utilities.
 //
 //===----------------------------------------------------------------------===//
 
@@ -86,7 +86,7 @@ ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred);
 /// between it and the return.
 ///
 /// This function only tests target-independent requirements.
-bool isInTailCallPosition(ImmutableCallSite CS, const TargetLowering &TLI);
+bool isInTailCallPosition(ImmutableCallSite CS, const SelectionDAG &DAG);
 
 /// Test if given that the input instruction is in the tail call position if the
 /// return type or any attributes of the function will inhibit tail call
diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h
index b53fb42..e1c9a14 100644
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h
@@ -52,7 +52,6 @@ class MCSubtargetInfo;
 class MCSymbol;
 class MDNode;
 class DwarfDebug;
-class DwarfException;
 class Mangler;
 class TargetLoweringObjectFile;
 class DataLayout;
diff --git a/include/llvm/CodeGen/CommandFlags.h b/include/llvm/CodeGen/CommandFlags.h
index 2956ad8..449d934 100644
--- a/include/llvm/CodeGen/CommandFlags.h
+++ b/include/llvm/CodeGen/CommandFlags.h
@@ -202,6 +202,21 @@ FunctionSections("function-sections",
                  cl::desc("Emit functions into separate sections"),
                  cl::init(false));
 
+cl::opt<llvm::JumpTable::JumpTableType>
+JTableType("jump-table-type",
+          cl::desc("Choose the type of Jump-Instruction Table for jumptable."),
+          cl::init(JumpTable::Single),
+          cl::values(
+              clEnumValN(JumpTable::Single, "single",
+                         "Create a single table for all jumptable functions"),
+              clEnumValN(JumpTable::Arity, "arity",
+                         "Create one table per number of parameters."),
+              clEnumValN(JumpTable::Simplified, "simplified",
+                         "Create one table per simplified function type."),
+              clEnumValN(JumpTable::Full, "full",
+                         "Create one table per unique function type."),
+              clEnumValEnd));
+
 // Common utility function tightly tied to the options listed here. Initializes
 // a TargetOptions object with CodeGen flags and returns it.
 static inline TargetOptions InitTargetOptionsFromCodeGenFlags() {
@@ -228,6 +243,7 @@ static inline TargetOptions InitTargetOptionsFromCodeGenFlags() {
   Options.FunctionSections = FunctionSections;
 
   Options.MCOptions = InitMCTargetOptionsFromFlags();
+  Options.JTType = JTableType;
 
   return Options;
 }
diff --git a/include/llvm/CodeGen/FastISel.h b/include/llvm/CodeGen/FastISel.h
index bfeede2..2bebae6 100644
--- a/include/llvm/CodeGen/FastISel.h
+++ b/include/llvm/CodeGen/FastISel.h
@@ -23,6 +23,7 @@ namespace llvm {
 class AllocaInst;
 class Constant;
 class ConstantFP;
+class CallInst;
 class DataLayout;
 class FunctionLoweringInfo;
 class Instruction;
@@ -48,6 +49,7 @@ class FastISel {
 protected:
   DenseMap<const Value *, unsigned> LocalValueMap;
   FunctionLoweringInfo &FuncInfo;
+  MachineFunction *MF;
   MachineRegisterInfo &MRI;
   MachineFrameInfo &MFI;
   MachineConstantPool &MCP;
@@ -373,6 +375,12 @@ protected:
   /// - \c Add has a constant operand.
   bool canFoldAddIntoGEP(const User *GEP, const Value *Add);
 
+  /// Test whether the given value has exactly one use.
+  bool hasTrivialKill(const Value *V) const;
+
+  /// \brief Create a machine mem operand from the given instruction.
+  MachineMemOperand *createMachineMemOperandFor(const Instruction *I) const;
+
 private:
   bool SelectBinaryOp(const User *I, unsigned ISDOpcode);
 
@@ -380,6 +388,7 @@ private:
 
   bool SelectGetElementPtr(const User *I);
 
+  bool SelectStackmap(const CallInst *I);
   bool SelectCall(const User *I);
 
   bool SelectBitCast(const User *I);
@@ -409,8 +418,8 @@ private:
   /// heavy instructions like calls.
   void flushLocalValueMap();
 
-  /// Test whether the given value has exactly one use.
-  bool hasTrivialKill(const Value *V) const;
+  bool addStackMapLiveVars(SmallVectorImpl<MachineOperand> &Ops,
+                           const CallInst *CI, unsigned StartIdx);
 };
 
 }
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index 49891b2..a8f2368 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -379,6 +379,37 @@ namespace ISD {
     /// operand, a ValueType node.
     SIGN_EXTEND_INREG,
 
+    /// ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an
+    /// in-register any-extension of the low lanes of an integer vector. The
+    /// result type must have fewer elements than the operand type, and those
+    /// elements must be larger integer types such that the total size of the
+    /// operand type and the result type match. Each of the low operand
+    /// elements is any-extended into the corresponding, wider result
+    /// elements with the high bits becoming undef.
+    ANY_EXTEND_VECTOR_INREG,
+
+    /// SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an
+    /// in-register sign-extension of the low lanes of an integer vector. The
+    /// result type must have fewer elements than the operand type, and those
+    /// elements must be larger integer types such that the total size of the
+    /// operand type and the result type match. Each of the low operand
+    /// elements is sign-extended into the corresponding, wider result
+    /// elements.
+    // FIXME: The SIGN_EXTEND_INREG node isn't specifically limited to
+    // scalars, but it also doesn't handle vectors well. Either it should be
+    // restricted to scalars or this node (and its handling) should be merged
+    // into it.
+    SIGN_EXTEND_VECTOR_INREG,
+
+    /// ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an
+    /// in-register zero-extension of the low lanes of an integer vector. The
+    /// result type must have fewer elements than the operand type, and those
+    /// elements must be larger integer types such that the total size of the
+    /// operand type and the result type match. Each of the low operand
+    /// elements is zero-extended into the corresponding, wider result
+    /// elements.
+    ZERO_EXTEND_VECTOR_INREG,
+
     /// FP_TO_[US]INT - Convert a floating point value to a signed or unsigned
     /// integer.
     FP_TO_SINT,
@@ -619,6 +650,12 @@ namespace ISD {
     /// This corresponds to the cmpxchg instruction.
     ATOMIC_CMP_SWAP,
 
+    /// Val, Success, OUTCHAIN
+    ///     = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap)
+    /// N.b. this is still a strong cmpxchg operation, so
+    /// Success == "Val == cmp".
+    ATOMIC_CMP_SWAP_WITH_SUCCESS,
+
     /// Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt)
     /// Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN, ptr, amt)
     /// For double-word atomic operations:
diff --git a/include/llvm/CodeGen/JumpInstrTables.h b/include/llvm/CodeGen/JumpInstrTables.h
new file mode 100644
index 0000000..6ca3d7d
--- /dev/null
+++ b/include/llvm/CodeGen/JumpInstrTables.h
@@ -0,0 +1,104 @@
+//===-- JumpInstrTables.h: Jump-Instruction Tables --------------*- C++ -*-===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief An implementation of tables consisting of jump instructions
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_JUMPINSTRTABLES_H
+#define LLVM_CODEGEN_JUMPINSTRTABLES_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetOptions.h"
+
+namespace llvm {
+class Constant;
+class Function;
+class FunctionType;
+class JumpInstrTableInfo;
+class Module;
+
+/// A class to manage a set of jump tables indexed on function type. It looks at
+/// each function in the module to find all the functions that have the
+/// jumptable attribute set. For each such function, it creates a new
+/// jump-instruction-table function and stores the mapping in the ImmutablePass
+/// JumpInstrTableInfo.
+///
+/// These special functions get lowered in AsmPrinter to assembly of the form:
+/// \verbatim
+///   .globl f
+///   .type f,@function
+///   .align 8,0x90
+/// f:
+///   jmp f_orig@PLT
+/// \endverbatim
+///
+/// Support for an architecture depends on two functions in TargetInstrInfo:
+/// getUnconditionalBranch, and getTrap. AsmPrinter uses these to generate the
+/// appropriate instructions for the jump statement (an unconditional branch)
+/// and for padding to make the table have a size that is a power of two. This
+/// padding uses a trap instruction to ensure that calls to this area halt the
+/// program. The default implementations of these functions call
+/// llvm_unreachable.
+class JumpInstrTables : public ModulePass {
+public:
+  static char ID;
+
+  JumpInstrTables();
+  JumpInstrTables(JumpTable::JumpTableType JTT);
+  virtual ~JumpInstrTables();
+  bool runOnModule(Module &M) override;
+  const char *getPassName() const override { return "Jump-Instruction Tables"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Creates a jump-instruction table function for the Target and adds it to
+  /// the tables.
+  Function *insertEntry(Module &M, Function *Target);
+
+  /// Checks to see if there is already a table for the given FunctionType.
+  bool hasTable(FunctionType *FunTy);
+
+private:
+  /// The metadata used while a jump table is being built
+  struct TableMeta {
+    /// The number of this table
+    unsigned TableNum;
+
+    /// The current number of jump entries in the table.
+    unsigned Count;
+  };
+
+  typedef DenseMap<FunctionType *, struct TableMeta> JumpMap;
+
+  /// Maps the function into a subset of function types, depending on the
+  /// jump-instruction table style selected from JumpTableTypes in
+  /// JumpInstrTables.cpp. The choice of mapping determines the number of
+  /// jump-instruction tables generated by this pass. E.g., the simplest mapping
+  /// converts every function type into void f(); so, all functions end up in a
+  /// single table.
+  FunctionType *transformType(FunctionType *FunTy);
+
+  /// The current state of functions and jump entries in the table(s).
+  JumpMap Metadata;
+
+  /// The ImmutablePass that stores information about the generated tables.
+  JumpInstrTableInfo *JITI;
+
+  /// The total number of tables.
+  unsigned TableCount;
+
+  /// The type of tables to build.
+  JumpTable::JumpTableType JTType;
+};
+
+/// Creates a JumpInstrTables pass for the given type of jump table.
+ModulePass *createJumpInstrTablesPass(JumpTable::JumpTableType JTT);
+}
+
+#endif /* LLVM_CODEGEN_JUMPINSTRTABLES_H */
diff --git a/include/llvm/CodeGen/LexicalScopes.h b/include/llvm/CodeGen/LexicalScopes.h
index 31d6872..036aea3 100644
--- a/include/llvm/CodeGen/LexicalScopes.h
+++ b/include/llvm/CodeGen/LexicalScopes.h
@@ -197,6 +197,9 @@ public:
   /// dump - Print data structures to dbgs().
   void dump();
 
+  /// getOrCreateAbstractScope - Find or create an abstract lexical scope.
+  LexicalScope *getOrCreateAbstractScope(const MDNode *N);
+
 private:
   /// getOrCreateLexicalScope - Find lexical scope for the given DebugLoc. If
   /// not available then create new lexical scope.
@@ -208,9 +211,6 @@ private:
   /// getOrCreateInlinedScope - Find or create an inlined lexical scope.
   LexicalScope *getOrCreateInlinedScope(MDNode *Scope, MDNode *InlinedAt);
 
-  /// getOrCreateAbstractScope - Find or create an abstract lexical scope.
-  LexicalScope *getOrCreateAbstractScope(const MDNode *N);
-
   /// extractLexicalScopes - Extract instruction ranges for each lexical scopes
   /// for the given machine function.
   void extractLexicalScopes(SmallVectorImpl<InsnRange> &MIRanges,
diff --git a/include/llvm/CodeGen/LiveIntervalAnalysis.h b/include/llvm/CodeGen/LiveIntervalAnalysis.h
index ddd623c..176665b 100644
--- a/include/llvm/CodeGen/LiveIntervalAnalysis.h
+++ b/include/llvm/CodeGen/LiveIntervalAnalysis.h
@@ -155,6 +155,17 @@ namespace llvm {
     bool shrinkToUses(LiveInterval *li,
                       SmallVectorImpl<MachineInstr*> *dead = nullptr);
 
+    /// \brief Walk the values in the given interval and compute which ones
+    /// are dead.  Dead values are not deleted, however:
+    /// - Dead PHIDef values are marked as unused.
+    /// - New dead machine instructions are added to the dead vector.
+    /// - CanSeparate is set to true if the interval may have been separated
+    ///   into multiple connected components.
+    void computeDeadValues(LiveInterval *li,
+                           LiveRange &LR,
+                           bool *CanSeparate,
+                           SmallVectorImpl<MachineInstr*> *dead);
+
     /// extendToIndices - Extend the live range of LI to reach all points in
     /// Indices. The points in the Indices array must be jointly dominated by
     /// existing defs in LI. PHI-defs are added as needed to maintain SSA form.
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index 90bdeee4..a08cc2e 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -620,7 +620,7 @@ public:
 
   /// computeRegisterLiveness - Return whether (physical) register \c Reg
   /// has been <def>ined and not <kill>ed as of just before \c MI.
-  /// 
+  ///
   /// Search is localised to a neighborhood of
   /// \c Neighborhood instructions before (searching for defs or kills) and
   /// Neighborhood instructions after (searching just for defs) MI.
@@ -635,7 +635,7 @@ public:
   void print(raw_ostream &OS, SlotIndexes* = nullptr) const;
 
   // Printing method used by LoopInfo.
-  void printAsOperand(raw_ostream &OS, bool PrintType = true);
+  void printAsOperand(raw_ostream &OS, bool PrintType = true) const;
 
   /// getNumber - MachineBasicBlocks are uniquely numbered at the function
   /// level, unless they're not in a MachineFunction yet, in which case this
diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h
index bd0ea11..c51f8fe 100644
--- a/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/include/llvm/CodeGen/MachineFrameInfo.h
@@ -484,6 +484,9 @@ public:
   ///
   int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool Immutable);
 
+  /// CreateFixedSpillStackObject - Create a spill slot at a fixed location
+  /// on the stack.  Returns an index with a negative value.
+  int CreateFixedSpillStackObject(uint64_t Size, int64_t SPOffset);
 
   /// isFixedObjectIndex - Returns true if the specified index corresponds to a
   /// fixed stack object.
diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index b0d3e02..3c82811 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -727,6 +727,9 @@ public:
   bool isFullCopy() const {
     return isCopy() && !getOperand(0).getSubReg() && !getOperand(1).getSubReg();
   }
+  bool isExtractSubreg() const {
+    return getOpcode() == TargetOpcode::EXTRACT_SUBREG;
+  }
 
   /// isCopyLike - Return true if the instruction behaves like a copy.
   /// This does not include native copy instructions.
@@ -947,7 +950,7 @@ public:
   }
 
   /// isRegTiedToDefOperand - Return true if the use operand of the specified
-  /// index is tied to an def operand. It also returns the def operand index by
+  /// index is tied to a def operand. It also returns the def operand index by
   /// reference if DefOpIdx is not null.
   bool isRegTiedToDefOperand(unsigned UseOpIdx,
                              unsigned *DefOpIdx = nullptr) const {
diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index acd37e1..7d85432 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -518,9 +518,7 @@ public:
     return Queue.begin() + idx;
   }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void dump();
-#endif
 };
 
 /// Summarize the unscheduled region.
@@ -624,9 +622,9 @@ private:
   SmallVector<unsigned, 16> ReservedCycles;
 
 #ifndef NDEBUG
-  // Remember the greatest operand latency as an upper bound on the number of
+  // Remember the greatest possible stall as an upper bound on the number of
   // times we should retry the pending queue because of a hazard.
-  unsigned MaxObservedLatency;
+  unsigned MaxObservedStall;
 #endif
 
 public:
@@ -739,6 +737,217 @@ public:
 #endif
 };
 
+/// Base class for GenericScheduler. This class maintains information about
+/// scheduling candidates based on TargetSchedModel making it easy to implement
+/// heuristics for either preRA or postRA scheduling.
+class GenericSchedulerBase : public MachineSchedStrategy {
+public:
+  /// Represent the type of SchedCandidate found within a single queue.
+  /// pickNodeBidirectional depends on these listed by decreasing priority.
+  enum CandReason {
+    NoCand, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak, RegMax,
+    ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
+    TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};
+
+#ifndef NDEBUG
+  static const char *getReasonStr(GenericSchedulerBase::CandReason Reason);
+#endif
+
+  /// Policy for scheduling the next instruction in the candidate's zone.
+  struct CandPolicy {
+    bool ReduceLatency;
+    unsigned ReduceResIdx;
+    unsigned DemandResIdx;
+
+    CandPolicy(): ReduceLatency(false), ReduceResIdx(0), DemandResIdx(0) {}
+  };
+
+  /// Status of an instruction's critical resource consumption.
+  struct SchedResourceDelta {
+    // Count critical resources in the scheduled region required by SU.
+    unsigned CritResources;
+
+    // Count critical resources from another region consumed by SU.
+    unsigned DemandedResources;
+
+    SchedResourceDelta(): CritResources(0), DemandedResources(0) {}
+
+    bool operator==(const SchedResourceDelta &RHS) const {
+      return CritResources == RHS.CritResources
+        && DemandedResources == RHS.DemandedResources;
+    }
+    bool operator!=(const SchedResourceDelta &RHS) const {
+      return !operator==(RHS);
+    }
+  };
+
+  /// Store the state used by GenericScheduler heuristics, required for the
+  /// lifetime of one invocation of pickNode().
+  struct SchedCandidate {
+    CandPolicy Policy;
+
+    // The best SUnit candidate.
+    SUnit *SU;
+
+    // The reason for this candidate.
+    CandReason Reason;
+
+    // Set of reasons that apply to multiple candidates.
+    uint32_t RepeatReasonSet;
+
+    // Register pressure values for the best candidate.
+    RegPressureDelta RPDelta;
+
+    // Critical resource consumption of the best candidate.
+    SchedResourceDelta ResDelta;
+
+    SchedCandidate(const CandPolicy &policy)
+      : Policy(policy), SU(nullptr), Reason(NoCand), RepeatReasonSet(0) {}
+
+    bool isValid() const { return SU; }
+
+    // Copy the status of another candidate without changing policy.
+    void setBest(SchedCandidate &Best) {
+      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+      SU = Best.SU;
+      Reason = Best.Reason;
+      RPDelta = Best.RPDelta;
+      ResDelta = Best.ResDelta;
+    }
+
+    bool isRepeat(CandReason R) { return RepeatReasonSet & (1 << R); }
+    void setRepeat(CandReason R) { RepeatReasonSet |= (1 << R); }
+
+    void initResourceDelta(const ScheduleDAGMI *DAG,
+                           const TargetSchedModel *SchedModel);
+  };
+
+protected:
+  const MachineSchedContext *Context;
+  const TargetSchedModel *SchedModel;
+  const TargetRegisterInfo *TRI;
+
+  SchedRemainder Rem;
+protected:
+  GenericSchedulerBase(const MachineSchedContext *C):
+    Context(C), SchedModel(nullptr), TRI(nullptr) {}
+
+  void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone,
+                 SchedBoundary *OtherZone);
+
+#ifndef NDEBUG
+  void traceCandidate(const SchedCandidate &Cand);
+#endif
+};
+
+/// GenericScheduler shrinks the unscheduled zone using heuristics to balance
+/// the schedule.
+class GenericScheduler : public GenericSchedulerBase {
+  ScheduleDAGMILive *DAG;
+
+  // State of the top and bottom scheduled instruction boundaries.
+  SchedBoundary Top;
+  SchedBoundary Bot;
+
+  MachineSchedPolicy RegionPolicy;
+public:
+  GenericScheduler(const MachineSchedContext *C):
+    GenericSchedulerBase(C), DAG(nullptr), Top(SchedBoundary::TopQID, "TopQ"),
+    Bot(SchedBoundary::BotQID, "BotQ") {}
+
+  void initPolicy(MachineBasicBlock::iterator Begin,
+                  MachineBasicBlock::iterator End,
+                  unsigned NumRegionInstrs) override;
+
+  bool shouldTrackPressure() const override {
+    return RegionPolicy.ShouldTrackPressure;
+  }
+
+  void initialize(ScheduleDAGMI *dag) override;
+
+  SUnit *pickNode(bool &IsTopNode) override;
+
+  void schedNode(SUnit *SU, bool IsTopNode) override;
+
+  void releaseTopNode(SUnit *SU) override {
+    Top.releaseTopNode(SU);
+  }
+
+  void releaseBottomNode(SUnit *SU) override {
+    Bot.releaseBottomNode(SU);
+  }
+
+  void registerRoots() override;
+
+protected:
+  void checkAcyclicLatency();
+
+  void tryCandidate(SchedCandidate &Cand,
+                    SchedCandidate &TryCand,
+                    SchedBoundary &Zone,
+                    const RegPressureTracker &RPTracker,
+                    RegPressureTracker &TempTracker);
+
+  SUnit *pickNodeBidirectional(bool &IsTopNode);
+
+  void pickNodeFromQueue(SchedBoundary &Zone,
+                         const RegPressureTracker &RPTracker,
+                         SchedCandidate &Candidate);
+
+  void reschedulePhysRegCopies(SUnit *SU, bool isTop);
+};
+
+/// PostGenericScheduler - Interface to the scheduling algorithm used by
+/// ScheduleDAGMI.
+///
+/// Callbacks from ScheduleDAGMI:
+///   initPolicy -> initialize(DAG) -> registerRoots -> pickNode ...
+class PostGenericScheduler : public GenericSchedulerBase {
+  ScheduleDAGMI *DAG;
+  SchedBoundary Top;
+  SmallVector<SUnit*, 8> BotRoots;
+public:
+  PostGenericScheduler(const MachineSchedContext *C):
+    GenericSchedulerBase(C), Top(SchedBoundary::TopQID, "TopQ") {}
+
+  virtual ~PostGenericScheduler() {}
+
+  void initPolicy(MachineBasicBlock::iterator Begin,
+                  MachineBasicBlock::iterator End,
+                  unsigned NumRegionInstrs) override {
+    /* no configurable policy */
+  };
+
+  /// PostRA scheduling does not track pressure.
+  bool shouldTrackPressure() const override { return false; }
+
+  void initialize(ScheduleDAGMI *Dag) override;
+
+  void registerRoots() override;
+
+  SUnit *pickNode(bool &IsTopNode) override;
+
+  void scheduleTree(unsigned SubtreeID) override {
+    llvm_unreachable("PostRA scheduler does not support subtree analysis.");
+  }
+
+  void schedNode(SUnit *SU, bool IsTopNode) override;
+
+  void releaseTopNode(SUnit *SU) override {
+    Top.releaseTopNode(SU);
+  }
+
+  // Only called for roots.
+  void releaseBottomNode(SUnit *SU) override {
+    BotRoots.push_back(SU);
+  }
+
+protected:
+  void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand);
+
+  void pickNodeFromQueue(SchedCandidate &Cand);
+};
+
 } // namespace llvm
 
 #endif
diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index 35210f1..17477fe 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -588,6 +588,25 @@ namespace llvm {
   /// the intrinsic for later emission to the StackMap.
   extern char &StackMapLivenessID;
 
+  /// createJumpInstrTables - This pass creates jump-instruction tables.
+  ModulePass *createJumpInstrTablesPass();
 } // End llvm namespace
 
+/// This initializer registers TargetMachine constructor, so the pass being
+/// initialized can use target dependent interfaces. Please do not move this
+/// macro to be together with INITIALIZE_PASS, which is a complete target
+/// independent initializer, and we don't want to make libScalarOpts depend
+/// on libCodeGen.
+#define INITIALIZE_TM_PASS(passName, arg, name, cfg, analysis) \
+  static void* initialize##passName##PassOnce(PassRegistry &Registry) { \
+    PassInfo *PI = new PassInfo(name, arg, & passName ::ID, \
+      PassInfo::NormalCtor_t(callDefaultCtor< passName >), cfg, analysis, \
+      PassInfo::TargetMachineCtor_t(callTargetMachineCtor< passName >)); \
+    Registry.registerPass(*PI, true); \
+    return PI; \
+  } \
+  void llvm::initialize##passName##Pass(PassRegistry &Registry) { \
+    CALL_ONCE_INITIALIZATION(initialize##passName##PassOnce) \
+  }
+
 #endif
diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h
index c11a6ac..cc9e000 100644
--- a/include/llvm/CodeGen/RegisterPressure.h
+++ b/include/llvm/CodeGen/RegisterPressure.h
@@ -434,10 +434,8 @@ protected:
   void bumpDownwardPressure(const MachineInstr *MI);
 };
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void dumpRegSetPressure(ArrayRef<unsigned> SetPressure,
                         const TargetRegisterInfo *TRI);
-#endif
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/CodeGen/ScheduleDFS.h b/include/llvm/CodeGen/ScheduleDFS.h
index 73ce99f..b2108ad 100644
--- a/include/llvm/CodeGen/ScheduleDFS.h
+++ b/include/llvm/CodeGen/ScheduleDFS.h
@@ -57,11 +57,9 @@ struct ILPValue {
     return RHS <= *this;
   }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void print(raw_ostream &OS) const;
 
   void dump() const;
-#endif
 };
 
 /// \brief Compute the values of each DAG node for various metrics during DFS.
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index d9c38c0..5effb82 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -151,8 +151,7 @@ public:
 };
 
 class SelectionDAG;
-void checkForCycles(const SDNode *N);
-void checkForCycles(const SelectionDAG *DAG);
+void checkForCycles(const SelectionDAG *DAG, bool force = false);
 
 /// SelectionDAG class - This is used to represent a portion of an LLVM function
 /// in a low-level Data Dependence DAG representation suitable for instruction
@@ -335,7 +334,7 @@ public:
     assert((!N.getNode() || N.getValueType() == MVT::Other) &&
            "DAG root value is not a chain!");
     if (N.getNode())
-      checkForCycles(N.getNode());
+      checkForCycles(N.getNode(), this);
     Root = N;
     if (N.getNode())
       checkForCycles(this);
@@ -540,6 +539,12 @@ public:
   /// undefined.
   SDValue getVectorShuffle(EVT VT, SDLoc dl, SDValue N1, SDValue N2,
                            const int *MaskElts);
+  SDValue getVectorShuffle(EVT VT, SDLoc dl, SDValue N1, SDValue N2,
+                           ArrayRef<int> MaskElts) {
+    assert(VT.getVectorNumElements() == MaskElts.size() &&
+           "Must have the same number of vector elements as mask elements!");
+    return getVectorShuffle(VT, dl, N1, N2, MaskElts.data());
+  }
 
   /// getAnyExtOrTrunc - Convert Op, which must be of integer type, to the
   /// integer type VT, by either any-extending or truncating it.
@@ -557,10 +562,28 @@ public:
   /// value assuming it was the smaller SrcTy value.
   SDValue getZeroExtendInReg(SDValue Op, SDLoc DL, EVT SrcTy);
 
+  /// getAnyExtendVectorInReg - Return an operation which will any-extend the
+  /// low lanes of the operand into the specified vector type. For example,
+  /// this can convert a v16i8 into a v4i32 by any-extending the low four
+  /// lanes of the operand from i8 to i32.
+  SDValue getAnyExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT);
+
+  /// getSignExtendVectorInReg - Return an operation which will sign extend the
+  /// low lanes of the operand into the specified vector type. For example,
+  /// this can convert a v16i8 into a v4i32 by sign extending the low four
+  /// lanes of the operand from i8 to i32.
+  SDValue getSignExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT);
+
+  /// getZeroExtendVectorInReg - Return an operation which will zero extend the
+  /// low lanes of the operand into the specified vector type. For example,
+  /// this can convert a v16i8 into a v4i32 by zero extending the low four
+  /// lanes of the operand from i8 to i32.
+  SDValue getZeroExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT);
+
   /// getBoolExtOrTrunc - Convert Op, which must be of integer type, to the
   /// integer type VT, by using an extension appropriate for the target's
-  /// BooleanContent or truncating it.
-  SDValue getBoolExtOrTrunc(SDValue Op, SDLoc SL, EVT VT);
+  /// BooleanContent for type OpVT or truncating it.
+  SDValue getBoolExtOrTrunc(SDValue Op, SDLoc SL, EVT VT, EVT OpVT);
 
   /// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
   SDValue getNOT(SDLoc DL, SDValue Val, EVT VT);
@@ -607,14 +630,14 @@ public:
   ///
   SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT);
   SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N);
-  SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2);
-  SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT,
-                  SDValue N1, SDValue N2, SDValue N3);
-  SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT,
-                  SDValue N1, SDValue N2, SDValue N3, SDValue N4);
-  SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT,
-                  SDValue N1, SDValue N2, SDValue N3, SDValue N4,
-                  SDValue N5);
+  SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2,
+                  bool nuw = false, bool nsw = false, bool exact = false);
+  SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2,
+                  SDValue N3);
+  SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2,
+                  SDValue N3, SDValue N4);
+  SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2,
+                  SDValue N3, SDValue N4, SDValue N5);
   SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, ArrayRef<SDUse> Ops);
   SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT,
                   ArrayRef<SDValue> Ops);
@@ -695,20 +718,22 @@ public:
   SDValue getVAArg(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr,
                    SDValue SV, unsigned Align);
 
-  /// getAtomic - Gets a node for an atomic op, produces result and chain and
-  /// takes 3 operands
-  SDValue getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDValue Chain,
-                    SDValue Ptr, SDValue Cmp, SDValue Swp,
-                    MachinePointerInfo PtrInfo, unsigned Alignment,
-                    AtomicOrdering SuccessOrdering,
-                    AtomicOrdering FailureOrdering,
-                    SynchronizationScope SynchScope);
-  SDValue getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDValue Chain,
-                    SDValue Ptr, SDValue Cmp, SDValue Swp,
-                    MachineMemOperand *MMO,
-                    AtomicOrdering SuccessOrdering,
-                    AtomicOrdering FailureOrdering,
-                    SynchronizationScope SynchScope);
+  /// getAtomicCmpSwap - Gets a node for an atomic cmpxchg op. There are two
+  /// valid Opcodes. ISD::ATOMIC_CMO_SWAP produces a the value loaded and a
+  /// chain result. ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS produces the value loaded,
+  /// a success flag (initially i1), and a chain.
+  SDValue getAtomicCmpSwap(unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTs,
+                           SDValue Chain, SDValue Ptr, SDValue Cmp, SDValue Swp,
+                           MachinePointerInfo PtrInfo, unsigned Alignment,
+                           AtomicOrdering SuccessOrdering,
+                           AtomicOrdering FailureOrdering,
+                           SynchronizationScope SynchScope);
+  SDValue getAtomicCmpSwap(unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTs,
+                           SDValue Chain, SDValue Ptr, SDValue Cmp, SDValue Swp,
+                           MachineMemOperand *MMO,
+                           AtomicOrdering SuccessOrdering,
+                           AtomicOrdering FailureOrdering,
+                           SynchronizationScope SynchScope);
 
   /// getAtomic - Gets a node for an atomic op, produces result (if relevant)
   /// and chain and takes 2 operands.
@@ -922,7 +947,9 @@ public:
 
   /// getNodeIfExists - Get the specified node if it's already available, or
   /// else return NULL.
-  SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTs, ArrayRef<SDValue> Ops);
+  SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTs, ArrayRef<SDValue> Ops,
+                          bool nuw = false, bool nsw = false,
+                          bool exact = false);
 
   /// getDbgValue - Creates a SDDbgValue node.
   ///
@@ -1179,6 +1206,10 @@ private:
 
   void allnodes_clear();
 
+  BinarySDNode *GetBinarySDNode(unsigned Opcode, SDLoc DL, SDVTList VTs,
+                                SDValue N1, SDValue N2, bool nuw, bool nsw,
+                                bool exact);
+
   /// VTList - List of non-single value types.
   FoldingSet<SDVTListNode> VTListMap;
 
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 4f0ddb7..2231511 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -20,6 +20,7 @@
 #define LLVM_CODEGEN_SELECTIONDAGNODES_H
 
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/STLExtras.h"
@@ -49,7 +50,26 @@ template <typename T> struct DenseMapInfo;
 template <typename T> struct simplify_type;
 template <typename T> struct ilist_traits;
 
-void checkForCycles(const SDNode *N);
+/// isBinOpWithFlags - Returns true if the opcode is a binary operation
+/// with flags.
+static bool isBinOpWithFlags(unsigned Opcode) {
+  switch (Opcode) {
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::MUL:
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::SHL:
+    return true;
+  default:
+    return false;
+  }
+}
+
+void checkForCycles(const SDNode *N, const SelectionDAG *DAG = nullptr,
+                    bool force = false);
 
 /// SDVTList - This represents a list of ValueType's that has been intern'd by
 /// a SelectionDAG.  Instances of this simple value class are returned by
@@ -123,6 +143,9 @@ public:
   bool operator<(const SDValue &O) const {
     return std::tie(Node, ResNo) < std::tie(O.Node, O.ResNo);
   }
+  LLVM_EXPLICIT operator bool() const {
+    return Node != nullptr;
+  }
 
   SDValue getValue(unsigned R) const {
     return SDValue(Node, R);
@@ -574,6 +597,7 @@ public:
   typedef SDUse* op_iterator;
   op_iterator op_begin() const { return OperandList; }
   op_iterator op_end() const { return OperandList+NumOperands; }
+  ArrayRef<SDUse> ops() const { return makeArrayRef(op_begin(), op_end()); }
 
   SDVTList getVTList() const {
     SDVTList X = { ValueList, NumValues };
@@ -938,6 +962,36 @@ public:
   }
 };
 
+/// BinaryWithFlagsSDNode - This class is an extension of BinarySDNode
+/// used from those opcodes that have associated extra flags.
+class BinaryWithFlagsSDNode : public BinarySDNode {
+  enum { NUW = (1 << 0), NSW = (1 << 1), EXACT = (1 << 2) };
+
+public:
+  BinaryWithFlagsSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
+                        SDValue X, SDValue Y)
+      : BinarySDNode(Opc, Order, dl, VTs, X, Y) {}
+  /// getRawSubclassData - Return the SubclassData value, which contains an
+  /// encoding of the flags.
+  /// This function should be used to add subclass data to the NodeID value.
+  unsigned getRawSubclassData() const { return SubclassData; }
+  void setHasNoUnsignedWrap(bool b) {
+    SubclassData = (SubclassData & ~NUW) | (b ? NUW : 0);
+  }
+  void setHasNoSignedWrap(bool b) {
+    SubclassData = (SubclassData & ~NSW) | (b ? NSW : 0);
+  }
+  void setIsExact(bool b) {
+    SubclassData = (SubclassData & ~EXACT) | (b ? EXACT : 0);
+  }
+  bool hasNoUnsignedWrap() const { return SubclassData & NUW; }
+  bool hasNoSignedWrap() const { return SubclassData & NSW; }
+  bool isExact() const { return SubclassData & EXACT; }
+  static bool classof(const SDNode *N) {
+    return isBinOpWithFlags(N->getOpcode());
+  }
+};
+
 /// TernarySDNode - This class is used for three-operand SDNodes. This is solely
 /// to allow co-allocation of node operands with the node itself.
 class TernarySDNode : public SDNode {
@@ -1077,6 +1131,7 @@ public:
            N->getOpcode() == ISD::STORE               ||
            N->getOpcode() == ISD::PREFETCH            ||
            N->getOpcode() == ISD::ATOMIC_CMP_SWAP     ||
+           N->getOpcode() == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS ||
            N->getOpcode() == ISD::ATOMIC_SWAP         ||
            N->getOpcode() == ISD::ATOMIC_LOAD_ADD     ||
            N->getOpcode() == ISD::ATOMIC_LOAD_SUB     ||
@@ -1185,12 +1240,13 @@ public:
 
   bool isCompareAndSwap() const {
     unsigned Op = getOpcode();
-    return Op == ISD::ATOMIC_CMP_SWAP;
+    return Op == ISD::ATOMIC_CMP_SWAP || Op == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS;
   }
 
   // Methods to support isa and dyn_cast
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::ATOMIC_CMP_SWAP     ||
+           N->getOpcode() == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS ||
            N->getOpcode() == ISD::ATOMIC_SWAP         ||
            N->getOpcode() == ISD::ATOMIC_LOAD_ADD     ||
            N->getOpcode() == ISD::ATOMIC_LOAD_SUB     ||
@@ -1528,11 +1584,27 @@ public:
                        unsigned MinSplatBits = 0,
                        bool isBigEndian = false) const;
 
-  /// getConstantSplatValue - Check if this is a constant splat, and if so,
-  /// return the splat value only if it is a ConstantSDNode. Otherwise
-  /// return nullptr. This is a simpler form of isConstantSplat.
-  /// Get the constant splat only if you care about the splat value.
-  ConstantSDNode *getConstantSplatValue() const;
+  /// \brief Returns the splatted value or a null value if this is not a splat.
+  ///
+  /// If passed a non-null UndefElements bitvector, it will resize it to match
+  /// the vector width and set the bits where elements are undef.
+  SDValue getSplatValue(BitVector *UndefElements = nullptr) const;
+
+  /// \brief Returns the splatted constant or null if this is not a constant
+  /// splat.
+  ///
+  /// If passed a non-null UndefElements bitvector, it will resize it to match
+  /// the vector width and set the bits where elements are undef.
+  ConstantSDNode *
+  getConstantSplatNode(BitVector *UndefElements = nullptr) const;
+
+  /// \brief Returns the splatted constant FP or null if this is not a constant
+  /// FP splat.
+  ///
+  /// If passed a non-null UndefElements bitvector, it will resize it to match
+  /// the vector width and set the bits where elements are undef.
+  ConstantFPSDNode *
+  getConstantFPSplatNode(BitVector *UndefElements = nullptr) const;
 
   bool isConstant() const;
 
diff --git a/include/llvm/CodeGen/StackMapLivenessAnalysis.h b/include/llvm/CodeGen/StackMapLivenessAnalysis.h
index 6ba7256..6f07546 100644
--- a/include/llvm/CodeGen/StackMapLivenessAnalysis.h
+++ b/include/llvm/CodeGen/StackMapLivenessAnalysis.h
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 //
 // This pass calculates the liveness for each basic block in a function and
-// attaches the register live-out information to a stackmap or patchpoint
-// intrinsic if present.
+// attaches the register live-out information to a patchpoint intrinsic (if
+// present).
 //
 //===----------------------------------------------------------------------===//
 
@@ -23,14 +23,13 @@
 namespace llvm {
 
 /// \brief This pass calculates the liveness information for each basic block in
-/// a function and attaches the register live-out information to a stackmap or
-/// patchpoint intrinsic if present.
+/// a function and attaches the register live-out information to a patchpoint
+/// intrinsic if present.
 ///
-/// This is an optional pass that has to be explicitly enabled via the
-/// -enable-stackmap-liveness and/or -enable-patchpoint-liveness flag. The pass
-/// skips functions that don't have any stackmap or patchpoint intrinsics. The
+/// This pass can be disabled via the -enable-patchpoint-liveness=false flag.
+/// The pass skips functions that don't have any patchpoint intrinsics. The
 /// information provided by this pass is optional and not required by the
-/// aformentioned intrinsics to function.
+/// aformentioned intrinsic to function.
 class StackMapLiveness : public MachineFunctionPass {
   MachineFunction *MF;
   const TargetRegisterInfo *TRI;
diff --git a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
index 9f1cbaa..230d1ed 100644
--- a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
+++ b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
@@ -68,11 +68,9 @@ public:
 
   void InitializeELF(bool UseInitArray_);
   const MCSection *getStaticCtorSection(unsigned Priority,
-                                        const MCSymbol *KeySym,
-                                        const MCSection *KeySec) const override;
+                                        const MCSymbol *KeySym) const override;
   const MCSection *getStaticDtorSection(unsigned Priority,
-                                        const MCSymbol *KeySym,
-                                        const MCSection *KeySec) const override;
+                                        const MCSymbol *KeySym) const override;
 };
 
 
@@ -144,11 +142,9 @@ public:
                        Mangler &Mang, const TargetMachine &TM) const override;
 
   const MCSection *getStaticCtorSection(unsigned Priority,
-                                        const MCSymbol *KeySym,
-                                        const MCSection *KeySec) const override;
+                                        const MCSymbol *KeySym) const override;
   const MCSection *getStaticDtorSection(unsigned Priority,
-                                        const MCSymbol *KeySym,
-                                        const MCSection *KeySec) const override;
+                                        const MCSymbol *KeySym) const override;
 };
 
 } // end namespace llvm
diff --git a/include/llvm/Config/config.h.cmake b/include/llvm/Config/config.h.cmake
index ea1e75a..e9f6702 100644
--- a/include/llvm/Config/config.h.cmake
+++ b/include/llvm/Config/config.h.cmake
@@ -3,20 +3,14 @@
 #ifndef CONFIG_H
 #define CONFIG_H
 
-/* Bug report URL. */
-#define BUG_REPORT_URL "${BUG_REPORT_URL}"
-
-/* Define if we have libxml2 */
-#cmakedefine CLANG_HAVE_LIBXML ${CLANG_HAVE_LIBXML}
-
-/* Relative directory for resource files */
-#define CLANG_RESOURCE_DIR "${CLANG_RESOURCE_DIR}"
+/* Exported configuration */
+#include "llvm/Config/llvm-config.h"
 
-/* Directories clang will search for headers */
-#define C_INCLUDE_DIRS "${C_INCLUDE_DIRS}"
+/* Patch version of the LLVM API */
+#cmakedefine LLVM_VERSION_PATCH ${LLVM_VERSION_PATCH}
 
-/* Default <path> to all compiler invocations for --sysroot=<path>. */
-#undef DEFAULT_SYSROOT
+/* Bug report URL. */
+#define BUG_REPORT_URL "${BUG_REPORT_URL}"
 
 /* Define if you want backtraces on crash */
 #cmakedefine ENABLE_BACKTRACES
@@ -30,9 +24,6 @@
 /* Define if timestamp information (e.g., __DATE__) is allowed */
 #cmakedefine ENABLE_TIMESTAMPS ${ENABLE_TIMESTAMPS}
 
-/* Directory where gcc is installed. */
-#undef GCC_INSTALL_PREFIX
-
 /* Define to 1 if you have the `arc4random' function. */
 #cmakedefine HAVE_ARC4RANDOM
 
@@ -45,9 +36,6 @@
 /* Define to 1 if you have the `ceilf' function. */
 #cmakedefine HAVE_CEILF ${HAVE_CEILF}
 
-/* Define if the neat program is available */
-#cmakedefine HAVE_CIRCO ${HAVE_CIRCO}
-
 /* Define to 1 if you have the `closedir' function. */
 #cmakedefine HAVE_CLOSEDIR ${HAVE_CLOSEDIR}
 
@@ -80,12 +68,6 @@
 /* Define if dlopen() is available on this platform. */
 #cmakedefine HAVE_DLOPEN ${HAVE_DLOPEN}
 
-/* Define if the dot program is available */
-#cmakedefine HAVE_DOT ${HAVE_DOT}
-
-/* Define if the dotty program is available */
-#cmakedefine HAVE_DOTTY ${HAVE_DOTTY}
-
 /* Define if you have the _dyld_func_lookup function. */
 #undef HAVE_DYLD
 
@@ -98,9 +80,6 @@
 /* Define to 1 if you have the <fcntl.h> header file. */
 #cmakedefine HAVE_FCNTL_H ${HAVE_FCNTL_H}
 
-/* Define if the neat program is available */
-#cmakedefine HAVE_FDP ${HAVE_FDP}
-
 /* Define to 1 if you have the <fenv.h> header file. */
 #cmakedefine HAVE_FENV_H ${HAVE_FENV_H}
 
@@ -161,12 +140,6 @@
 /* Define to 1 if you have the `gettimeofday' function. */
 #cmakedefine HAVE_GETTIMEOFDAY ${HAVE_GETTIMEOFDAY}
 
-/* Define if the Graphviz program is available */
-#cmakedefine HAVE_GRAPHVIZ ${HAVE_GRAPHVIZ}
-
-/* Define if the gv program is available */
-#cmakedefine HAVE_GV ${HAVE_GV}
-
 /* Define to 1 if the system has the type `int64_t'. */
 #cmakedefine HAVE_INT64_T ${HAVE_INT64_T}
 
@@ -271,9 +244,6 @@
 /* Define to 1 if you have the `nearbyintf' function. */
 #cmakedefine HAVE_NEARBYINTF ${HAVE_NEARBYINTF}
 
-/* Define if the neat program is available */
-#cmakedefine HAVE_NEATO ${HAVE_NEATO}
-
 /* Define to 1 if you have the `opendir' function. */
 #cmakedefine HAVE_OPENDIR ${HAVE_OPENDIR}
 
@@ -417,9 +387,6 @@
 /* Define to 1 if you have the <termios.h> header file. */
 #cmakedefine HAVE_TERMIOS_H ${HAVE_TERMIOS_H}
 
-/* Define if the neat program is available */
-#cmakedefine HAVE_TWOPI ${HAVE_TWOPI}
-
 /* Define to 1 if the system has the type `uint64_t'. */
 #cmakedefine HAVE_UINT64_T ${HAVE_UINT64_T}
 
@@ -438,9 +405,6 @@
 /* Define to 1 if you have the `writev' function. */
 #cmakedefine HAVE_WRITEV ${HAVE_WRITEV}
 
-/* Define if the xdot.py program is available */
-#cmakedefine HAVE_XDOT ${HAVE_XDOT}
-
 /* Define to 1 if you have the <zlib.h> header file. */
 #cmakedefine HAVE_ZLIB_H ${HAVE_ZLIB_H}
 
@@ -501,114 +465,9 @@
 /* Define if we link Polly to the tools */
 #cmakedefine LINK_POLLY_INTO_TOOLS
 
-/* Installation directory for binary executables */
-#cmakedefine LLVM_BINDIR "${LLVM_BINDIR}"
-
-/* Time at which LLVM was configured */
-#cmakedefine LLVM_CONFIGTIME "${LLVM_CONFIGTIME}"
-
-/* Installation directory for data files */
-#cmakedefine LLVM_DATADIR "${LLVM_DATADIR}"
-
-/* Target triple LLVM will generate code for by default */
-#cmakedefine LLVM_DEFAULT_TARGET_TRIPLE "${LLVM_DEFAULT_TARGET_TRIPLE}"
-
-/* Installation directory for documentation */
-#cmakedefine LLVM_DOCSDIR "${LLVM_DOCSDIR}"
-
-/* Define if threads enabled */
-#cmakedefine01 LLVM_ENABLE_THREADS
-
 /* Define if zlib compression is available */
 #cmakedefine01 LLVM_ENABLE_ZLIB
 
-/* Installation directory for config files */
-#cmakedefine LLVM_ETCDIR "${LLVM_ETCDIR}"
-
-/* Has gcc/MSVC atomic intrinsics */
-#cmakedefine01 LLVM_HAS_ATOMICS
-
-/* Host triple LLVM will be executed on */
-#cmakedefine LLVM_HOST_TRIPLE "${LLVM_HOST_TRIPLE}"
-
-/* Installation directory for include files */
-#cmakedefine LLVM_INCLUDEDIR "${LLVM_INCLUDEDIR}"
-
-/* Installation directory for .info files */
-#cmakedefine LLVM_INFODIR "${LLVM_INFODIR}"
-
-/* Installation directory for man pages */
-#cmakedefine LLVM_MANDIR "${LLVM_MANDIR}"
-
-/* LLVM architecture name for the native architecture, if available */
-#cmakedefine LLVM_NATIVE_ARCH ${LLVM_NATIVE_ARCH}
-
-/* LLVM name for the native AsmParser init function, if available */
-#cmakedefine LLVM_NATIVE_ASMPARSER LLVMInitialize${LLVM_NATIVE_ARCH}AsmParser
-
-/* LLVM name for the native AsmPrinter init function, if available */
-#cmakedefine LLVM_NATIVE_ASMPRINTER LLVMInitialize${LLVM_NATIVE_ARCH}AsmPrinter
-
-/* LLVM name for the native Disassembler init function, if available */
-#cmakedefine LLVM_NATIVE_DISASSEMBLER LLVMInitialize${LLVM_NATIVE_ARCH}Disassembler
-
-/* LLVM name for the native Target init function, if available */
-#cmakedefine LLVM_NATIVE_TARGET LLVMInitialize${LLVM_NATIVE_ARCH}Target
-
-/* LLVM name for the native TargetInfo init function, if available */
-#cmakedefine LLVM_NATIVE_TARGETINFO LLVMInitialize${LLVM_NATIVE_ARCH}TargetInfo
-
-/* LLVM name for the native target MC init function, if available */
-#cmakedefine LLVM_NATIVE_TARGETMC LLVMInitialize${LLVM_NATIVE_ARCH}TargetMC
-
-/* Define if this is Unixish platform */
-#cmakedefine LLVM_ON_UNIX ${LLVM_ON_UNIX}
-
-/* Define if this is Win32ish platform */
-#cmakedefine LLVM_ON_WIN32 ${LLVM_ON_WIN32}
-
-/* Define to path to circo program if found or 'echo circo' otherwise */
-#cmakedefine LLVM_PATH_CIRCO "${LLVM_PATH_CIRCO}"
-
-/* Define to path to dot program if found or 'echo dot' otherwise */
-#cmakedefine LLVM_PATH_DOT "${LLVM_PATH_DOT}"
-
-/* Define to path to dotty program if found or 'echo dotty' otherwise */
-#cmakedefine LLVM_PATH_DOTTY "${LLVM_PATH_DOTTY}"
-
-/* Define to path to fdp program if found or 'echo fdp' otherwise */
-#cmakedefine LLVM_PATH_FDP "${LLVM_PATH_FDP}"
-
-/* Define to path to Graphviz program if found or 'echo Graphviz' otherwise */
-#cmakedefine LLVM_PATH_GRAPHVIZ "${LLVM_PATH_GRAPHVIZ}"
-
-/* Define to path to gv program if found or 'echo gv' otherwise */
-#cmakedefine LLVM_PATH_GV "${LLVM_PATH_GV}"
-
-/* Define to path to neato program if found or 'echo neato' otherwise */
-#cmakedefine LLVM_PATH_NEATO "${LLVM_PATH_NEATO}"
-
-/* Define to path to twopi program if found or 'echo twopi' otherwise */
-#cmakedefine LLVM_PATH_TWOPI "${LLVM_PATH_TWOPI}"
-
-/* Define to path to xdot.py program if found or 'echo xdot' otherwise */
-#cmakedefine LLVM_PATH_XDOT "${LLVM_PATH_XDOT}"
-
-/* Installation prefix directory */
-#cmakedefine LLVM_PREFIX "${LLVM_PREFIX}"
-
-/* Define if we have the Intel JIT API runtime support library */
-#cmakedefine LLVM_USE_INTEL_JITEVENTS 1
-
-/* Define if we have the oprofile JIT-support library */
-#cmakedefine LLVM_USE_OPROFILE 1
-
-/* Major version of the LLVM API */
-#cmakedefine LLVM_VERSION_MAJOR ${LLVM_VERSION_MAJOR}
-
-/* Minor version of the LLVM API */
-#cmakedefine LLVM_VERSION_MINOR ${LLVM_VERSION_MINOR}
-
 /* Define if the OS needs help to load dependent libraries for dlopen(). */
 #cmakedefine LTDL_DLOPEN_DEPLIBS ${LTDL_DLOPEN_DEPLIBS}
 
@@ -689,7 +548,7 @@
 /* Define to 1 if you have the `_chsize_s' function. */
 #cmakedefine HAVE__CHSIZE_S ${HAVE__CHSIZE_S}
 
-/* Added by Kevin -- Maximum path length */
+/* Maximum path length */
 #cmakedefine MAXPATHLEN ${MAXPATHLEN}
 
 #endif
diff --git a/include/llvm/Config/config.h.in b/include/llvm/Config/config.h.in
index 6b8dbb7..b5f7297 100644
--- a/include/llvm/Config/config.h.in
+++ b/include/llvm/Config/config.h.in
@@ -3,20 +3,14 @@
 #ifndef CONFIG_H
 #define CONFIG_H
 
-/* Bug report URL. */
-#undef BUG_REPORT_URL
-
-/* Define if we have libxml2 */
-#undef CLANG_HAVE_LIBXML
-
-/* Relative directory for resource files */
-#undef CLANG_RESOURCE_DIR
+/* Exported configuration */
+#include "llvm/Config/llvm-config.h"
 
-/* Directories clang will search for headers */
-#undef C_INCLUDE_DIRS
+/* Patch version of the LLVM API */
+#undef LLVM_VERSION_PATCH
 
-/* Default <path> to all compiler invocations for --sysroot=<path>. */
-#undef DEFAULT_SYSROOT
+/* Bug report URL. */
+#undef BUG_REPORT_URL
 
 /* Define if you want backtraces on crash */
 #undef ENABLE_BACKTRACES
@@ -30,18 +24,12 @@
 /* Define if timestamp information (e.g., __DATE__) is allowed */
 #undef ENABLE_TIMESTAMPS
 
-/* Directory where gcc is installed. */
-#undef GCC_INSTALL_PREFIX
-
 /* Define to 1 if you have the `backtrace' function. */
 #undef HAVE_BACKTRACE
 
 /* Define to 1 if you have the `ceilf' function. */
 #undef HAVE_CEILF
 
-/* Define if the neat program is available */
-#undef HAVE_CIRCO
-
 /* Define to 1 if you have the <CrashReporterClient.h> header file. */
 #undef HAVE_CRASHREPORTERCLIENT_H
 
@@ -77,12 +65,6 @@
 /* Define if dlopen() is available on this platform. */
 #undef HAVE_DLOPEN
 
-/* Define if the dot program is available */
-#undef HAVE_DOT
-
-/* Define if the dotty program is available */
-#undef HAVE_DOTTY
-
 /* Define to 1 if you have the <errno.h> header file. */
 #undef HAVE_ERRNO_H
 
@@ -98,9 +80,6 @@
 /* Define to 1 if you have the <fcntl.h> header file. */
 #undef HAVE_FCNTL_H
 
-/* Define if the neat program is available */
-#undef HAVE_FDP
-
 /* Define to 1 if you have the <fenv.h> header file. */
 #undef HAVE_FENV_H
 
@@ -143,12 +122,6 @@
 /* Define to 1 if you have the `gettimeofday' function. */
 #undef HAVE_GETTIMEOFDAY
 
-/* Define if the Graphviz program is available */
-#undef HAVE_GRAPHVIZ
-
-/* Define if the gv program is available */
-#undef HAVE_GV
-
 /* Define to 1 if the system has the type `int64_t'. */
 #undef HAVE_INT64_T
 
@@ -259,9 +232,6 @@
 /* Define to 1 if you have the `nearbyintf' function. */
 #undef HAVE_NEARBYINTF
 
-/* Define if the neat program is available */
-#undef HAVE_NEATO
-
 /* Define to 1 if you have the `posix_spawn' function. */
 #undef HAVE_POSIX_SPAWN
 
@@ -402,9 +372,6 @@
 /* Define to 1 if you have the <termios.h> header file. */
 #undef HAVE_TERMIOS_H
 
-/* Define if the neat program is available */
-#undef HAVE_TWOPI
-
 /* Define to 1 if the system has the type `uint64_t'. */
 #undef HAVE_UINT64_T
 
@@ -423,9 +390,6 @@
 /* Define to 1 if you have the `writev' function. */
 #undef HAVE_WRITEV
 
-/* Define if the xdot program is available */
-#undef HAVE_XDOT
-
 /* Define to 1 if you have the <zlib.h> header file. */
 #undef HAVE_ZLIB_H
 
@@ -483,117 +447,9 @@
 /* Linker version detected at compile time. */
 #undef HOST_LINK_VERSION
 
-/* Installation directory for binary executables */
-#undef LLVM_BINDIR
-
-/* Time at which LLVM was configured */
-#undef LLVM_CONFIGTIME
-
-/* Installation directory for data files */
-#undef LLVM_DATADIR
-
-/* Target triple LLVM will generate code for by default */
-#undef LLVM_DEFAULT_TARGET_TRIPLE
-
-/* Installation directory for documentation */
-#undef LLVM_DOCSDIR
-
-/* Define if threads enabled */
-#undef LLVM_ENABLE_THREADS
-
 /* Define if zlib is enabled */
 #undef LLVM_ENABLE_ZLIB
 
-/* Installation directory for config files */
-#undef LLVM_ETCDIR
-
-/* Has gcc/MSVC atomic intrinsics */
-#undef LLVM_HAS_ATOMICS
-
-/* Host triple LLVM will be executed on */
-#undef LLVM_HOST_TRIPLE
-
-/* Installation directory for include files */
-#undef LLVM_INCLUDEDIR
-
-/* Installation directory for .info files */
-#undef LLVM_INFODIR
-
-/* Installation directory for man pages */
-#undef LLVM_MANDIR
-
-/* LLVM architecture name for the native architecture, if available */
-#undef LLVM_NATIVE_ARCH
-
-/* LLVM name for the native AsmParser init function, if available */
-#undef LLVM_NATIVE_ASMPARSER
-
-/* LLVM name for the native AsmPrinter init function, if available */
-#undef LLVM_NATIVE_ASMPRINTER
-
-/* LLVM name for the native Disassembler init function, if available */
-#undef LLVM_NATIVE_DISASSEMBLER
-
-/* LLVM name for the native Target init function, if available */
-#undef LLVM_NATIVE_TARGET
-
-/* LLVM name for the native TargetInfo init function, if available */
-#undef LLVM_NATIVE_TARGETINFO
-
-/* LLVM name for the native target MC init function, if available */
-#undef LLVM_NATIVE_TARGETMC
-
-/* Define if this is Unixish platform */
-#undef LLVM_ON_UNIX
-
-/* Define if this is Win32ish platform */
-#undef LLVM_ON_WIN32
-
-/* Define to path to circo program if found or 'echo circo' otherwise */
-#undef LLVM_PATH_CIRCO
-
-/* Define to path to dot program if found or 'echo dot' otherwise */
-#undef LLVM_PATH_DOT
-
-/* Define to path to dotty program if found or 'echo dotty' otherwise */
-#undef LLVM_PATH_DOTTY
-
-/* Define to path to fdp program if found or 'echo fdp' otherwise */
-#undef LLVM_PATH_FDP
-
-/* Define to path to Graphviz program if found or 'echo Graphviz' otherwise */
-#undef LLVM_PATH_GRAPHVIZ
-
-/* Define to path to gv program if found or 'echo gv' otherwise */
-#undef LLVM_PATH_GV
-
-/* Define to path to neato program if found or 'echo neato' otherwise */
-#undef LLVM_PATH_NEATO
-
-/* Define to path to twopi program if found or 'echo twopi' otherwise */
-#undef LLVM_PATH_TWOPI
-
-/* Define to path to xdot program if found or 'echo xdot' otherwise */
-#undef LLVM_PATH_XDOT
-
-/* Installation prefix directory */
-#undef LLVM_PREFIX
-
-/* Define if we have the Intel JIT API runtime support library */
-#undef LLVM_USE_INTEL_JITEVENTS
-
-/* Define if we have the oprofile JIT-support library */
-#undef LLVM_USE_OPROFILE
-
-/* Major version of the LLVM API */
-#undef LLVM_VERSION_MAJOR
-
-/* Minor version of the LLVM API */
-#undef LLVM_VERSION_MINOR
-
-/* Patch version of the LLVM API */
-#undef LLVM_VERSION_PATCH
-
 /* The shared library extension */
 #undef LTDL_SHLIB_EXT
 
diff --git a/include/llvm/Config/llvm-config.h.cmake b/include/llvm/Config/llvm-config.h.cmake
index 65116cb..5811164 100644
--- a/include/llvm/Config/llvm-config.h.cmake
+++ b/include/llvm/Config/llvm-config.h.cmake
@@ -1,4 +1,4 @@
-/*===-- llvm/config/llvm-config.h - llvm configure variable -------*- C -*-===*/
+/*===------- llvm/Config/llvm-config.h - llvm configuration -------*- C -*-===*/
 /*                                                                            */
 /*                     The LLVM Compiler Infrastructure                       */
 /*                                                                            */
@@ -7,14 +7,12 @@
 /*                                                                            */
 /*===----------------------------------------------------------------------===*/
 
-/* This file enumerates all of the llvm variables from configure so that
-   they can be in exported headers and won't override package specific
-   directives.  This is a C file so we can include it in the llvm-c headers.  */
+/* This file enumerates variables from the LLVM configuration so that they
+   can be in exported headers and won't override package specific directives.
+   This is a C header that can be included in the llvm-c headers. */
 
-/* To avoid multiple inclusions of these variables when we include the exported
-   headers and config.h, conditionally include these.  */
-/* TODO: This is a bit of a hack.  */
-#ifndef CONFIG_H
+#ifndef LLVM_CONFIG_H
+#define LLVM_CONFIG_H
 
 /* Installation directory for binary executables */
 #cmakedefine LLVM_BINDIR "${LLVM_BINDIR}"
@@ -79,33 +77,6 @@
 /* Define if this is Win32ish platform */
 #cmakedefine LLVM_ON_WIN32 ${LLVM_ON_WIN32}
 
-/* Define to path to circo program if found or 'echo circo' otherwise */
-#cmakedefine LLVM_PATH_CIRCO "${LLVM_PATH_CIRCO}"
-
-/* Define to path to dot program if found or 'echo dot' otherwise */
-#cmakedefine LLVM_PATH_DOT "${LLVM_PATH_DOT}"
-
-/* Define to path to dotty program if found or 'echo dotty' otherwise */
-#cmakedefine LLVM_PATH_DOTTY "${LLVM_PATH_DOTTY}"
-
-/* Define to path to fdp program if found or 'echo fdp' otherwise */
-#cmakedefine LLVM_PATH_FDP "${LLVM_PATH_FDP}"
-
-/* Define to path to Graphviz program if found or 'echo Graphviz' otherwise */
-#cmakedefine LLVM_PATH_GRAPHVIZ "${LLVM_PATH_GRAPHVIZ}"
-
-/* Define to path to gv program if found or 'echo gv' otherwise */
-#cmakedefine LLVM_PATH_GV "${LLVM_PATH_GV}"
-
-/* Define to path to neato program if found or 'echo neato' otherwise */
-#cmakedefine LLVM_PATH_NEATO "${LLVM_PATH_NEATO}"
-
-/* Define to path to twopi program if found or 'echo twopi' otherwise */
-#cmakedefine LLVM_PATH_TWOPI "${LLVM_PATH_TWOPI}"
-
-/* Define to path to xdot.py program if found or 'echo xdot.py' otherwise */
-#cmakedefine LLVM_PATH_XDOT_PY "${LLVM_PATH_XDOT_PY}"
-
 /* Installation prefix directory */
 #cmakedefine LLVM_PREFIX "${LLVM_PREFIX}"
 
diff --git a/include/llvm/Config/llvm-config.h.in b/include/llvm/Config/llvm-config.h.in
index a4fae55..5656240 100644
--- a/include/llvm/Config/llvm-config.h.in
+++ b/include/llvm/Config/llvm-config.h.in
@@ -1,4 +1,4 @@
-/*===-- llvm/config/llvm-config.h - llvm configure variable -------*- C -*-===*/
+/*===------- llvm/Config/llvm-config.h - llvm configuration -------*- C -*-===*/
 /*                                                                            */
 /*                     The LLVM Compiler Infrastructure                       */
 /*                                                                            */
@@ -7,14 +7,12 @@
 /*                                                                            */
 /*===----------------------------------------------------------------------===*/
 
-/* This file enumerates all of the llvm variables from configure so that
-   they can be in exported headers and won't override package specific
-   directives.  This is a C file so we can include it in the llvm-c headers.  */
+/* This file enumerates variables from the LLVM configuration so that they
+   can be in exported headers and won't override package specific directives.
+   This is a C header that can be included in the llvm-c headers. */
 
-/* To avoid multiple inclusions of these variables when we include the exported
-   headers and config.h, conditionally include these.  */
-/* TODO: This is a bit of a hack.  */
-#ifndef CONFIG_H
+#ifndef LLVM_CONFIG_H
+#define LLVM_CONFIG_H
 
 /* Installation directory for binary executables */
 #undef LLVM_BINDIR
@@ -79,33 +77,6 @@
 /* Define if this is Win32ish platform */
 #undef LLVM_ON_WIN32
 
-/* Define to path to circo program if found or 'echo circo' otherwise */
-#undef LLVM_PATH_CIRCO
-
-/* Define to path to dot program if found or 'echo dot' otherwise */
-#undef LLVM_PATH_DOT
-
-/* Define to path to dotty program if found or 'echo dotty' otherwise */
-#undef LLVM_PATH_DOTTY
-
-/* Define to path to fdp program if found or 'echo fdp' otherwise */
-#undef LLVM_PATH_FDP
-
-/* Define to path to Graphviz program if found or 'echo Graphviz' otherwise */
-#undef LLVM_PATH_GRAPHVIZ
-
-/* Define to path to gv program if found or 'echo gv' otherwise */
-#undef LLVM_PATH_GV
-
-/* Define to path to neato program if found or 'echo neato' otherwise */
-#undef LLVM_PATH_NEATO
-
-/* Define to path to twopi program if found or 'echo twopi' otherwise */
-#undef LLVM_PATH_TWOPI
-
-/* Define to path to xdot.py program if found or 'echo xdot.py' otherwise */
-#undef LLVM_PATH_XDOT_PY
-
 /* Installation prefix directory */
 #undef LLVM_PREFIX
 
diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h
index 7518c1e..e5dab61 100644
--- a/include/llvm/ExecutionEngine/ExecutionEngine.h
+++ b/include/llvm/ExecutionEngine/ExecutionEngine.h
@@ -54,7 +54,7 @@ namespace object {
 }
 
 /// \brief Helper class for helping synchronize access to the global address map
-/// table.
+/// table.  Access to this class should be serialized under a mutex.
 class ExecutionEngineState {
 public:
   struct AddressMapConfig : public ValueMapConfig<const GlobalValue*> {
@@ -84,19 +84,19 @@ private:
 public:
   ExecutionEngineState(ExecutionEngine &EE);
 
-  GlobalAddressMapTy &getGlobalAddressMap(const MutexGuard &) {
+  GlobalAddressMapTy &getGlobalAddressMap() {
     return GlobalAddressMap;
   }
 
   std::map<void*, AssertingVH<const GlobalValue> > &
-  getGlobalAddressReverseMap(const MutexGuard &) {
+  getGlobalAddressReverseMap() {
     return GlobalAddressReverseMap;
   }
 
   /// \brief Erase an entry from the mapping table.
   ///
   /// \returns The address that \p ToUnmap was happed to.
-  void *RemoveMapping(const MutexGuard &, const GlobalValue *ToUnmap);
+  void *RemoveMapping(const GlobalValue *ToUnmap);
 };
 
 /// \brief Abstract interface for implementation execution of LLVM modules,
@@ -586,26 +586,7 @@ private:
   bool VerifyModules;
 
   /// InitEngine - Does the common initialization of default options.
-  void InitEngine() {
-    WhichEngine = EngineKind::Either;
-    ErrorStr = nullptr;
-    OptLevel = CodeGenOpt::Default;
-    MCJMM = nullptr;
-    JMM = nullptr;
-    Options = TargetOptions();
-    AllocateGVsWithCode = false;
-    RelocModel = Reloc::Default;
-    CMModel = CodeModel::JITDefault;
-    UseMCJIT = false;
-
-  // IR module verification is enabled by default in debug builds, and disabled
-  // by default in release builds.
-#ifndef NDEBUG
-  VerifyModules = true;
-#else
-  VerifyModules = false;
-#endif
-  }
+  void InitEngine();
 
 public:
   /// EngineBuilder - Constructor for EngineBuilder.  If create() is called and
diff --git a/include/llvm/ExecutionEngine/ObjectBuffer.h b/include/llvm/ExecutionEngine/ObjectBuffer.h
index 071a42b..6221d3b 100644
--- a/include/llvm/ExecutionEngine/ObjectBuffer.h
+++ b/include/llvm/ExecutionEngine/ObjectBuffer.h
@@ -39,7 +39,8 @@ public:
   /// returns a pointer to an object that is owned by the caller. However,
   /// the caller does not take ownership of the underlying memory.
   MemoryBuffer *getMemBuffer() const {
-    return MemoryBuffer::getMemBuffer(Buffer->getBuffer(), "", false);
+    return MemoryBuffer::getMemBuffer(Buffer->getBuffer(),
+                                      Buffer->getBufferIdentifier(), false);
   }
 
   const char *getBufferStart() const { return Buffer->getBufferStart(); }
diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h
index 30c0d49..f123ffb 100644
--- a/include/llvm/ExecutionEngine/RuntimeDyld.h
+++ b/include/llvm/ExecutionEngine/RuntimeDyld.h
@@ -29,6 +29,8 @@ class RuntimeDyldImpl;
 class ObjectImage;
 
 class RuntimeDyld {
+  friend class RuntimeDyldChecker;
+
   RuntimeDyld(const RuntimeDyld &) LLVM_DELETED_FUNCTION;
   void operator=(const RuntimeDyld &) LLVM_DELETED_FUNCTION;
 
diff --git a/include/llvm/ExecutionEngine/RuntimeDyldChecker.h b/include/llvm/ExecutionEngine/RuntimeDyldChecker.h
new file mode 100644
index 0000000..38a4ea1
--- /dev/null
+++ b/include/llvm/ExecutionEngine/RuntimeDyldChecker.h
@@ -0,0 +1,98 @@
+//===---- RuntimeDyldChecker.h - RuntimeDyld tester framework -----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_RUNTIMEDYLDCHECKER_H
+#define LLVM_RUNTIMEDYLDCHECKER_H
+
+#include "RuntimeDyld.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+
+namespace llvm {
+
+class MCDisassembler;
+class MCInstPrinter;
+
+/// \brief RuntimeDyld invariant checker for verifying that RuntimeDyld has
+///        correctly applied relocations.
+///
+/// The RuntimeDyldChecker class evaluates expressions against an attached
+/// RuntimeDyld instance to verify that relocations have been applied
+/// correctly.
+///
+/// The expression language supports basic pointer arithmetic and bit-masking,
+/// and has limited disassembler integration for accessing instruction
+/// operands and the next PC (program counter) address for each instruction.
+///
+/// The language syntax is:
+///
+/// check = expr '=' expr
+///
+/// expr = binary_expr
+///      | sliceable_expr
+///
+/// sliceable_expr = '*{' number '}' load_addr_expr [slice]
+///                | '(' expr ')' [slice]
+///                | ident_expr [slice]
+///                | number [slice]
+///
+/// slice = '[' high-bit-index ':' low-bit-index ']'
+///
+/// load_addr_expr = symbol
+///                | '(' symbol '+' number ')'
+///                | '(' symbol '-' number ')'
+///
+/// ident_expr = 'decode_operand' '(' symbol ',' operand-index ')'
+///            | 'next_pc'        '(' symbol ')'
+///            | symbol
+///
+/// binary_expr = expr '+' expr
+///             | expr '-' expr
+///             | expr '&' expr
+///             | expr '|' expr
+///             | expr '<<' expr
+///             | expr '>>' expr
+///
+class RuntimeDyldChecker {
+  friend class RuntimeDyldCheckerExprEval;
+public:
+  RuntimeDyldChecker(RuntimeDyld &RTDyld,
+                     MCDisassembler *Disassembler,
+                     MCInstPrinter *InstPrinter,
+                     llvm::raw_ostream &ErrStream)
+    : RTDyld(*RTDyld.Dyld), Disassembler(Disassembler),
+      InstPrinter(InstPrinter), ErrStream(ErrStream) {}
+
+  /// \brief Check a single expression against the attached RuntimeDyld
+  ///        instance.
+  bool check(StringRef CheckExpr) const;
+
+  /// \brief Scan the given memory buffer for lines beginning with the string
+  ///        in RulePrefix. The remainder of the line is passed to the check
+  ///        method to be evaluated as an expression.
+  bool checkAllRulesInBuffer(StringRef RulePrefix, MemoryBuffer *MemBuf) const;
+
+private:
+
+  bool checkSymbolIsValidForLoad(StringRef Symbol) const;
+  uint64_t getSymbolAddress(StringRef Symbol) const;
+  uint64_t readMemoryAtSymbol(StringRef Symbol, int64_t Offset,
+                              unsigned Size) const;
+  StringRef getSubsectionStartingAt(StringRef Name) const;
+
+  RuntimeDyldImpl &RTDyld;
+  MCDisassembler *Disassembler;
+  MCInstPrinter *InstPrinter;
+  llvm::raw_ostream &ErrStream;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_RUNTIMEDYLDCHECKER_H
diff --git a/include/llvm/ExecutionEngine/SectionMemoryManager.h b/include/llvm/ExecutionEngine/SectionMemoryManager.h
index f24bb4d..1368563 100644
--- a/include/llvm/ExecutionEngine/SectionMemoryManager.h
+++ b/include/llvm/ExecutionEngine/SectionMemoryManager.h
@@ -21,7 +21,6 @@
 #include "llvm/Support/Memory.h"
 
 namespace llvm {
-
 /// This is a simple memory manager which implements the methods called by
 /// the RuntimeDyld class to allocate memory for section-based loading of
 /// objects, usually those generated by the MCJIT execution engine.
@@ -93,8 +92,8 @@ private:
   uint8_t *allocateSection(MemoryGroup &MemGroup, uintptr_t Size,
                            unsigned Alignment);
 
-  error_code applyMemoryGroupPermissions(MemoryGroup &MemGroup,
-                                         unsigned Permissions);
+  std::error_code applyMemoryGroupPermissions(MemoryGroup &MemGroup,
+                                              unsigned Permissions);
 
   MemoryGroup CodeMem;
   MemoryGroup RWDataMem;
diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h
index 86f9cc8..e34dc83 100644
--- a/include/llvm/IR/Attributes.h
+++ b/include/llvm/IR/Attributes.h
@@ -75,6 +75,7 @@ public:
     Cold,                  ///< Marks function as being in a cold path.
     InlineHint,            ///< Source said inlining was desirable
     InReg,                 ///< Force argument to be passed in register
+    JumpTable,             ///< Build jump-instruction tables and replace refs.
     MinSize,               ///< Function must be optimized for size first
     Naked,                 ///< Naked function
     Nest,                  ///< Nested function static chain
diff --git a/include/llvm/IR/AutoUpgrade.h b/include/llvm/IR/AutoUpgrade.h
index 076ed4a..a4b3c41 100644
--- a/include/llvm/IR/AutoUpgrade.h
+++ b/include/llvm/IR/AutoUpgrade.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_IR_AUTOUPGRADE_H
 #define LLVM_IR_AUTOUPGRADE_H
 
+#include <string>
+
 namespace llvm {
   class CallInst;
   class Constant;
@@ -61,6 +63,9 @@ namespace llvm {
   /// Check the debug info version number, if it is out-dated, drop the debug
   /// info. Return true if module is modified.
   bool UpgradeDebugInfo(Module &M);
+
+  /// Upgrade a metadata string constant in place.
+  void UpgradeMDStringConstant(std::string &String);
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/IR/Comdat.h b/include/llvm/IR/Comdat.h
new file mode 100644
index 0000000..3e77a77
--- /dev/null
+++ b/include/llvm/IR/Comdat.h
@@ -0,0 +1,66 @@
+//===-- llvm/IR/Comdat.h - Comdat definitions -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// @file
+/// This file contains the declaration of the Comdat class, which represents a
+/// single COMDAT in LLVM.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_COMDAT_H
+#define LLVM_IR_COMDAT_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class raw_ostream;
+template <typename ValueTy> class StringMapEntry;
+
+// This is a Name X SelectionKind pair. The reason for having this be an
+// independent object instead of just adding the name and the SelectionKind
+// to a GlobalObject is that it is invalid to have two Comdats with the same
+// name but different SelectionKind. This structure makes that unrepresentable.
+class Comdat {
+public:
+  enum SelectionKind {
+    Any,          ///< The linker may choose any COMDAT.
+    ExactMatch,   ///< The data referenced by the COMDAT must be the same.
+    Largest,      ///< The linker will choose the largest COMDAT.
+    NoDuplicates, ///< No other Module may specify this COMDAT.
+    SameSize,     ///< The data referenced by the COMDAT must be the same size.
+  };
+
+  Comdat(Comdat &&C);
+  SelectionKind getSelectionKind() const { return SK; }
+  void setSelectionKind(SelectionKind Val) { SK = Val; }
+  StringRef getName() const;
+  void print(raw_ostream &OS) const;
+  void dump() const;
+
+private:
+  friend class Module;
+  Comdat();
+  Comdat(SelectionKind SK, StringMapEntry<Comdat> *Name);
+  Comdat(const Comdat &) LLVM_DELETED_FUNCTION;
+
+  // Points to the map in Module.
+  StringMapEntry<Comdat> *Name;
+  SelectionKind SK;
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const Comdat &C) {
+  C.print(OS);
+  return OS;
+}
+
+} // end llvm namespace
+
+#endif
diff --git a/include/llvm/IR/Constant.h b/include/llvm/IR/Constant.h
index f03e3dd..82ad9fc 100644
--- a/include/llvm/IR/Constant.h
+++ b/include/llvm/IR/Constant.h
@@ -64,6 +64,9 @@ public:
   /// Return true if the value is negative zero or null value.
   bool isZeroValue() const;
 
+  /// \brief Return true if the value is the smallest signed value.
+  bool isMinSignedValue() const;
+
   /// canTrap - Return true if evaluation of this constant could trap.  This is
   /// true for things like constant expressions that could divide by zero.
   bool canTrap() const;
@@ -71,6 +74,9 @@ public:
   /// isThreadDependent - Return true if the value can vary between threads.
   bool isThreadDependent() const;
 
+  /// Return true if the value is dependent on a dllimport variable.
+  bool isDLLImportDependent() const;
+
   /// isConstantUsed - Return true if the constant has users other than constant
   /// exprs and other dangling things.
   bool isConstantUsed() const;
@@ -163,6 +169,14 @@ public:
   /// that want to check to see if a global is unused, but don't want to deal
   /// with potentially dead constants hanging off of the globals.
   void removeDeadConstantUsers() const;
+
+  Constant *stripPointerCasts() {
+    return cast<Constant>(Value::stripPointerCasts());
+  }
+
+  const Constant *stripPointerCasts() const {
+    return const_cast<Constant*>(this)->stripPointerCasts();
+  }
 };
 
 } // End llvm namespace
diff --git a/include/llvm/IR/DIBuilder.h b/include/llvm/IR/DIBuilder.h
index 8b05bbb..2673504 100644
--- a/include/llvm/IR/DIBuilder.h
+++ b/include/llvm/IR/DIBuilder.h
@@ -108,12 +108,23 @@ namespace llvm {
     ///                 Objective-C.
     /// @param SplitName The name of the file that we'll split debug info out
     ///                  into.
+    /// @param Kind     The kind of debug information to generate.
+    /// @param EmitDebugInfo   A boolean flag which indicates whether debug
+    ///                        information should be written to the final
+    ///                        output or not. When this is false, debug
+    ///                        information annotations will be present in
+    ///                        the IL but they are not written to the final
+    ///                        assembly or object file. This supports tracking
+    ///                        source location information in the back end
+    ///                        without actually changing the output (e.g.,
+    ///                        when using optimization remarks).
     DICompileUnit createCompileUnit(unsigned Lang, StringRef File,
                                     StringRef Dir, StringRef Producer,
                                     bool isOptimized, StringRef Flags,
                                     unsigned RV,
                                     StringRef SplitName = StringRef(),
-                                    DebugEmissionKind Kind = FullDebug);
+                                    DebugEmissionKind Kind = FullDebug,
+                                    bool EmitDebugInfo = true);
 
     /// createFile - Create a file descriptor to hold debugging information
     /// for a file.
diff --git a/include/llvm/IR/DataLayout.h b/include/llvm/IR/DataLayout.h
index 3079dec..877029f 100644
--- a/include/llvm/IR/DataLayout.h
+++ b/include/llvm/IR/DataLayout.h
@@ -414,8 +414,8 @@ public:
     return (LargestSize == 0) ? nullptr : Type::getIntNTy(C, LargestSize);
   }
 
-  /// getLargestLegalIntType - Return the size of largest legal integer type
-  /// size, or 0 if none are set.
+  /// getLargestLegalIntTypeSize - Return the size of largest legal integer
+  /// type size, or 0 if none are set.
   unsigned getLargestLegalIntTypeSize() const;
 
   /// getIndexedOffset - return the offset from the beginning of the type for
diff --git a/include/llvm/IR/DebugInfo.h b/include/llvm/IR/DebugInfo.h
index 65e0a06..088eb9f 100644
--- a/include/llvm/IR/DebugInfo.h
+++ b/include/llvm/IR/DebugInfo.h
@@ -690,12 +690,17 @@ public:
   /// HasComplexAddr - Return true if the variable has a complex address.
   bool hasComplexAddress() const { return getNumAddrElements() > 0; }
 
-  unsigned getNumAddrElements() const;
-
-  uint64_t getAddrElement(unsigned Idx) const {
-    return getUInt64Field(Idx + 8);
+  /// \brief Return the size of this variable's complex address or
+  /// zero if there is none.
+  unsigned getNumAddrElements() const {
+    if (DbgNode->getNumOperands() < 9)
+      return 0;
+    return getDescriptorField(8)->getNumOperands();
   }
 
+  /// \brief return the Idx'th complex address element.
+  uint64_t getAddrElement(unsigned Idx) const;
+
   /// isBlockByrefVariable - Return true if the variable was declared as
   /// a "__block" variable (Apple Blocks).
   bool isBlockByrefVariable(const DITypeIdentifierMap &Map) const {
@@ -929,6 +934,9 @@ private:
   /// Specify if TypeIdentifierMap is initialized.
   bool TypeMapInitialized;
 };
+
+DenseMap<const Function *, DISubprogram> makeSubprogramMap(const Module &M);
+
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/IR/DebugLoc.h b/include/llvm/IR/DebugLoc.h
index 6d769d4..3d969a8 100644
--- a/include/llvm/IR/DebugLoc.h
+++ b/include/llvm/IR/DebugLoc.h
@@ -95,7 +95,7 @@ namespace llvm {
 
     // getFnDebugLoc - Walk up the scope chain of given debug loc and find line
     // number info for the function.
-    DebugLoc getFnDebugLoc(const LLVMContext &Ctx);
+    DebugLoc getFnDebugLoc(const LLVMContext &Ctx) const;
 
     /// getAsMDNode - This method converts the compressed DebugLoc node into a
     /// DILocation compatible MDNode.
diff --git a/include/llvm/IR/DiagnosticInfo.h b/include/llvm/IR/DiagnosticInfo.h
index e78a42b..de38d07 100644
--- a/include/llvm/IR/DiagnosticInfo.h
+++ b/include/llvm/IR/DiagnosticInfo.h
@@ -138,7 +138,6 @@ public:
   /// \see DiagnosticInfo::print.
   void print(DiagnosticPrinter &DP) const override;
 
-  /// Hand rolled RTTI.
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_InlineAsm;
   }
@@ -166,7 +165,6 @@ public:
   /// \see DiagnosticInfo::print.
   void print(DiagnosticPrinter &DP) const override;
 
-  /// Hand rolled RTTI.
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_StackSize;
   }
@@ -195,7 +193,6 @@ public:
   /// \see DiagnosticInfo::print.
   void print(DiagnosticPrinter &DP) const override;
 
-  /// Hand rolled RTTI.
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_DebugMetadataVersion;
   }
@@ -221,7 +218,6 @@ public:
   /// \see DiagnosticInfo::print.
   void print(DiagnosticPrinter &DP) const override;
 
-  /// Hand rolled RTTI.
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_SampleProfile;
   }
@@ -261,7 +257,6 @@ public:
   /// \see DiagnosticInfo::print.
   void print(DiagnosticPrinter &DP) const override;
 
-  /// Hand rolled RTTI.
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_OptimizationRemark;
   }
@@ -323,7 +318,6 @@ public:
       : DiagnosticInfoOptimizationRemarkBase(DK_OptimizationRemark, PassName,
                                              Fn, DLoc, Msg) {}
 
-  /// Hand rolled RTTI
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_OptimizationRemark;
   }
@@ -350,7 +344,6 @@ public:
       : DiagnosticInfoOptimizationRemarkBase(DK_OptimizationRemarkMissed,
                                              PassName, Fn, DLoc, Msg) {}
 
-  /// Hand rolled RTTI
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_OptimizationRemarkMissed;
   }
@@ -378,7 +371,6 @@ public:
       : DiagnosticInfoOptimizationRemarkBase(DK_OptimizationRemarkAnalysis,
                                              PassName, Fn, DLoc, Msg) {}
 
-  /// Hand rolled RTTI
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_OptimizationRemarkAnalysis;
   }
diff --git a/include/llvm/IR/Dominators.h b/include/llvm/IR/Dominators.h
index 3648202..e2d1ccc 100644
--- a/include/llvm/IR/Dominators.h
+++ b/include/llvm/IR/Dominators.h
@@ -97,10 +97,6 @@ public:
   bool dominates(const BasicBlockEdge &BBE, const Use &U) const;
   bool dominates(const BasicBlockEdge &BBE, const BasicBlock *BB) const;
 
-  inline DomTreeNode *operator[](BasicBlock *BB) const {
-    return getNode(BB);
-  }
-
   // Ensure base class overloads are visible.
   using Base::isReachableFromEntry;
 
diff --git a/include/llvm/IR/GVMaterializer.h b/include/llvm/IR/GVMaterializer.h
index dbe52bc..a1216a1 100644
--- a/include/llvm/IR/GVMaterializer.h
+++ b/include/llvm/IR/GVMaterializer.h
@@ -18,10 +18,9 @@
 #ifndef LLVM_IR_GVMATERIALIZER_H
 #define LLVM_IR_GVMATERIALIZER_H
 
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 namespace llvm {
-
 class Function;
 class GlobalValue;
 class Module;
@@ -43,7 +42,7 @@ public:
 
   /// Make sure the given GlobalValue is fully read.
   ///
-  virtual error_code Materialize(GlobalValue *GV) = 0;
+  virtual std::error_code Materialize(GlobalValue *GV) = 0;
 
   /// If the given GlobalValue is read in, and if the GVMaterializer supports
   /// it, release the memory for the GV, and set it up to be materialized
@@ -54,7 +53,9 @@ public:
 
   /// Make sure the entire Module has been completely read.
   ///
-  virtual error_code MaterializeModule(Module *M) = 0;
+  virtual std::error_code MaterializeModule(Module *M) = 0;
+
+  virtual void releaseBuffer() = 0;
 };
 
 } // End llvm namespace
diff --git a/include/llvm/IR/GlobalAlias.h b/include/llvm/IR/GlobalAlias.h
index d9f0b4a..075b570 100644
--- a/include/llvm/IR/GlobalAlias.h
+++ b/include/llvm/IR/GlobalAlias.h
@@ -34,7 +34,7 @@ class GlobalAlias : public GlobalValue, public ilist_node<GlobalAlias> {
   void setParent(Module *parent);
 
   GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage,
-              const Twine &Name, GlobalObject *Aliasee, Module *Parent);
+              const Twine &Name, Constant *Aliasee, Module *Parent);
 
 public:
   // allocate space for exactly one operand
@@ -46,7 +46,7 @@ public:
   /// the end of the specified module's alias list.
   static GlobalAlias *create(Type *Ty, unsigned AddressSpace,
                              LinkageTypes Linkage, const Twine &Name,
-                             GlobalObject *Aliasee, Module *Parent);
+                             Constant *Aliasee, Module *Parent);
 
   // Without the Aliasee.
   static GlobalAlias *create(Type *Ty, unsigned AddressSpace,
@@ -56,14 +56,14 @@ public:
   // The module is taken from the Aliasee.
   static GlobalAlias *create(Type *Ty, unsigned AddressSpace,
                              LinkageTypes Linkage, const Twine &Name,
-                             GlobalObject *Aliasee);
+                             GlobalValue *Aliasee);
 
   // Type, Parent and AddressSpace taken from the Aliasee.
   static GlobalAlias *create(LinkageTypes Linkage, const Twine &Name,
-                             GlobalObject *Aliasee);
+                             GlobalValue *Aliasee);
 
   // Linkage, Type, Parent and AddressSpace taken from the Aliasee.
-  static GlobalAlias *create(const Twine &Name, GlobalObject *Aliasee);
+  static GlobalAlias *create(const Twine &Name, GlobalValue *Aliasee);
 
   /// Provide fast operand accessors
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant);
@@ -78,14 +78,28 @@ public:
   ///
   void eraseFromParent() override;
 
-  /// set/getAliasee - These methods retrive and set alias target.
-  void setAliasee(GlobalObject *GO);
-  const GlobalObject *getAliasee() const {
+  /// These methods retrive and set alias target.
+  void setAliasee(Constant *Aliasee);
+  const Constant *getAliasee() const {
     return const_cast<GlobalAlias *>(this)->getAliasee();
   }
+  Constant *getAliasee() {
+    return getOperand(0);
+  }
 
-  GlobalObject *getAliasee() {
-    return cast_or_null<GlobalObject>(getOperand(0));
+  const GlobalObject *getBaseObject() const {
+    return const_cast<GlobalAlias *>(this)->getBaseObject();
+  }
+  GlobalObject *getBaseObject() {
+    return dyn_cast<GlobalObject>(getAliasee()->stripInBoundsOffsets());
+  }
+
+  const GlobalObject *getBaseObject(const DataLayout &DL, APInt &Offset) const {
+    return const_cast<GlobalAlias *>(this)->getBaseObject(DL, Offset);
+  }
+  GlobalObject *getBaseObject(const DataLayout &DL, APInt &Offset) {
+    return dyn_cast<GlobalObject>(
+        getAliasee()->stripAndAccumulateInBoundsConstantOffsets(DL, Offset));
   }
 
   static bool isValidLinkage(LinkageTypes L) {
diff --git a/include/llvm/IR/GlobalObject.h b/include/llvm/IR/GlobalObject.h
index 3bc8b85..2e042f4 100644
--- a/include/llvm/IR/GlobalObject.h
+++ b/include/llvm/IR/GlobalObject.h
@@ -20,7 +20,7 @@
 #include "llvm/IR/GlobalValue.h"
 
 namespace llvm {
-
+class Comdat;
 class Module;
 
 class GlobalObject : public GlobalValue {
@@ -29,21 +29,27 @@ class GlobalObject : public GlobalValue {
 protected:
   GlobalObject(Type *Ty, ValueTy VTy, Use *Ops, unsigned NumOps,
                LinkageTypes Linkage, const Twine &Name)
-      : GlobalValue(Ty, VTy, Ops, NumOps, Linkage, Name) {
+      : GlobalValue(Ty, VTy, Ops, NumOps, Linkage, Name), ObjComdat(nullptr) {
     setGlobalValueSubClassData(0);
   }
 
   std::string Section;     // Section to emit this into, empty means default
+  Comdat *ObjComdat;
 public:
   unsigned getAlignment() const {
     return (1u << getGlobalValueSubClassData()) >> 1;
   }
   void setAlignment(unsigned Align);
 
-  bool hasSection() const { return !getSection().empty(); }
-  const std::string &getSection() const { return Section; }
+  bool hasSection() const { return !StringRef(getSection()).empty(); }
+  const char *getSection() const { return Section.c_str(); }
   void setSection(StringRef S);
 
+  bool hasComdat() const { return getComdat() != nullptr; }
+  const Comdat *getComdat() const { return ObjComdat; }
+  Comdat *getComdat() { return ObjComdat; }
+  void setComdat(Comdat *C) { ObjComdat = C; }
+
   void copyAttributesFrom(const GlobalValue *Src) override;
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h
index 10df372..68e410b 100644
--- a/include/llvm/IR/GlobalValue.h
+++ b/include/llvm/IR/GlobalValue.h
@@ -23,6 +23,7 @@
 
 namespace llvm {
 
+class Comdat;
 class PointerType;
 class Module;
 
@@ -63,7 +64,8 @@ protected:
               LinkageTypes Linkage, const Twine &Name)
       : Constant(Ty, VTy, Ops, NumOps), Linkage(Linkage),
         Visibility(DefaultVisibility), UnnamedAddr(0),
-        DllStorageClass(DefaultStorageClass), Parent(nullptr) {
+        DllStorageClass(DefaultStorageClass),
+        ThreadLocal(NotThreadLocal), Parent(nullptr) {
     setName(Name);
   }
 
@@ -74,21 +76,32 @@ protected:
   unsigned UnnamedAddr : 1;   // This value's address is not significant
   unsigned DllStorageClass : 2; // DLL storage class
 
+  unsigned ThreadLocal : 3; // Is this symbol "Thread Local", if so, what is
+                            // the desired model?
+
 private:
   // Give subclasses access to what otherwise would be wasted padding.
-  // (22 + 2 + 1 + 2 + 5) == 32.
-  unsigned SubClassData : 22;
+  // (19 + 3 + 2 + 1 + 2 + 5) == 32.
+  unsigned SubClassData : 19;
 protected:
   unsigned getGlobalValueSubClassData() const {
     return SubClassData;
   }
   void setGlobalValueSubClassData(unsigned V) {
-    assert(V < (1 << 22) && "It will not fit");
+    assert(V < (1 << 19) && "It will not fit");
     SubClassData = V;
   }
 
   Module *Parent;             // The containing module.
 public:
+  enum ThreadLocalMode {
+    NotThreadLocal = 0,
+    GeneralDynamicTLSModel,
+    LocalDynamicTLSModel,
+    InitialExecTLSModel,
+    LocalExecTLSModel
+  };
+
   ~GlobalValue() {
     removeDeadConstantUsers();   // remove any dead constants using this.
   }
@@ -98,6 +111,12 @@ public:
   bool hasUnnamedAddr() const { return UnnamedAddr; }
   void setUnnamedAddr(bool Val) { UnnamedAddr = Val; }
 
+  bool hasComdat() const { return getComdat() != nullptr; }
+  Comdat *getComdat();
+  const Comdat *getComdat() const {
+    return const_cast<GlobalValue *>(this)->getComdat();
+  }
+
   VisibilityTypes getVisibility() const { return VisibilityTypes(Visibility); }
   bool hasDefaultVisibility() const { return Visibility == DefaultVisibility; }
   bool hasHiddenVisibility() const { return Visibility == HiddenVisibility; }
@@ -110,6 +129,19 @@ public:
     Visibility = V;
   }
 
+  /// If the value is "Thread Local", its value isn't shared by the threads.
+  bool isThreadLocal() const { return getThreadLocalMode() != NotThreadLocal; }
+  void setThreadLocal(bool Val) {
+    setThreadLocalMode(Val ? GeneralDynamicTLSModel : NotThreadLocal);
+  }
+  void setThreadLocalMode(ThreadLocalMode Val) {
+    assert(Val == NotThreadLocal || getValueID() != Value::FunctionVal);
+    ThreadLocal = Val;
+  }
+  ThreadLocalMode getThreadLocalMode() const {
+    return static_cast<ThreadLocalMode>(ThreadLocal);
+  }
+
   DLLStorageClassTypes getDLLStorageClass() const {
     return DLLStorageClassTypes(DllStorageClass);
   }
@@ -121,8 +153,14 @@ public:
   }
   void setDLLStorageClass(DLLStorageClassTypes C) { DllStorageClass = C; }
 
-  bool hasSection() const { return !getSection().empty(); }
-  const std::string &getSection() const;
+  bool hasSection() const { return !StringRef(getSection()).empty(); }
+  // It is unfortunate that we have to use "char *" in here since this is
+  // always non NULL, but:
+  // * The C API expects a null terminated string, so we cannot use StringRef.
+  // * The C API expects us to own it, so we cannot use a std:string.
+  // * For GlobalAliases we can fail to find the section and we have to
+  //   return "", so we cannot use a "const std::string &".
+  const char *getSection() const;
 
   /// Global values are always pointers.
   inline PointerType *getType() const {
@@ -142,6 +180,9 @@ public:
   static bool isAvailableExternallyLinkage(LinkageTypes Linkage) {
     return Linkage == AvailableExternallyLinkage;
   }
+  static bool isLinkOnceODRLinkage(LinkageTypes Linkage) {
+    return Linkage == LinkOnceODRLinkage;
+  }
   static bool isLinkOnceLinkage(LinkageTypes Linkage) {
     return Linkage == LinkOnceAnyLinkage || Linkage == LinkOnceODRLinkage;
   }
diff --git a/include/llvm/IR/GlobalVariable.h b/include/llvm/IR/GlobalVariable.h
index 8cd4332..4189ccb 100644
--- a/include/llvm/IR/GlobalVariable.h
+++ b/include/llvm/IR/GlobalVariable.h
@@ -41,9 +41,6 @@ class GlobalVariable : public GlobalObject, public ilist_node<GlobalVariable> {
   void setParent(Module *parent);
 
   bool isConstantGlobal : 1;                   // Is this a global constant?
-  unsigned threadLocalMode : 3;                // Is this symbol "Thread Local",
-                                               // if so, what is the desired
-                                               // model?
   bool isExternallyInitializedConstant : 1;    // Is this a global whose value
                                                // can change from its initial
                                                // value before global
@@ -55,14 +52,6 @@ public:
     return User::operator new(s, 1);
   }
 
-  enum ThreadLocalMode {
-    NotThreadLocal = 0,
-    GeneralDynamicTLSModel,
-    LocalDynamicTLSModel,
-    InitialExecTLSModel,
-    LocalExecTLSModel
-  };
-
   /// GlobalVariable ctor - If a parent module is specified, the global is
   /// automatically inserted into the end of the specified modules global list.
   GlobalVariable(Type *Ty, bool isConstant, LinkageTypes Linkage,
@@ -155,16 +144,6 @@ public:
   bool isConstant() const { return isConstantGlobal; }
   void setConstant(bool Val) { isConstantGlobal = Val; }
 
-  /// If the value is "Thread Local", its value isn't shared by the threads.
-  bool isThreadLocal() const { return threadLocalMode != NotThreadLocal; }
-  void setThreadLocal(bool Val) {
-    threadLocalMode = Val ? GeneralDynamicTLSModel : NotThreadLocal;
-  }
-  void setThreadLocalMode(ThreadLocalMode Val) { threadLocalMode = Val; }
-  ThreadLocalMode getThreadLocalMode() const {
-    return static_cast<ThreadLocalMode>(threadLocalMode);
-  }
-
   bool isExternallyInitialized() const {
     return isExternallyInitializedConstant;
   }
diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h
index 580d333..00d3684 100644
--- a/include/llvm/IR/IRBuilder.h
+++ b/include/llvm/IR/IRBuilder.h
@@ -327,6 +327,11 @@ public:
     return Type::getIntNTy(Context, N);
   }
 
+  /// \brief Fetch the type representing a 16-bit floating point value.
+  Type *getHalfTy() {
+    return Type::getHalfTy(Context);
+  }
+
   /// \brief Fetch the type representing a 32-bit floating point value.
   Type *getFloatTy() {
     return Type::getFloatTy(Context);
@@ -1464,6 +1469,30 @@ public:
     Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32Ty, NumElts));
     return CreateShuffleVector(V, Undef, Zeros, Name + ".splat");
   }
+
+  /// \brief Return a value that has been extracted from a larger integer type.
+  Value *CreateExtractInteger(const DataLayout &DL, Value *From,
+                              IntegerType *ExtractedTy, uint64_t Offset,
+                              const Twine &Name) {
+    IntegerType *IntTy = cast<IntegerType>(From->getType());
+    assert(DL.getTypeStoreSize(ExtractedTy) + Offset <=
+               DL.getTypeStoreSize(IntTy) &&
+           "Element extends past full value");
+    uint64_t ShAmt = 8 * Offset;
+    Value *V = From;
+    if (DL.isBigEndian())
+      ShAmt = 8 * (DL.getTypeStoreSize(IntTy) -
+                   DL.getTypeStoreSize(ExtractedTy) - Offset);
+    if (ShAmt) {
+      V = CreateLShr(V, ShAmt, Name + ".shift");
+    }
+    assert(ExtractedTy->getBitWidth() <= IntTy->getBitWidth() &&
+           "Cannot extract to a larger integer!");
+    if (ExtractedTy != IntTy) {
+      V = CreateTrunc(V, ExtractedTy, Name + ".trunc");
+    }
+    return V;
+  }
 };
 
 // Create wrappers for C Binding types (see CBindingWrapping.h).
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index 7d338a6..a590f5a 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -500,6 +500,16 @@ public:
                                 (unsigned)V);
   }
 
+  /// Return true if this cmpxchg may spuriously fail.
+  bool isWeak() const {
+    return getSubclassDataFromInstruction() & 0x100;
+  }
+
+  void setWeak(bool IsWeak) {
+    setInstructionSubclassData((getSubclassDataFromInstruction() & ~0x100) |
+                               (IsWeak << 8));
+  }
+
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
@@ -2311,12 +2321,14 @@ public:
                                (V ? 1 : 0));
   }
 
-  /// addClause - Add a catch or filter clause to the landing pad.
-  void addClause(Value *ClauseVal);
+  /// Add a catch or filter clause to the landing pad.
+  void addClause(Constant *ClauseVal);
 
-  /// getClause - Get the value of the clause at index Idx. Use isCatch/isFilter
-  /// to determine what type of clause this is.
-  Value *getClause(unsigned Idx) const { return OperandList[Idx + 1]; }
+  /// Get the value of the clause at index Idx. Use isCatch/isFilter to
+  /// determine what type of clause this is.
+  Constant *getClause(unsigned Idx) const {
+    return cast<Constant>(OperandList[Idx + 1]);
+  }
 
   /// isCatch - Return 'true' if the clause and index Idx is a catch clause.
   bool isCatch(unsigned Idx) const {
@@ -2649,6 +2661,9 @@ public:
       assert(RHS.SI == SI && "Incompatible operators.");
       return RHS.Index != Index;
     }
+    Self &operator*() {
+      return *this;
+    }
   };
 
   typedef CaseIteratorT<const SwitchInst, const ConstantInt, const BasicBlock>
@@ -2729,6 +2744,17 @@ public:
   ConstCaseIt case_end() const {
     return ConstCaseIt(this, getNumCases());
   }
+
+  /// cases - iteration adapter for range-for loops.
+  iterator_range<CaseIt> cases() {
+    return iterator_range<CaseIt>(case_begin(), case_end());
+  }
+
+  /// cases - iteration adapter for range-for loops.
+  iterator_range<ConstCaseIt> cases() const {
+    return iterator_range<ConstCaseIt>(case_begin(), case_end());
+  }
+
   /// Returns an iterator that points to the default case.
   /// Note: this iterator allows to resolve successor only. Attempt
   /// to resolve case value causes an assertion.
diff --git a/include/llvm/IR/Intrinsics.h b/include/llvm/IR/Intrinsics.h
index 839bbbd..b0d746b 100644
--- a/include/llvm/IR/Intrinsics.h
+++ b/include/llvm/IR/Intrinsics.h
@@ -71,6 +71,9 @@ namespace Intrinsic {
 
   /// Map a GCC builtin name to an intrinsic ID.
   ID getIntrinsicForGCCBuiltin(const char *Prefix, const char *BuiltinName);
+
+  /// Map a MS builtin name to an intrinsic ID.
+  ID getIntrinsicForMSBuiltin(const char *Prefix, const char *BuiltinName);
   
   /// IITDescriptor - This is a type descriptor which explains the type
   /// requirements of an intrinsic.  This is returned by
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index edd1621..ae2a90c 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -226,6 +226,10 @@ class GCCBuiltin<string name> {
   string GCCBuiltinName = name;
 }
 
+class MSBuiltin<string name> {
+  string MSBuiltinName = name;
+}
+
 
 //===--------------- Variable Argument Handling Intrinsics ----------------===//
 //
diff --git a/include/llvm/IR/IntrinsicsAArch64.td b/include/llvm/IR/IntrinsicsAArch64.td
index 23757aa..e3c0fb3 100644
--- a/include/llvm/IR/IntrinsicsAArch64.td
+++ b/include/llvm/IR/IntrinsicsAArch64.td
@@ -31,6 +31,13 @@ def int_aarch64_sdiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
                                 LLVMMatchType<0>], [IntrNoMem]>;
 def int_aarch64_udiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
                                 LLVMMatchType<0>], [IntrNoMem]>;
+
+//===----------------------------------------------------------------------===//
+// RBIT
+
+def int_aarch64_rbit : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>],
+                                 [IntrNoMem]>;
+
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/IR/IntrinsicsARM.td b/include/llvm/IR/IntrinsicsARM.td
index d19d7b8..a02d707 100644
--- a/include/llvm/IR/IntrinsicsARM.td
+++ b/include/llvm/IR/IntrinsicsARM.td
@@ -54,8 +54,12 @@ def int_arm_ldaexd : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_ptr_ty]>;
 
 //===----------------------------------------------------------------------===//
 // Data barrier instructions
-def int_arm_dmb : GCCBuiltin<"__builtin_arm_dmb">, Intrinsic<[], [llvm_i32_ty]>;
-def int_arm_dsb : GCCBuiltin<"__builtin_arm_dsb">, Intrinsic<[], [llvm_i32_ty]>;
+def int_arm_dmb : GCCBuiltin<"__builtin_arm_dmb">, MSBuiltin<"__dmb">,
+                  Intrinsic<[], [llvm_i32_ty]>;
+def int_arm_dsb : GCCBuiltin<"__builtin_arm_dsb">, MSBuiltin<"__dsb">,
+                  Intrinsic<[], [llvm_i32_ty]>;
+def int_arm_isb : GCCBuiltin<"__builtin_arm_isb">, MSBuiltin<"__isb">,
+                  Intrinsic<[], [llvm_i32_ty]>;
 
 //===----------------------------------------------------------------------===//
 // VFP
@@ -74,17 +78,21 @@ def int_arm_vcvtru    : Intrinsic<[llvm_float_ty], [llvm_anyfloat_ty],
 
 // Move to coprocessor
 def int_arm_mcr : GCCBuiltin<"__builtin_arm_mcr">,
+                  MSBuiltin<"_MoveToCoprocessor">,
    Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
 def int_arm_mcr2 : GCCBuiltin<"__builtin_arm_mcr2">,
+                   MSBuiltin<"_MoveToCoprocessor2">,
    Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
 
 // Move from coprocessor
 def int_arm_mrc : GCCBuiltin<"__builtin_arm_mrc">,
+                  MSBuiltin<"_MoveFromCoprocessor">,
    Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                              llvm_i32_ty, llvm_i32_ty], []>;
 def int_arm_mrc2 : GCCBuiltin<"__builtin_arm_mrc2">,
+                   MSBuiltin<"_MoveFromCoprocessor2">,
    Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                              llvm_i32_ty, llvm_i32_ty], []>;
 
@@ -126,6 +134,11 @@ def int_arm_crc32cw : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
 def int_arm_hint : Intrinsic<[], [llvm_i32_ty]>;
 
 //===----------------------------------------------------------------------===//
+// RBIT
+
+def int_arm_rbit : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+//===----------------------------------------------------------------------===//
 // UND (reserved undefined sequence)
 
 def int_arm_undefined : Intrinsic<[], [llvm_i32_ty]>;
diff --git a/include/llvm/IR/IntrinsicsNVVM.td b/include/llvm/IR/IntrinsicsNVVM.td
index 26dc70a..6baf018 100644
--- a/include/llvm/IR/IntrinsicsNVVM.td
+++ b/include/llvm/IR/IntrinsicsNVVM.td
@@ -796,26 +796,25 @@ def llvm_anyi64ptr_ty     : LLVMAnyPointerType<llvm_i64_ty>;     // (space)i64*
 
 
 // Generated within nvvm. Use for ldu on sm_20 or later
-// @TODO: Revisit this, Changed LLVMAnyPointerType to LLVMPointerType
 def int_nvvm_ldu_global_i : Intrinsic<[llvm_anyint_ty],
-  [LLVMPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  [LLVMAnyPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
   "llvm.nvvm.ldu.global.i">;
 def int_nvvm_ldu_global_f : Intrinsic<[llvm_anyfloat_ty],
-  [LLVMPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  [LLVMAnyPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
   "llvm.nvvm.ldu.global.f">;
 def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty],
-  [LLVMPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  [LLVMAnyPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
   "llvm.nvvm.ldu.global.p">;
 
 // Generated within nvvm. Use for ldg on sm_35 or later
 def int_nvvm_ldg_global_i : Intrinsic<[llvm_anyint_ty],
-  [LLVMPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  [LLVMAnyPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
   "llvm.nvvm.ldg.global.i">;
 def int_nvvm_ldg_global_f : Intrinsic<[llvm_anyfloat_ty],
-  [LLVMPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  [LLVMAnyPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
   "llvm.nvvm.ldg.global.f">;
 def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty],
-  [LLVMPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  [LLVMAnyPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
   "llvm.nvvm.ldg.global.p">;
 
 // Use for generic pointers
@@ -889,6 +888,157 @@ def int_nvvm_compiler_error :
 def int_nvvm_compiler_warn :
     Intrinsic<[], [llvm_anyptr_ty], [], "llvm.nvvm.compiler.warn">;
 
+def int_nvvm_reflect :
+  Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty], [IntrNoMem], "llvm.nvvm.reflect">;
+
+// isspacep.{const, global, local, shared}
+def int_nvvm_isspacep_const
+  : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem],
+              "llvm.nvvm.isspacep.const">,
+    GCCBuiltin<"__nvvm_isspacep_const">;
+def int_nvvm_isspacep_global
+  : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem],
+              "llvm.nvvm.isspacep.global">,
+    GCCBuiltin<"__nvvm_isspacep_global">;
+def int_nvvm_isspacep_local
+  : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem],
+              "llvm.nvvm.isspacep.local">,
+    GCCBuiltin<"__nvvm_isspacep_local">;
+def int_nvvm_isspacep_shared
+  : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty], [IntrNoMem],
+              "llvm.nvvm.isspacep.shared">,
+    GCCBuiltin<"__nvvm_isspacep_shared">;
+
+// Environment register read
+def int_nvvm_read_ptx_sreg_envreg0
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg0">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg0">;
+def int_nvvm_read_ptx_sreg_envreg1
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg1">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg1">;
+def int_nvvm_read_ptx_sreg_envreg2
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg2">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg2">;
+def int_nvvm_read_ptx_sreg_envreg3
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg3">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg3">;
+def int_nvvm_read_ptx_sreg_envreg4
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg4">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg4">;
+def int_nvvm_read_ptx_sreg_envreg5
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg5">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg5">;
+def int_nvvm_read_ptx_sreg_envreg6
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg6">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg6">;
+def int_nvvm_read_ptx_sreg_envreg7
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg7">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg7">;
+def int_nvvm_read_ptx_sreg_envreg8
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg8">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg8">;
+def int_nvvm_read_ptx_sreg_envreg9
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg9">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg9">;
+def int_nvvm_read_ptx_sreg_envreg10
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg10">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg10">;
+def int_nvvm_read_ptx_sreg_envreg11
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg11">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg11">;
+def int_nvvm_read_ptx_sreg_envreg12
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg12">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg12">;
+def int_nvvm_read_ptx_sreg_envreg13
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg13">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg13">;
+def int_nvvm_read_ptx_sreg_envreg14
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg14">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg14">;
+def int_nvvm_read_ptx_sreg_envreg15
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg15">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg15">;
+def int_nvvm_read_ptx_sreg_envreg16
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg16">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg16">;
+def int_nvvm_read_ptx_sreg_envreg17
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg17">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg17">;
+def int_nvvm_read_ptx_sreg_envreg18
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg18">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg18">;
+def int_nvvm_read_ptx_sreg_envreg19
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg19">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg19">;
+def int_nvvm_read_ptx_sreg_envreg20
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg20">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg20">;
+def int_nvvm_read_ptx_sreg_envreg21
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg21">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg21">;
+def int_nvvm_read_ptx_sreg_envreg22
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg22">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg22">;
+def int_nvvm_read_ptx_sreg_envreg23
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg23">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg23">;
+def int_nvvm_read_ptx_sreg_envreg24
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg24">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg24">;
+def int_nvvm_read_ptx_sreg_envreg25
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg25">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg25">;
+def int_nvvm_read_ptx_sreg_envreg26
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg26">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg26">;
+def int_nvvm_read_ptx_sreg_envreg27
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg27">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg27">;
+def int_nvvm_read_ptx_sreg_envreg28
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg28">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg28">;
+def int_nvvm_read_ptx_sreg_envreg29
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg29">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg29">;
+def int_nvvm_read_ptx_sreg_envreg30
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg30">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg30">;
+def int_nvvm_read_ptx_sreg_envreg31
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem],
+              "llvm.nvvm.read.ptx.sreg.envreg31">,
+    GCCBuiltin<"__nvvm_read_ptx_sreg_envreg31">;
+
 
 // Texture Fetch
 def int_nvvm_tex_1d_v4f32_i32
@@ -1800,6 +1950,25 @@ def int_nvvm_sust_p_3d_v4i32_trap
               "llvm.nvvm.sust.p.3d.v4i32.trap">,
     GCCBuiltin<"__nvvm_sust_p_3d_v4i32_trap">;
 
+def int_nvvm_rotate_b32
+  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+              [IntrNoMem], "llvm.nvvm.rotate.b32">,
+              GCCBuiltin<"__nvvm_rotate_b32">;
+
+def int_nvvm_rotate_b64
+  :Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
+             [IntrNoMem], "llvm.nvvm.rotate.b64">,
+             GCCBuiltin<"__nvvm_rotate_b64">;
+
+def int_nvvm_rotate_right_b64
+  : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
+              [IntrNoMem], "llvm.nvvm.rotate.right.b64">,
+              GCCBuiltin<"__nvvm_rotate_right_b64">;
+
+def int_nvvm_swap_lo_hi_b64
+  : Intrinsic<[llvm_i64_ty], [llvm_i64_ty],
+              [IntrNoMem], "llvm.nvvm.swap.lo.hi.b64">,
+              GCCBuiltin<"__nvvm_swap_lo_hi_b64">;
 
 
 // Old PTX back-end intrinsics retained here for backwards-compatibility
diff --git a/include/llvm/IR/IntrinsicsR600.td b/include/llvm/IR/IntrinsicsR600.td
index ecb5668..ba69eaa 100644
--- a/include/llvm/IR/IntrinsicsR600.td
+++ b/include/llvm/IR/IntrinsicsR600.td
@@ -33,4 +33,40 @@ defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz <
                                        "__builtin_r600_read_tgid">;
 defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz <
                                        "__builtin_r600_read_tidig">;
+
 } // End TargetPrefix = "r600"
+
+let TargetPrefix = "AMDGPU" in {
+def int_AMDGPU_div_scale : GCCBuiltin<"__builtin_amdgpu_div_scale">,
+  // 1st parameter: Numerator
+  // 2nd parameter: Denominator
+  // 3rd parameter: Constant to select select between first and
+  //                second. (0 = first, 1 = second).
+  Intrinsic<[llvm_anyfloat_ty, llvm_i1_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
+            [IntrNoMem]>;
+
+def int_AMDGPU_div_fmas : GCCBuiltin<"__builtin_amdgpu_div_fmas">,
+  Intrinsic<[llvm_anyfloat_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+            [IntrNoMem]>;
+
+def int_AMDGPU_div_fixup : GCCBuiltin<"__builtin_amdgpu_div_fixup">,
+  Intrinsic<[llvm_anyfloat_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+            [IntrNoMem]>;
+
+def int_AMDGPU_trig_preop : GCCBuiltin<"__builtin_amdgpu_trig_preop">,
+  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
+            [IntrNoMem]>;
+
+def int_AMDGPU_rcp : GCCBuiltin<"__builtin_amdgpu_rcp">,
+  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+
+def int_AMDGPU_rsq : GCCBuiltin<"__builtin_amdgpu_rsq">,
+  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+
+def int_AMDGPU_rsq_clamped : GCCBuiltin<"__builtin_amdgpu_rsq_clamped">,
+  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+
+} // End TargetPrefix = "AMDGPU"
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 36d93fe..5de9508 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -26,6 +26,12 @@ let TargetPrefix = "x86" in {
               Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], [IntrReadWriteArgMem]>;
 }
 
+// Read Performance-Monitoring Counter.
+let TargetPrefix = "x86" in {
+  def int_x86_rdpmc : GCCBuiltin<"__builtin_ia32_rdpmc">,
+              Intrinsic<[llvm_i64_ty], [llvm_i32_ty], []>;
+}
+
 //===----------------------------------------------------------------------===//
 // 3DNow!
 
@@ -667,6 +673,15 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_ssse3_pshuf_b_128     : GCCBuiltin<"__builtin_ia32_pshufb128">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
                          llvm_v16i8_ty], [IntrNoMem]>;
+  def int_x86_sse2_pshuf_d          : GCCBuiltin<"__builtin_ia32_pshufd">,
+              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i8_ty],
+                         [IntrNoMem]>;
+  def int_x86_sse2_pshufl_w         : GCCBuiltin<"__builtin_ia32_pshuflw">,
+              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i8_ty],
+                         [IntrNoMem]>;
+  def int_x86_sse2_pshufh_w         : GCCBuiltin<"__builtin_ia32_pshufhw">,
+              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i8_ty],
+                         [IntrNoMem]>;
   def int_x86_sse_pshuf_w           : GCCBuiltin<"__builtin_ia32_pshufw">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i8_ty],
                          [IntrNoMem]>;
@@ -1304,15 +1319,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Vector load with broadcast
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx_vbroadcast_ss :
-        GCCBuiltin<"__builtin_ia32_vbroadcastss">,
-        Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
-  def int_x86_avx_vbroadcast_sd_256 :
-        GCCBuiltin<"__builtin_ia32_vbroadcastsd256">,
-        Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
-  def int_x86_avx_vbroadcast_ss_256 :
-        GCCBuiltin<"__builtin_ia32_vbroadcastss256">,
-        Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
   def int_x86_avx_vbroadcastf128_pd_256 :
         GCCBuiltin<"__builtin_ia32_vbroadcastf128_pd256">,
         Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
@@ -1948,6 +1954,8 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                          llvm_i32_ty], [IntrNoMem, Commutative]>;
   def int_x86_avx2_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
+  def int_x86_avx512_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa512">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3141,6 +3149,16 @@ let TargetPrefix = "x86" in {
           Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
                     llvm_v8i64_ty, llvm_i8_ty],
                     []>;
+  def int_x86_avx512_mask_lzcnt_d_512 :
+          GCCBuiltin<"__builtin_ia32_vplzcntd_512_mask">,
+          Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+                    llvm_v16i32_ty, llvm_i16_ty],
+                    []>;
+  def int_x86_avx512_mask_lzcnt_q_512 :
+          GCCBuiltin<"__builtin_ia32_vplzcntq_512_mask">,
+          Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                    llvm_v8i64_ty, llvm_i8_ty],
+                    []>;
 }
 
 // Vector blend
diff --git a/include/llvm/IR/LegacyPassNameParser.h b/include/llvm/IR/LegacyPassNameParser.h
index b72fc4c..e2e4912 100644
--- a/include/llvm/IR/LegacyPassNameParser.h
+++ b/include/llvm/IR/LegacyPassNameParser.h
@@ -43,7 +43,7 @@ class PassNameParser : public PassRegistrationListener,
                        public cl::parser<const PassInfo*> {
   cl::Option *Opt;
 public:
-  PassNameParser() : Opt(nullptr) {}
+  PassNameParser();
   virtual ~PassNameParser();
 
   void initialize(cl::Option &O) {
diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h
index 0c309e8..26f62db 100644
--- a/include/llvm/IR/Module.h
+++ b/include/llvm/IR/Module.h
@@ -16,6 +16,7 @@
 #define LLVM_IR_MODULE_H
 
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/IR/Comdat.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
@@ -23,13 +24,13 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/DataTypes.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 namespace llvm {
-
 class FunctionType;
 class GVMaterializer;
 class LLVMContext;
+class RandomNumberGenerator;
 class StructType;
 template<typename T> struct DenseMapInfo;
 template<typename KeyT, typename ValueT, typename KeyInfoT> class DenseMap;
@@ -123,6 +124,8 @@ public:
   typedef iplist<GlobalAlias> AliasListType;
   /// The type for the list of named metadata.
   typedef ilist<NamedMDNode> NamedMDListType;
+  /// The type of the comdat "symbol" table.
+  typedef StringMap<Comdat> ComdatSymTabType;
 
   /// The Global Variable iterator.
   typedef GlobalListType::iterator                      global_iterator;
@@ -197,11 +200,14 @@ private:
   NamedMDListType NamedMDList;    ///< The named metadata in the module
   std::string GlobalScopeAsm;     ///< Inline Asm at global scope.
   ValueSymbolTable *ValSymTab;    ///< Symbol table for values
+  ComdatSymTabType ComdatSymTab;  ///< Symbol table for COMDATs
   std::unique_ptr<GVMaterializer>
   Materializer;                   ///< Used to materialize GlobalValues
   std::string ModuleID;           ///< Human readable identifier for the module
   std::string TargetTriple;       ///< Platform target triple Module compiled on
   void *NamedMDSymTab;            ///< NamedMDNode names.
+  // Allow lazy initialization in const method.
+  mutable RandomNumberGenerator *RNG; ///< The random number generator for this module.
 
   // We need to keep the string because the C API expects us to own the string
   // representation.
@@ -250,6 +256,11 @@ public:
   /// @returns a string containing the module-scope inline assembly blocks.
   const std::string &getModuleInlineAsm() const { return GlobalScopeAsm; }
 
+  /// Get the RandomNumberGenerator for this module. The RNG can be
+  /// seeded via -rng-seed=<uint64> and is salted with the ModuleID.
+  /// The returned RNG should not be shared across threads.
+  RandomNumberGenerator &getRNG() const;
+
 /// @}
 /// @name Module Level Mutators
 /// @{
@@ -397,6 +408,14 @@ public:
   void eraseNamedMetadata(NamedMDNode *NMD);
 
 /// @}
+/// @name Comdat Accessors
+/// @{
+
+  /// Return the Comdat in the module with the specified name. It is created
+  /// if it didn't already exist.
+  Comdat *getOrInsertComdat(StringRef Name);
+
+/// @}
 /// @name Module Flags Accessors
 /// @{
 
@@ -454,12 +473,12 @@ public:
   void Dematerialize(GlobalValue *GV);
 
   /// Make sure all GlobalValues in this Module are fully read.
-  error_code materializeAll();
+  std::error_code materializeAll();
 
   /// Make sure all GlobalValues in this Module are fully read and clear the
   /// Materializer. If the module is corrupt, this DOES NOT clear the old
   /// Materializer.
-  error_code materializeAllPermanently();
+  std::error_code materializeAllPermanently(bool ReleaseBuffer = false);
 
 /// @}
 /// @name Direct access to the globals list, functions list, and symbol table
@@ -497,6 +516,10 @@ public:
   const ValueSymbolTable &getValueSymbolTable() const { return *ValSymTab; }
   /// Get the Module's symbol table of global variable and function identifiers.
   ValueSymbolTable       &getValueSymbolTable()       { return *ValSymTab; }
+  /// Get the Module's symbol table for COMDATs (constant).
+  const ComdatSymTabType &getComdatSymbolTable() const { return ComdatSymTab; }
+  /// Get the Module's symbol table for COMDATs.
+  ComdatSymTabType &getComdatSymbolTable() { return ComdatSymTab; }
 
 /// @}
 /// @name Global Variable Iteration
diff --git a/include/llvm/IR/User.h b/include/llvm/IR/User.h
index bc7696b..848adae 100644
--- a/include/llvm/IR/User.h
+++ b/include/llvm/IR/User.h
@@ -39,6 +39,10 @@ class User : public Value {
   friend struct HungoffOperandTraits;
   virtual void anchor();
 protected:
+  /// NumOperands - The number of values used by this User.
+  ///
+  unsigned NumOperands;
+
   /// OperandList - This is a pointer to the array of Uses for this User.
   /// For nodes of fixed arity (e.g. a binary operator) this array will live
   /// prefixed to some derived class instance.  For nodes of resizable variable
@@ -46,13 +50,9 @@ protected:
   /// allocated and should be destroyed by the classes' virtual dtor.
   Use *OperandList;
 
-  /// NumOperands - The number of values used by this User.
-  ///
-  unsigned NumOperands;
-
   void *operator new(size_t s, unsigned Us);
   User(Type *ty, unsigned vty, Use *OpList, unsigned NumOps)
-    : Value(ty, vty), OperandList(OpList), NumOperands(NumOps) {}
+    : Value(ty, vty), NumOperands(NumOps), OperandList(OpList) {}
   Use *allocHungoffUses(unsigned) const;
   void dropHungoffUses() {
     Use::zap(OperandList, OperandList + NumOperands, true);
diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h
index 0158683..b5bbc96 100644
--- a/include/llvm/IR/Value.h
+++ b/include/llvm/IR/Value.h
@@ -67,6 +67,13 @@ typedef StringMapEntry<Value*> ValueName;
 ///
 /// @brief LLVM Value Representation
 class Value {
+  Type *VTy;
+  Use *UseList;
+
+  friend class ValueSymbolTable; // Allow ValueSymbolTable to directly mod Name.
+  friend class ValueHandleBase;
+  ValueName *Name;
+
   const unsigned char SubclassID;   // Subclass identifier (for isa/dyn_cast)
   unsigned char HasValueHandle : 1; // Has a ValueHandle pointing to this?
 protected:
@@ -77,6 +84,11 @@ protected:
   unsigned char SubclassOptionalData : 7;
 
 private:
+  /// SubclassData - This member is defined by this class, but is not used for
+  /// anything.  Subclasses can use it to hold whatever state they find useful.
+  /// This field is initialized to zero by the ctor.
+  unsigned short SubclassData;
+
   template <typename UseT> // UseT == 'Use' or 'const Use'
   class use_iterator_impl
       : public std::iterator<std::forward_iterator_tag, UseT *, ptrdiff_t> {
@@ -167,18 +179,6 @@ private:
     unsigned getOperandNo() const { return UI->getOperandNo(); }
   };
 
-  /// SubclassData - This member is defined by this class, but is not used for
-  /// anything.  Subclasses can use it to hold whatever state they find useful.
-  /// This field is initialized to zero by the ctor.
-  unsigned short SubclassData;
-
-  Type *VTy;
-  Use *UseList;
-
-  friend class ValueSymbolTable; // Allow ValueSymbolTable to directly mod Name.
-  friend class ValueHandleBase;
-  ValueName *Name;
-
   void operator=(const Value &) LLVM_DELETED_FUNCTION;
   Value(const Value &) LLVM_DELETED_FUNCTION;
 
@@ -430,7 +430,7 @@ public:
 
   /// isDereferenceablePointer - Test if this value is always a pointer to
   /// allocated and suitably aligned memory for a simple load or store.
-  bool isDereferenceablePointer() const;
+  bool isDereferenceablePointer(const DataLayout *DL = nullptr) const;
 
   /// DoPHITranslation - If this value is a PHI node with CurBB as its parent,
   /// return the value in the PHI node corresponding to PredBB.  If not, return
diff --git a/include/llvm/IR/ValueMap.h b/include/llvm/IR/ValueMap.h
index 1503aed..43a79c7 100644
--- a/include/llvm/IR/ValueMap.h
+++ b/include/llvm/IR/ValueMap.h
@@ -45,8 +45,10 @@ class ValueMapConstIterator;
 /// This class defines the default behavior for configurable aspects of
 /// ValueMap<>.  User Configs should inherit from this class to be as compatible
 /// as possible with future versions of ValueMap.
-template<typename KeyT>
+template<typename KeyT, typename MutexT = sys::Mutex>
 struct ValueMapConfig {
+  typedef MutexT mutex_type;
+
   /// If FollowRAUW is true, the ValueMap will update mappings on RAUW. If it's
   /// false, the ValueMap will leave the original mapping in place.
   enum { FollowRAUW = true };
@@ -67,7 +69,7 @@ struct ValueMapConfig {
   /// and onDelete) and not inside other ValueMap methods.  NULL means that no
   /// mutex is necessary.
   template<typename ExtraDataT>
-  static sys::Mutex *getMutex(const ExtraDataT &/*Data*/) { return nullptr; }
+  static mutex_type *getMutex(const ExtraDataT &/*Data*/) { return nullptr; }
 };
 
 /// See the file comment.
@@ -85,6 +87,7 @@ public:
   typedef KeyT key_type;
   typedef ValueT mapped_type;
   typedef std::pair<KeyT, ValueT> value_type;
+  typedef unsigned size_type;
 
   explicit ValueMap(unsigned NumInitBuckets = 64)
     : Map(NumInitBuckets), Data() {}
@@ -101,16 +104,16 @@ public:
   inline const_iterator end() const { return const_iterator(Map.end()); }
 
   bool empty() const { return Map.empty(); }
-  unsigned size() const { return Map.size(); }
+  size_type size() const { return Map.size(); }
 
   /// Grow the map so that it has at least Size buckets. Does not shrink
   void resize(size_t Size) { Map.resize(Size); }
 
   void clear() { Map.clear(); }
 
-  /// count - Return true if the specified key is in the map.
-  bool count(const KeyT &Val) const {
-    return Map.find_as(Val) != Map.end();
+  /// Return 1 if the specified key is in the map, 0 otherwise.
+  size_type count(const KeyT &Val) const {
+    return Map.find_as(Val) == Map.end() ? 0 : 1;
   }
 
   iterator find(const KeyT &Val) {
@@ -212,7 +215,7 @@ public:
   void deleted() override {
     // Make a copy that won't get changed even when *this is destroyed.
     ValueMapCallbackVH Copy(*this);
-    sys::Mutex *M = Config::getMutex(Copy.Map->Data);
+    typename Config::mutex_type *M = Config::getMutex(Copy.Map->Data);
     if (M)
       M->acquire();
     Config::onDelete(Copy.Map->Data, Copy.Unwrap());  // May destroy *this.
@@ -225,7 +228,7 @@ public:
            "Invalid RAUW on key of ValueMap<>");
     // Make a copy that won't get changed even when *this is destroyed.
     ValueMapCallbackVH Copy(*this);
-    sys::Mutex *M = Config::getMutex(Copy.Map->Data);
+    typename Config::mutex_type *M = Config::getMutex(Copy.Map->Data);
     if (M)
       M->acquire();
 
diff --git a/include/llvm/IRReader/IRReader.h b/include/llvm/IRReader/IRReader.h
index e2ae5f7..59ffc09 100644
--- a/include/llvm/IRReader/IRReader.h
+++ b/include/llvm/IRReader/IRReader.h
@@ -24,13 +24,6 @@ class MemoryBuffer;
 class SMDiagnostic;
 class LLVMContext;
 
-/// If the given MemoryBuffer holds a bitcode image, return a Module for it
-/// which does lazy deserialization of function bodies.  Otherwise, attempt to
-/// parse it as LLVM Assembly and return a fully populated Module. This
-/// function *always* takes ownership of the given MemoryBuffer.
-Module *getLazyIRModule(MemoryBuffer *Buffer, SMDiagnostic &Err,
-                        LLVMContext &Context);
-
 /// If the given file holds a bitcode image, return a Module
 /// for it which does lazy deserialization of function bodies.  Otherwise,
 /// attempt to parse it as LLVM Assembly and return a fully populated
@@ -40,8 +33,7 @@ Module *getLazyIRFileModule(const std::string &Filename, SMDiagnostic &Err,
 
 /// If the given MemoryBuffer holds a bitcode image, return a Module
 /// for it.  Otherwise, attempt to parse it as LLVM Assembly and return
-/// a Module for it. This function *always* takes ownership of the given
-/// MemoryBuffer.
+/// a Module for it. This function *never* takes ownership of Buffer.
 Module *ParseIR(MemoryBuffer *Buffer, SMDiagnostic &Err, LLVMContext &Context);
 
 /// If the given file holds a bitcode image, return a Module for it.
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 8e53615..0c840f3 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -146,6 +146,8 @@ void initializeInstCountPass(PassRegistry&);
 void initializeInstNamerPass(PassRegistry&);
 void initializeInternalizePassPass(PassRegistry&);
 void initializeIntervalPartitionPass(PassRegistry&);
+void initializeJumpInstrTableInfoPass(PassRegistry&);
+void initializeJumpInstrTablesPass(PassRegistry&);
 void initializeJumpThreadingPass(PassRegistry&);
 void initializeLCSSAPass(PassRegistry&);
 void initializeLICMPass(PassRegistry&);
@@ -272,6 +274,7 @@ void initializeSLPVectorizerPass(PassRegistry&);
 void initializeBBVectorizePass(PassRegistry&);
 void initializeMachineFunctionPrinterPassPass(PassRegistry&);
 void initializeStackMapLivenessPass(PassRegistry&);
+void initializeLoadCombinePass(PassRegistry&);
 }
 
 #endif
diff --git a/include/llvm/LTO/LTOModule.h b/include/llvm/LTO/LTOModule.h
index f1b1480..c43846a 100644
--- a/include/llvm/LTO/LTOModule.h
+++ b/include/llvm/LTO/LTOModule.h
@@ -16,10 +16,10 @@
 
 #include "llvm-c/lto.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/Object/IRObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include <string>
 #include <vector>
@@ -46,9 +46,8 @@ private:
     const GlobalValue *symbol;
   };
 
-  std::unique_ptr<Module> _module;
+  std::unique_ptr<object::IRObjectFile> IRFile;
   std::unique_ptr<TargetMachine> _target;
-  MCObjectFileInfo ObjFileInfo;
   StringSet                               _linkeropt_strings;
   std::vector<const char *>               _deplibs;
   std::vector<const char *>               _linkeropts;
@@ -58,25 +57,22 @@ private:
   StringSet                               _defines;
   StringMap<NameAndAttributes> _undefines;
   std::vector<const char*>                _asm_undefines;
-  MCContext _context;
 
-  // Use mangler to add GlobalPrefix to names to match linker names.
-  Mangler _mangler;
-
-  LTOModule(Module *m, TargetMachine *t);
+  LTOModule(std::unique_ptr<object::IRObjectFile> Obj, TargetMachine *TM);
 
 public:
   /// Returns 'true' if the file or memory contents is LLVM bitcode.
   static bool isBitcodeFile(const void *mem, size_t length);
   static bool isBitcodeFile(const char *path);
 
-  /// Returns 'true' if the file or memory contents is LLVM bitcode for the
-  /// specified triple.
-  static bool isBitcodeFileForTarget(const void *mem,
-                                     size_t length,
-                                     const char *triplePrefix);
-  static bool isBitcodeFileForTarget(const char *path,
-                                     const char *triplePrefix);
+  /// Returns 'true' if the memory buffer is LLVM bitcode for the specified
+  /// triple.
+  static bool isBitcodeForTarget(MemoryBuffer *memBuffer,
+                                 StringRef triplePrefix);
+
+  /// Create a MemoryBuffer from a memory range with an optional name.
+  static MemoryBuffer *makeBuffer(const void *mem, size_t length,
+                                  StringRef name = "");
 
   /// Create an LTOModule. N.B. These methods take ownership of the buffer. The
   /// caller must have initialized the Targets, the TargetMCs, the AsmPrinters,
@@ -86,25 +82,34 @@ public:
   /// InitializeAllTargetMCs();
   /// InitializeAllAsmPrinters();
   /// InitializeAllAsmParsers();
-  static LTOModule *makeLTOModule(const char *path, TargetOptions options,
-                                  std::string &errMsg);
-  static LTOModule *makeLTOModule(int fd, const char *path, size_t size,
-                                  TargetOptions options, std::string &errMsg);
-  static LTOModule *makeLTOModule(int fd, const char *path, size_t map_size,
-                                  off_t offset, TargetOptions options,
-                                  std::string &errMsg);
-  static LTOModule *makeLTOModule(const void *mem, size_t length,
-                                  TargetOptions options, std::string &errMsg,
-                                  StringRef path = "");
+  static LTOModule *createFromFile(const char *path, TargetOptions options,
+                                   std::string &errMsg);
+  static LTOModule *createFromOpenFile(int fd, const char *path, size_t size,
+                                       TargetOptions options,
+                                       std::string &errMsg);
+  static LTOModule *createFromOpenFileSlice(int fd, const char *path,
+                                            size_t map_size, off_t offset,
+                                            TargetOptions options,
+                                            std::string &errMsg);
+  static LTOModule *createFromBuffer(const void *mem, size_t length,
+                                     TargetOptions options, std::string &errMsg,
+                                     StringRef path = "");
+
+  const Module &getModule() const {
+    return const_cast<LTOModule*>(this)->getModule();
+  }
+  Module &getModule() {
+    return IRFile->getModule();
+  }
 
   /// Return the Module's target triple.
-  const char *getTargetTriple() {
-    return _module->getTargetTriple().c_str();
+  const std::string &getTargetTriple() {
+    return getModule().getTargetTriple();
   }
 
   /// Set the Module's target triple.
-  void setTargetTriple(const char *triple) {
-    _module->setTargetTriple(triple);
+  void setTargetTriple(StringRef Triple) {
+    getModule().setTargetTriple(Triple);
   }
 
   /// Get the number of symbols
@@ -150,9 +155,6 @@ public:
     return nullptr;
   }
 
-  /// Return the Module.
-  Module *getLLVVMModule() { return _module.get(); }
-
   const std::vector<const char*> &getAsmUndefinedRefs() {
     return _asm_undefines;
   }
@@ -167,20 +169,20 @@ private:
   bool parseSymbols(std::string &errMsg);
 
   /// Add a symbol which isn't defined just yet to a list to be resolved later.
-  void addPotentialUndefinedSymbol(const GlobalValue *dcl, bool isFunc);
+  void addPotentialUndefinedSymbol(const object::BasicSymbolRef &Sym,
+                                   bool isFunc);
 
   /// Add a defined symbol to the list.
-  void addDefinedSymbol(const GlobalValue *def, bool isFunction);
-
-  /// Add a function symbol as defined to the list.
-  void addDefinedFunctionSymbol(const Function *f);
+  void addDefinedSymbol(const char *Name, const GlobalValue *def,
+                        bool isFunction);
 
   /// Add a data symbol as defined to the list.
-  void addDefinedDataSymbol(const GlobalValue *v);
+  void addDefinedDataSymbol(const object::BasicSymbolRef &Sym);
+  void addDefinedDataSymbol(const char*Name, const GlobalValue *v);
 
-  /// Add global symbols from module-level ASM to the defined or undefined
-  /// lists.
-  bool addAsmGlobalSymbols(std::string &errMsg);
+  /// Add a function symbol as defined to the list.
+  void addDefinedFunctionSymbol(const object::BasicSymbolRef &Sym);
+  void addDefinedFunctionSymbol(const char *Name, const Function *F);
 
   /// Add a global symbol from module-level ASM to the defined list.
   void addAsmGlobalSymbol(const char *, lto_symbol_attributes scope);
@@ -200,17 +202,10 @@ private:
   /// Get string that the data pointer points to.
   bool objcClassNameFromExpression(const Constant *c, std::string &name);
 
-  /// Returns 'true' if the memory buffer is for the specified target triple.
-  static bool isTargetMatch(MemoryBuffer *memBuffer, const char *triplePrefix);
-
   /// Create an LTOModule (private version). N.B. This method takes ownership of
   /// the buffer.
-  static LTOModule *makeLTOModule(MemoryBuffer *buffer, TargetOptions options,
-                                  std::string &errMsg);
-
-  /// Create a MemoryBuffer from a memory range with an optional name.
-  static MemoryBuffer *makeBuffer(const void *mem, size_t length,
-                                  StringRef name = "");
+  static LTOModule *makeLTOModule(std::unique_ptr<MemoryBuffer> Buffer,
+                                  TargetOptions options, std::string &errMsg);
 };
 }
 #endif // LTO_MODULE_H
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index 2616ebd..b2309ff 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -85,6 +85,8 @@ namespace {
       (void) llvm::createIndVarSimplifyPass();
       (void) llvm::createInstructionCombiningPass();
       (void) llvm::createInternalizePass();
+      (void) llvm::createJumpInstrTableInfoPass();
+      (void) llvm::createJumpInstrTablesPass();
       (void) llvm::createLCSSAPass();
       (void) llvm::createLICMPass();
       (void) llvm::createLazyValueInfoPass();
diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h
index 42b2cb3..6254bbb 100644
--- a/include/llvm/Linker/Linker.h
+++ b/include/llvm/Linker/Linker.h
@@ -15,6 +15,8 @@
 
 namespace llvm {
 
+class Comdat;
+class GlobalValue;
 class Module;
 class StringRef;
 class StructType;
diff --git a/include/llvm/MC/ConstantPools.h b/include/llvm/MC/ConstantPools.h
new file mode 100644
index 0000000..2819b75
--- /dev/null
+++ b/include/llvm/MC/ConstantPools.h
@@ -0,0 +1,80 @@
+//===- ConstantPool.h - Keep track of assembler-generated  ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ConstantPool and AssemblerConstantPools classes.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_MC_CONSTANTPOOL_H
+#define LLVM_MC_CONSTANTPOOL_H
+
+#include "llvm/ADT/SmallVector.h"
+namespace llvm {
+class MCContext;
+class MCExpr;
+class MCSection;
+class MCStreamer;
+class MCSymbol;
+// A class to keep track of assembler-generated constant pools that are use to
+// implement the ldr-pseudo.
+class ConstantPool {
+  typedef SmallVector<std::pair<MCSymbol *, const MCExpr *>, 4> EntryVecTy;
+  EntryVecTy Entries;
+
+public:
+  // Initialize a new empty constant pool
+  ConstantPool() {}
+
+  // Add a new entry to the constant pool in the next slot.
+  // \param Value is the new entry to put in the constant pool.
+  //
+  // \returns a MCExpr that references the newly inserted value
+  const MCExpr *addEntry(const MCExpr *Value, MCContext &Context);
+
+  // Emit the contents of the constant pool using the provided streamer.
+  void emitEntries(MCStreamer &Streamer);
+
+  // Return true if the constant pool is empty
+  bool empty();
+};
+
+class AssemblerConstantPools {
+  // Map type used to keep track of per-Section constant pools used by the
+  // ldr-pseudo opcode. The map associates a section to its constant pool. The
+  // constant pool is a vector of (label, value) pairs. When the ldr
+  // pseudo is parsed we insert a new (label, value) pair into the constant pool
+  // for the current section and add MCSymbolRefExpr to the new label as
+  // an opcode to the ldr. After we have parsed all the user input we
+  // output the (label, value) pairs in each constant pool at the end of the
+  // section.
+  //
+  // We use the MapVector for the map type to ensure stable iteration of
+  // the sections at the end of the parse. We need to iterate over the
+  // sections in a stable order to ensure that we have print the
+  // constant pools in a deterministic order when printing an assembly
+  // file.
+  typedef MapVector<const MCSection *, ConstantPool> ConstantPoolMapTy;
+  ConstantPoolMapTy ConstantPools;
+
+public:
+  AssemblerConstantPools() {}
+  ~AssemblerConstantPools() {}
+
+  void emitAll(MCStreamer &Streamer);
+  void emitForCurrentSection(MCStreamer &Streamer);
+  const MCExpr *addEntry(MCStreamer &Streamer, const MCExpr *Expr);
+
+private:
+  ConstantPool *getConstantPool(const MCSection *Section);
+  ConstantPool &getOrCreateConstantPool(const MCSection *Section);
+};
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/MC/MCAtom.h b/include/llvm/MC/MCAnalysis/MCAtom.h
index e9d0fba..33f3431 100644
--- a/include/llvm/MC/MCAtom.h
+++ b/include/llvm/MC/MCAnalysis/MCAtom.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCAtom.h ----------------------------------------*- C++ -*-===//
+//===-- MCAtom.h ------------------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_MC_MCATOM_H
-#define LLVM_MC_MCATOM_H
+#ifndef LLVM_MC_MCANALYSIS_MCATOM_H
+#define LLVM_MC_MCANALYSIS_MCATOM_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/MC/MCInst.h"
diff --git a/include/llvm/MC/MCFunction.h b/include/llvm/MC/MCAnalysis/MCFunction.h
index bfa470b..44fa450 100644
--- a/include/llvm/MC/MCFunction.h
+++ b/include/llvm/MC/MCAnalysis/MCFunction.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCFunction.h ------------------------------------*- C++ -*-===//
+//===-- MCFunction.h --------------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_MC_MCFUNCTION_H
-#define LLVM_MC_MCFUNCTION_H
+#ifndef LLVM_MC_MCANALYSIS_MCFUNCTION_H
+#define LLVM_MC_MCANALYSIS_MCFUNCTION_H
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInst.h"
diff --git a/include/llvm/MC/MCModule.h b/include/llvm/MC/MCAnalysis/MCModule.h
index aa389cb..cf7e2c0 100644
--- a/include/llvm/MC/MCModule.h
+++ b/include/llvm/MC/MCAnalysis/MCModule.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCModule.h - MCModule class ---------------------*- C++ -*-===//
+//===-- MCModule.h - MCModule class -----------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_MC_MCMODULE_H
-#define LLVM_MC_MCMODULE_H
+#ifndef LLVM_MC_MCANALYSIS_MCMODULE_H
+#define LLVM_MC_MCANALYSIS_MCMODULE_H
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
diff --git a/include/llvm/MC/MCModuleYAML.h b/include/llvm/MC/MCAnalysis/MCModuleYAML.h
index c4ae829..4856277 100644
--- a/include/llvm/MC/MCModuleYAML.h
+++ b/include/llvm/MC/MCAnalysis/MCModuleYAML.h
@@ -13,11 +13,11 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_MC_MCMODULEYAML_H
-#define LLVM_MC_MCMODULEYAML_H
+#ifndef LLVM_MC_MCANALYSIS_MCMODULEYAML_H
+#define LLVM_MC_MCANALYSIS_MCMODULEYAML_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCModule.h"
+#include "llvm/MC/MCAnalysis/MCModule.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h
index f7d3be2..06e473d 100644
--- a/include/llvm/MC/MCAsmInfo.h
+++ b/include/llvm/MC/MCAsmInfo.h
@@ -23,546 +23,496 @@
 #include <vector>
 
 namespace llvm {
-  class MCExpr;
-  class MCSection;
-  class MCStreamer;
-  class MCSymbol;
-  class MCContext;
-
-  namespace ExceptionHandling {
-    enum ExceptionsType { None, DwarfCFI, SjLj, ARM, Win64 };
+class MCExpr;
+class MCSection;
+class MCStreamer;
+class MCSymbol;
+class MCContext;
+
+namespace WinEH {
+enum class EncodingType {
+  ET_Invalid, /// Invalid
+  ET_Alpha,   /// Windows Alpha
+  ET_Alpha64, /// Windows AXP64
+  ET_ARM,     /// Windows NT (Windows on ARM)
+  ET_CE,      /// Windows CE ARM, PowerPC, SH3, SH4
+  ET_Itanium, /// Windows x64, Windows Itanium (IA-64)
+  ET_MIPS = ET_Alpha,
+};
+}
+
+enum class ExceptionHandling {
+  None,     /// No exception support
+  DwarfCFI, /// DWARF-like instruction based exceptions
+  SjLj,     /// setjmp/longjmp based exceptions
+  ARM,      /// ARM EHABI
+  WinEH,    /// Windows Exception Handling
+};
+
+namespace LCOMM {
+enum LCOMMType { NoAlignment, ByteAlignment, Log2Alignment };
+}
+
+/// This class is intended to be used as a base class for asm
+/// properties and features specific to the target.
+class MCAsmInfo {
+protected:
+  //===------------------------------------------------------------------===//
+  // Properties to be set by the target writer, used to configure asm printer.
+  //
+
+  /// Pointer size in bytes.  Default is 4.
+  unsigned PointerSize;
+
+  /// Size of the stack slot reserved for callee-saved registers, in bytes.
+  /// Default is same as pointer size.
+  unsigned CalleeSaveStackSlotSize;
+
+  /// True if target is little endian.  Default is true.
+  bool IsLittleEndian;
+
+  /// True if target stack grow up.  Default is false.
+  bool StackGrowsUp;
+
+  /// True if this target has the MachO .subsections_via_symbols directive.
+  /// Default is false.
+  bool HasSubsectionsViaSymbols;
+
+  /// True if this is a MachO target that supports the macho-specific .zerofill
+  /// directive for emitting BSS Symbols.  Default is false.
+  bool HasMachoZeroFillDirective;
+
+  /// True if this is a MachO target that supports the macho-specific .tbss
+  /// directive for emitting thread local BSS Symbols.  Default is false.
+  bool HasMachoTBSSDirective;
+
+  /// True if the compiler should emit a ".reference .constructors_used" or
+  /// ".reference .destructors_used" directive after the a static ctor/dtor
+  /// list.  This directive is only emitted in Static relocation model.  Default
+  /// is false.
+  bool HasStaticCtorDtorReferenceInStaticMode;
+
+  /// True if the linker has a bug and requires that the debug_line section be
+  /// of a minimum size. In practice such a linker requires a non-empty line
+  /// sequence if a file is present.  Default to false.
+  bool LinkerRequiresNonEmptyDwarfLines;
+
+  /// This is the maximum possible length of an instruction, which is needed to
+  /// compute the size of an inline asm.  Defaults to 4.
+  unsigned MaxInstLength;
+
+  /// Every possible instruction length is a multiple of this value.  Factored
+  /// out in .debug_frame and .debug_line.  Defaults to 1.
+  unsigned MinInstAlignment;
+
+  /// The '$' token, when not referencing an identifier or constant, refers to
+  /// the current PC.  Defaults to false.
+  bool DollarIsPC;
+
+  /// This string, if specified, is used to separate instructions from each
+  /// other when on the same line.  Defaults to ';'
+  const char *SeparatorString;
+
+  /// This indicates the comment character used by the assembler.  Defaults to
+  /// "#"
+  const char *CommentString;
+
+  /// This is appended to emitted labels.  Defaults to ":"
+  const char *LabelSuffix;
+
+  // Print the EH begin symbol with an assignment. Defaults to false.
+  bool UseAssignmentForEHBegin;
+
+  /// This prefix is used for globals like constant pool entries that are
+  /// completely private to the .s file and should not have names in the .o
+  /// file.  Defaults to "L"
+  const char *PrivateGlobalPrefix;
+
+  /// This prefix is used for symbols that should be passed through the
+  /// assembler but be removed by the linker.  This is 'l' on Darwin, currently
+  /// used for some ObjC metadata.  The default of "" meast that for this system
+  /// a plain private symbol should be used.  Defaults to "".
+  const char *LinkerPrivateGlobalPrefix;
+
+  /// If these are nonempty, they contain a directive to emit before and after
+  /// an inline assembly statement.  Defaults to "#APP\n", "#NO_APP\n"
+  const char *InlineAsmStart;
+  const char *InlineAsmEnd;
+
+  /// These are assembly directives that tells the assembler to interpret the
+  /// following instructions differently.  Defaults to ".code16", ".code32",
+  /// ".code64".
+  const char *Code16Directive;
+  const char *Code32Directive;
+  const char *Code64Directive;
+
+  /// Which dialect of an assembler variant to use.  Defaults to 0
+  unsigned AssemblerDialect;
+
+  /// This is true if the assembler allows @ characters in symbol names.
+  /// Defaults to false.
+  bool AllowAtInName;
+
+  /// This is true if data region markers should be printed as
+  /// ".data_region/.end_data_region" directives. If false, use "$d/$a" labels
+  /// instead.
+  bool UseDataRegionDirectives;
+
+  //===--- Data Emission Directives -------------------------------------===//
+
+  /// This should be set to the directive used to get some number of zero bytes
+  /// emitted to the current section.  Common cases are "\t.zero\t" and
+  /// "\t.space\t".  If this is set to null, the Data*bitsDirective's will be
+  /// used to emit zero bytes.  Defaults to "\t.zero\t"
+  const char *ZeroDirective;
+
+  /// This directive allows emission of an ascii string with the standard C
+  /// escape characters embedded into it.  Defaults to "\t.ascii\t"
+  const char *AsciiDirective;
+
+  /// If not null, this allows for special handling of zero terminated strings
+  /// on this target.  This is commonly supported as ".asciz".  If a target
+  /// doesn't support this, it can be set to null.  Defaults to "\t.asciz\t"
+  const char *AscizDirective;
+
+  /// These directives are used to output some unit of integer data to the
+  /// current section.  If a data directive is set to null, smaller data
+  /// directives will be used to emit the large sizes.  Defaults to "\t.byte\t",
+  /// "\t.short\t", "\t.long\t", "\t.quad\t"
+  const char *Data8bitsDirective;
+  const char *Data16bitsDirective;
+  const char *Data32bitsDirective;
+  const char *Data64bitsDirective;
+
+  /// If non-null, a directive that is used to emit a word which should be
+  /// relocated as a 64-bit GP-relative offset, e.g. .gpdword on Mips.  Defaults
+  /// to NULL.
+  const char *GPRel64Directive;
+
+  /// If non-null, a directive that is used to emit a word which should be
+  /// relocated as a 32-bit GP-relative offset, e.g. .gpword on Mips or .gprel32
+  /// on Alpha.  Defaults to NULL.
+  const char *GPRel32Directive;
+
+  /// This is true if this target uses "Sun Style" syntax for section switching
+  /// ("#alloc,#write" etc) instead of the normal ELF syntax (,"a,w") in
+  /// .section directives.  Defaults to false.
+  bool SunStyleELFSectionSwitchSyntax;
+
+  /// This is true if this target uses ELF '.section' directive before the
+  /// '.bss' one. It's used for PPC/Linux which doesn't support the '.bss'
+  /// directive only.  Defaults to false.
+  bool UsesELFSectionDirectiveForBSS;
+
+  bool NeedsDwarfSectionOffsetDirective;
+
+  //===--- Alignment Information ----------------------------------------===//
+
+  /// If this is true (the default) then the asmprinter emits ".align N"
+  /// directives, where N is the number of bytes to align to.  Otherwise, it
+  /// emits ".align log2(N)", e.g. 3 to align to an 8 byte boundary.  Defaults
+  /// to true.
+  bool AlignmentIsInBytes;
+
+  /// If non-zero, this is used to fill the executable space created as the
+  /// result of a alignment directive.  Defaults to 0
+  unsigned TextAlignFillValue;
+
+  //===--- Global Variable Emission Directives --------------------------===//
+
+  /// This is the directive used to declare a global entity.  Defaults to NULL.
+  const char *GlobalDirective;
+
+  /// True if the assembler supports the .set directive.  Defaults to true.
+  bool HasSetDirective;
+
+  /// False if the assembler requires that we use
+  /// \code
+  ///   Lc = a - b
+  ///   .long Lc
+  /// \endcode
+  //
+  /// instead of
+  //
+  /// \code
+  ///   .long a - b
+  /// \endcode
+  ///
+  ///  Defaults to true.
+  bool HasAggressiveSymbolFolding;
+
+  /// True is .comm's and .lcomms optional alignment is to be specified in bytes
+  /// instead of log2(n).  Defaults to true.
+  bool COMMDirectiveAlignmentIsInBytes;
+
+  /// Describes if the .lcomm directive for the target supports an alignment
+  /// argument and how it is interpreted.  Defaults to NoAlignment.
+  LCOMM::LCOMMType LCOMMDirectiveAlignmentType;
+
+  /// True if the target has .type and .size directives, this is true for most
+  /// ELF targets.  Defaults to true.
+  bool HasDotTypeDotSizeDirective;
+
+  /// True if the target has a single parameter .file directive, this is true
+  /// for ELF targets.  Defaults to true.
+  bool HasSingleParameterDotFile;
+
+  /// True if the target has a .ident directive, this is true for ELF targets.
+  /// Defaults to false.
+  bool HasIdentDirective;
+
+  /// True if this target supports the MachO .no_dead_strip directive.  Defaults
+  /// to false.
+  bool HasNoDeadStrip;
+
+  /// This directive, if non-null, is used to declare a global as being a weak
+  /// undefined symbol.  Defaults to NULL.
+  const char *WeakRefDirective;
+
+  /// True if we have a directive to declare a global as being a weak defined
+  /// symbol.  Defaults to false.
+  bool HasWeakDefDirective;
+
+  /// True if we have a directive to declare a global as being a weak defined
+  /// symbol that can be hidden (unexported).  Defaults to false.
+  bool HasWeakDefCanBeHiddenDirective;
+
+  /// True if we have a .linkonce directive.  This is used on cygwin/mingw.
+  /// Defaults to false.
+  bool HasLinkOnceDirective;
+
+  /// This attribute, if not MCSA_Invalid, is used to declare a symbol as having
+  /// hidden visibility.  Defaults to MCSA_Hidden.
+  MCSymbolAttr HiddenVisibilityAttr;
+
+  /// This attribute, if not MCSA_Invalid, is used to declare an undefined
+  /// symbol as having hidden visibility. Defaults to MCSA_Hidden.
+  MCSymbolAttr HiddenDeclarationVisibilityAttr;
+
+  /// This attribute, if not MCSA_Invalid, is used to declare a symbol as having
+  /// protected visibility.  Defaults to MCSA_Protected
+  MCSymbolAttr ProtectedVisibilityAttr;
+
+  //===--- Dwarf Emission Directives -----------------------------------===//
+
+  /// True if target asm supports leb128 directives.  Defaults to false.
+  bool HasLEB128;
+
+  /// True if target supports emission of debugging information.  Defaults to
+  /// false.
+  bool SupportsDebugInformation;
+
+  /// Exception handling format for the target.  Defaults to None.
+  ExceptionHandling ExceptionsType;
+
+  /// Windows exception handling data (.pdata) encoding.  Defaults to Invalid.
+  WinEH::EncodingType WinEHEncodingType;
+
+  /// True if Dwarf2 output generally uses relocations for references to other
+  /// .debug_* sections.
+  bool DwarfUsesRelocationsAcrossSections;
+
+  /// True if DWARF FDE symbol reference relocations should be replaced by an
+  /// absolute difference.
+  bool DwarfFDESymbolsUseAbsDiff;
+
+  /// True if dwarf register numbers are printed instead of symbolic register
+  /// names in .cfi_* directives.  Defaults to false.
+  bool DwarfRegNumForCFI;
+
+  /// True if target uses parens to indicate the symbol variant instead of @.
+  /// For example, foo(plt) instead of foo@plt.  Defaults to false.
+  bool UseParensForSymbolVariant;
+
+  //===--- Prologue State ----------------------------------------------===//
+
+  std::vector<MCCFIInstruction> InitialFrameState;
+
+  //===--- Integrated Assembler State ----------------------------------===//
+
+  /// Should we use the integrated assembler?
+  /// The integrated assembler should be enabled by default (by the
+  /// constructors) when failing to parse a valid piece of assembly (inline
+  /// or otherwise) is considered a bug. It may then be overridden after
+  /// construction (see LLVMTargetMachine::initAsmInfo()).
+  bool UseIntegratedAssembler;
+
+  /// Compress DWARF debug sections. Defaults to false.
+  bool CompressDebugSections;
+
+public:
+  explicit MCAsmInfo();
+  virtual ~MCAsmInfo();
+
+  /// Get the pointer size in bytes.
+  unsigned getPointerSize() const { return PointerSize; }
+
+  /// Get the callee-saved register stack slot
+  /// size in bytes.
+  unsigned getCalleeSaveStackSlotSize() const {
+    return CalleeSaveStackSlotSize;
   }
 
-  namespace LCOMM {
-    enum LCOMMType { NoAlignment, ByteAlignment, Log2Alignment };
+  /// True if the target is little endian.
+  bool isLittleEndian() const { return IsLittleEndian; }
+
+  /// True if target stack grow up.
+  bool isStackGrowthDirectionUp() const { return StackGrowsUp; }
+
+  bool hasSubsectionsViaSymbols() const { return HasSubsectionsViaSymbols; }
+
+  // Data directive accessors.
+
+  const char *getData8bitsDirective() const { return Data8bitsDirective; }
+  const char *getData16bitsDirective() const { return Data16bitsDirective; }
+  const char *getData32bitsDirective() const { return Data32bitsDirective; }
+  const char *getData64bitsDirective() const { return Data64bitsDirective; }
+  const char *getGPRel64Directive() const { return GPRel64Directive; }
+  const char *getGPRel32Directive() const { return GPRel32Directive; }
+
+  /// Targets can implement this method to specify a section to switch to if the
+  /// translation unit doesn't have any trampolines that require an executable
+  /// stack.
+  virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const {
+    return nullptr;
   }
 
-  /// MCAsmInfo - This class is intended to be used as a base class for asm
-  /// properties and features specific to the target.
-  class MCAsmInfo {
-  protected:
-    //===------------------------------------------------------------------===//
-    // Properties to be set by the target writer, used to configure asm printer.
-    //
-
-    /// PointerSize - Pointer size in bytes.
-    ///               Default is 4.
-    unsigned PointerSize;
-
-    /// CalleeSaveStackSlotSize - Size of the stack slot reserved for
-    ///                           callee-saved registers, in bytes.
-    ///                           Default is same as pointer size.
-    unsigned CalleeSaveStackSlotSize;
-
-    /// IsLittleEndian - True if target is little endian.
-    ///                  Default is true.
-    bool IsLittleEndian;
-
-    /// StackGrowsUp - True if target stack grow up.
-    ///                Default is false.
-    bool StackGrowsUp;
-
-    /// HasSubsectionsViaSymbols - True if this target has the MachO
-    /// .subsections_via_symbols directive.
-    bool HasSubsectionsViaSymbols;           // Default is false.
-
-    /// HasMachoZeroFillDirective - True if this is a MachO target that supports
-    /// the macho-specific .zerofill directive for emitting BSS Symbols.
-    bool HasMachoZeroFillDirective;               // Default is false.
-
-    /// HasMachoTBSSDirective - True if this is a MachO target that supports
-    /// the macho-specific .tbss directive for emitting thread local BSS Symbols
-    bool HasMachoTBSSDirective;                 // Default is false.
-
-    /// HasStaticCtorDtorReferenceInStaticMode - True if the compiler should
-    /// emit a ".reference .constructors_used" or ".reference .destructors_used"
-    /// directive after the a static ctor/dtor list.  This directive is only
-    /// emitted in Static relocation model.
-    bool HasStaticCtorDtorReferenceInStaticMode;  // Default is false.
-
-    /// LinkerRequiresNonEmptyDwarfLines - True if the linker has a bug and
-    /// requires that the debug_line section be of a minimum size. In practice
-    /// such a linker requires a non-empty line sequence if a file is present.
-    bool LinkerRequiresNonEmptyDwarfLines; // Default to false.
-
-    /// MaxInstLength - This is the maximum possible length of an instruction,
-    /// which is needed to compute the size of an inline asm.
-    unsigned MaxInstLength;                  // Defaults to 4.
-
-    /// MinInstAlignment - Every possible instruction length is a multiple of
-    /// this value.  Factored out in .debug_frame and .debug_line.
-    unsigned MinInstAlignment;                  // Defaults to 1.
-
-    /// DollarIsPC - The '$' token, when not referencing an identifier or
-    /// constant, refers to the current PC.
-    bool DollarIsPC;                         // Defaults to false.
-
-    /// SeparatorString - This string, if specified, is used to separate
-    /// instructions from each other when on the same line.
-    const char *SeparatorString;             // Defaults to ';'
-
-    /// CommentString - This indicates the comment character used by the
-    /// assembler.
-    const char *CommentString;               // Defaults to "#"
-
-    /// LabelSuffix - This is appended to emitted labels.
-    const char *LabelSuffix;                 // Defaults to ":"
-
-    /// LabelSuffix - This is appended to emitted labels.
-    const char *DebugLabelSuffix;                 // Defaults to ":"
-
-    /// This prefix is used for globals like constant pool entries that are
-    /// completely private to the .s file and should not have names in the .o
-    /// file.
-    const char *PrivateGlobalPrefix;         // Defaults to "L"
-
-    /// This prefix is used for symbols that should be passed through the
-    /// assembler but be removed by the linker.  This is 'l' on Darwin,
-    /// currently used for some ObjC metadata.
-    /// The default of "" meast that for this system a plain private symbol
-    /// should be used.
-    const char *LinkerPrivateGlobalPrefix;    // Defaults to "".
-
-    /// InlineAsmStart/End - If these are nonempty, they contain a directive to
-    /// emit before and after an inline assembly statement.
-    const char *InlineAsmStart;              // Defaults to "#APP\n"
-    const char *InlineAsmEnd;                // Defaults to "#NO_APP\n"
-
-    /// Code16Directive, Code32Directive, Code64Directive - These are assembly
-    /// directives that tells the assembler to interpret the following
-    /// instructions differently.
-    const char *Code16Directive;             // Defaults to ".code16"
-    const char *Code32Directive;             // Defaults to ".code32"
-    const char *Code64Directive;             // Defaults to ".code64"
-
-    /// AssemblerDialect - Which dialect of an assembler variant to use.
-    unsigned AssemblerDialect;               // Defaults to 0
-
-    /// \brief This is true if the assembler allows @ characters in symbol
-    /// names. Defaults to false.
-    bool AllowAtInName;
-
-    /// UseDataRegionDirectives - This is true if data region markers should
-    /// be printed as ".data_region/.end_data_region" directives. If false,
-    /// use "$d/$a" labels instead.
-    bool UseDataRegionDirectives;
-
-    //===--- Data Emission Directives -------------------------------------===//
-
-    /// ZeroDirective - this should be set to the directive used to get some
-    /// number of zero bytes emitted to the current section.  Common cases are
-    /// "\t.zero\t" and "\t.space\t".  If this is set to null, the
-    /// Data*bitsDirective's will be used to emit zero bytes.
-    const char *ZeroDirective;               // Defaults to "\t.zero\t"
-
-    /// AsciiDirective - This directive allows emission of an ascii string with
-    /// the standard C escape characters embedded into it.
-    const char *AsciiDirective;              // Defaults to "\t.ascii\t"
-
-    /// AscizDirective - If not null, this allows for special handling of
-    /// zero terminated strings on this target.  This is commonly supported as
-    /// ".asciz".  If a target doesn't support this, it can be set to null.
-    const char *AscizDirective;              // Defaults to "\t.asciz\t"
-
-    /// DataDirectives - These directives are used to output some unit of
-    /// integer data to the current section.  If a data directive is set to
-    /// null, smaller data directives will be used to emit the large sizes.
-    const char *Data8bitsDirective;          // Defaults to "\t.byte\t"
-    const char *Data16bitsDirective;         // Defaults to "\t.short\t"
-    const char *Data32bitsDirective;         // Defaults to "\t.long\t"
-    const char *Data64bitsDirective;         // Defaults to "\t.quad\t"
-
-    /// GPRel64Directive - if non-null, a directive that is used to emit a word
-    /// which should be relocated as a 64-bit GP-relative offset, e.g. .gpdword
-    /// on Mips.
-    const char *GPRel64Directive;            // Defaults to NULL.
-
-    /// GPRel32Directive - if non-null, a directive that is used to emit a word
-    /// which should be relocated as a 32-bit GP-relative offset, e.g. .gpword
-    /// on Mips or .gprel32 on Alpha.
-    const char *GPRel32Directive;            // Defaults to NULL.
-
-    /// SunStyleELFSectionSwitchSyntax - This is true if this target uses "Sun
-    /// Style" syntax for section switching ("#alloc,#write" etc) instead of the
-    /// normal ELF syntax (,"a,w") in .section directives.
-    bool SunStyleELFSectionSwitchSyntax;     // Defaults to false.
-
-    /// UsesELFSectionDirectiveForBSS - This is true if this target uses ELF
-    /// '.section' directive before the '.bss' one. It's used for PPC/Linux
-    /// which doesn't support the '.bss' directive only.
-    bool UsesELFSectionDirectiveForBSS;      // Defaults to false.
-
-    bool NeedsDwarfSectionOffsetDirective;
-
-    //===--- Alignment Information ----------------------------------------===//
-
-    /// AlignmentIsInBytes - If this is true (the default) then the asmprinter
-    /// emits ".align N" directives, where N is the number of bytes to align to.
-    /// Otherwise, it emits ".align log2(N)", e.g. 3 to align to an 8 byte
-    /// boundary.
-    bool AlignmentIsInBytes;                 // Defaults to true
-
-    /// TextAlignFillValue - If non-zero, this is used to fill the executable
-    /// space created as the result of a alignment directive.
-    unsigned TextAlignFillValue;             // Defaults to 0
+  virtual const MCExpr *getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                                    unsigned Encoding,
+                                                    MCStreamer &Streamer) const;
+
+  virtual const MCExpr *getExprForFDESymbol(const MCSymbol *Sym,
+                                            unsigned Encoding,
+                                            MCStreamer &Streamer) const;
+
+  bool usesSunStyleELFSectionSwitchSyntax() const {
+    return SunStyleELFSectionSwitchSyntax;
+  }
 
-    //===--- Global Variable Emission Directives --------------------------===//
+  bool usesELFSectionDirectiveForBSS() const {
+    return UsesELFSectionDirectiveForBSS;
+  }
+
+  bool needsDwarfSectionOffsetDirective() const {
+    return NeedsDwarfSectionOffsetDirective;
+  }
+
+  // Accessors.
+
+  bool hasMachoZeroFillDirective() const { return HasMachoZeroFillDirective; }
+  bool hasMachoTBSSDirective() const { return HasMachoTBSSDirective; }
+  bool hasStaticCtorDtorReferenceInStaticMode() const {
+    return HasStaticCtorDtorReferenceInStaticMode;
+  }
+  bool getLinkerRequiresNonEmptyDwarfLines() const {
+    return LinkerRequiresNonEmptyDwarfLines;
+  }
+  unsigned getMaxInstLength() const { return MaxInstLength; }
+  unsigned getMinInstAlignment() const { return MinInstAlignment; }
+  bool getDollarIsPC() const { return DollarIsPC; }
+  const char *getSeparatorString() const { return SeparatorString; }
+
+  /// This indicates the column (zero-based) at which asm comments should be
+  /// printed.
+  unsigned getCommentColumn() const { return 40; }
+
+  const char *getCommentString() const { return CommentString; }
+  const char *getLabelSuffix() const { return LabelSuffix; }
+
+  bool useAssignmentForEHBegin() const { return UseAssignmentForEHBegin; }
+  const char *getPrivateGlobalPrefix() const { return PrivateGlobalPrefix; }
+  bool hasLinkerPrivateGlobalPrefix() const {
+    return LinkerPrivateGlobalPrefix[0] != '\0';
+  }
+  const char *getLinkerPrivateGlobalPrefix() const {
+    if (hasLinkerPrivateGlobalPrefix())
+      return LinkerPrivateGlobalPrefix;
+    return getPrivateGlobalPrefix();
+  }
+  const char *getInlineAsmStart() const { return InlineAsmStart; }
+  const char *getInlineAsmEnd() const { return InlineAsmEnd; }
+  const char *getCode16Directive() const { return Code16Directive; }
+  const char *getCode32Directive() const { return Code32Directive; }
+  const char *getCode64Directive() const { return Code64Directive; }
+  unsigned getAssemblerDialect() const { return AssemblerDialect; }
+  bool doesAllowAtInName() const { return AllowAtInName; }
+  bool doesSupportDataRegionDirectives() const {
+    return UseDataRegionDirectives;
+  }
+  const char *getZeroDirective() const { return ZeroDirective; }
+  const char *getAsciiDirective() const { return AsciiDirective; }
+  const char *getAscizDirective() const { return AscizDirective; }
+  bool getAlignmentIsInBytes() const { return AlignmentIsInBytes; }
+  unsigned getTextAlignFillValue() const { return TextAlignFillValue; }
+  const char *getGlobalDirective() const { return GlobalDirective; }
+  bool hasSetDirective() const { return HasSetDirective; }
+  bool hasAggressiveSymbolFolding() const { return HasAggressiveSymbolFolding; }
+  bool getCOMMDirectiveAlignmentIsInBytes() const {
+    return COMMDirectiveAlignmentIsInBytes;
+  }
+  LCOMM::LCOMMType getLCOMMDirectiveAlignmentType() const {
+    return LCOMMDirectiveAlignmentType;
+  }
+  bool hasDotTypeDotSizeDirective() const { return HasDotTypeDotSizeDirective; }
+  bool hasSingleParameterDotFile() const { return HasSingleParameterDotFile; }
+  bool hasIdentDirective() const { return HasIdentDirective; }
+  bool hasNoDeadStrip() const { return HasNoDeadStrip; }
+  const char *getWeakRefDirective() const { return WeakRefDirective; }
+  bool hasWeakDefDirective() const { return HasWeakDefDirective; }
+  bool hasWeakDefCanBeHiddenDirective() const {
+    return HasWeakDefCanBeHiddenDirective;
+  }
+  bool hasLinkOnceDirective() const { return HasLinkOnceDirective; }
 
-    /// GlobalDirective - This is the directive used to declare a global entity.
-    ///
-    const char *GlobalDirective;             // Defaults to NULL.
-
-    /// HasSetDirective - True if the assembler supports the .set directive.
-    bool HasSetDirective;                    // Defaults to true.
-
-    /// HasAggressiveSymbolFolding - False if the assembler requires that we use
-    /// Lc = a - b
-    /// .long Lc
-    /// instead of
-    /// .long a - b
-    bool HasAggressiveSymbolFolding;           // Defaults to true.
+  MCSymbolAttr getHiddenVisibilityAttr() const { return HiddenVisibilityAttr; }
+  MCSymbolAttr getHiddenDeclarationVisibilityAttr() const {
+    return HiddenDeclarationVisibilityAttr;
+  }
+  MCSymbolAttr getProtectedVisibilityAttr() const {
+    return ProtectedVisibilityAttr;
+  }
+  bool hasLEB128() const { return HasLEB128; }
+  bool doesSupportDebugInformation() const { return SupportsDebugInformation; }
+  bool doesSupportExceptionHandling() const {
+    return ExceptionsType != ExceptionHandling::None;
+  }
+  ExceptionHandling getExceptionHandlingType() const { return ExceptionsType; }
+  WinEH::EncodingType getWinEHEncodingType() const { return WinEHEncodingType; }
+  bool isExceptionHandlingDwarf() const {
+    return (ExceptionsType == ExceptionHandling::DwarfCFI ||
+            ExceptionsType == ExceptionHandling::ARM ||
+            // Windows handler data still uses DWARF LSDA encoding.
+            ExceptionsType == ExceptionHandling::WinEH);
+  }
+  bool doesDwarfUseRelocationsAcrossSections() const {
+    return DwarfUsesRelocationsAcrossSections;
+  }
+  bool doDwarfFDESymbolsUseAbsDiff() const { return DwarfFDESymbolsUseAbsDiff; }
+  bool useDwarfRegNumForCFI() const { return DwarfRegNumForCFI; }
+  bool useParensForSymbolVariant() const { return UseParensForSymbolVariant; }
 
-    /// COMMDirectiveAlignmentIsInBytes - True is .comm's and .lcomms optional
-    /// alignment is to be specified in bytes instead of log2(n).
-    bool COMMDirectiveAlignmentIsInBytes;    // Defaults to true;
+  void addInitialFrameState(const MCCFIInstruction &Inst) {
+    InitialFrameState.push_back(Inst);
+  }
 
-    /// LCOMMDirectiveAlignment - Describes if the .lcomm directive for the
-    /// target supports an alignment argument and how it is interpreted.
-    LCOMM::LCOMMType LCOMMDirectiveAlignmentType; // Defaults to NoAlignment.
-
-    /// HasDotTypeDotSizeDirective - True if the target has .type and .size
-    /// directives, this is true for most ELF targets.
-    bool HasDotTypeDotSizeDirective;         // Defaults to true.
+  const std::vector<MCCFIInstruction> &getInitialFrameState() const {
+    return InitialFrameState;
+  }
 
-    /// HasSingleParameterDotFile - True if the target has a single parameter
-    /// .file directive, this is true for ELF targets.
-    bool HasSingleParameterDotFile;          // Defaults to true.
+  /// Return true if assembly (inline or otherwise) should be parsed.
+  bool useIntegratedAssembler() const { return UseIntegratedAssembler; }
 
-    /// hasIdentDirective - True if the target has a .ident directive, this is
-    /// true for ELF targets.
-    bool HasIdentDirective;                  // Defaults to false.
+  /// Set whether assembly (inline or otherwise) should be parsed.
+  virtual void setUseIntegratedAssembler(bool Value) {
+    UseIntegratedAssembler = Value;
+  }
 
-    /// HasNoDeadStrip - True if this target supports the MachO .no_dead_strip
-    /// directive.
-    bool HasNoDeadStrip;                     // Defaults to false.
+  bool compressDebugSections() const { return CompressDebugSections; }
 
-    /// WeakRefDirective - This directive, if non-null, is used to declare a
-    /// global as being a weak undefined symbol.
-    const char *WeakRefDirective;            // Defaults to NULL.
-
-    /// True if we have a directive to declare a global as being a weak
-    /// defined symbol.
-    bool HasWeakDefDirective;                // Defaults to false.
-
-    /// True if we have a directive to declare a global as being a weak
-    /// defined symbol that can be hidden (unexported).
-    bool HasWeakDefCanBeHiddenDirective;     // Defaults to false.
-
-    /// True if we have a .linkonce directive.  This is used on cygwin/mingw.
-    bool HasLinkOnceDirective;               // Defaults to false.
-
-    /// HiddenVisibilityAttr - This attribute, if not MCSA_Invalid, is used to
-    /// declare a symbol as having hidden visibility.
-    MCSymbolAttr HiddenVisibilityAttr;       // Defaults to MCSA_Hidden.
-
-    /// HiddenDeclarationVisibilityAttr - This attribute, if not MCSA_Invalid,
-    /// is used to declare an undefined symbol as having hidden visibility.
-    MCSymbolAttr HiddenDeclarationVisibilityAttr;   // Defaults to MCSA_Hidden.
-
-
-    /// ProtectedVisibilityAttr - This attribute, if not MCSA_Invalid, is used
-    /// to declare a symbol as having protected visibility.
-    MCSymbolAttr ProtectedVisibilityAttr;    // Defaults to MCSA_Protected
-
-    //===--- Dwarf Emission Directives -----------------------------------===//
-
-    /// HasLEB128 - True if target asm supports leb128 directives.
-    bool HasLEB128;                          // Defaults to false.
-
-    /// SupportsDebugInformation - True if target supports emission of debugging
-    /// information.
-    bool SupportsDebugInformation;           // Defaults to false.
-
-    /// SupportsExceptionHandling - True if target supports exception handling.
-    ExceptionHandling::ExceptionsType ExceptionsType; // Defaults to None
-
-    /// DwarfUsesRelocationsAcrossSections - True if Dwarf2 output generally
-    /// uses relocations for references to other .debug_* sections.
-    bool DwarfUsesRelocationsAcrossSections;
-
-    /// DwarfFDESymbolsUseAbsDiff - true if DWARF FDE symbol reference
-    /// relocations should be replaced by an absolute difference.
-    bool DwarfFDESymbolsUseAbsDiff;
-
-    /// DwarfRegNumForCFI - True if dwarf register numbers are printed
-    /// instead of symbolic register names in .cfi_* directives.
-    bool DwarfRegNumForCFI;  // Defaults to false;
-
-    /// UseParensForSymbolVariant - True if target uses parens to indicate the
-    /// symbol variant instead of @. For example, foo(plt) instead of foo@plt.
-    bool UseParensForSymbolVariant; // Defaults to false;
-
-    //===--- Prologue State ----------------------------------------------===//
-
-    std::vector<MCCFIInstruction> InitialFrameState;
-
-    //===--- Integrated Assembler State ----------------------------------===//
-    /// Should we use the integrated assembler?
-    /// The integrated assembler should be enabled by default (by the
-    /// constructors) when failing to parse a valid piece of assembly (inline
-    /// or otherwise) is considered a bug. It may then be overridden after
-    /// construction (see LLVMTargetMachine::initAsmInfo()).
-    bool UseIntegratedAssembler;
-
-    /// Compress DWARF debug sections. Defaults to false.
-    bool CompressDebugSections;
-
-  public:
-    explicit MCAsmInfo();
-    virtual ~MCAsmInfo();
-
-    /// getPointerSize - Get the pointer size in bytes.
-    unsigned getPointerSize() const {
-      return PointerSize;
-    }
-
-    /// getCalleeSaveStackSlotSize - Get the callee-saved register stack slot
-    /// size in bytes.
-    unsigned getCalleeSaveStackSlotSize() const {
-      return CalleeSaveStackSlotSize;
-    }
-
-    /// isLittleEndian - True if the target is little endian.
-    bool isLittleEndian() const {
-      return IsLittleEndian;
-    }
-
-    /// isStackGrowthDirectionUp - True if target stack grow up.
-    bool isStackGrowthDirectionUp() const {
-      return StackGrowsUp;
-    }
-
-    bool hasSubsectionsViaSymbols() const { return HasSubsectionsViaSymbols; }
-
-    // Data directive accessors.
-    //
-    const char *getData8bitsDirective() const {
-      return Data8bitsDirective;
-    }
-    const char *getData16bitsDirective() const {
-      return Data16bitsDirective;
-    }
-    const char *getData32bitsDirective() const {
-      return Data32bitsDirective;
-    }
-    const char *getData64bitsDirective() const {
-      return Data64bitsDirective;
-    }
-    const char *getGPRel64Directive() const { return GPRel64Directive; }
-    const char *getGPRel32Directive() const { return GPRel32Directive; }
-
-    /// getNonexecutableStackSection - Targets can implement this method to
-    /// specify a section to switch to if the translation unit doesn't have any
-    /// trampolines that require an executable stack.
-    virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const{
-      return nullptr;
-    }
-
-    virtual const MCExpr *
-    getExprForPersonalitySymbol(const MCSymbol *Sym,
-                                unsigned Encoding,
-                                MCStreamer &Streamer) const;
-
-    virtual const MCExpr *
-    getExprForFDESymbol(const MCSymbol *Sym,
-                        unsigned Encoding,
-                        MCStreamer &Streamer) const;
-
-    bool usesSunStyleELFSectionSwitchSyntax() const {
-      return SunStyleELFSectionSwitchSyntax;
-    }
-
-    bool usesELFSectionDirectiveForBSS() const {
-      return UsesELFSectionDirectiveForBSS;
-    }
-
-    bool needsDwarfSectionOffsetDirective() const {
-      return NeedsDwarfSectionOffsetDirective;
-    }
-
-    // Accessors.
-    //
-    bool hasMachoZeroFillDirective() const { return HasMachoZeroFillDirective; }
-    bool hasMachoTBSSDirective() const { return HasMachoTBSSDirective; }
-    bool hasStaticCtorDtorReferenceInStaticMode() const {
-      return HasStaticCtorDtorReferenceInStaticMode;
-    }
-    bool getLinkerRequiresNonEmptyDwarfLines() const {
-      return LinkerRequiresNonEmptyDwarfLines;
-    }
-    unsigned getMaxInstLength() const {
-      return MaxInstLength;
-    }
-    unsigned getMinInstAlignment() const {
-      return MinInstAlignment;
-    }
-    bool getDollarIsPC() const {
-      return DollarIsPC;
-    }
-    const char *getSeparatorString() const {
-      return SeparatorString;
-    }
-
-    /// This indicates the column (zero-based) at which asm comments should be
-    /// printed.
-    unsigned getCommentColumn() const {
-      return 40;
-    }
-
-    const char *getCommentString() const {
-      return CommentString;
-    }
-    const char *getLabelSuffix() const {
-      return LabelSuffix;
-    }
-
-    const char *getDebugLabelSuffix() const {
-      return DebugLabelSuffix;
-    }
-    const char *getPrivateGlobalPrefix() const {
-      return PrivateGlobalPrefix;
-    }
-    bool hasLinkerPrivateGlobalPrefix() const {
-      return LinkerPrivateGlobalPrefix[0] != '\0';
-    }
-    const char *getLinkerPrivateGlobalPrefix() const {
-      if (hasLinkerPrivateGlobalPrefix())
-        return LinkerPrivateGlobalPrefix;
-      return getPrivateGlobalPrefix();
-    }
-    const char *getInlineAsmStart() const {
-      return InlineAsmStart;
-    }
-    const char *getInlineAsmEnd() const {
-      return InlineAsmEnd;
-    }
-    const char *getCode16Directive() const {
-      return Code16Directive;
-    }
-    const char *getCode32Directive() const {
-      return Code32Directive;
-    }
-    const char *getCode64Directive() const {
-      return Code64Directive;
-    }
-    unsigned getAssemblerDialect() const {
-      return AssemblerDialect;
-    }
-    bool doesAllowAtInName() const {
-      return AllowAtInName;
-    }
-    bool doesSupportDataRegionDirectives() const {
-      return UseDataRegionDirectives;
-    }
-    const char *getZeroDirective() const {
-      return ZeroDirective;
-    }
-    const char *getAsciiDirective() const {
-      return AsciiDirective;
-    }
-    const char *getAscizDirective() const {
-      return AscizDirective;
-    }
-    bool getAlignmentIsInBytes() const {
-      return AlignmentIsInBytes;
-    }
-    unsigned getTextAlignFillValue() const {
-      return TextAlignFillValue;
-    }
-    const char *getGlobalDirective() const {
-      return GlobalDirective;
-    }
-    bool hasSetDirective() const { return HasSetDirective; }
-    bool hasAggressiveSymbolFolding() const {
-      return HasAggressiveSymbolFolding;
-    }
-    bool getCOMMDirectiveAlignmentIsInBytes() const {
-      return COMMDirectiveAlignmentIsInBytes;
-    }
-    LCOMM::LCOMMType getLCOMMDirectiveAlignmentType() const {
-      return LCOMMDirectiveAlignmentType;
-    }
-    bool hasDotTypeDotSizeDirective() const {return HasDotTypeDotSizeDirective;}
-    bool hasSingleParameterDotFile() const { return HasSingleParameterDotFile; }
-    bool hasIdentDirective() const { return HasIdentDirective; }
-    bool hasNoDeadStrip() const { return HasNoDeadStrip; }
-    const char *getWeakRefDirective() const { return WeakRefDirective; }
-    bool hasWeakDefDirective() const { return HasWeakDefDirective; }
-    bool hasWeakDefCanBeHiddenDirective() const {
-      return HasWeakDefCanBeHiddenDirective;
-    }
-    bool hasLinkOnceDirective() const { return HasLinkOnceDirective; }
-
-    MCSymbolAttr getHiddenVisibilityAttr() const { return HiddenVisibilityAttr;}
-    MCSymbolAttr getHiddenDeclarationVisibilityAttr() const {
-      return HiddenDeclarationVisibilityAttr;
-    }
-    MCSymbolAttr getProtectedVisibilityAttr() const {
-      return ProtectedVisibilityAttr;
-    }
-    bool hasLEB128() const {
-      return HasLEB128;
-    }
-    bool doesSupportDebugInformation() const {
-      return SupportsDebugInformation;
-    }
-    bool doesSupportExceptionHandling() const {
-      return ExceptionsType != ExceptionHandling::None;
-    }
-    ExceptionHandling::ExceptionsType getExceptionHandlingType() const {
-      return ExceptionsType;
-    }
-    bool isExceptionHandlingDwarf() const {
-      return
-        (ExceptionsType == ExceptionHandling::DwarfCFI ||
-         ExceptionsType == ExceptionHandling::ARM ||
-         ExceptionsType == ExceptionHandling::Win64);
-    }
-    bool doesDwarfUseRelocationsAcrossSections() const {
-      return DwarfUsesRelocationsAcrossSections;
-    }
-    bool doDwarfFDESymbolsUseAbsDiff() const {
-      return DwarfFDESymbolsUseAbsDiff;
-    }
-    bool useDwarfRegNumForCFI() const {
-      return DwarfRegNumForCFI;
-    }
-    bool useParensForSymbolVariant() const {
-      return UseParensForSymbolVariant;
-    }
-
-    void addInitialFrameState(const MCCFIInstruction &Inst) {
-      InitialFrameState.push_back(Inst);
-    }
-
-    const std::vector<MCCFIInstruction> &getInitialFrameState() const {
-      return InitialFrameState;
-    }
-
-    /// Return true if assembly (inline or otherwise) should be parsed.
-    bool useIntegratedAssembler() const { return UseIntegratedAssembler; }
-
-    /// Set whether assembly (inline or otherwise) should be parsed.
-    virtual void setUseIntegratedAssembler(bool Value) {
-      UseIntegratedAssembler = Value;
-    }
-
-    bool compressDebugSections() const { return CompressDebugSections; }
-
-    void setCompressDebugSections(bool CompressDebugSections) {
-      this->CompressDebugSections = CompressDebugSections;
-    }
-  };
+  void setCompressDebugSections(bool CompressDebugSections) {
+    this->CompressDebugSections = CompressDebugSections;
+  }
+};
 }
 
 #endif
diff --git a/include/llvm/MC/MCAssembler.h b/include/llvm/MC/MCAssembler.h
index be13b36..1cb34c2 100644
--- a/include/llvm/MC/MCAssembler.h
+++ b/include/llvm/MC/MCAssembler.h
@@ -802,7 +802,7 @@ public:
 
   /// @}
 
-  void dump();
+  void dump() const;
 };
 
 // FIXME: This really doesn't belong here. See comments below.
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index 7557e76..eb0340f 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -11,15 +11,18 @@
 #define LLVM_MC_MCCONTEXT_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
+#include <tuple>
 #include <vector> // FIXME: Shouldn't be needed.
 
 namespace llvm {
@@ -129,11 +132,10 @@ namespace llvm {
     /// assembly source files.
     unsigned GenDwarfFileNumber;
 
-    /// The default initial text section that we generate dwarf debugging line
-    /// info for when generating dwarf assembly source files.
-    const MCSection *GenDwarfSection;
-    /// Symbols created for the start and end of this section.
-    MCSymbol *GenDwarfSectionStartSym, *GenDwarfSectionEndSym;
+    /// Symbols created for the start and end of each section, used for
+    /// generating the .debug_ranges and .debug_aranges sections.
+    MapVector<const MCSection *, std::pair<MCSymbol *, MCSymbol *> >
+    SectionStartEndSyms;
 
     /// The information gathered from labels that will have dwarf label
     /// entries when generating dwarf assembly source files.
@@ -159,10 +161,11 @@ namespace llvm {
     unsigned DwarfCompileUnitID;
 
     typedef std::pair<std::string, std::string> SectionGroupPair;
+    typedef std::tuple<std::string, std::string, int> SectionGroupTriple;
 
     StringMap<const MCSectionMachO*> MachOUniquingMap;
     std::map<SectionGroupPair, const MCSectionELF *> ELFUniquingMap;
-    std::map<SectionGroupPair, const MCSectionCOFF *> COFFUniquingMap;
+    std::map<SectionGroupTriple, const MCSectionCOFF *> COFFUniquingMap;
 
     /// Do automatic reset in destructor
     bool AutoReset;
@@ -273,9 +276,7 @@ namespace llvm {
     const MCSectionCOFF *getCOFFSection(StringRef Section,
                                         unsigned Characteristics,
                                         SectionKind Kind,
-                                        StringRef COMDATSymName,
-                                        int Selection,
-                                        const MCSectionCOFF *Assoc = nullptr);
+                                        StringRef COMDATSymName, int Selection);
 
     const MCSectionCOFF *getCOFFSection(StringRef Section,
                                         unsigned Characteristics,
@@ -376,16 +377,18 @@ namespace llvm {
     void setGenDwarfFileNumber(unsigned FileNumber) {
       GenDwarfFileNumber = FileNumber;
     }
-    const MCSection *getGenDwarfSection() { return GenDwarfSection; }
-    void setGenDwarfSection(const MCSection *Sec) { GenDwarfSection = Sec; }
-    MCSymbol *getGenDwarfSectionStartSym() { return GenDwarfSectionStartSym; }
-    void setGenDwarfSectionStartSym(MCSymbol *Sym) {
-      GenDwarfSectionStartSym = Sym;
+    MapVector<const MCSection *, std::pair<MCSymbol *, MCSymbol *> > &
+    getGenDwarfSectionSyms() {
+      return SectionStartEndSyms;
     }
-    MCSymbol *getGenDwarfSectionEndSym() { return GenDwarfSectionEndSym; }
-    void setGenDwarfSectionEndSym(MCSymbol *Sym) {
-      GenDwarfSectionEndSym = Sym;
+    std::pair<MapVector<const MCSection *,
+                        std::pair<MCSymbol *, MCSymbol *> >::iterator,
+              bool>
+    addGenDwarfSection(const MCSection *Sec) {
+      return SectionStartEndSyms.insert(
+          std::make_pair(Sec, std::make_pair(nullptr, nullptr)));
     }
+    void finalizeDwarfSections(MCStreamer &MCOS);
     const std::vector<MCGenDwarfLabelEntry> &getMCGenDwarfLabelEntries() const {
       return MCGenDwarfLabelEntries;
     }
diff --git a/include/llvm/MC/MCDwarf.h b/include/llvm/MC/MCDwarf.h
index 6df8a19..6cd9a9a 100644
--- a/include/llvm/MC/MCDwarf.h
+++ b/include/llvm/MC/MCDwarf.h
@@ -465,14 +465,13 @@ public:
 
 struct MCDwarfFrameInfo {
   MCDwarfFrameInfo()
-    : Begin(nullptr), End(nullptr), Personality(nullptr), Lsda(nullptr),
-      Function(nullptr), Instructions(), PersonalityEncoding(), LsdaEncoding(0),
-      CompactUnwindEncoding(0), IsSignalFrame(false), IsSimple(false) {}
+      : Begin(nullptr), End(nullptr), Personality(nullptr), Lsda(nullptr),
+        Instructions(), PersonalityEncoding(), LsdaEncoding(0),
+        CompactUnwindEncoding(0), IsSignalFrame(false), IsSimple(false) {}
   MCSymbol *Begin;
   MCSymbol *End;
   const MCSymbol *Personality;
   const MCSymbol *Lsda;
-  const MCSymbol *Function;
   std::vector<MCCFIInstruction> Instructions;
   unsigned PersonalityEncoding;
   unsigned LsdaEncoding;
diff --git a/include/llvm/MC/MCELFStreamer.h b/include/llvm/MC/MCELFStreamer.h
index be39128..66729fe 100644
--- a/include/llvm/MC/MCELFStreamer.h
+++ b/include/llvm/MC/MCELFStreamer.h
@@ -48,7 +48,6 @@ public:
   void ChangeSection(const MCSection *Section,
                      const MCExpr *Subsection) override;
   void EmitLabel(MCSymbol *Symbol) override;
-  void EmitDebugLabel(MCSymbol *Symbol) override;
   void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
   void EmitThumbFunc(MCSymbol *Func) override;
   void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override;
diff --git a/include/llvm/MC/MCELFSymbolFlags.h b/include/llvm/MC/MCELFSymbolFlags.h
index 2f1f561..297c442 100644
--- a/include/llvm/MC/MCELFSymbolFlags.h
+++ b/include/llvm/MC/MCELFSymbolFlags.h
@@ -41,6 +41,7 @@ namespace llvm {
       ELF_STT_File      = (ELF::STT_FILE      << ELF_STT_Shift),
       ELF_STT_Common    = (ELF::STT_COMMON    << ELF_STT_Shift),
       ELF_STT_Tls       = (ELF::STT_TLS       << ELF_STT_Shift),
+      ELF_STT_GnuIFunc  = (ELF::STT_GNU_IFUNC << ELF_STT_Shift),
       ELF_STT_Loproc    = (ELF::STT_LOPROC    << ELF_STT_Shift),
       ELF_STT_Hiproc    = (ELF::STT_HIPROC    << ELF_STT_Shift),
 
diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h
index ca5cecb..e96ecb4 100644
--- a/include/llvm/MC/MCExpr.h
+++ b/include/llvm/MC/MCExpr.h
@@ -21,6 +21,7 @@ class MCAssembler;
 class MCContext;
 class MCSection;
 class MCSectionData;
+class MCStreamer;
 class MCSymbol;
 class MCValue;
 class raw_ostream;
@@ -524,7 +525,7 @@ public:
   virtual void PrintImpl(raw_ostream &OS) const = 0;
   virtual bool EvaluateAsRelocatableImpl(MCValue &Res,
                                          const MCAsmLayout *Layout) const = 0;
-  virtual void AddValueSymbols(MCAssembler *) const = 0;
+  virtual void visitUsedExpr(MCStreamer& Streamer) const = 0;
   virtual const MCSection *FindAssociatedSection() const = 0;
 
   virtual void fixELFSymbolsInTLSFixups(MCAssembler &) const = 0;
diff --git a/include/llvm/MC/MCLinkerOptimizationHint.h b/include/llvm/MC/MCLinkerOptimizationHint.h
index 3b0d933..50fd527 100644
--- a/include/llvm/MC/MCLinkerOptimizationHint.h
+++ b/include/llvm/MC/MCLinkerOptimizationHint.h
@@ -132,8 +132,19 @@ public:
   /// the given @p Layout.
   uint64_t getEmitSize(const MachObjectWriter &ObjWriter,
                        const MCAsmLayout &Layout) const {
-    std::string Buffer;
-    raw_string_ostream OutStream(Buffer);
+    class raw_counting_ostream : public raw_ostream {
+      uint64_t Count;
+
+      void write_impl(const char *, size_t size) override { Count += size; }
+
+      uint64_t current_pos() const override { return Count; }
+
+    public:
+      raw_counting_ostream() : Count(0) {}
+      ~raw_counting_ostream() { flush(); }
+    };
+
+    raw_counting_ostream OutStream;
     Emit_impl(OutStream, ObjWriter, Layout);
     return OutStream.tell();
   }
diff --git a/include/llvm/MC/MCMachObjectWriter.h b/include/llvm/MC/MCMachObjectWriter.h
index e7d5bbd..12a7f0e 100644
--- a/include/llvm/MC/MCMachObjectWriter.h
+++ b/include/llvm/MC/MCMachObjectWriter.h
@@ -111,6 +111,8 @@ class MachObjectWriter : public MCObjectWriter {
 
   /// @}
 
+  MachSymbolData *findSymbolData(const MCSymbol &Sym);
+
 public:
   MachObjectWriter(MCMachObjectTargetWriter *MOTW, raw_ostream &_OS,
                    bool _IsLittleEndian)
diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h
index 1a56040..4d1715e 100644
--- a/include/llvm/MC/MCObjectFileInfo.h
+++ b/include/llvm/MC/MCObjectFileInfo.h
@@ -14,13 +14,13 @@
 #ifndef LLVM_MC_MCBJECTFILEINFO_H
 #define LLVM_MC_MCBJECTFILEINFO_H
 
+#include "llvm/ADT/Triple.h"
 #include "llvm/Support/CodeGen.h"
 
 namespace llvm {
   class MCContext;
   class MCSection;
   class StringRef;
-  class Triple;
 
 class MCObjectFileInfo {
 protected:
@@ -33,12 +33,6 @@ protected:
   /// weak_definition of constant 0 for an omitted EH frame.
   bool SupportsWeakOmittedEHFrame;
 
-  /// IsFunctionEHFrameSymbolPrivate - This flag is set to true if the
-  /// "EH_frame" symbol for EH information should be an assembler temporary (aka
-  /// private linkage, aka an L or .L label) or false if it should be a normal
-  /// non-.globl label.  This defaults to true.
-  bool IsFunctionEHFrameSymbolPrivate;
-
   /// SupportsCompactUnwindWithoutEHFrame - True if the target object file
   /// supports emitting a compact unwind section without an associated EH frame
   /// section.
@@ -201,9 +195,6 @@ public:
   void InitMCObjectFileInfo(StringRef TT, Reloc::Model RM, CodeModel::Model CM,
                             MCContext &ctx);
 
-  bool isFunctionEHFrameSymbolPrivate() const {
-    return IsFunctionEHFrameSymbolPrivate;
-  }
   bool getSupportsWeakOmittedEHFrame() const {
     return SupportsWeakOmittedEHFrame;
   }
@@ -380,6 +371,7 @@ private:
   Reloc::Model RelocM;
   CodeModel::Model CMModel;
   MCContext *Ctx;
+  Triple TT;
 
   void InitMachOMCObjectFileInfo(Triple T);
   void InitELFMCObjectFileInfo(Triple T);
@@ -388,6 +380,9 @@ private:
   /// InitEHFrameSection - Initialize EHFrameSection on demand.
   ///
   void InitEHFrameSection();
+
+public:
+  const Triple &getTargetTriple() const { return TT; }
 };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h
index e41a8ba..8d37c85 100644
--- a/include/llvm/MC/MCObjectStreamer.h
+++ b/include/llvm/MC/MCObjectStreamer.h
@@ -78,16 +78,15 @@ protected:
   /// fragment is not a data fragment.
   MCDataFragment *getOrCreateDataFragment() const;
 
-  const MCExpr *AddValueSymbols(const MCExpr *Value);
-
 public:
+  void visitUsedSymbol(const MCSymbol &Sym) override;
+
   MCAssembler &getAssembler() { return *Assembler; }
 
   /// @name MCStreamer Interface
   /// @{
 
   void EmitLabel(MCSymbol *Symbol) override;
-  void EmitDebugLabel(MCSymbol *Symbol) override;
   void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
   void EmitValueImpl(const MCExpr *Value, unsigned Size,
                      const SMLoc &Loc = SMLoc()) override;
@@ -126,6 +125,10 @@ public:
   void EmitFill(uint64_t NumBytes, uint8_t FillValue) override;
   void EmitZeros(uint64_t NumBytes) override;
   void FinishImpl() override;
+
+  virtual bool mayHaveInstructions() const {
+    return getCurrentSectionData()->hasInstructions();
+  }
 };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCParser/AsmLexer.h b/include/llvm/MC/MCParser/AsmLexer.h
index 59b5c09..0b550ba 100644
--- a/include/llvm/MC/MCParser/AsmLexer.h
+++ b/include/llvm/MC/MCParser/AsmLexer.h
@@ -28,7 +28,7 @@ class AsmLexer : public MCAsmLexer {
   const MCAsmInfo &MAI;
 
   const char *CurPtr;
-  const MemoryBuffer *CurBuf;
+  StringRef CurBuf;
   bool isAtStartOfLine;
 
   void operator=(const AsmLexer&) LLVM_DELETED_FUNCTION;
@@ -42,7 +42,7 @@ public:
   AsmLexer(const MCAsmInfo &MAI);
   ~AsmLexer();
 
-  void setBuffer(const MemoryBuffer *buf, const char *ptr = nullptr);
+  void setBuffer(StringRef Buf, const char *ptr = nullptr);
 
   StringRef LexUntilEndOfStatement() override;
   StringRef LexUntilEndOfLine();
diff --git a/include/llvm/MC/MCParser/MCAsmParser.h b/include/llvm/MC/MCParser/MCAsmParser.h
index f751786..9836795 100644
--- a/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/include/llvm/MC/MCParser/MCAsmParser.h
@@ -30,23 +30,24 @@ class SMRange;
 class SourceMgr;
 class Twine;
 
+class InlineAsmIdentifierInfo {
+public:
+  void *OpDecl;
+  bool IsVarDecl;
+  unsigned Length, Size, Type;
+
+  void clear() {
+    OpDecl = nullptr;
+    IsVarDecl = false;
+    Length = 1;
+    Size = 0;
+    Type = 0;
+  }
+};
+
 /// MCAsmParserSemaCallback - Generic Sema callback for assembly parser.
 class MCAsmParserSemaCallback {
 public:
-  typedef struct {
-    void *OpDecl;
-    bool IsVarDecl;
-    unsigned Length, Size, Type;
-
-    void clear() {
-      OpDecl = nullptr;
-      IsVarDecl = false;
-      Length = 1;
-      Size = 0;
-      Type = 0;
-    }
-  } InlineAsmIdentifierInfo;
-
   virtual ~MCAsmParserSemaCallback();
   virtual void *LookupInlineAsmIdentifier(StringRef &LineBuf,
                                           InlineAsmIdentifierInfo &Info,
@@ -56,9 +57,6 @@ public:
                                     unsigned &Offset) = 0;
 };
 
-typedef MCAsmParserSemaCallback::InlineAsmIdentifierInfo
-  InlineAsmIdentifierInfo;
-
 /// MCAsmParser - Generic assembler parser interface, for use by target specific
 /// assembly parsers.
 class MCAsmParser {
diff --git a/include/llvm/MC/MCSectionCOFF.h b/include/llvm/MC/MCSectionCOFF.h
index a428f9e..d205e2a 100644
--- a/include/llvm/MC/MCSectionCOFF.h
+++ b/include/llvm/MC/MCSectionCOFF.h
@@ -42,24 +42,15 @@ class MCSymbol;
     /// it is a COMDAT section (Characteristics & IMAGE_SCN_LNK_COMDAT) != 0
     mutable int Selection;
 
-    /// Assoc - This is name of the associated section, if it is a COMDAT
-    /// section (Characteristics & IMAGE_SCN_LNK_COMDAT) != 0 with an
-    /// associative Selection (IMAGE_COMDAT_SELECT_ASSOCIATIVE).
-    mutable const MCSectionCOFF *Assoc;
-
   private:
     friend class MCContext;
     MCSectionCOFF(StringRef Section, unsigned Characteristics,
-                  const MCSymbol *COMDATSymbol, int Selection,
-                  const MCSectionCOFF *Assoc, SectionKind K)
+                  const MCSymbol *COMDATSymbol, int Selection, SectionKind K)
         : MCSection(SV_COFF, K), SectionName(Section),
           Characteristics(Characteristics), COMDATSymbol(COMDATSymbol),
-          Selection(Selection), Assoc(Assoc) {
+          Selection(Selection) {
       assert ((Characteristics & 0x00F00000) == 0 &&
         "alignment must not be set upon section creation");
-      assert ((Selection == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) ==
-              (Assoc != nullptr) &&
-        "associative COMDAT section must have an associated section");
     }
     ~MCSectionCOFF();
 
@@ -76,11 +67,10 @@ class MCSymbol;
       return SectionName.str() + "_end";
     }
     unsigned getCharacteristics() const { return Characteristics; }
+    const MCSymbol *getCOMDATSymbol() const { return COMDATSymbol; }
     int getSelection() const { return Selection; }
-    const MCSectionCOFF *getAssocSection() const { return Assoc; }
 
-    void setSelection(int Selection,
-                      const MCSectionCOFF *Assoc = nullptr) const;
+    void setSelection(int Selection) const;
 
     void PrintSwitchToSection(const MCAsmInfo &MAI, raw_ostream &OS,
                               const MCExpr *Subsection) const override;
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index 2a8367a..216de75 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -86,6 +86,27 @@ public:
   virtual void finish();
 };
 
+class AArch64TargetStreamer : public MCTargetStreamer {
+public:
+  AArch64TargetStreamer(MCStreamer &S);
+  ~AArch64TargetStreamer();
+
+
+  void finish() override;
+
+  /// Callback used to implement the ldr= pseudo.
+  /// Add a new entry to the constant pool for the current section and return an
+  /// MCExpr that can be used to refer to the constant pool location.
+  const MCExpr *addConstantPoolEntry(const MCExpr *);
+
+  /// Callback used to implemnt the .ltorg directive.
+  /// Emit contents of constant pool for the current section.
+  void emitCurrentConstantPool();
+
+private:
+  std::unique_ptr<AssemblerConstantPools> ConstantPools;
+};
+
 // FIXME: declared here because it is used from
 // lib/CodeGen/AsmPrinter/ARMException.cpp.
 class ARMTargetStreamer : public MCTargetStreamer {
@@ -164,8 +185,6 @@ class MCStreamer {
   void setCurrentW64UnwindInfo(MCWin64EHUnwindInfo *Frame);
   void EnsureValidW64UnwindInfo();
 
-  MCSymbol *LastSymbol;
-
   // SymbolOrdering - Tracks an index to represent the order
   // a symbol was emitted in. Zero means we did not emit that symbol.
   DenseMap<const MCSymbol *, unsigned> SymbolOrdering;
@@ -182,9 +201,7 @@ protected:
 
   const MCExpr *ForceExpAbs(const MCExpr *Expr);
 
-  void RecordProcStart(MCDwarfFrameInfo &Frame);
   virtual void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame);
-  void RecordProcEnd(MCDwarfFrameInfo &Frame);
   virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &CurFrame);
 
   MCWin64EHUnwindInfo *getCurrentW64UnwindInfo() {
@@ -197,6 +214,9 @@ protected:
 public:
   virtual ~MCStreamer();
 
+  void visitUsedExpr(const MCExpr &Expr);
+  virtual void visitUsedSymbol(const MCSymbol &Sym);
+
   void setTargetStreamer(MCTargetStreamer *TS) {
     TargetStreamer.reset(TS);
   }
@@ -223,6 +243,10 @@ public:
     return *W64UnwindInfos[i];
   }
 
+  ArrayRef<MCWin64EHUnwindInfo *> getW64UnwindInfos() const {
+    return W64UnwindInfos;
+  }
+
   void generateCompactUnwindEncodings(MCAsmBackend *MAB);
 
   /// @name Assembly File Formatting.
@@ -294,7 +318,7 @@ public:
   ///
   /// This is called by PopSection and SwitchSection, if the current
   /// section changes.
-  virtual void ChangeSection(const MCSection *, const MCExpr *) = 0;
+  virtual void ChangeSection(const MCSection *, const MCExpr *);
 
   /// pushSection - Save the current and previous section on the
   /// section stack.
@@ -374,12 +398,10 @@ public:
   // add the section we're emitting it to later.
   virtual void EmitLabel(MCSymbol *Symbol);
 
-  virtual void EmitDebugLabel(MCSymbol *Symbol);
-
   virtual void EmitEHSymAttributes(const MCSymbol *Symbol, MCSymbol *EHSymbol);
 
   /// EmitAssemblerFlag - Note in the output the specified @p Flag.
-  virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) = 0;
+  virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
 
   /// EmitLinkerOptions - Emit the given list @p Options of strings as linker
   /// options into the output.
@@ -394,7 +416,7 @@ public:
 
   /// EmitThumbFunc - Note in the output that the specified @p Func is
   /// a Thumb mode function (ARM target only).
-  virtual void EmitThumbFunc(MCSymbol *Func) = 0;
+  virtual void EmitThumbFunc(MCSymbol *Func);
 
   /// EmitAssignment - Emit an assignment of @p Value to @p Symbol.
   ///
@@ -416,7 +438,7 @@ public:
   ///
   /// @param Alias - The alias that is being created.
   /// @param Symbol - The symbol being aliased.
-  virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) = 0;
+  virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol);
 
   /// EmitSymbolAttribute - Add the given @p Attribute to @p Symbol.
   virtual bool EmitSymbolAttribute(MCSymbol *Symbol,
@@ -426,25 +448,25 @@ public:
   ///
   /// @param Symbol - The symbol to have its n_desc field set.
   /// @param DescValue - The value to set into the n_desc field.
-  virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) = 0;
+  virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
 
   /// BeginCOFFSymbolDef - Start emitting COFF symbol definition
   ///
   /// @param Symbol - The symbol to have its External & Type fields set.
-  virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol) = 0;
+  virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol);
 
   /// EmitCOFFSymbolStorageClass - Emit the storage class of the symbol.
   ///
   /// @param StorageClass - The storage class the symbol should have.
-  virtual void EmitCOFFSymbolStorageClass(int StorageClass) = 0;
+  virtual void EmitCOFFSymbolStorageClass(int StorageClass);
 
   /// EmitCOFFSymbolType - Emit the type of the symbol.
   ///
   /// @param Type - A COFF type identifier (see COFF::SymbolType in X86COFF.h)
-  virtual void EmitCOFFSymbolType(int Type) = 0;
+  virtual void EmitCOFFSymbolType(int Type);
 
   /// EndCOFFSymbolDef - Marks the end of the symbol definition.
-  virtual void EndCOFFSymbolDef() = 0;
+  virtual void EndCOFFSymbolDef();
 
   /// EmitCOFFSectionIndex - Emits a COFF section index.
   ///
@@ -461,7 +483,7 @@ public:
   /// This corresponds to an assembler statement such as:
   ///  .size symbol, expression
   ///
-  virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) = 0;
+  virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value);
 
   /// \brief Emit a Linker Optimization Hint (LOH) directive.
   /// \param Args - Arguments of the LOH.
@@ -482,7 +504,7 @@ public:
   /// @param Size - The size of the common symbol.
   /// @param ByteAlignment - The alignment of the common symbol in bytes.
   virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                                     unsigned ByteAlignment) = 0;
+                                     unsigned ByteAlignment);
 
   /// EmitZerofill - Emit the zerofill section and an optional symbol.
   ///
@@ -503,7 +525,7 @@ public:
   /// @param ByteAlignment - The alignment of the thread local common symbol
   /// if non-zero.  This must be a power of 2 on some targets.
   virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
-                              uint64_t Size, unsigned ByteAlignment = 0) = 0;
+                              uint64_t Size, unsigned ByteAlignment = 0);
 
   /// @}
   /// @name Generating Data
@@ -513,7 +535,7 @@ public:
   ///
   /// This is used to implement assembler directives such as .byte, .ascii,
   /// etc.
-  virtual void EmitBytes(StringRef Data) = 0;
+  virtual void EmitBytes(StringRef Data);
 
   /// EmitValue - Emit the expression @p Value into the output as a native
   /// integer of the given @p Size bytes.
@@ -526,7 +548,7 @@ public:
   /// match a native machine width.
   /// @param Loc - The location of the expression for error reporting.
   virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                             const SMLoc &Loc = SMLoc()) = 0;
+                             const SMLoc &Loc = SMLoc());
 
   void EmitValue(const MCExpr *Value, unsigned Size,
                  const SMLoc &Loc = SMLoc());
@@ -541,9 +563,9 @@ public:
   /// .long foo
   void EmitAbsValue(const MCExpr *Value, unsigned Size);
 
-  virtual void EmitULEB128Value(const MCExpr *Value) = 0;
+  virtual void EmitULEB128Value(const MCExpr *Value);
 
-  virtual void EmitSLEB128Value(const MCExpr *Value) = 0;
+  virtual void EmitSLEB128Value(const MCExpr *Value);
 
   /// EmitULEB128Value - Special case of EmitULEB128Value that avoids the
   /// client having to pass in a MCExpr for constant integers.
@@ -598,7 +620,7 @@ public:
   /// emitted.
   virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
                                     unsigned ValueSize = 1,
-                                    unsigned MaxBytesToEmit = 0) = 0;
+                                    unsigned MaxBytesToEmit = 0);
 
   /// EmitCodeAlignment - Emit nops until the byte alignment @p ByteAlignment
   /// is reached.
@@ -612,7 +634,7 @@ public:
   /// the alignment cannot be reached in this many bytes, no bytes are
   /// emitted.
   virtual void EmitCodeAlignment(unsigned ByteAlignment,
-                                 unsigned MaxBytesToEmit = 0) = 0;
+                                 unsigned MaxBytesToEmit = 0);
 
   /// EmitValueToOffset - Emit some number of copies of @p Value until the
   /// byte offset @p Offset is reached.
@@ -624,13 +646,13 @@ public:
   /// @param Value - The value to use when filling bytes.
   /// @return false on success, true if the offset was invalid.
   virtual bool EmitValueToOffset(const MCExpr *Offset,
-                                 unsigned char Value = 0) = 0;
+                                 unsigned char Value = 0);
 
   /// @}
 
   /// EmitFileDirective - Switch to a new logical file.  This is used to
   /// implement the '.file "foo.c"' assembler directive.
-  virtual void EmitFileDirective(StringRef Filename) = 0;
+  virtual void EmitFileDirective(StringRef Filename);
 
   /// Emit the "identifiers" directive.  This implements the
   /// '.ident "version foo"' assembler directive.
@@ -677,38 +699,38 @@ public:
   virtual void EmitCFIRegister(int64_t Register1, int64_t Register2);
   virtual void EmitCFIWindowSave();
 
-  virtual void EmitWin64EHStartProc(const MCSymbol *Symbol);
-  virtual void EmitWin64EHEndProc();
-  virtual void EmitWin64EHStartChained();
-  virtual void EmitWin64EHEndChained();
-  virtual void EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind,
-                                  bool Except);
-  virtual void EmitWin64EHHandlerData();
-  virtual void EmitWin64EHPushReg(unsigned Register);
-  virtual void EmitWin64EHSetFrame(unsigned Register, unsigned Offset);
-  virtual void EmitWin64EHAllocStack(unsigned Size);
-  virtual void EmitWin64EHSaveReg(unsigned Register, unsigned Offset);
-  virtual void EmitWin64EHSaveXMM(unsigned Register, unsigned Offset);
-  virtual void EmitWin64EHPushFrame(bool Code);
-  virtual void EmitWin64EHEndProlog();
+  virtual void EmitWinCFIStartProc(const MCSymbol *Symbol);
+  virtual void EmitWinCFIEndProc();
+  virtual void EmitWinCFIStartChained();
+  virtual void EmitWinCFIEndChained();
+  virtual void EmitWinCFIPushReg(unsigned Register);
+  virtual void EmitWinCFISetFrame(unsigned Register, unsigned Offset);
+  virtual void EmitWinCFIAllocStack(unsigned Size);
+  virtual void EmitWinCFISaveReg(unsigned Register, unsigned Offset);
+  virtual void EmitWinCFISaveXMM(unsigned Register, unsigned Offset);
+  virtual void EmitWinCFIPushFrame(bool Code);
+  virtual void EmitWinCFIEndProlog();
+
+  virtual void EmitWinEHHandler(const MCSymbol *Sym, bool Unwind, bool Except);
+  virtual void EmitWinEHHandlerData();
 
   /// EmitInstruction - Emit the given @p Instruction into the current
   /// section.
-  virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) = 0;
+  virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI);
 
   /// \brief Set the bundle alignment mode from now on in the section.
   /// The argument is the power of 2 to which the alignment is set. The
   /// value 0 means turn the bundle alignment off.
-  virtual void EmitBundleAlignMode(unsigned AlignPow2) = 0;
+  virtual void EmitBundleAlignMode(unsigned AlignPow2);
 
   /// \brief The following instructions are a bundle-locked group.
   ///
   /// \param AlignToEnd - If true, the bundle-locked group will be aligned to
   ///                     the end of a bundle.
-  virtual void EmitBundleLock(bool AlignToEnd) = 0;
+  virtual void EmitBundleLock(bool AlignToEnd);
 
   /// \brief Ends a bundle-locked group.
-  virtual void EmitBundleUnlock() = 0;
+  virtual void EmitBundleUnlock();
 
   /// EmitRawText - If this file is backed by a assembly streamer, this dumps
   /// the specified string in the output .s file.  This capability is
@@ -719,9 +741,11 @@ public:
   virtual void Flush() {}
 
   /// FinishImpl - Streamer specific finalization.
-  virtual void FinishImpl() = 0;
+  virtual void FinishImpl();
   /// Finish - Finish emission of machine code.
   void Finish();
+
+  virtual bool mayHaveInstructions() const { return true; }
 };
 
 /// createNullStreamer - Create a dummy machine code streamer, which does
diff --git a/include/llvm/MC/MCTargetAsmParser.h b/include/llvm/MC/MCTargetAsmParser.h
index 18ef6c2..384cc1b 100644
--- a/include/llvm/MC/MCTargetAsmParser.h
+++ b/include/llvm/MC/MCTargetAsmParser.h
@@ -14,6 +14,8 @@
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCTargetOptions.h"
 
+#include <memory>
+
 namespace llvm {
 class AsmToken;
 class MCInst;
@@ -23,6 +25,8 @@ class SMLoc;
 class StringRef;
 template <typename T> class SmallVectorImpl;
 
+typedef SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> OperandVector;
+
 enum AsmRewriteKind {
   AOK_Delete = 0,     // Rewrite should be ignored.
   AOK_Align,          // Rewrite align as .align.
@@ -131,8 +135,7 @@ public:
   ///        ownership of them to the caller.
   /// \return True on failure.
   virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                                SMLoc NameLoc,
-                            SmallVectorImpl<MCParsedAsmOperand*> &Operands) = 0;
+                                SMLoc NameLoc, OperandVector &Operands) = 0;
 
   /// ParseDirective - Parse a target specific assembler directive
   ///
@@ -156,17 +159,16 @@ public:
   ///
   /// On failure, the target parser is responsible for emitting a diagnostic
   /// explaining the match failure.
-  virtual bool
-  MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                          SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                          MCStreamer &Out, unsigned &ErrorInfo,
-                          bool MatchingInlineAsm) = 0;
+  virtual bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                       OperandVector &Operands, MCStreamer &Out,
+                                       unsigned &ErrorInfo,
+                                       bool MatchingInlineAsm) = 0;
 
   /// Allow a target to add special case operand matching for things that
   /// tblgen doesn't/can't handle effectively. For example, literal
   /// immediates on ARM. TableGen expects a token operand, but the parser
   /// will recognize them as immediates.
-  virtual unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+  virtual unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
                                               unsigned Kind) {
     return Match_InvalidOperand;
   }
@@ -178,7 +180,7 @@ public:
   }
 
   virtual void convertToMapAndConstraints(unsigned Kind,
-                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) = 0;
+                                          const OperandVector &Operands) = 0;
 
   virtual const MCExpr *applyModifierToExpr(const MCExpr *E,
                                             MCSymbolRefExpr::VariantKind,
diff --git a/include/llvm/MC/MCTargetOptions.h b/include/llvm/MC/MCTargetOptions.h
index 80cc8be..eb4348e 100644
--- a/include/llvm/MC/MCTargetOptions.h
+++ b/include/llvm/MC/MCTargetOptions.h
@@ -29,6 +29,7 @@ public:
   bool ShowMCEncoding : 1;
   bool ShowMCInst : 1;
   bool AsmVerbose : 1;
+  int DwarfVersion;
   MCTargetOptions();
 };
 
@@ -41,7 +42,8 @@ inline bool operator==(const MCTargetOptions &LHS, const MCTargetOptions &RHS) {
           ARE_EQUAL(MCUseDwarfDirectory) &&
           ARE_EQUAL(ShowMCEncoding) &&
           ARE_EQUAL(ShowMCInst) &&
-          ARE_EQUAL(AsmVerbose));
+          ARE_EQUAL(AsmVerbose) &&
+	  ARE_EQUAL(DwarfVersion));
 #undef ARE_EQUAL
 }
 
diff --git a/include/llvm/MC/MCTargetOptionsCommandFlags.h b/include/llvm/MC/MCTargetOptionsCommandFlags.h
index 17a117a..6d4eb0e 100644
--- a/include/llvm/MC/MCTargetOptionsCommandFlags.h
+++ b/include/llvm/MC/MCTargetOptionsCommandFlags.h
@@ -33,11 +33,20 @@ cl::opt<bool> RelaxAll("mc-relax-all",
                        cl::desc("When used with filetype=obj, "
                                 "relax all fixups in the emitted object file"));
 
+cl::opt<int> DwarfVersion("dwarf-version", cl::desc("Dwarf version"),
+                          cl::init(0));
+
+cl::opt<bool> ShowMCInst("asm-show-inst",
+                         cl::desc("Emit internal instruction representation to "
+                                  "assembly file"));
+
 static inline MCTargetOptions InitMCTargetOptionsFromFlags() {
   MCTargetOptions Options;
   Options.SanitizeAddress =
       (AsmInstrumentation == MCTargetOptions::AsmInstrumentationAddress);
   Options.MCRelaxAll = RelaxAll;
+  Options.DwarfVersion = DwarfVersion;
+  Options.ShowMCInst = ShowMCInst;
   return Options;
 }
 
diff --git a/include/llvm/MC/MCWinCOFFStreamer.h b/include/llvm/MC/MCWinCOFFStreamer.h
index 34e39bb..7d2d0e4 100644
--- a/include/llvm/MC/MCWinCOFFStreamer.h
+++ b/include/llvm/MC/MCWinCOFFStreamer.h
@@ -35,7 +35,6 @@ public:
 
   void InitSections() override;
   void EmitLabel(MCSymbol *Symbol) override;
-  void EmitDebugLabel(MCSymbol *Symbol) override;
   void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
   void EmitThumbFunc(MCSymbol *Func) override;
   bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
@@ -57,7 +56,7 @@ public:
                       unsigned ByteAlignment) override;
   void EmitFileDirective(StringRef Filename) override;
   void EmitIdent(StringRef IdentString) override;
-  void EmitWin64EHHandlerData() override;
+  void EmitWinEHHandlerData() override;
   void FinishImpl() override;
 
   /// \}
diff --git a/include/llvm/Object/StringTableBuilder.h b/include/llvm/MC/StringTableBuilder.h
index c61e216..065e9e0 100644
--- a/include/llvm/Object/StringTableBuilder.h
+++ b/include/llvm/MC/StringTableBuilder.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_OBJECT_STRINGTABLE_BUILDER_H
-#define LLVM_OBJECT_STRINGTABLE_BUILDER_H
+#ifndef LLVM_MC_STRINGTABLE_BUILDER_H
+#define LLVM_MC_STRINGTABLE_BUILDER_H
 
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
diff --git a/include/llvm/Object/YAML.h b/include/llvm/MC/YAML.h
index 1792e8b..383cdc6 100644
--- a/include/llvm/Object/YAML.h
+++ b/include/llvm/MC/YAML.h
@@ -1,26 +1,10 @@
-//===- YAML.h - YAMLIO utilities for object files ---------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares utility classes for handling the YAML representation of
-// object files.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_OBJECT_YAML_H
-#define LLVM_OBJECT_YAML_H
+#ifndef LLVM_MC_YAML_H
+#define LLVM_MC_YAML_H
 
 #include "llvm/Support/YAMLTraits.h"
 
 namespace llvm {
-namespace object {
 namespace yaml {
-
 /// \brief Specialized YAMLIO scalar type for representing a binary blob.
 ///
 /// A typical use case would be to represent the content of a section in a
@@ -100,18 +84,11 @@ inline bool operator==(const BinaryRef &LHS, const BinaryRef &RHS) {
   return LHS.DataIsHexString == RHS.DataIsHexString && LHS.Data == RHS.Data;
 }
 
-}
-}
-
-namespace yaml {
-template <> struct ScalarTraits<object::yaml::BinaryRef> {
-  static void output(const object::yaml::BinaryRef &, void *,
-                     llvm::raw_ostream &);
-  static StringRef input(StringRef, void *, object::yaml::BinaryRef &);
+template <> struct ScalarTraits<BinaryRef> {
+  static void output(const BinaryRef &, void *, llvm::raw_ostream &);
+  static StringRef input(StringRef, void *, BinaryRef &);
   static bool mustQuote(StringRef S) { return needsQuotes(S); }
 };
 }
-
 }
-
 #endif
diff --git a/include/llvm/Object/Archive.h b/include/llvm/Object/Archive.h
index 652b659..af6c995 100644
--- a/include/llvm/Object/Archive.h
+++ b/include/llvm/Object/Archive.h
@@ -72,7 +72,7 @@ public:
 
     Child getNext() const;
 
-    error_code getName(StringRef &Result) const;
+    ErrorOr<StringRef> getName() const;
     StringRef getRawName() const { return getHeader()->getName(); }
     sys::TimeValue getLastModified() const {
       return getHeader()->getLastModified();
@@ -89,11 +89,11 @@ public:
       return StringRef(Data.data() + StartOfFile, getSize());
     }
 
-    error_code getMemoryBuffer(std::unique_ptr<MemoryBuffer> &Result,
-                               bool FullPath = false) const;
+    ErrorOr<std::unique_ptr<MemoryBuffer>>
+    getMemoryBuffer(bool FullPath = false) const;
 
-    error_code getAsBinary(std::unique_ptr<Binary> &Result,
-                           LLVMContext *Context = nullptr) const;
+    ErrorOr<std::unique_ptr<Binary>>
+    getAsBinary(LLVMContext *Context = nullptr) const;
   };
 
   class child_iterator {
@@ -137,8 +137,8 @@ public:
       : Parent(p)
       , SymbolIndex(symi)
       , StringIndex(stri) {}
-    error_code getName(StringRef &Result) const;
-    error_code getMember(child_iterator &Result) const;
+    StringRef getName() const;
+    ErrorOr<child_iterator> getMember() const;
     Symbol getNext() const;
   };
 
@@ -164,8 +164,8 @@ public:
     }
   };
 
-  Archive(MemoryBuffer *source, error_code &ec);
-  static ErrorOr<Archive *> create(MemoryBuffer *Source);
+  Archive(std::unique_ptr<MemoryBuffer> Source, std::error_code &EC);
+  static ErrorOr<Archive *> create(std::unique_ptr<MemoryBuffer> Source);
 
   enum Kind {
     K_GNU,
diff --git a/include/llvm/Object/Binary.h b/include/llvm/Object/Binary.h
index 8ac84e7..9be2fbe 100644
--- a/include/llvm/Object/Binary.h
+++ b/include/llvm/Object/Binary.h
@@ -32,12 +32,11 @@ private:
   Binary(const Binary &other) LLVM_DELETED_FUNCTION;
 
   unsigned int TypeID;
-  bool BufferOwned;
 
 protected:
-  MemoryBuffer *Data;
+  std::unique_ptr<MemoryBuffer> Data;
 
-  Binary(unsigned int Type, MemoryBuffer *Source, bool BufferOwned = true);
+  Binary(unsigned int Type, std::unique_ptr<MemoryBuffer> Source);
 
   enum {
     ID_Archive,
@@ -79,6 +78,7 @@ public:
   virtual ~Binary();
 
   StringRef getData() const;
+  MemoryBuffer *releaseBuffer() { return Data.release(); }
   StringRef getFileName() const;
 
   // Cast methods.
@@ -128,7 +128,7 @@ public:
 /// @param Source The data to create the Binary from. Ownership is transferred
 ///        to the Binary if successful. If an error is returned,
 ///        Source is destroyed by createBinary before returning.
-ErrorOr<Binary *> createBinary(MemoryBuffer *Source,
+ErrorOr<Binary *> createBinary(std::unique_ptr<MemoryBuffer> &Source,
                                LLVMContext *Context = nullptr);
 
 ErrorOr<Binary *> createBinary(StringRef Path);
diff --git a/include/llvm/Object/COFF.h b/include/llvm/Object/COFF.h
index bd9c677..e2da070 100644
--- a/include/llvm/Object/COFF.h
+++ b/include/llvm/Object/COFF.h
@@ -353,65 +353,74 @@ private:
   uint32_t NumberOfImportDirectory;
   const export_directory_table_entry *ExportDirectory;
 
-  error_code getString(uint32_t offset, StringRef &Res) const;
+  std::error_code getString(uint32_t offset, StringRef &Res) const;
 
   const coff_symbol *toSymb(DataRefImpl Symb) const;
   const coff_section *toSec(DataRefImpl Sec) const;
   const coff_relocation *toRel(DataRefImpl Rel) const;
 
-  error_code initSymbolTablePtr();
-  error_code initImportTablePtr();
-  error_code initExportTablePtr();
+  std::error_code initSymbolTablePtr();
+  std::error_code initImportTablePtr();
+  std::error_code initExportTablePtr();
 
 protected:
   void moveSymbolNext(DataRefImpl &Symb) const override;
-  error_code getSymbolName(DataRefImpl Symb, StringRef &Res) const override;
-  error_code getSymbolAddress(DataRefImpl Symb, uint64_t &Res) const override;
-  error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const override;
+  std::error_code getSymbolName(DataRefImpl Symb,
+                                StringRef &Res) const override;
+  std::error_code getSymbolAddress(DataRefImpl Symb,
+                                   uint64_t &Res) const override;
+  std::error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const override;
   uint32_t getSymbolFlags(DataRefImpl Symb) const override;
-  error_code getSymbolType(DataRefImpl Symb,
-                           SymbolRef::Type &Res) const override;
-  error_code getSymbolSection(DataRefImpl Symb,
-                              section_iterator &Res) const override;
+  std::error_code getSymbolType(DataRefImpl Symb,
+                                SymbolRef::Type &Res) const override;
+  std::error_code getSymbolSection(DataRefImpl Symb,
+                                   section_iterator &Res) const override;
   void moveSectionNext(DataRefImpl &Sec) const override;
-  error_code getSectionName(DataRefImpl Sec, StringRef &Res) const override;
-  error_code getSectionAddress(DataRefImpl Sec, uint64_t &Res) const override;
-  error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const override;
-  error_code getSectionContents(DataRefImpl Sec, StringRef &Res) const override;
-  error_code getSectionAlignment(DataRefImpl Sec, uint64_t &Res) const override;
-  error_code isSectionText(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionData(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionBSS(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionReadOnlyData(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionRequiredForExecution(DataRefImpl Sec,
-                                           bool &Res) const override;
-  error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
-                                   bool &Result) const override;
+  std::error_code getSectionName(DataRefImpl Sec,
+                                 StringRef &Res) const override;
+  std::error_code getSectionAddress(DataRefImpl Sec,
+                                    uint64_t &Res) const override;
+  std::error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const override;
+  std::error_code getSectionContents(DataRefImpl Sec,
+                                     StringRef &Res) const override;
+  std::error_code getSectionAlignment(DataRefImpl Sec,
+                                      uint64_t &Res) const override;
+  std::error_code isSectionText(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionData(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionBSS(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionReadOnlyData(DataRefImpl Sec,
+                                        bool &Res) const override;
+  std::error_code isSectionRequiredForExecution(DataRefImpl Sec,
+                                                bool &Res) const override;
+  std::error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
+                                        bool &Result) const override;
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
 
   void moveRelocationNext(DataRefImpl &Rel) const override;
-  error_code getRelocationAddress(DataRefImpl Rel,
-                                  uint64_t &Res) const override;
-  error_code getRelocationOffset(DataRefImpl Rel, uint64_t &Res) const override;
+  std::error_code getRelocationAddress(DataRefImpl Rel,
+                                       uint64_t &Res) const override;
+  std::error_code getRelocationOffset(DataRefImpl Rel,
+                                      uint64_t &Res) const override;
   symbol_iterator getRelocationSymbol(DataRefImpl Rel) const override;
-  error_code getRelocationType(DataRefImpl Rel, uint64_t &Res) const override;
-  error_code
+  std::error_code getRelocationType(DataRefImpl Rel,
+                                    uint64_t &Res) const override;
+  std::error_code
   getRelocationTypeName(DataRefImpl Rel,
                         SmallVectorImpl<char> &Result) const override;
-  error_code
+  std::error_code
   getRelocationValueString(DataRefImpl Rel,
                            SmallVectorImpl<char> &Result) const override;
 
-  error_code getLibraryNext(DataRefImpl LibData,
-                            LibraryRef &Result) const override;
-  error_code getLibraryPath(DataRefImpl LibData,
-                            StringRef &Result) const override;
+  std::error_code getLibraryNext(DataRefImpl LibData,
+                                 LibraryRef &Result) const override;
+  std::error_code getLibraryPath(DataRefImpl LibData,
+                                 StringRef &Result) const override;
 
 public:
-  COFFObjectFile(MemoryBuffer *Object, error_code &EC, bool BufferOwned = true);
+  COFFObjectFile(std::unique_ptr<MemoryBuffer> Object, std::error_code &EC);
   basic_symbol_iterator symbol_begin_impl() const override;
   basic_symbol_iterator symbol_end_impl() const override;
   library_iterator needed_library_begin() const override;
@@ -433,30 +442,33 @@ public:
   export_directory_iterator export_directory_begin() const;
   export_directory_iterator export_directory_end() const;
 
-  error_code getHeader(const coff_file_header *&Res) const;
-  error_code getCOFFHeader(const coff_file_header *&Res) const;
-  error_code getPE32Header(const pe32_header *&Res) const;
-  error_code getPE32PlusHeader(const pe32plus_header *&Res) const;
-  error_code getDataDirectory(uint32_t index, const data_directory *&Res) const;
-  error_code getSection(int32_t index, const coff_section *&Res) const;
-  error_code getSymbol(uint32_t index, const coff_symbol *&Res) const;
+  std::error_code getHeader(const coff_file_header *&Res) const;
+  std::error_code getCOFFHeader(const coff_file_header *&Res) const;
+  std::error_code getPE32Header(const pe32_header *&Res) const;
+  std::error_code getPE32PlusHeader(const pe32plus_header *&Res) const;
+  std::error_code getDataDirectory(uint32_t index,
+                                   const data_directory *&Res) const;
+  std::error_code getSection(int32_t index, const coff_section *&Res) const;
+  std::error_code getSymbol(uint32_t index, const coff_symbol *&Res) const;
   template <typename T>
-  error_code getAuxSymbol(uint32_t index, const T *&Res) const {
+  std::error_code getAuxSymbol(uint32_t index, const T *&Res) const {
     const coff_symbol *s;
-    error_code ec = getSymbol(index, s);
+    std::error_code ec = getSymbol(index, s);
     Res = reinterpret_cast<const T *>(s);
     return ec;
   }
-  error_code getSymbolName(const coff_symbol *symbol, StringRef &Res) const;
+  std::error_code getSymbolName(const coff_symbol *symbol,
+                                StringRef &Res) const;
   ArrayRef<uint8_t> getSymbolAuxData(const coff_symbol *symbol) const;
 
-  error_code getSectionName(const coff_section *Sec, StringRef &Res) const;
-  error_code getSectionContents(const coff_section *Sec,
-                                ArrayRef<uint8_t> &Res) const;
+  std::error_code getSectionName(const coff_section *Sec, StringRef &Res) const;
+  std::error_code getSectionContents(const coff_section *Sec,
+                                     ArrayRef<uint8_t> &Res) const;
 
-  error_code getVaPtr(uint64_t VA, uintptr_t &Res) const;
-  error_code getRvaPtr(uint32_t Rva, uintptr_t &Res) const;
-  error_code getHintName(uint32_t Rva, uint16_t &Hint, StringRef &Name) const;
+  std::error_code getVaPtr(uint64_t VA, uintptr_t &Res) const;
+  std::error_code getRvaPtr(uint32_t Rva, uintptr_t &Res) const;
+  std::error_code getHintName(uint32_t Rva, uint16_t &Hint,
+                              StringRef &Name) const;
 
   static inline bool classof(const Binary *v) { return v->isCOFF(); }
 };
@@ -471,12 +483,12 @@ public:
 
   bool operator==(const ImportDirectoryEntryRef &Other) const;
   void moveNext();
-  error_code getName(StringRef &Result) const;
+  std::error_code getName(StringRef &Result) const;
 
-  error_code
+  std::error_code
   getImportTableEntry(const import_directory_table_entry *&Result) const;
 
-  error_code
+  std::error_code
   getImportLookupEntry(const import_lookup_table_entry32 *&Result) const;
 
 private:
@@ -496,11 +508,11 @@ public:
   bool operator==(const ExportDirectoryEntryRef &Other) const;
   void moveNext();
 
-  error_code getDllName(StringRef &Result) const;
-  error_code getOrdinalBase(uint32_t &Result) const;
-  error_code getOrdinal(uint32_t &Result) const;
-  error_code getExportRVA(uint32_t &Result) const;
-  error_code getSymbolName(StringRef &Result) const;
+  std::error_code getDllName(StringRef &Result) const;
+  std::error_code getOrdinalBase(uint32_t &Result) const;
+  std::error_code getOrdinal(uint32_t &Result) const;
+  std::error_code getExportRVA(uint32_t &Result) const;
+  std::error_code getSymbolName(StringRef &Result) const;
 
 private:
   const export_directory_table_entry *ExportTable;
diff --git a/include/llvm/Object/COFFYAML.h b/include/llvm/Object/COFFYAML.h
index 3f48e07..4aba08f 100644
--- a/include/llvm/Object/COFFYAML.h
+++ b/include/llvm/Object/COFFYAML.h
@@ -15,7 +15,7 @@
 #define LLVM_OBJECT_COFFYAML_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/Object/YAML.h"
+#include "llvm/MC/YAML.h"
 #include "llvm/Support/COFF.h"
 
 namespace llvm {
@@ -49,7 +49,7 @@ namespace COFFYAML {
   struct Section {
     COFF::section Header;
     unsigned Alignment;
-    object::yaml::BinaryRef SectionData;
+    yaml::BinaryRef SectionData;
     std::vector<Relocation> Relocations;
     StringRef Name;
     Section();
diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h
index ee97d4e..fbc48e6 100644
--- a/include/llvm/Object/ELF.h
+++ b/include/llvm/Object/ELF.h
@@ -40,11 +40,12 @@ StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type);
 
 // Subclasses of ELFFile may need this for template instantiation
 inline std::pair<unsigned char, unsigned char>
-getElfArchType(MemoryBuffer *Object) {
-  if (Object->getBufferSize() < ELF::EI_NIDENT)
-    return std::make_pair((uint8_t)ELF::ELFCLASSNONE,(uint8_t)ELF::ELFDATANONE);
-  return std::make_pair((uint8_t) Object->getBufferStart()[ELF::EI_CLASS],
-                        (uint8_t) Object->getBufferStart()[ELF::EI_DATA]);
+getElfArchType(StringRef Object) {
+  if (Object.size() < ELF::EI_NIDENT)
+    return std::make_pair((uint8_t)ELF::ELFCLASSNONE,
+                          (uint8_t)ELF::ELFDATANONE);
+  return std::make_pair((uint8_t)Object[ELF::EI_CLASS],
+                        (uint8_t)Object[ELF::EI_DATA]);
 }
 
 template <class ELFT>
@@ -133,6 +134,7 @@ public:
   typedef Elf_Vernaux_Impl<ELFT> Elf_Vernaux;
   typedef Elf_Versym_Impl<ELFT> Elf_Versym;
   typedef ELFEntityIterator<const Elf_Dyn> Elf_Dyn_Iter;
+  typedef iterator_range<Elf_Dyn_Iter> Elf_Dyn_Range;
   typedef ELFEntityIterator<const Elf_Rela> Elf_Rela_Iter;
   typedef ELFEntityIterator<const Elf_Rel> Elf_Rel_Iter;
   typedef ELFEntityIterator<const Elf_Shdr> Elf_Shdr_Iter;
@@ -229,10 +231,10 @@ private:
   typedef SmallVector<const Elf_Shdr *, 2> Sections_t;
   typedef DenseMap<unsigned, unsigned> IndexMap_t;
 
-  MemoryBuffer *Buf;
+  StringRef Buf;
 
   const uint8_t *base() const {
-    return reinterpret_cast<const uint8_t *>(Buf->getBufferStart());
+    return reinterpret_cast<const uint8_t *>(Buf.data());
   }
 
   const Elf_Ehdr *Header;
@@ -316,7 +318,7 @@ public:
   std::pair<const Elf_Shdr *, const Elf_Sym *>
   getRelocationSymbol(const Elf_Shdr *RelSec, const RelT *Rel) const;
 
-  ELFFile(MemoryBuffer *Object, error_code &ec);
+  ELFFile(StringRef Object, std::error_code &ec);
 
   bool isMipsELF64() const {
     return Header->e_machine == ELF::EM_MIPS &&
@@ -342,6 +344,9 @@ public:
   /// \param NULLEnd use one past the first DT_NULL entry as the end instead of
   /// the section size.
   Elf_Dyn_Iter end_dynamic_table(bool NULLEnd = false) const;
+  Elf_Dyn_Range dynamic_table(bool NULLEnd = false) const {
+    return make_range(begin_dynamic_table(), end_dynamic_table(NULLEnd));
+  }
 
   Elf_Sym_Iter begin_dynamic_symbols() const {
     if (DynSymRegion.Addr)
@@ -532,7 +537,7 @@ ELFFile<ELFT>::getSymbol(uint32_t Index) const {
 template <class ELFT>
 ErrorOr<ArrayRef<uint8_t> >
 ELFFile<ELFT>::getSectionContents(const Elf_Shdr *Sec) const {
-  if (Sec->sh_offset + Sec->sh_size > Buf->getBufferSize())
+  if (Sec->sh_offset + Sec->sh_size > Buf.size())
     return object_error::parse_failed;
   const uint8_t *Start = base() + Sec->sh_offset;
   return ArrayRef<uint8_t>(Start, Sec->sh_size);
@@ -598,7 +603,7 @@ void ELFFile<ELFT>::VerifyStrTab(const Elf_Shdr *sh) const {
 template <class ELFT>
 uint64_t ELFFile<ELFT>::getNumSections() const {
   assert(Header && "Header not initialized!");
-  if (Header->e_shnum == ELF::SHN_UNDEF) {
+  if (Header->e_shnum == ELF::SHN_UNDEF && Header->e_shoff > 0) {
     assert(SectionHeaderTable && "SectionHeaderTable not initialized!");
     return SectionHeaderTable->sh_size;
   }
@@ -617,18 +622,13 @@ typename ELFFile<ELFT>::uintX_t ELFFile<ELFT>::getStringTableIndex() const {
 }
 
 template <class ELFT>
-ELFFile<ELFT>::ELFFile(MemoryBuffer *Object, error_code &ec)
-    : Buf(Object),
-      SectionHeaderTable(nullptr),
-      dot_shstrtab_sec(nullptr),
-      dot_strtab_sec(nullptr),
-      dot_symtab_sec(nullptr),
-      SymbolTableSectionHeaderIndex(nullptr),
-      dot_gnu_version_sec(nullptr),
-      dot_gnu_version_r_sec(nullptr),
-      dot_gnu_version_d_sec(nullptr),
+ELFFile<ELFT>::ELFFile(StringRef Object, std::error_code &ec)
+    : Buf(Object), SectionHeaderTable(nullptr), dot_shstrtab_sec(nullptr),
+      dot_strtab_sec(nullptr), dot_symtab_sec(nullptr),
+      SymbolTableSectionHeaderIndex(nullptr), dot_gnu_version_sec(nullptr),
+      dot_gnu_version_r_sec(nullptr), dot_gnu_version_d_sec(nullptr),
       dt_soname(nullptr) {
-  const uint64_t FileSize = Buf->getBufferSize();
+  const uint64_t FileSize = Buf.size();
 
   if (sizeof(Elf_Ehdr) > FileSize)
     // FIXME: Proper error handling.
@@ -744,7 +744,7 @@ ELFFile<ELFT>::ELFFile(MemoryBuffer *Object, error_code &ec)
     }
   }
 
-  ec = error_code::success();
+  ec = std::error_code();
 }
 
 // Get the symbol table index in the symtab section given a symbol
@@ -823,17 +823,13 @@ ELFFile<ELFT>::end_dynamic_table(bool NULLEnd) const {
 template <class ELFT>
 StringRef ELFFile<ELFT>::getLoadName() const {
   if (!dt_soname) {
+    dt_soname = "";
     // Find the DT_SONAME entry
-    Elf_Dyn_Iter it = begin_dynamic_table();
-    Elf_Dyn_Iter ie = end_dynamic_table();
-    while (it != ie && it->getTag() != ELF::DT_SONAME)
-      ++it;
-
-    if (it != ie) {
-      dt_soname = getDynamicString(it->getVal());
-    } else {
-      dt_soname = "";
-    }
+    for (const auto &Entry : dynamic_table())
+      if (Entry.getTag() == ELF::DT_SONAME) {
+        dt_soname = getDynamicString(Entry.getVal());
+        break;
+      }
   }
   return dt_soname;
 }
diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h
index 302caba..cfb6b08 100644
--- a/include/llvm/Object/ELFObjectFile.h
+++ b/include/llvm/Object/ELFObjectFile.h
@@ -57,50 +57,63 @@ protected:
   ELFFile<ELFT> EF;
 
   void moveSymbolNext(DataRefImpl &Symb) const override;
-  error_code getSymbolName(DataRefImpl Symb, StringRef &Res) const override;
-  error_code getSymbolAddress(DataRefImpl Symb, uint64_t &Res) const override;
-  error_code getSymbolAlignment(DataRefImpl Symb, uint32_t &Res) const override;
-  error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const override;
+  std::error_code getSymbolName(DataRefImpl Symb,
+                                StringRef &Res) const override;
+  std::error_code getSymbolAddress(DataRefImpl Symb,
+                                   uint64_t &Res) const override;
+  std::error_code getSymbolAlignment(DataRefImpl Symb,
+                                     uint32_t &Res) const override;
+  std::error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const override;
   uint32_t getSymbolFlags(DataRefImpl Symb) const override;
-  error_code getSymbolType(DataRefImpl Symb,
-                           SymbolRef::Type &Res) const override;
-  error_code getSymbolSection(DataRefImpl Symb,
-                              section_iterator &Res) const override;
+  std::error_code getSymbolType(DataRefImpl Symb,
+                                SymbolRef::Type &Res) const override;
+  std::error_code getSymbolSection(DataRefImpl Symb,
+                                   section_iterator &Res) const override;
 
-  error_code getLibraryNext(DataRefImpl Data,
-                            LibraryRef &Result) const override;
-  error_code getLibraryPath(DataRefImpl Data, StringRef &Res) const override;
+  std::error_code getLibraryNext(DataRefImpl Data,
+                                 LibraryRef &Result) const override;
+  std::error_code getLibraryPath(DataRefImpl Data,
+                                 StringRef &Res) const override;
 
   void moveSectionNext(DataRefImpl &Sec) const override;
-  error_code getSectionName(DataRefImpl Sec, StringRef &Res) const override;
-  error_code getSectionAddress(DataRefImpl Sec, uint64_t &Res) const override;
-  error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const override;
-  error_code getSectionContents(DataRefImpl Sec, StringRef &Res) const override;
-  error_code getSectionAlignment(DataRefImpl Sec, uint64_t &Res) const override;
-  error_code isSectionText(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionData(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionBSS(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionRequiredForExecution(DataRefImpl Sec,
-                                           bool &Res) const override;
-  error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionReadOnlyData(DataRefImpl Sec, bool &Res) const override;
-  error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
-                                   bool &Result) const override;
+  std::error_code getSectionName(DataRefImpl Sec,
+                                 StringRef &Res) const override;
+  std::error_code getSectionAddress(DataRefImpl Sec,
+                                    uint64_t &Res) const override;
+  std::error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const override;
+  std::error_code getSectionContents(DataRefImpl Sec,
+                                     StringRef &Res) const override;
+  std::error_code getSectionAlignment(DataRefImpl Sec,
+                                      uint64_t &Res) const override;
+  std::error_code isSectionText(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionData(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionBSS(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionRequiredForExecution(DataRefImpl Sec,
+                                                bool &Res) const override;
+  std::error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionReadOnlyData(DataRefImpl Sec,
+                                        bool &Res) const override;
+  std::error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
+                                        bool &Result) const override;
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
   section_iterator getRelocatedSection(DataRefImpl Sec) const override;
 
   void moveRelocationNext(DataRefImpl &Rel) const override;
-  error_code getRelocationAddress(DataRefImpl Rel,
-                                  uint64_t &Res) const override;
-  error_code getRelocationOffset(DataRefImpl Rel, uint64_t &Res) const override;
+  std::error_code getRelocationAddress(DataRefImpl Rel,
+                                       uint64_t &Res) const override;
+  std::error_code getRelocationOffset(DataRefImpl Rel,
+                                      uint64_t &Res) const override;
   symbol_iterator getRelocationSymbol(DataRefImpl Rel) const override;
-  error_code getRelocationType(DataRefImpl Rel, uint64_t &Res) const override;
-  error_code getRelocationTypeName(DataRefImpl Rel,
-                                  SmallVectorImpl<char> &Result) const override;
-  error_code getRelocationValueString(DataRefImpl Rel,
-                                  SmallVectorImpl<char> &Result) const override;
+  std::error_code getRelocationType(DataRefImpl Rel,
+                                    uint64_t &Res) const override;
+  std::error_code
+  getRelocationTypeName(DataRefImpl Rel,
+                        SmallVectorImpl<char> &Result) const override;
+  std::error_code
+  getRelocationValueString(DataRefImpl Rel,
+                           SmallVectorImpl<char> &Result) const override;
 
   uint64_t getROffset(DataRefImpl Rel) const;
   StringRef getRelocationTypeName(uint32_t Type) const;
@@ -164,7 +177,7 @@ protected:
   bool isDyldELFObject;
 
 public:
-  ELFObjectFile(MemoryBuffer *Object, error_code &EC, bool BufferOwned = true);
+  ELFObjectFile(std::unique_ptr<MemoryBuffer> Object, std::error_code &EC);
 
   const Elf_Sym *getSymbol(DataRefImpl Symb) const;
 
@@ -180,10 +193,9 @@ public:
   library_iterator needed_library_begin() const override;
   library_iterator needed_library_end() const override;
 
-  error_code getRelocationAddend(DataRefImpl Rel, int64_t &Res) const;
-  error_code getSymbolVersion(SymbolRef Symb, StringRef &Version,
-                              bool &IsDefault) const;
-
+  std::error_code getRelocationAddend(DataRefImpl Rel, int64_t &Res) const;
+  std::error_code getSymbolVersion(SymbolRef Symb, StringRef &Version,
+                                   bool &IsDefault) const;
 
   uint8_t getBytesInAddress() const override;
   StringRef getFileFormatName() const override;
@@ -212,8 +224,8 @@ void ELFObjectFile<ELFT>::moveSymbolNext(DataRefImpl &Symb) const {
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolName(DataRefImpl Symb,
-                                              StringRef &Result) const {
+std::error_code ELFObjectFile<ELFT>::getSymbolName(DataRefImpl Symb,
+                                                   StringRef &Result) const {
   ErrorOr<StringRef> Name = EF.getSymbolName(toELFSymIter(Symb));
   if (!Name)
     return Name.getError();
@@ -222,9 +234,9 @@ error_code ELFObjectFile<ELFT>::getSymbolName(DataRefImpl Symb,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolVersion(SymbolRef SymRef,
-                                                 StringRef &Version,
-                                                 bool &IsDefault) const {
+std::error_code ELFObjectFile<ELFT>::getSymbolVersion(SymbolRef SymRef,
+                                                      StringRef &Version,
+                                                      bool &IsDefault) const {
   DataRefImpl Symb = SymRef.getRawDataRefImpl();
   const Elf_Sym *symb = getSymbol(Symb);
   ErrorOr<StringRef> Ver =
@@ -236,8 +248,8 @@ error_code ELFObjectFile<ELFT>::getSymbolVersion(SymbolRef SymRef,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolAddress(DataRefImpl Symb,
-                                                 uint64_t &Result) const {
+std::error_code ELFObjectFile<ELFT>::getSymbolAddress(DataRefImpl Symb,
+                                                      uint64_t &Result) const {
   const Elf_Sym *ESym = getSymbol(Symb);
   switch (EF.getSymbolTableIndex(ESym)) {
   case ELF::SHN_COMMON:
@@ -265,8 +277,8 @@ error_code ELFObjectFile<ELFT>::getSymbolAddress(DataRefImpl Symb,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolAlignment(DataRefImpl Symb,
-                                                   uint32_t &Res) const {
+std::error_code ELFObjectFile<ELFT>::getSymbolAlignment(DataRefImpl Symb,
+                                                        uint32_t &Res) const {
   Elf_Sym_Iter Sym = toELFSymIter(Symb);
   if (Sym->st_shndx == ELF::SHN_COMMON)
     Res = Sym->st_value;
@@ -276,15 +288,16 @@ error_code ELFObjectFile<ELFT>::getSymbolAlignment(DataRefImpl Symb,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolSize(DataRefImpl Symb,
-                                              uint64_t &Result) const {
+std::error_code ELFObjectFile<ELFT>::getSymbolSize(DataRefImpl Symb,
+                                                   uint64_t &Result) const {
   Result = toELFSymIter(Symb)->st_size;
   return object_error::success;
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolType(DataRefImpl Symb,
-                                              SymbolRef::Type &Result) const {
+std::error_code
+ELFObjectFile<ELFT>::getSymbolType(DataRefImpl Symb,
+                                   SymbolRef::Type &Result) const {
   const Elf_Sym *ESym = getSymbol(Symb);
 
   switch (ESym->getType()) {
@@ -343,8 +356,9 @@ uint32_t ELFObjectFile<ELFT>::getSymbolFlags(DataRefImpl Symb) const {
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolSection(DataRefImpl Symb,
-                                                 section_iterator &Res) const {
+std::error_code
+ELFObjectFile<ELFT>::getSymbolSection(DataRefImpl Symb,
+                                      section_iterator &Res) const {
   const Elf_Sym *ESym = getSymbol(Symb);
   const Elf_Shdr *ESec = EF.getSection(ESym);
   if (!ESec)
@@ -363,8 +377,8 @@ void ELFObjectFile<ELFT>::moveSectionNext(DataRefImpl &Sec) const {
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSectionName(DataRefImpl Sec,
-                                               StringRef &Result) const {
+std::error_code ELFObjectFile<ELFT>::getSectionName(DataRefImpl Sec,
+                                                    StringRef &Result) const {
   ErrorOr<StringRef> Name = EF.getSectionName(&*toELFShdrIter(Sec));
   if (!Name)
     return Name.getError();
@@ -373,44 +387,46 @@ error_code ELFObjectFile<ELFT>::getSectionName(DataRefImpl Sec,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSectionAddress(DataRefImpl Sec,
-                                                  uint64_t &Result) const {
+std::error_code ELFObjectFile<ELFT>::getSectionAddress(DataRefImpl Sec,
+                                                       uint64_t &Result) const {
   Result = toELFShdrIter(Sec)->sh_addr;
   return object_error::success;
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSectionSize(DataRefImpl Sec,
-                                               uint64_t &Result) const {
+std::error_code ELFObjectFile<ELFT>::getSectionSize(DataRefImpl Sec,
+                                                    uint64_t &Result) const {
   Result = toELFShdrIter(Sec)->sh_size;
   return object_error::success;
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSectionContents(DataRefImpl Sec,
-                                                   StringRef &Result) const {
+std::error_code
+ELFObjectFile<ELFT>::getSectionContents(DataRefImpl Sec,
+                                        StringRef &Result) const {
   Elf_Shdr_Iter EShdr = toELFShdrIter(Sec);
   Result = StringRef((const char *)base() + EShdr->sh_offset, EShdr->sh_size);
   return object_error::success;
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getSectionAlignment(DataRefImpl Sec,
-                                                    uint64_t &Result) const {
+std::error_code
+ELFObjectFile<ELFT>::getSectionAlignment(DataRefImpl Sec,
+                                         uint64_t &Result) const {
   Result = toELFShdrIter(Sec)->sh_addralign;
   return object_error::success;
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::isSectionText(DataRefImpl Sec,
-                                              bool &Result) const {
+std::error_code ELFObjectFile<ELFT>::isSectionText(DataRefImpl Sec,
+                                                   bool &Result) const {
   Result = toELFShdrIter(Sec)->sh_flags & ELF::SHF_EXECINSTR;
   return object_error::success;
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::isSectionData(DataRefImpl Sec,
-                                              bool &Result) const {
+std::error_code ELFObjectFile<ELFT>::isSectionData(DataRefImpl Sec,
+                                                   bool &Result) const {
   Elf_Shdr_Iter EShdr = toELFShdrIter(Sec);
   Result = EShdr->sh_flags & (ELF::SHF_ALLOC | ELF::SHF_WRITE) &&
            EShdr->sh_type == ELF::SHT_PROGBITS;
@@ -418,8 +434,8 @@ error_code ELFObjectFile<ELFT>::isSectionData(DataRefImpl Sec,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::isSectionBSS(DataRefImpl Sec,
-                                             bool &Result) const {
+std::error_code ELFObjectFile<ELFT>::isSectionBSS(DataRefImpl Sec,
+                                                  bool &Result) const {
   Elf_Shdr_Iter EShdr = toELFShdrIter(Sec);
   Result = EShdr->sh_flags & (ELF::SHF_ALLOC | ELF::SHF_WRITE) &&
            EShdr->sh_type == ELF::SHT_NOBITS;
@@ -427,7 +443,7 @@ error_code ELFObjectFile<ELFT>::isSectionBSS(DataRefImpl Sec,
 }
 
 template <class ELFT>
-error_code
+std::error_code
 ELFObjectFile<ELFT>::isSectionRequiredForExecution(DataRefImpl Sec,
                                                    bool &Result) const {
   Result = toELFShdrIter(Sec)->sh_flags & ELF::SHF_ALLOC;
@@ -435,31 +451,31 @@ ELFObjectFile<ELFT>::isSectionRequiredForExecution(DataRefImpl Sec,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::isSectionVirtual(DataRefImpl Sec,
-                                                 bool &Result) const {
+std::error_code ELFObjectFile<ELFT>::isSectionVirtual(DataRefImpl Sec,
+                                                      bool &Result) const {
   Result = toELFShdrIter(Sec)->sh_type == ELF::SHT_NOBITS;
   return object_error::success;
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::isSectionZeroInit(DataRefImpl Sec,
-                                                  bool &Result) const {
+std::error_code ELFObjectFile<ELFT>::isSectionZeroInit(DataRefImpl Sec,
+                                                       bool &Result) const {
   Result = toELFShdrIter(Sec)->sh_type == ELF::SHT_NOBITS;
   return object_error::success;
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::isSectionReadOnlyData(DataRefImpl Sec,
-                                                      bool &Result) const {
+std::error_code ELFObjectFile<ELFT>::isSectionReadOnlyData(DataRefImpl Sec,
+                                                           bool &Result) const {
   Elf_Shdr_Iter EShdr = toELFShdrIter(Sec);
   Result = !(EShdr->sh_flags & (ELF::SHF_WRITE | ELF::SHF_EXECINSTR));
   return object_error::success;
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::sectionContainsSymbol(DataRefImpl Sec,
-                                                      DataRefImpl Symb,
-                                                      bool &Result) const {
+std::error_code ELFObjectFile<ELFT>::sectionContainsSymbol(DataRefImpl Sec,
+                                                           DataRefImpl Symb,
+                                                           bool &Result) const {
   Elf_Sym_Iter ESym = toELFSymIter(Symb);
 
   uintX_t Index = ESym->st_shndx;
@@ -553,8 +569,9 @@ ELFObjectFile<ELFT>::getRelocationSymbol(DataRefImpl Rel) const {
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getRelocationAddress(DataRefImpl Rel,
-                                                     uint64_t &Result) const {
+std::error_code
+ELFObjectFile<ELFT>::getRelocationAddress(DataRefImpl Rel,
+                                          uint64_t &Result) const {
   uint64_t ROffset = getROffset(Rel);
   const Elf_Ehdr *Header = EF.getHeader();
 
@@ -570,8 +587,9 @@ error_code ELFObjectFile<ELFT>::getRelocationAddress(DataRefImpl Rel,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getRelocationOffset(DataRefImpl Rel,
-                                                    uint64_t &Result) const {
+std::error_code
+ELFObjectFile<ELFT>::getRelocationOffset(DataRefImpl Rel,
+                                         uint64_t &Result) const {
   assert(EF.getHeader()->e_type == ELF::ET_REL &&
          "Only relocatable object files have relocation offsets");
   Result = getROffset(Rel);
@@ -592,8 +610,8 @@ uint64_t ELFObjectFile<ELFT>::getROffset(DataRefImpl Rel) const {
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getRelocationType(DataRefImpl Rel,
-                                                  uint64_t &Result) const {
+std::error_code ELFObjectFile<ELFT>::getRelocationType(DataRefImpl Rel,
+                                                       uint64_t &Result) const {
   const Elf_Shdr *sec = getRelSection(Rel);
   switch (sec->sh_type) {
   default:
@@ -616,7 +634,7 @@ StringRef ELFObjectFile<ELFT>::getRelocationTypeName(uint32_t Type) const {
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getRelocationTypeName(
+std::error_code ELFObjectFile<ELFT>::getRelocationTypeName(
     DataRefImpl Rel, SmallVectorImpl<char> &Result) const {
   const Elf_Shdr *sec = getRelSection(Rel);
   uint32_t type;
@@ -638,8 +656,9 @@ error_code ELFObjectFile<ELFT>::getRelocationTypeName(
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getRelocationAddend(DataRefImpl Rel,
-                                                    int64_t &Result) const {
+std::error_code
+ELFObjectFile<ELFT>::getRelocationAddend(DataRefImpl Rel,
+                                         int64_t &Result) const {
   const Elf_Shdr *sec = getRelSection(Rel);
   switch (sec->sh_type) {
   default:
@@ -656,7 +675,7 @@ error_code ELFObjectFile<ELFT>::getRelocationAddend(DataRefImpl Rel,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getRelocationValueString(
+std::error_code ELFObjectFile<ELFT>::getRelocationValueString(
     DataRefImpl Rel, SmallVectorImpl<char> &Result) const {
   const Elf_Shdr *sec = getRelSection(Rel);
   uint8_t type;
@@ -754,13 +773,13 @@ ELFObjectFile<ELFT>::getRela(DataRefImpl Rela) const {
 }
 
 template <class ELFT>
-ELFObjectFile<ELFT>::ELFObjectFile(MemoryBuffer *Object, error_code &ec,
-                                   bool BufferOwned)
+ELFObjectFile<ELFT>::ELFObjectFile(std::unique_ptr<MemoryBuffer> Object,
+                                   std::error_code &EC)
     : ObjectFile(getELFType(static_cast<endianness>(ELFT::TargetEndianness) ==
                                 support::little,
                             ELFT::Is64Bits),
-                 Object, BufferOwned),
-      EF(Object, ec) {}
+                 std::move(Object)),
+      EF(Data->getBuffer(), EC) {}
 
 template <class ELFT>
 basic_symbol_iterator ELFObjectFile<ELFT>::symbol_begin_impl() const {
@@ -817,8 +836,8 @@ library_iterator ELFObjectFile<ELFT>::needed_library_begin() const {
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getLibraryNext(DataRefImpl Data,
-                                               LibraryRef &Result) const {
+std::error_code ELFObjectFile<ELFT>::getLibraryNext(DataRefImpl Data,
+                                                    LibraryRef &Result) const {
   Elf_Dyn_Iter DI = toELFDynIter(Data);
   Elf_Dyn_Iter DE = EF.end_dynamic_table();
 
@@ -832,8 +851,8 @@ error_code ELFObjectFile<ELFT>::getLibraryNext(DataRefImpl Data,
 }
 
 template <class ELFT>
-error_code ELFObjectFile<ELFT>::getLibraryPath(DataRefImpl Data,
-                                               StringRef &Res) const {
+std::error_code ELFObjectFile<ELFT>::getLibraryPath(DataRefImpl Data,
+                                                    StringRef &Res) const {
   Res = EF.getDynamicString(toELFDynIter(Data)->getVal());
   return object_error::success;
 }
@@ -898,6 +917,7 @@ StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
 
 template <class ELFT>
 unsigned ELFObjectFile<ELFT>::getArch() const {
+  bool IsLittleEndian = ELFT::TargetEndianness == support::little;
   switch (EF.getHeader()->e_machine) {
   case ELF::EM_386:
     return Triple::x86;
@@ -910,11 +930,16 @@ unsigned ELFObjectFile<ELFT>::getArch() const {
   case ELF::EM_HEXAGON:
     return Triple::hexagon;
   case ELF::EM_MIPS:
-    return (ELFT::TargetEndianness == support::little) ? Triple::mipsel
-                                                       : Triple::mips;
+    switch (EF.getHeader()->e_ident[ELF::EI_CLASS]) {
+    case ELF::ELFCLASS32:
+      return IsLittleEndian ? Triple::mipsel : Triple::mips;
+    case ELF::ELFCLASS64:
+      return IsLittleEndian ? Triple::mips64el : Triple::mips64;
+    default:
+      report_fatal_error("Invalid ELFCLASS!");
+    }
   case ELF::EM_PPC64:
-    return (ELFT::TargetEndianness == support::little) ? Triple::ppc64le
-                                                       : Triple::ppc64;
+    return IsLittleEndian ? Triple::ppc64le : Triple::ppc64;
   case ELF::EM_S390:
     return Triple::systemz;
 
@@ -931,8 +956,8 @@ unsigned ELFObjectFile<ELFT>::getArch() const {
 
 /// FIXME: Maybe we should have a base ElfObjectFile that is not a template
 /// and make these member functions?
-inline error_code getELFRelocationAddend(const RelocationRef R,
-                                         int64_t &Addend) {
+inline std::error_code getELFRelocationAddend(const RelocationRef R,
+                                              int64_t &Addend) {
   const ObjectFile *Obj = R.getObjectFile();
   DataRefImpl DRI = R.getRawDataRefImpl();
   // Little-endian 32-bit
@@ -975,9 +1000,10 @@ getELFDynamicSymbolIterators(SymbolicFile *Obj) {
 
 /// This is a generic interface for retrieving GNU symbol version
 /// information from an ELFObjectFile.
-inline error_code GetELFSymbolVersion(const ObjectFile *Obj,
-                                      const SymbolRef &Sym, StringRef &Version,
-                                      bool &IsDefault) {
+inline std::error_code GetELFSymbolVersion(const ObjectFile *Obj,
+                                           const SymbolRef &Sym,
+                                           StringRef &Version,
+                                           bool &IsDefault) {
   // Little-endian 32-bit
   if (const ELF32LEObjectFile *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
     return ELFObj->getSymbolVersion(Sym, Version, IsDefault);
diff --git a/include/llvm/Object/ELFYAML.h b/include/llvm/Object/ELFYAML.h
index 524e55b..fc8cc95 100644
--- a/include/llvm/Object/ELFYAML.h
+++ b/include/llvm/Object/ELFYAML.h
@@ -16,7 +16,7 @@
 #ifndef LLVM_OBJECT_ELFYAML_H
 #define LLVM_OBJECT_ELFYAML_H
 
-#include "llvm/Object/YAML.h"
+#include "llvm/MC/YAML.h"
 #include "llvm/Support/ELF.h"
 
 namespace llvm {
@@ -44,6 +44,7 @@ LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_REL)
 // Just use 64, since it can hold 32-bit values too.
 LLVM_YAML_STRONG_TYPEDEF(uint64_t, ELF_SHF)
 LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_STT)
+LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_STV)
 
 // For now, hardcode 64 bits everywhere that 32 or 64 would be needed
 // since 64-bit can hold 32-bit values too.
@@ -62,6 +63,7 @@ struct Symbol {
   StringRef Section;
   llvm::yaml::Hex64 Value;
   llvm::yaml::Hex64 Size;
+  ELF_STV Visibility;
 };
 struct LocalGlobalWeakSymbols {
   std::vector<Symbol> Local;
@@ -76,13 +78,12 @@ struct Section {
   ELF_SHF Flags;
   llvm::yaml::Hex64 Address;
   StringRef Link;
-  StringRef Info;
   llvm::yaml::Hex64 AddressAlign;
   Section(SectionKind Kind) : Kind(Kind) {}
   virtual ~Section();
 };
 struct RawContentSection : Section {
-  object::yaml::BinaryRef Content;
+  yaml::BinaryRef Content;
   llvm::yaml::Hex64 Size;
   RawContentSection() : Section(SectionKind::RawContent) {}
   static bool classof(const Section *S) {
@@ -96,6 +97,7 @@ struct Relocation {
   StringRef Symbol;
 };
 struct RelocationSection : Section {
+  StringRef Info;
   std::vector<Relocation> Relocations;
   RelocationSection() : Section(SectionKind::Relocation) {}
   static bool classof(const Section *S) {
@@ -168,6 +170,11 @@ struct ScalarEnumerationTraits<ELFYAML::ELF_STT> {
 };
 
 template <>
+struct ScalarEnumerationTraits<ELFYAML::ELF_STV> {
+  static void enumeration(IO &IO, ELFYAML::ELF_STV &Value);
+};
+
+template <>
 struct ScalarEnumerationTraits<ELFYAML::ELF_REL> {
   static void enumeration(IO &IO, ELFYAML::ELF_REL &Value);
 };
diff --git a/include/llvm/Object/Error.h b/include/llvm/Object/Error.h
index 779c747..701da12 100644
--- a/include/llvm/Object/Error.h
+++ b/include/llvm/Object/Error.h
@@ -14,38 +14,32 @@
 #ifndef LLVM_OBJECT_ERROR_H
 #define LLVM_OBJECT_ERROR_H
 
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 namespace llvm {
 namespace object {
 
-const error_category &object_category();
+const std::error_category &object_category();
 
-struct object_error {
-  enum Impl {
-    success = 0,
-    arch_not_found,
-    invalid_file_type,
-    parse_failed,
-    unexpected_eof
-  };
-  Impl V;
-
-  object_error(Impl V) : V(V) {}
-  operator Impl() const { return V; }
+enum class object_error {
+  success = 0,
+  arch_not_found,
+  invalid_file_type,
+  parse_failed,
+  unexpected_eof
 };
 
-inline error_code make_error_code(object_error e) {
-  return error_code(static_cast<int>(e), object_category());
+inline std::error_code make_error_code(object_error e) {
+  return std::error_code(static_cast<int>(e), object_category());
 }
 
 } // end namespace object.
 
-template <> struct is_error_code_enum<object::object_error> : std::true_type {};
+} // end namespace llvm.
 
+namespace std {
 template <>
-struct is_error_code_enum<object::object_error::Impl> : std::true_type {};
-
-} // end namespace llvm.
+struct is_error_code_enum<llvm::object::object_error> : std::true_type {};
+}
 
 #endif
diff --git a/include/llvm/Object/IRObjectFile.h b/include/llvm/Object/IRObjectFile.h
index 78f5b2b..b33cc26 100644
--- a/include/llvm/Object/IRObjectFile.h
+++ b/include/llvm/Object/IRObjectFile.h
@@ -25,20 +25,33 @@ namespace object {
 class IRObjectFile : public SymbolicFile {
   std::unique_ptr<Module> M;
   std::unique_ptr<Mangler> Mang;
+  std::vector<std::pair<std::string, uint32_t>> AsmSymbols;
 
 public:
-  IRObjectFile(MemoryBuffer *Object, error_code &EC, LLVMContext &Context,
-               bool BufferOwned);
+  IRObjectFile(std::unique_ptr<MemoryBuffer> Object, std::unique_ptr<Module> M);
+  ~IRObjectFile();
   void moveSymbolNext(DataRefImpl &Symb) const override;
-  error_code printSymbolName(raw_ostream &OS, DataRefImpl Symb) const override;
+  std::error_code printSymbolName(raw_ostream &OS,
+                                  DataRefImpl Symb) const override;
   uint32_t getSymbolFlags(DataRefImpl Symb) const override;
-  const GlobalValue &getSymbolGV(DataRefImpl Symb) const;
+  const GlobalValue *getSymbolGV(DataRefImpl Symb) const;
   basic_symbol_iterator symbol_begin_impl() const override;
   basic_symbol_iterator symbol_end_impl() const override;
 
+  const Module &getModule() const {
+    return const_cast<IRObjectFile*>(this)->getModule();
+  }
+  Module &getModule() {
+    return *M;
+  }
+
   static inline bool classof(const Binary *v) {
     return v->isIR();
   }
+
+  static ErrorOr<IRObjectFile *>
+  createIRObjectFile(std::unique_ptr<MemoryBuffer> Object,
+                     LLVMContext &Context);
 };
 }
 }
diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h
index 710ad7e..e93ebb8 100644
--- a/include/llvm/Object/MachO.h
+++ b/include/llvm/Object/MachO.h
@@ -40,9 +40,9 @@ public:
 
   void moveNext();
 
-  error_code getOffset(uint32_t &Result) const;
-  error_code getLength(uint16_t &Result) const;
-  error_code getKind(uint16_t &Result) const;
+  std::error_code getOffset(uint32_t &Result) const;
+  std::error_code getLength(uint16_t &Result) const;
+  std::error_code getKind(uint16_t &Result) const;
 
   DataRefImpl getRawDataRefImpl() const;
   const ObjectFile *getObjectFile() const;
@@ -56,54 +56,75 @@ public:
     MachO::load_command C; // The command itself.
   };
 
-  MachOObjectFile(MemoryBuffer *Object, bool IsLittleEndian, bool Is64Bits,
-                  error_code &EC, bool BufferOwned = true);
+  MachOObjectFile(std::unique_ptr<MemoryBuffer> Object, bool IsLittleEndian,
+                  bool Is64Bits, std::error_code &EC);
 
   void moveSymbolNext(DataRefImpl &Symb) const override;
-  error_code getSymbolName(DataRefImpl Symb, StringRef &Res) const override;
-  error_code getSymbolAddress(DataRefImpl Symb, uint64_t &Res) const override;
-  error_code getSymbolAlignment(DataRefImpl Symb, uint32_t &Res) const override;
-  error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const override;
-  error_code getSymbolType(DataRefImpl Symb,
-                           SymbolRef::Type &Res) const override;
+  std::error_code getSymbolName(DataRefImpl Symb,
+                                StringRef &Res) const override;
+
+  // MachO specific.
+  std::error_code getIndirectName(DataRefImpl Symb, StringRef &Res) const;
+
+  std::error_code getSymbolAddress(DataRefImpl Symb,
+                                   uint64_t &Res) const override;
+  std::error_code getSymbolAlignment(DataRefImpl Symb,
+                                     uint32_t &Res) const override;
+  std::error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const override;
+  std::error_code getSymbolType(DataRefImpl Symb,
+                                SymbolRef::Type &Res) const override;
   uint32_t getSymbolFlags(DataRefImpl Symb) const override;
-  error_code getSymbolSection(DataRefImpl Symb,
-                              section_iterator &Res) const override;
+  std::error_code getSymbolSection(DataRefImpl Symb,
+                                   section_iterator &Res) const override;
 
   void moveSectionNext(DataRefImpl &Sec) const override;
-  error_code getSectionName(DataRefImpl Sec, StringRef &Res) const override;
-  error_code getSectionAddress(DataRefImpl Sec, uint64_t &Res) const override;
-  error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const override;
-  error_code getSectionContents(DataRefImpl Sec, StringRef &Res) const override;
-  error_code getSectionAlignment(DataRefImpl Sec, uint64_t &Res) const override;
-  error_code isSectionText(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionData(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionBSS(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionRequiredForExecution(DataRefImpl Sec,
-                                           bool &Res) const override;
-  error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const override;
-  error_code isSectionReadOnlyData(DataRefImpl Sec, bool &Res) const override;
-  error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
-                                   bool &Result) const override;
+  std::error_code getSectionName(DataRefImpl Sec,
+                                 StringRef &Res) const override;
+  std::error_code getSectionAddress(DataRefImpl Sec,
+                                    uint64_t &Res) const override;
+  std::error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const override;
+  std::error_code getSectionContents(DataRefImpl Sec,
+                                     StringRef &Res) const override;
+  std::error_code getSectionAlignment(DataRefImpl Sec,
+                                      uint64_t &Res) const override;
+  std::error_code isSectionText(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionData(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionBSS(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionRequiredForExecution(DataRefImpl Sec,
+                                                bool &Res) const override;
+  std::error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const override;
+  std::error_code isSectionReadOnlyData(DataRefImpl Sec,
+                                        bool &Res) const override;
+  std::error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
+                                        bool &Result) const override;
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
 
   void moveRelocationNext(DataRefImpl &Rel) const override;
-  error_code getRelocationAddress(DataRefImpl Rel,
-                                  uint64_t &Res) const override;
-  error_code getRelocationOffset(DataRefImpl Rel, uint64_t &Res) const override;
+  std::error_code getRelocationAddress(DataRefImpl Rel,
+                                       uint64_t &Res) const override;
+  std::error_code getRelocationOffset(DataRefImpl Rel,
+                                      uint64_t &Res) const override;
   symbol_iterator getRelocationSymbol(DataRefImpl Rel) const override;
-  error_code getRelocationType(DataRefImpl Rel, uint64_t &Res) const override;
-  error_code getRelocationTypeName(DataRefImpl Rel,
-                                  SmallVectorImpl<char> &Result) const override;
-  error_code getRelocationValueString(DataRefImpl Rel,
-                                  SmallVectorImpl<char> &Result) const override;
-  error_code getRelocationHidden(DataRefImpl Rel, bool &Result) const override;
+  std::error_code getRelocationType(DataRefImpl Rel,
+                                    uint64_t &Res) const override;
+  std::error_code
+  getRelocationTypeName(DataRefImpl Rel,
+                        SmallVectorImpl<char> &Result) const override;
+  std::error_code
+  getRelocationValueString(DataRefImpl Rel,
+                           SmallVectorImpl<char> &Result) const override;
+  std::error_code getRelocationHidden(DataRefImpl Rel,
+                                      bool &Result) const override;
+
+  std::error_code getLibraryNext(DataRefImpl LibData,
+                                 LibraryRef &Res) const override;
+  std::error_code getLibraryPath(DataRefImpl LibData,
+                                 StringRef &Res) const override;
 
-  error_code getLibraryNext(DataRefImpl LibData,
-                            LibraryRef &Res) const override;
-  error_code getLibraryPath(DataRefImpl LibData, StringRef &Res) const override;
+  // MachO specific.
+  std::error_code getLibraryShortNameByIndex(unsigned Index, StringRef &Res);
 
   // TODO: Would be useful to have an iterator based version
   // of the load command interface too.
@@ -180,6 +201,8 @@ public:
   getLinkerOptionsLoadCommand(const LoadCommandInfo &L) const;
   MachO::version_min_command
   getVersionMinLoadCommand(const LoadCommandInfo &L) const;
+  MachO::dylib_command
+  getDylibIDLoadCommand(const LoadCommandInfo &L) const;
 
   MachO::any_relocation_info getRelocation(DataRefImpl Rel) const;
   MachO::data_in_code_entry getDice(DataRefImpl Rel) const;
@@ -198,15 +221,27 @@ public:
   bool is64Bit() const;
   void ReadULEB128s(uint64_t Index, SmallVectorImpl<uint64_t> &Out) const;
 
+  static StringRef guessLibraryShortName(StringRef Name, bool &isFramework,
+                                         StringRef &Suffix);
+
   static Triple::ArchType getArch(uint32_t CPUType);
+  static Triple getArch(uint32_t CPUType, uint32_t CPUSubType);
+  static Triple getArch(StringRef ArchFlag);
+  static Triple getHostArch();
 
   static bool classof(const Binary *v) {
     return v->isMachO();
   }
 
+  const char *getSectionPointer(DataRefImpl Rel) const;
+
 private:
-  typedef SmallVector<const char*, 1> SectionList;
+  typedef SmallVector<const char *, 1> SectionList;
   SectionList Sections;
+  typedef SmallVector<const char *, 1> LibraryList;
+  LibraryList Libraries;
+  typedef SmallVector<StringRef, 1> LibraryShortName;
+  LibraryShortName LibrariesShortNames;
   const char *SymtabLoadCmd;
   const char *DysymtabLoadCmd;
   const char *DataInCodeLoadCmd;
@@ -234,7 +269,7 @@ inline void DiceRef::moveNext() {
 // the OwningObject ObjectFile is a MachOObjectFile a static_cast<> is used for
 // the methods that get the values of the fields of the reference.
 
-inline error_code DiceRef::getOffset(uint32_t &Result) const {
+inline std::error_code DiceRef::getOffset(uint32_t &Result) const {
   const MachOObjectFile *MachOOF =
     static_cast<const MachOObjectFile *>(OwningObject);
   MachO::data_in_code_entry Dice = MachOOF->getDice(DicePimpl);
@@ -242,7 +277,7 @@ inline error_code DiceRef::getOffset(uint32_t &Result) const {
   return object_error::success;
 }
 
-inline error_code DiceRef::getLength(uint16_t &Result) const {
+inline std::error_code DiceRef::getLength(uint16_t &Result) const {
   const MachOObjectFile *MachOOF =
     static_cast<const MachOObjectFile *>(OwningObject);
   MachO::data_in_code_entry Dice = MachOOF->getDice(DicePimpl);
@@ -250,7 +285,7 @@ inline error_code DiceRef::getLength(uint16_t &Result) const {
   return object_error::success;
 }
 
-inline error_code DiceRef::getKind(uint16_t &Result) const {
+inline std::error_code DiceRef::getKind(uint16_t &Result) const {
   const MachOObjectFile *MachOOF =
     static_cast<const MachOObjectFile *>(OwningObject);
   MachO::data_in_code_entry Dice = MachOOF->getDice(DicePimpl);
diff --git a/include/llvm/Object/MachOUniversal.h b/include/llvm/Object/MachOUniversal.h
index d27c824..e6677f5 100644
--- a/include/llvm/Object/MachOUniversal.h
+++ b/include/llvm/Object/MachOUniversal.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/Archive.h"
+#include "llvm/Object/MachO.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MachO.h"
 
@@ -52,10 +53,14 @@ public:
 
     ObjectForArch getNext() const { return ObjectForArch(Parent, Index + 1); }
     uint32_t getCPUType() const { return Header.cputype; }
+    std::string getArchTypeName() const {
+      Triple T = MachOObjectFile::getArch(Header.cputype, Header.cpusubtype);
+      return T.getArchName();
+    }
 
-    error_code getAsObjectFile(std::unique_ptr<ObjectFile> &Result) const;
+    ErrorOr<std::unique_ptr<ObjectFile>> getAsObjectFile() const;
 
-    error_code getAsArchive(std::unique_ptr<Archive> &Result) const;
+    std::error_code getAsArchive(std::unique_ptr<Archive> &Result) const;
   };
 
   class object_iterator {
@@ -79,8 +84,10 @@ public:
     }
   };
 
-  MachOUniversalBinary(MemoryBuffer *Source, error_code &ec);
-  static ErrorOr<MachOUniversalBinary*> create(MemoryBuffer *Source);
+  MachOUniversalBinary(std::unique_ptr<MemoryBuffer> Source,
+                       std::error_code &ec);
+  static ErrorOr<MachOUniversalBinary *>
+  create(std::unique_ptr<MemoryBuffer> Source);
 
   object_iterator begin_objects() const {
     return ObjectForArch(this, 0);
@@ -96,8 +103,8 @@ public:
     return V->isMachOUniversalBinary();
   }
 
-  error_code getObjectForArch(Triple::ArchType Arch,
-                              std::unique_ptr<ObjectFile> &Result) const;
+  ErrorOr<std::unique_ptr<ObjectFile>>
+  getObjectForArch(Triple::ArchType Arch) const;
 };
 
 }
diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
index 10209b9..646abf8 100644
--- a/include/llvm/Object/ObjectFile.h
+++ b/include/llvm/Object/ObjectFile.h
@@ -46,26 +46,26 @@ public:
 
   void moveNext();
 
-  error_code getAddress(uint64_t &Result) const;
-  error_code getOffset(uint64_t &Result) const;
+  std::error_code getAddress(uint64_t &Result) const;
+  std::error_code getOffset(uint64_t &Result) const;
   symbol_iterator getSymbol() const;
-  error_code getType(uint64_t &Result) const;
+  std::error_code getType(uint64_t &Result) const;
 
   /// @brief Indicates whether this relocation should hidden when listing
   /// relocations, usually because it is the trailing part of a multipart
   /// relocation that will be printed as part of the leading relocation.
-  error_code getHidden(bool &Result) const;
+  std::error_code getHidden(bool &Result) const;
 
   /// @brief Get a string that represents the type of this relocation.
   ///
   /// This is for display purposes only.
-  error_code getTypeName(SmallVectorImpl<char> &Result) const;
+  std::error_code getTypeName(SmallVectorImpl<char> &Result) const;
 
   /// @brief Get a string that represents the calculation of the value of this
   ///        relocation.
   ///
   /// This is for display purposes only.
-  error_code getValueString(SmallVectorImpl<char> &Result) const;
+  std::error_code getValueString(SmallVectorImpl<char> &Result) const;
 
   DataRefImpl getRawDataRefImpl() const;
   const ObjectFile *getObjectFile() const;
@@ -92,24 +92,24 @@ public:
 
   void moveNext();
 
-  error_code getName(StringRef &Result) const;
-  error_code getAddress(uint64_t &Result) const;
-  error_code getSize(uint64_t &Result) const;
-  error_code getContents(StringRef &Result) const;
+  std::error_code getName(StringRef &Result) const;
+  std::error_code getAddress(uint64_t &Result) const;
+  std::error_code getSize(uint64_t &Result) const;
+  std::error_code getContents(StringRef &Result) const;
 
   /// @brief Get the alignment of this section as the actual value (not log 2).
-  error_code getAlignment(uint64_t &Result) const;
+  std::error_code getAlignment(uint64_t &Result) const;
 
   // FIXME: Move to the normalization layer when it's created.
-  error_code isText(bool &Result) const;
-  error_code isData(bool &Result) const;
-  error_code isBSS(bool &Result) const;
-  error_code isRequiredForExecution(bool &Result) const;
-  error_code isVirtual(bool &Result) const;
-  error_code isZeroInit(bool &Result) const;
-  error_code isReadOnlyData(bool &Result) const;
+  std::error_code isText(bool &Result) const;
+  std::error_code isData(bool &Result) const;
+  std::error_code isBSS(bool &Result) const;
+  std::error_code isRequiredForExecution(bool &Result) const;
+  std::error_code isVirtual(bool &Result) const;
+  std::error_code isZeroInit(bool &Result) const;
+  std::error_code isReadOnlyData(bool &Result) const;
 
-  error_code containsSymbol(SymbolRef S, bool &Result) const;
+  std::error_code containsSymbol(SymbolRef S, bool &Result) const;
 
   relocation_iterator relocation_begin() const;
   relocation_iterator relocation_end() const;
@@ -141,18 +141,18 @@ public:
 
   SymbolRef(DataRefImpl SymbolP, const ObjectFile *Owner);
 
-  error_code getName(StringRef &Result) const;
+  std::error_code getName(StringRef &Result) const;
   /// Returns the symbol virtual address (i.e. address at which it will be
   /// mapped).
-  error_code getAddress(uint64_t &Result) const;
+  std::error_code getAddress(uint64_t &Result) const;
   /// @brief Get the alignment of this symbol as the actual value (not log 2).
-  error_code getAlignment(uint32_t &Result) const;
-  error_code getSize(uint64_t &Result) const;
-  error_code getType(SymbolRef::Type &Result) const;
+  std::error_code getAlignment(uint32_t &Result) const;
+  std::error_code getSize(uint64_t &Result) const;
+  std::error_code getType(SymbolRef::Type &Result) const;
 
   /// @brief Get section this symbol is defined in reference to. Result is
   /// end_sections() if it is undefined or is an absolute symbol.
-  error_code getSection(section_iterator &Result) const;
+  std::error_code getSection(section_iterator &Result) const;
 
   const ObjectFile *getObject() const;
 };
@@ -190,10 +190,10 @@ public:
   bool operator==(const LibraryRef &Other) const;
   bool operator<(const LibraryRef &Other) const;
 
-  error_code getNext(LibraryRef &Result) const;
+  std::error_code getNext(LibraryRef &Result) const;
 
   // Get the path to this library, as stored in the object file.
-  error_code getPath(StringRef &Result) const;
+  std::error_code getPath(StringRef &Result) const;
 
   DataRefImpl getRawDataRefImpl() const;
 };
@@ -208,7 +208,7 @@ class ObjectFile : public SymbolicFile {
   ObjectFile(const ObjectFile &other) LLVM_DELETED_FUNCTION;
 
 protected:
-  ObjectFile(unsigned int Type, MemoryBuffer *Source, bool BufferOwned = true);
+  ObjectFile(unsigned int Type, std::unique_ptr<MemoryBuffer> Source);
 
   const uint8_t *base() const {
     return reinterpret_cast<const uint8_t *>(Data->getBufferStart());
@@ -223,35 +223,49 @@ protected:
   // Implementations assume that the DataRefImpl is valid and has not been
   // modified externally. It's UB otherwise.
   friend class SymbolRef;
-  virtual error_code getSymbolName(DataRefImpl Symb, StringRef &Res) const = 0;
-  error_code printSymbolName(raw_ostream &OS, DataRefImpl Symb) const override;
-  virtual error_code getSymbolAddress(DataRefImpl Symb, uint64_t &Res) const = 0;
-  virtual error_code getSymbolAlignment(DataRefImpl Symb, uint32_t &Res) const;
-  virtual error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const = 0;
-  virtual error_code getSymbolType(DataRefImpl Symb,
-                                   SymbolRef::Type &Res) const = 0;
-  virtual error_code getSymbolSection(DataRefImpl Symb,
-                                      section_iterator &Res) const = 0;
+  virtual std::error_code getSymbolName(DataRefImpl Symb,
+                                        StringRef &Res) const = 0;
+  std::error_code printSymbolName(raw_ostream &OS,
+                                  DataRefImpl Symb) const override;
+  virtual std::error_code getSymbolAddress(DataRefImpl Symb,
+                                           uint64_t &Res) const = 0;
+  virtual std::error_code getSymbolAlignment(DataRefImpl Symb,
+                                             uint32_t &Res) const;
+  virtual std::error_code getSymbolSize(DataRefImpl Symb,
+                                        uint64_t &Res) const = 0;
+  virtual std::error_code getSymbolType(DataRefImpl Symb,
+                                        SymbolRef::Type &Res) const = 0;
+  virtual std::error_code getSymbolSection(DataRefImpl Symb,
+                                           section_iterator &Res) const = 0;
 
   // Same as above for SectionRef.
   friend class SectionRef;
   virtual void moveSectionNext(DataRefImpl &Sec) const = 0;
-  virtual error_code getSectionName(DataRefImpl Sec, StringRef &Res) const = 0;
-  virtual error_code getSectionAddress(DataRefImpl Sec, uint64_t &Res) const =0;
-  virtual error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const = 0;
-  virtual error_code getSectionContents(DataRefImpl Sec, StringRef &Res)const=0;
-  virtual error_code getSectionAlignment(DataRefImpl Sec, uint64_t &Res)const=0;
-  virtual error_code isSectionText(DataRefImpl Sec, bool &Res) const = 0;
-  virtual error_code isSectionData(DataRefImpl Sec, bool &Res) const = 0;
-  virtual error_code isSectionBSS(DataRefImpl Sec, bool &Res) const = 0;
-  virtual error_code isSectionRequiredForExecution(DataRefImpl Sec,
-                                                   bool &Res) const = 0;
+  virtual std::error_code getSectionName(DataRefImpl Sec,
+                                         StringRef &Res) const = 0;
+  virtual std::error_code getSectionAddress(DataRefImpl Sec,
+                                            uint64_t &Res) const = 0;
+  virtual std::error_code getSectionSize(DataRefImpl Sec,
+                                         uint64_t &Res) const = 0;
+  virtual std::error_code getSectionContents(DataRefImpl Sec,
+                                             StringRef &Res) const = 0;
+  virtual std::error_code getSectionAlignment(DataRefImpl Sec,
+                                              uint64_t &Res) const = 0;
+  virtual std::error_code isSectionText(DataRefImpl Sec, bool &Res) const = 0;
+  virtual std::error_code isSectionData(DataRefImpl Sec, bool &Res) const = 0;
+  virtual std::error_code isSectionBSS(DataRefImpl Sec, bool &Res) const = 0;
+  virtual std::error_code isSectionRequiredForExecution(DataRefImpl Sec,
+                                                        bool &Res) const = 0;
   // A section is 'virtual' if its contents aren't present in the object image.
-  virtual error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const = 0;
-  virtual error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const = 0;
-  virtual error_code isSectionReadOnlyData(DataRefImpl Sec, bool &Res) const =0;
-  virtual error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
-                                           bool &Result) const = 0;
+  virtual std::error_code isSectionVirtual(DataRefImpl Sec,
+                                           bool &Res) const = 0;
+  virtual std::error_code isSectionZeroInit(DataRefImpl Sec,
+                                            bool &Res) const = 0;
+  virtual std::error_code isSectionReadOnlyData(DataRefImpl Sec,
+                                                bool &Res) const = 0;
+  virtual std::error_code sectionContainsSymbol(DataRefImpl Sec,
+                                                DataRefImpl Symb,
+                                                bool &Result) const = 0;
   virtual relocation_iterator section_rel_begin(DataRefImpl Sec) const = 0;
   virtual relocation_iterator section_rel_end(DataRefImpl Sec) const = 0;
   virtual section_iterator getRelocatedSection(DataRefImpl Sec) const;
@@ -259,26 +273,31 @@ protected:
   // Same as above for RelocationRef.
   friend class RelocationRef;
   virtual void moveRelocationNext(DataRefImpl &Rel) const = 0;
-  virtual error_code getRelocationAddress(DataRefImpl Rel,
-                                          uint64_t &Res) const =0;
-  virtual error_code getRelocationOffset(DataRefImpl Rel,
-                                         uint64_t &Res) const =0;
+  virtual std::error_code getRelocationAddress(DataRefImpl Rel,
+                                               uint64_t &Res) const = 0;
+  virtual std::error_code getRelocationOffset(DataRefImpl Rel,
+                                              uint64_t &Res) const = 0;
   virtual symbol_iterator getRelocationSymbol(DataRefImpl Rel) const = 0;
-  virtual error_code getRelocationType(DataRefImpl Rel,
-                                       uint64_t &Res) const = 0;
-  virtual error_code getRelocationTypeName(DataRefImpl Rel,
-                                       SmallVectorImpl<char> &Result) const = 0;
-  virtual error_code getRelocationValueString(DataRefImpl Rel,
-                                       SmallVectorImpl<char> &Result) const = 0;
-  virtual error_code getRelocationHidden(DataRefImpl Rel, bool &Result) const {
+  virtual std::error_code getRelocationType(DataRefImpl Rel,
+                                            uint64_t &Res) const = 0;
+  virtual std::error_code
+  getRelocationTypeName(DataRefImpl Rel,
+                        SmallVectorImpl<char> &Result) const = 0;
+  virtual std::error_code
+  getRelocationValueString(DataRefImpl Rel,
+                           SmallVectorImpl<char> &Result) const = 0;
+  virtual std::error_code getRelocationHidden(DataRefImpl Rel,
+                                              bool &Result) const {
     Result = false;
     return object_error::success;
   }
 
   // Same for LibraryRef
   friend class LibraryRef;
-  virtual error_code getLibraryNext(DataRefImpl Lib, LibraryRef &Res) const = 0;
-  virtual error_code getLibraryPath(DataRefImpl Lib, StringRef &Res) const = 0;
+  virtual std::error_code getLibraryNext(DataRefImpl Lib,
+                                         LibraryRef &Res) const = 0;
+  virtual std::error_code getLibraryPath(DataRefImpl Lib,
+                                         StringRef &Res) const = 0;
 
 public:
   typedef iterator_range<symbol_iterator> symbol_iterator_range;
@@ -314,11 +333,12 @@ public:
   ///        return true.
   /// @brief Create ObjectFile from path.
   static ErrorOr<ObjectFile *> createObjectFile(StringRef ObjectPath);
-  static ErrorOr<ObjectFile *> createObjectFile(MemoryBuffer *Object,
-                                                bool BufferOwned,
-                                                sys::fs::file_magic Type);
-  static ErrorOr<ObjectFile *> createObjectFile(MemoryBuffer *Object) {
-    return createObjectFile(Object, true, sys::fs::file_magic::unknown);
+  static ErrorOr<ObjectFile *>
+  createObjectFile(std::unique_ptr<MemoryBuffer> &Object,
+                   sys::fs::file_magic Type);
+  static ErrorOr<ObjectFile *>
+  createObjectFile(std::unique_ptr<MemoryBuffer> &Object) {
+    return createObjectFile(Object, sys::fs::file_magic::unknown);
   }
 
 
@@ -327,39 +347,39 @@ public:
   }
 
 public:
-  static ErrorOr<ObjectFile *> createCOFFObjectFile(MemoryBuffer *Object,
-                                                    bool BufferOwned = true);
-  static ErrorOr<ObjectFile *> createELFObjectFile(MemoryBuffer *Object,
-                                                   bool BufferOwned = true);
-  static ErrorOr<ObjectFile *> createMachOObjectFile(MemoryBuffer *Object,
-                                                     bool BufferOwned = true);
+  static ErrorOr<ObjectFile *>
+  createCOFFObjectFile(std::unique_ptr<MemoryBuffer> Object);
+  static ErrorOr<ObjectFile *>
+  createELFObjectFile(std::unique_ptr<MemoryBuffer> &Object);
+  static ErrorOr<ObjectFile *>
+  createMachOObjectFile(std::unique_ptr<MemoryBuffer> &Object);
 };
 
 // Inline function definitions.
 inline SymbolRef::SymbolRef(DataRefImpl SymbolP, const ObjectFile *Owner)
     : BasicSymbolRef(SymbolP, Owner) {}
 
-inline error_code SymbolRef::getName(StringRef &Result) const {
+inline std::error_code SymbolRef::getName(StringRef &Result) const {
   return getObject()->getSymbolName(getRawDataRefImpl(), Result);
 }
 
-inline error_code SymbolRef::getAddress(uint64_t &Result) const {
+inline std::error_code SymbolRef::getAddress(uint64_t &Result) const {
   return getObject()->getSymbolAddress(getRawDataRefImpl(), Result);
 }
 
-inline error_code SymbolRef::getAlignment(uint32_t &Result) const {
+inline std::error_code SymbolRef::getAlignment(uint32_t &Result) const {
   return getObject()->getSymbolAlignment(getRawDataRefImpl(), Result);
 }
 
-inline error_code SymbolRef::getSize(uint64_t &Result) const {
+inline std::error_code SymbolRef::getSize(uint64_t &Result) const {
   return getObject()->getSymbolSize(getRawDataRefImpl(), Result);
 }
 
-inline error_code SymbolRef::getSection(section_iterator &Result) const {
+inline std::error_code SymbolRef::getSection(section_iterator &Result) const {
   return getObject()->getSymbolSection(getRawDataRefImpl(), Result);
 }
 
-inline error_code SymbolRef::getType(SymbolRef::Type &Result) const {
+inline std::error_code SymbolRef::getType(SymbolRef::Type &Result) const {
   return getObject()->getSymbolType(getRawDataRefImpl(), Result);
 }
 
@@ -391,55 +411,56 @@ inline void SectionRef::moveNext() {
   return OwningObject->moveSectionNext(SectionPimpl);
 }
 
-inline error_code SectionRef::getName(StringRef &Result) const {
+inline std::error_code SectionRef::getName(StringRef &Result) const {
   return OwningObject->getSectionName(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::getAddress(uint64_t &Result) const {
+inline std::error_code SectionRef::getAddress(uint64_t &Result) const {
   return OwningObject->getSectionAddress(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::getSize(uint64_t &Result) const {
+inline std::error_code SectionRef::getSize(uint64_t &Result) const {
   return OwningObject->getSectionSize(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::getContents(StringRef &Result) const {
+inline std::error_code SectionRef::getContents(StringRef &Result) const {
   return OwningObject->getSectionContents(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::getAlignment(uint64_t &Result) const {
+inline std::error_code SectionRef::getAlignment(uint64_t &Result) const {
   return OwningObject->getSectionAlignment(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::isText(bool &Result) const {
+inline std::error_code SectionRef::isText(bool &Result) const {
   return OwningObject->isSectionText(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::isData(bool &Result) const {
+inline std::error_code SectionRef::isData(bool &Result) const {
   return OwningObject->isSectionData(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::isBSS(bool &Result) const {
+inline std::error_code SectionRef::isBSS(bool &Result) const {
   return OwningObject->isSectionBSS(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::isRequiredForExecution(bool &Result) const {
+inline std::error_code SectionRef::isRequiredForExecution(bool &Result) const {
   return OwningObject->isSectionRequiredForExecution(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::isVirtual(bool &Result) const {
+inline std::error_code SectionRef::isVirtual(bool &Result) const {
   return OwningObject->isSectionVirtual(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::isZeroInit(bool &Result) const {
+inline std::error_code SectionRef::isZeroInit(bool &Result) const {
   return OwningObject->isSectionZeroInit(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::isReadOnlyData(bool &Result) const {
+inline std::error_code SectionRef::isReadOnlyData(bool &Result) const {
   return OwningObject->isSectionReadOnlyData(SectionPimpl, Result);
 }
 
-inline error_code SectionRef::containsSymbol(SymbolRef S, bool &Result) const {
+inline std::error_code SectionRef::containsSymbol(SymbolRef S,
+                                                  bool &Result) const {
   return OwningObject->sectionContainsSymbol(SectionPimpl,
                                              S.getRawDataRefImpl(), Result);
 }
@@ -474,11 +495,11 @@ inline void RelocationRef::moveNext() {
   return OwningObject->moveRelocationNext(RelocationPimpl);
 }
 
-inline error_code RelocationRef::getAddress(uint64_t &Result) const {
+inline std::error_code RelocationRef::getAddress(uint64_t &Result) const {
   return OwningObject->getRelocationAddress(RelocationPimpl, Result);
 }
 
-inline error_code RelocationRef::getOffset(uint64_t &Result) const {
+inline std::error_code RelocationRef::getOffset(uint64_t &Result) const {
   return OwningObject->getRelocationOffset(RelocationPimpl, Result);
 }
 
@@ -486,21 +507,21 @@ inline symbol_iterator RelocationRef::getSymbol() const {
   return OwningObject->getRelocationSymbol(RelocationPimpl);
 }
 
-inline error_code RelocationRef::getType(uint64_t &Result) const {
+inline std::error_code RelocationRef::getType(uint64_t &Result) const {
   return OwningObject->getRelocationType(RelocationPimpl, Result);
 }
 
-inline error_code RelocationRef::getTypeName(SmallVectorImpl<char> &Result)
-  const {
+inline std::error_code
+RelocationRef::getTypeName(SmallVectorImpl<char> &Result) const {
   return OwningObject->getRelocationTypeName(RelocationPimpl, Result);
 }
 
-inline error_code RelocationRef::getValueString(SmallVectorImpl<char> &Result)
-  const {
+inline std::error_code
+RelocationRef::getValueString(SmallVectorImpl<char> &Result) const {
   return OwningObject->getRelocationValueString(RelocationPimpl, Result);
 }
 
-inline error_code RelocationRef::getHidden(bool &Result) const {
+inline std::error_code RelocationRef::getHidden(bool &Result) const {
   return OwningObject->getRelocationHidden(RelocationPimpl, Result);
 }
 
@@ -525,11 +546,11 @@ inline bool LibraryRef::operator<(const LibraryRef &Other) const {
   return LibraryPimpl < Other.LibraryPimpl;
 }
 
-inline error_code LibraryRef::getNext(LibraryRef &Result) const {
+inline std::error_code LibraryRef::getNext(LibraryRef &Result) const {
   return OwningObject->getLibraryNext(LibraryPimpl, Result);
 }
 
-inline error_code LibraryRef::getPath(StringRef &Result) const {
+inline std::error_code LibraryRef::getPath(StringRef &Result) const {
   return OwningObject->getLibraryPath(LibraryPimpl, Result);
 }
 
diff --git a/include/llvm/Object/RelocVisitor.h b/include/llvm/Object/RelocVisitor.h
index a3aaf17..5ca2450 100644
--- a/include/llvm/Object/RelocVisitor.h
+++ b/include/llvm/Object/RelocVisitor.h
@@ -253,12 +253,14 @@ private:
 
   /// PPC64 ELF
   RelocToApply visitELF_PPC64_ADDR32(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getAddend64BE(R);
+    int64_t Addend;
+    getELFRelocationAddend(R, Addend);
     uint32_t Res = (Value + Addend) & 0xFFFFFFFF;
     return RelocToApply(Res, 4);
   }
   RelocToApply visitELF_PPC64_ADDR64(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getAddend64BE(R);
+    int64_t Addend;
+    getELFRelocationAddend(R, Addend);
     return RelocToApply(Value + Addend, 8);
   }
 
diff --git a/include/llvm/Object/SymbolicFile.h b/include/llvm/Object/SymbolicFile.h
index 28400e1..77eef4a 100644
--- a/include/llvm/Object/SymbolicFile.h
+++ b/include/llvm/Object/SymbolicFile.h
@@ -86,7 +86,8 @@ public:
     SF_Weak = 1U << 2,           // Weak symbol
     SF_Absolute = 1U << 3,       // Absolute symbol
     SF_Common = 1U << 4,         // Symbol has common linkage
-    SF_FormatSpecific = 1U << 5  // Specific to the object file format
+    SF_Indirect = 1U << 5,       // Symbol is an alias to another symbol
+    SF_FormatSpecific = 1U << 6  // Specific to the object file format
                                  // (e.g. section symbols)
   };
 
@@ -98,7 +99,7 @@ public:
 
   void moveNext();
 
-  error_code printName(raw_ostream &OS) const;
+  std::error_code printName(raw_ostream &OS) const;
 
   /// Get symbol flags (bitwise OR of SymbolRef::Flags)
   uint32_t getFlags() const;
@@ -114,13 +115,13 @@ const uint64_t UnknownAddressOrSize = ~0ULL;
 class SymbolicFile : public Binary {
 public:
   virtual ~SymbolicFile();
-  SymbolicFile(unsigned int Type, MemoryBuffer *Source, bool BufferOwned);
+  SymbolicFile(unsigned int Type, std::unique_ptr<MemoryBuffer> Source);
 
   // virtual interface.
   virtual void moveSymbolNext(DataRefImpl &Symb) const = 0;
 
-  virtual error_code printSymbolName(raw_ostream &OS,
-                                     DataRefImpl Symb) const = 0;
+  virtual std::error_code printSymbolName(raw_ostream &OS,
+                                          DataRefImpl Symb) const = 0;
 
   virtual uint32_t getSymbolFlags(DataRefImpl Symb) const = 0;
 
@@ -135,20 +136,19 @@ public:
   basic_symbol_iterator symbol_end() const {
     return symbol_end_impl();
   }
+  typedef iterator_range<basic_symbol_iterator> basic_symbol_iterator_range;
+  basic_symbol_iterator_range symbols() const {
+    return basic_symbol_iterator_range(symbol_begin(), symbol_end());
+  }
 
   // construction aux.
-  static ErrorOr<SymbolicFile *> createIRObjectFile(MemoryBuffer *Object,
-                                                    LLVMContext &Context,
-                                                    bool BufferOwned = true);
-
-  static ErrorOr<SymbolicFile *> createSymbolicFile(MemoryBuffer *Object,
-                                                    bool BufferOwned,
-                                                    sys::fs::file_magic Type,
-                                                    LLVMContext *Context);
-
-  static ErrorOr<SymbolicFile *> createSymbolicFile(MemoryBuffer *Object) {
-    return createSymbolicFile(Object, true, sys::fs::file_magic::unknown,
-                              nullptr);
+  static ErrorOr<SymbolicFile *>
+  createSymbolicFile(std::unique_ptr<MemoryBuffer> &Object,
+                     sys::fs::file_magic Type, LLVMContext *Context);
+
+  static ErrorOr<SymbolicFile *>
+  createSymbolicFile(std::unique_ptr<MemoryBuffer> &Object) {
+    return createSymbolicFile(Object, sys::fs::file_magic::unknown, nullptr);
   }
   static ErrorOr<SymbolicFile *> createSymbolicFile(StringRef ObjectPath);
 
@@ -173,7 +173,7 @@ inline void BasicSymbolRef::moveNext() {
   return OwningObject->moveSymbolNext(SymbolPimpl);
 }
 
-inline error_code BasicSymbolRef::printName(raw_ostream &OS) const {
+inline std::error_code BasicSymbolRef::printName(raw_ostream &OS) const {
   return OwningObject->printSymbolName(OS, SymbolPimpl);
 }
 
diff --git a/include/llvm/Option/ArgList.h b/include/llvm/Option/ArgList.h
index ab40a1a..d46b0e8 100644
--- a/include/llvm/Option/ArgList.h
+++ b/include/llvm/Option/ArgList.h
@@ -150,6 +150,12 @@ public:
     return arg_iterator(Args.end(), *this);
   }
 
+  iterator_range<arg_iterator> filtered(OptSpecifier Id0 = 0U,
+                                        OptSpecifier Id1 = 0U,
+                                        OptSpecifier Id2 = 0U) const {
+    return make_range(filtered_begin(Id0, Id1, Id2), filtered_end());
+  }
+
   /// @}
   /// @name Arg Removal
   /// @{
@@ -328,6 +334,7 @@ public:
   unsigned MakeIndex(StringRef String0) const;
   unsigned MakeIndex(StringRef String0, StringRef String1) const;
 
+  using ArgList::MakeArgString;
   const char *MakeArgString(StringRef Str) const override;
 
   /// @}
@@ -365,6 +372,7 @@ public:
   /// (to be freed).
   void AddSynthesizedArg(Arg *A);
 
+  using ArgList::MakeArgString;
   const char *MakeArgString(StringRef Str) const override;
 
   /// AddFlagArg - Construct a new FlagArg for the given option \p Id and
diff --git a/include/llvm/PassInfo.h b/include/llvm/PassInfo.h
new file mode 100644
index 0000000..d53daf1
--- /dev/null
+++ b/include/llvm/PassInfo.h
@@ -0,0 +1,147 @@
+//===- llvm/PassInfo.h - Pass Info class ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines and implements the PassInfo class.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_PASSINFO_H
+#define LLVM_PASSINFO_H
+
+#include <cassert>
+#include <vector>
+
+namespace llvm {
+
+class Pass;
+class TargetMachine;
+
+//===---------------------------------------------------------------------------
+/// PassInfo class - An instance of this class exists for every pass known by
+/// the system, and can be obtained from a live Pass by calling its
+/// getPassInfo() method.  These objects are set up by the RegisterPass<>
+/// template.
+///
+class PassInfo {
+public:
+  typedef Pass* (*NormalCtor_t)();
+  typedef Pass *(*TargetMachineCtor_t)(TargetMachine *);
+
+private:
+  const char      *const PassName;     // Nice name for Pass
+  const char      *const PassArgument; // Command Line argument to run this pass
+  const void *PassID;      
+  const bool IsCFGOnlyPass;            // Pass only looks at the CFG.
+  const bool IsAnalysis;               // True if an analysis pass.
+  const bool IsAnalysisGroup;          // True if an analysis group.
+  std::vector<const PassInfo*> ItfImpl;// Interfaces implemented by this pass
+
+  NormalCtor_t NormalCtor;
+  TargetMachineCtor_t TargetMachineCtor;
+
+public:
+  /// PassInfo ctor - Do not call this directly, this should only be invoked
+  /// through RegisterPass.
+  PassInfo(const char *name, const char *arg, const void *pi,
+           NormalCtor_t normal, bool isCFGOnly, bool is_analysis,
+           TargetMachineCtor_t machine = nullptr)
+    : PassName(name), PassArgument(arg), PassID(pi), 
+      IsCFGOnlyPass(isCFGOnly), 
+      IsAnalysis(is_analysis), IsAnalysisGroup(false), NormalCtor(normal),
+      TargetMachineCtor(machine) {}
+  /// PassInfo ctor - Do not call this directly, this should only be invoked
+  /// through RegisterPass. This version is for use by analysis groups; it
+  /// does not auto-register the pass.
+  PassInfo(const char *name, const void *pi)
+    : PassName(name), PassArgument(""), PassID(pi), 
+      IsCFGOnlyPass(false), 
+      IsAnalysis(false), IsAnalysisGroup(true), NormalCtor(nullptr),
+      TargetMachineCtor(nullptr) {}
+
+  /// getPassName - Return the friendly name for the pass, never returns null
+  ///
+  const char *getPassName() const { return PassName; }
+
+  /// getPassArgument - Return the command line option that may be passed to
+  /// 'opt' that will cause this pass to be run.  This will return null if there
+  /// is no argument.
+  ///
+  const char *getPassArgument() const { return PassArgument; }
+
+  /// getTypeInfo - Return the id object for the pass...
+  /// TODO : Rename
+  const void *getTypeInfo() const { return PassID; }
+
+  /// Return true if this PassID implements the specified ID pointer.
+  bool isPassID(const void *IDPtr) const {
+    return PassID == IDPtr;
+  }
+  
+  /// isAnalysisGroup - Return true if this is an analysis group, not a normal
+  /// pass.
+  ///
+  bool isAnalysisGroup() const { return IsAnalysisGroup; }
+  bool isAnalysis() const { return IsAnalysis; }
+
+  /// isCFGOnlyPass - return true if this pass only looks at the CFG for the
+  /// function.
+  bool isCFGOnlyPass() const { return IsCFGOnlyPass; }
+  
+  /// getNormalCtor - Return a pointer to a function, that when called, creates
+  /// an instance of the pass and returns it.  This pointer may be null if there
+  /// is no default constructor for the pass.
+  ///
+  NormalCtor_t getNormalCtor() const {
+    return NormalCtor;
+  }
+  void setNormalCtor(NormalCtor_t Ctor) {
+    NormalCtor = Ctor;
+  }
+
+  /// getTargetMachineCtor - Return a pointer to a function, that when called
+  /// with a TargetMachine, creates an instance of the pass and returns it.
+  /// This pointer may be null if there is no constructor with a TargetMachine
+  /// for the pass.
+  ///
+  TargetMachineCtor_t getTargetMachineCtor() const { return TargetMachineCtor; }
+  void setTargetMachineCtor(TargetMachineCtor_t Ctor) {
+    TargetMachineCtor = Ctor;
+  }
+
+  /// createPass() - Use this method to create an instance of this pass.
+  Pass *createPass() const {
+    assert((!isAnalysisGroup() || NormalCtor) &&
+           "No default implementation found for analysis group!");
+    assert(NormalCtor &&
+           "Cannot call createPass on PassInfo without default ctor!");
+    return NormalCtor();
+  }
+
+  /// addInterfaceImplemented - This method is called when this pass is
+  /// registered as a member of an analysis group with the RegisterAnalysisGroup
+  /// template.
+  ///
+  void addInterfaceImplemented(const PassInfo *ItfPI) {
+    ItfImpl.push_back(ItfPI);
+  }
+
+  /// getInterfacesImplemented - Return a list of all of the analysis group
+  /// interfaces implemented by this pass.
+  ///
+  const std::vector<const PassInfo*> &getInterfacesImplemented() const {
+    return ItfImpl;
+  }
+
+private:
+  void operator=(const PassInfo &) LLVM_DELETED_FUNCTION;
+  PassInfo(const PassInfo &) LLVM_DELETED_FUNCTION;
+};
+
+}
+
+#endif
diff --git a/include/llvm/PassRegistry.h b/include/llvm/PassRegistry.h
index 7f2a014..1558c51 100644
--- a/include/llvm/PassRegistry.h
+++ b/include/llvm/PassRegistry.h
@@ -18,8 +18,14 @@
 #define LLVM_PASSREGISTRY_H
 
 #include "llvm-c/Core.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/PassInfo.h"
 #include "llvm/Support/CBindingWrapping.h"
+#include "llvm/Support/RWMutex.h"
+#include <vector>
 
 namespace llvm {
 
@@ -33,11 +39,26 @@ struct PassRegistrationListener;
 /// threads simultaneously, you will need to use a separate PassRegistry on
 /// each thread.
 class PassRegistry {
-  mutable void *pImpl;
-  void *getImpl() const;
+  mutable sys::SmartRWMutex<true> Lock;
+
+  /// PassInfoMap - Keep track of the PassInfo object for each registered pass.
+  typedef DenseMap<const void*, const PassInfo*> MapType;
+  MapType PassInfoMap;
+  
+  typedef StringMap<const PassInfo*> StringMapType;
+  StringMapType PassInfoStringMap;
+  
+  /// AnalysisGroupInfo - Keep track of information for each analysis group.
+  struct AnalysisGroupInfo {
+    SmallPtrSet<const PassInfo *, 8> Implementations;
+  };
+  DenseMap<const PassInfo*, AnalysisGroupInfo> AnalysisGroupInfoMap;
+  
+  std::vector<std::unique_ptr<const PassInfo>> ToFree;
+  std::vector<PassRegistrationListener*> Listeners;
    
 public:
-  PassRegistry() : pImpl(nullptr) { }
+  PassRegistry() { }
   ~PassRegistry();
   
   /// getPassRegistry - Access the global registry object, which is 
diff --git a/include/llvm/PassSupport.h b/include/llvm/PassSupport.h
index 8efb45f..449bc92 100644
--- a/include/llvm/PassSupport.h
+++ b/include/llvm/PassSupport.h
@@ -23,6 +23,7 @@
 
 #include "Pass.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/PassInfo.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/Atomic.h"
 #include "llvm/Support/Valgrind.h"
@@ -31,120 +32,6 @@
 namespace llvm {
 
 class TargetMachine;
-//===---------------------------------------------------------------------------
-/// PassInfo class - An instance of this class exists for every pass known by
-/// the system, and can be obtained from a live Pass by calling its
-/// getPassInfo() method.  These objects are set up by the RegisterPass<>
-/// template, defined below.
-///
-class PassInfo {
-public:
-  typedef Pass* (*NormalCtor_t)();
-  typedef Pass *(*TargetMachineCtor_t)(TargetMachine *);
-
-private:
-  const char      *const PassName;     // Nice name for Pass
-  const char      *const PassArgument; // Command Line argument to run this pass
-  const void *PassID;      
-  const bool IsCFGOnlyPass;            // Pass only looks at the CFG.
-  const bool IsAnalysis;               // True if an analysis pass.
-  const bool IsAnalysisGroup;          // True if an analysis group.
-  std::vector<const PassInfo*> ItfImpl;// Interfaces implemented by this pass
-
-  NormalCtor_t NormalCtor;
-  TargetMachineCtor_t TargetMachineCtor;
-
-public:
-  /// PassInfo ctor - Do not call this directly, this should only be invoked
-  /// through RegisterPass.
-  PassInfo(const char *name, const char *arg, const void *pi,
-           NormalCtor_t normal, bool isCFGOnly, bool is_analysis,
-           TargetMachineCtor_t machine = nullptr)
-    : PassName(name), PassArgument(arg), PassID(pi), 
-      IsCFGOnlyPass(isCFGOnly), 
-      IsAnalysis(is_analysis), IsAnalysisGroup(false), NormalCtor(normal),
-      TargetMachineCtor(machine) {}
-  /// PassInfo ctor - Do not call this directly, this should only be invoked
-  /// through RegisterPass. This version is for use by analysis groups; it
-  /// does not auto-register the pass.
-  PassInfo(const char *name, const void *pi)
-    : PassName(name), PassArgument(""), PassID(pi), 
-      IsCFGOnlyPass(false), 
-      IsAnalysis(false), IsAnalysisGroup(true), NormalCtor(nullptr),
-      TargetMachineCtor(nullptr) {}
-
-  /// getPassName - Return the friendly name for the pass, never returns null
-  ///
-  const char *getPassName() const { return PassName; }
-
-  /// getPassArgument - Return the command line option that may be passed to
-  /// 'opt' that will cause this pass to be run.  This will return null if there
-  /// is no argument.
-  ///
-  const char *getPassArgument() const { return PassArgument; }
-
-  /// getTypeInfo - Return the id object for the pass...
-  /// TODO : Rename
-  const void *getTypeInfo() const { return PassID; }
-
-  /// Return true if this PassID implements the specified ID pointer.
-  bool isPassID(const void *IDPtr) const {
-    return PassID == IDPtr;
-  }
-  
-  /// isAnalysisGroup - Return true if this is an analysis group, not a normal
-  /// pass.
-  ///
-  bool isAnalysisGroup() const { return IsAnalysisGroup; }
-  bool isAnalysis() const { return IsAnalysis; }
-
-  /// isCFGOnlyPass - return true if this pass only looks at the CFG for the
-  /// function.
-  bool isCFGOnlyPass() const { return IsCFGOnlyPass; }
-  
-  /// getNormalCtor - Return a pointer to a function, that when called, creates
-  /// an instance of the pass and returns it.  This pointer may be null if there
-  /// is no default constructor for the pass.
-  ///
-  NormalCtor_t getNormalCtor() const {
-    return NormalCtor;
-  }
-  void setNormalCtor(NormalCtor_t Ctor) {
-    NormalCtor = Ctor;
-  }
-
-  /// getTargetMachineCtor - Return a pointer to a function, that when called
-  /// with a TargetMachine, creates an instance of the pass and returns it.
-  /// This pointer may be null if there is no constructor with a TargetMachine
-  /// for the pass.
-  ///
-  TargetMachineCtor_t getTargetMachineCtor() const { return TargetMachineCtor; }
-  void setTargetMachineCtor(TargetMachineCtor_t Ctor) {
-    TargetMachineCtor = Ctor;
-  }
-
-  /// createPass() - Use this method to create an instance of this pass.
-  Pass *createPass() const;
-
-  /// addInterfaceImplemented - This method is called when this pass is
-  /// registered as a member of an analysis group with the RegisterAnalysisGroup
-  /// template.
-  ///
-  void addInterfaceImplemented(const PassInfo *ItfPI) {
-    ItfImpl.push_back(ItfPI);
-  }
-
-  /// getInterfacesImplemented - Return a list of all of the analysis group
-  /// interfaces implemented by this pass.
-  ///
-  const std::vector<const PassInfo*> &getInterfacesImplemented() const {
-    return ItfImpl;
-  }
-
-private:
-  void operator=(const PassInfo &) LLVM_DELETED_FUNCTION;
-  PassInfo(const PassInfo &) LLVM_DELETED_FUNCTION;
-};
 
 #define CALL_ONCE_INITIALIZATION(function) \
   static volatile sys::cas_flag initialized = 0; \
@@ -325,19 +212,12 @@ struct RegisterAnalysisGroup : public RegisterAGBase {
 /// clients that are interested in which passes get registered and unregistered
 /// at runtime (which can be because of the RegisterPass constructors being run
 /// as the program starts up, or may be because a shared object just got
-/// loaded).  Deriving from the PassRegistrationListener class automatically
-/// registers your object to receive callbacks indicating when passes are loaded
-/// and removed.
+/// loaded).
 ///
 struct PassRegistrationListener {
 
-  /// PassRegistrationListener ctor - Add the current object to the list of
-  /// PassRegistrationListeners...
-  PassRegistrationListener();
-
-  /// dtor - Remove object from list of listeners...
-  ///
-  virtual ~PassRegistrationListener();
+  PassRegistrationListener() {}
+  virtual ~PassRegistrationListener() {}
 
   /// Callback functions - These functions are invoked whenever a pass is loaded
   /// or removed from the current executable.
diff --git a/include/llvm/ProfileData/InstrProf.h b/include/llvm/ProfileData/InstrProf.h
index 8457678..eafb768 100644
--- a/include/llvm/ProfileData/InstrProf.h
+++ b/include/llvm/ProfileData/InstrProf.h
@@ -16,14 +16,12 @@
 #ifndef LLVM_PROFILEDATA_INSTRPROF_H_
 #define LLVM_PROFILEDATA_INSTRPROF_H_
 
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 namespace llvm {
+const std::error_category &instrprof_category();
 
-const error_category &instrprof_category();
-
-struct instrprof_error {
-  enum ErrorType {
+enum class instrprof_error {
     success = 0,
     eof,
     bad_magic,
@@ -37,21 +35,17 @@ struct instrprof_error {
     hash_mismatch,
     count_mismatch,
     counter_overflow
-  };
-  ErrorType V;
-
-  instrprof_error(ErrorType V) : V(V) {}
-  operator ErrorType() const { return V; }
 };
 
-inline error_code make_error_code(instrprof_error E) {
-  return error_code(static_cast<int>(E), instrprof_category());
+inline std::error_code make_error_code(instrprof_error E) {
+  return std::error_code(static_cast<int>(E), instrprof_category());
 }
 
-template <> struct is_error_code_enum<instrprof_error> : std::true_type {};
-template <> struct is_error_code_enum<instrprof_error::ErrorType>
-  : std::true_type {};
-
 } // end namespace llvm
 
+namespace std {
+template <>
+struct is_error_code_enum<llvm::instrprof_error> : std::true_type {};
+}
+
 #endif // LLVM_PROFILEDATA_INSTRPROF_H_
diff --git a/include/llvm/ProfileData/InstrProfReader.h b/include/llvm/ProfileData/InstrProfReader.h
index 3e18c76..7a5a71d 100644
--- a/include/llvm/ProfileData/InstrProfReader.h
+++ b/include/llvm/ProfileData/InstrProfReader.h
@@ -60,28 +60,29 @@ public:
 /// Base class and interface for reading profiling data of any known instrprof
 /// format. Provides an iterator over InstrProfRecords.
 class InstrProfReader {
-  error_code LastError;
+  std::error_code LastError;
+
 public:
   InstrProfReader() : LastError(instrprof_error::success) {}
   virtual ~InstrProfReader() {}
 
   /// Read the header.  Required before reading first record.
-  virtual error_code readHeader() = 0;
+  virtual std::error_code readHeader() = 0;
   /// Read a single record.
-  virtual error_code readNextRecord(InstrProfRecord &Record) = 0;
+  virtual std::error_code readNextRecord(InstrProfRecord &Record) = 0;
   /// Iterator over profile data.
   InstrProfIterator begin() { return InstrProfIterator(this); }
   InstrProfIterator end() { return InstrProfIterator(); }
 
 protected:
-  /// Set the current error_code and return same.
-  error_code error(error_code EC) {
+  /// Set the current std::error_code and return same.
+  std::error_code error(std::error_code EC) {
     LastError = EC;
     return EC;
   }
 
   /// Clear the current error code and return a successful one.
-  error_code success() { return error(instrprof_error::success); }
+  std::error_code success() { return error(instrprof_error::success); }
 
 public:
   /// Return true if the reader has finished reading the profile data.
@@ -89,12 +90,12 @@ public:
   /// Return true if the reader encountered an error reading profiling data.
   bool hasError() { return LastError && !isEOF(); }
   /// Get the current error code.
-  error_code getError() { return LastError; }
+  std::error_code getError() { return LastError; }
 
   /// Factory method to create an appropriately typed reader for the given
   /// instrprof file.
-  static error_code create(std::string Path,
-                           std::unique_ptr<InstrProfReader> &Result);
+  static std::error_code create(std::string Path,
+                                std::unique_ptr<InstrProfReader> &Result);
 };
 
 /// Reader for the simple text based instrprof format.
@@ -122,9 +123,9 @@ public:
       : DataBuffer(std::move(DataBuffer_)), Line(*DataBuffer, '#') {}
 
   /// Read the header.
-  error_code readHeader() override { return success(); }
+  std::error_code readHeader() override { return success(); }
   /// Read a single record.
-  error_code readNextRecord(InstrProfRecord &Record) override;
+  std::error_code readNextRecord(InstrProfRecord &Record) override;
 };
 
 /// Reader for the raw instrprof binary format from runtime.
@@ -167,23 +168,23 @@ private:
   const char *NamesStart;
   const char *ProfileEnd;
 
-  RawInstrProfReader(const TextInstrProfReader &) LLVM_DELETED_FUNCTION;
-  RawInstrProfReader &operator=(const TextInstrProfReader &)
+  RawInstrProfReader(const RawInstrProfReader &) LLVM_DELETED_FUNCTION;
+  RawInstrProfReader &operator=(const RawInstrProfReader &)
     LLVM_DELETED_FUNCTION;
 public:
   RawInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer)
       : DataBuffer(std::move(DataBuffer)) { }
 
   static bool hasFormat(const MemoryBuffer &DataBuffer);
-  error_code readHeader() override;
-  error_code readNextRecord(InstrProfRecord &Record) override;
+  std::error_code readHeader() override;
+  std::error_code readNextRecord(InstrProfRecord &Record) override;
 
 private:
-  error_code readNextHeader(const char *CurrentPos);
-  error_code readHeader(const RawHeader &Header);
+  std::error_code readNextHeader(const char *CurrentPos);
+  std::error_code readHeader(const RawHeader &Header);
   template <class IntT>
   IntT swap(IntT Int) const {
-    return ShouldSwapBytes ? sys::SwapByteOrder(Int) : Int;
+    return ShouldSwapBytes ? sys::getSwappedBytes(Int) : Int;
   }
   const uint64_t *getCounter(IntPtrT CounterPtr) const {
     ptrdiff_t Offset = (swap(CounterPtr) - CountersDelta) / sizeof(uint64_t);
@@ -281,19 +282,19 @@ public:
   static bool hasFormat(const MemoryBuffer &DataBuffer);
 
   /// Read the file header.
-  error_code readHeader() override;
+  std::error_code readHeader() override;
   /// Read a single record.
-  error_code readNextRecord(InstrProfRecord &Record) override;
+  std::error_code readNextRecord(InstrProfRecord &Record) override;
 
   /// Fill Counts with the profile data for the given function name.
-  error_code getFunctionCounts(StringRef FuncName, uint64_t &FuncHash,
-                               std::vector<uint64_t> &Counts);
+  std::error_code getFunctionCounts(StringRef FuncName, uint64_t &FuncHash,
+                                    std::vector<uint64_t> &Counts);
   /// Return the maximum of all known function counts.
   uint64_t getMaximumFunctionCount() { return MaxFunctionCount; }
 
   /// Factory method to create an indexed reader.
-  static error_code create(std::string Path,
-                           std::unique_ptr<IndexedInstrProfReader> &Result);
+  static std::error_code
+  create(std::string Path, std::unique_ptr<IndexedInstrProfReader> &Result);
 };
 
 } // end namespace llvm
diff --git a/include/llvm/ProfileData/InstrProfWriter.h b/include/llvm/ProfileData/InstrProfWriter.h
index fa37bf1..6e68bee 100644
--- a/include/llvm/ProfileData/InstrProfWriter.h
+++ b/include/llvm/ProfileData/InstrProfWriter.h
@@ -38,8 +38,9 @@ public:
   /// Add function counts for the given function. If there are already counts
   /// for this function and the hash and number of counts match, each counter is
   /// summed.
-  error_code addFunctionCounts(StringRef FunctionName, uint64_t FunctionHash,
-                               ArrayRef<uint64_t> Counters);
+  std::error_code addFunctionCounts(StringRef FunctionName,
+                                    uint64_t FunctionHash,
+                                    ArrayRef<uint64_t> Counters);
   /// Ensure that all data is written to disk.
   void write(raw_fd_ostream &OS);
 };
diff --git a/include/llvm/Support/ARMBuildAttributes.h b/include/llvm/Support/ARMBuildAttributes.h
index 1631200..f63e0a6 100644
--- a/include/llvm/Support/ARMBuildAttributes.h
+++ b/include/llvm/Support/ARMBuildAttributes.h
@@ -159,6 +159,11 @@ enum {
   AddressDirect = 1, // Address imported data directly
   AddressGOT = 2, // Address imported data indirectly (via GOT)
 
+  // Tag_ABI_PCS_wchar_t, (=18), uleb128
+  WCharProhibited = 0,  // wchar_t is not used
+  WCharWidth2Bytes = 2, // sizeof(wchar_t) == 2
+  WCharWidth4Bytes = 4, // sizeof(wchar_t) == 4
+
   // Tag_ABI_FP_denormal, (=20), uleb128
   PreserveFPSign = 2, // sign when flushed-to-zero is preserved
 
@@ -166,6 +171,16 @@ enum {
   AllowRTABI = 2,  // numbers, infinities, and one quiet NaN (see [RTABI])
   AllowIEE754 = 3, // this code to use all the IEEE 754-defined FP encodings
 
+  // Tag_ABI_enum_size, (=26), uleb128
+  EnumProhibited = 0, // The user prohibited the use of enums when building
+                      // this entity.
+  EnumSmallest = 1,   // Enum is smallest container big enough to hold all
+                      // values.
+  Enum32Bit = 2,      // Enum is at least 32 bits.
+  Enum32BitABI = 3,   // Every enumeration visible across an ABI-complying
+                      // interface contains a value needing 32 bits to encode
+                      // it; other enums can be containerized.
+
   // Tag_ABI_HardFP_use, (=27), uleb128
   HardFPImplied = 0,          // FP use should be implied by Tag_FP_arch
   HardFPSinglePrecision = 1,  // Single-precision only
diff --git a/include/llvm/Support/ARMWinEH.h b/include/llvm/Support/ARMWinEH.h
new file mode 100644
index 0000000..78deb8d
--- /dev/null
+++ b/include/llvm/Support/ARMWinEH.h
@@ -0,0 +1,384 @@
+//===-- llvm/Support/WinARMEH.h - Windows on ARM EH Constants ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_WINARMEH_H
+#define LLVM_SUPPORT_WINARMEH_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Endian.h"
+
+namespace llvm {
+namespace ARM {
+namespace WinEH {
+enum class RuntimeFunctionFlag {
+  RFF_Unpacked,       /// unpacked entry
+  RFF_Packed,         /// packed entry
+  RFF_PackedFragment, /// packed entry representing a fragment
+  RFF_Reserved,       /// reserved
+};
+
+enum class ReturnType {
+  RT_POP,             /// return via pop {pc} (L flag must be set)
+  RT_B,               /// 16-bit branch
+  RT_BW,              /// 32-bit branch
+  RT_NoEpilogue,      /// no epilogue (fragment)
+};
+
+/// RuntimeFunction - An entry in the table of procedure data (.pdata)
+///
+///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
+///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+/// +---------------------------------------------------------------+
+/// |                     Function Start RVA                        |
+/// +-------------------+-+-+-+-----+-+---+---------------------+---+
+/// |    Stack Adjust   |C|L|R| Reg |H|Ret|   Function Length   |Flg|
+/// +-------------------+-+-+-+-----+-+---+---------------------+---+
+///
+/// Flag : 2-bit field with the following meanings:
+///   - 00 = packed unwind data not used; reamining bits point to .xdata record
+///   - 01 = packed unwind data
+///   - 10 = packed unwind data, function assumed to have no prologue; useful
+///          for function fragments that are discontiguous with the start of the
+///          function
+///   - 11 = reserved
+/// Function Length : 11-bit field providing the length of the entire function
+///                   in bytes, divided by 2; if the function is greater than
+///                   4KB, a full .xdata record must be used instead
+/// Ret : 2-bit field indicating how the function returns
+///   - 00 = return via pop {pc} (the L bit must be set)
+///   - 01 = return via 16-bit branch
+///   - 10 = return via 32-bit branch
+///   - 11 = no epilogue; useful for function fragments that may only contain a
+///          prologue but the epilogue is elsewhere
+/// H : 1-bit flag indicating whether the function "homes" the integer parameter
+///     registers (r0-r3), allocating 16-bytes on the stack
+/// Reg : 3-bit field indicating the index of the last saved non-volatile
+///       register.  If the R bit is set to 0, then only integer registers are
+///       saved (r4-rN, where N is 4 + Reg).  If the R bit is set to 1, then
+///       only floating-point registers are being saved (d8-dN, where N is
+///       8 + Reg).  The special case of the R bit being set to 1 and Reg equal
+///       to 7 indicates that no registers are saved.
+/// R : 1-bit flag indicating whether the non-volatile registers are integer or
+///     floating-point.  0 indicates integer, 1 indicates floating-point.  The
+///     special case of the R-flag being set and Reg being set to 7 indicates
+///     that no non-volatile registers are saved.
+/// L : 1-bit flag indicating whether the function saves/restores the link
+///     register (LR)
+/// C : 1-bit flag indicating whether the function includes extra instructions
+///     to setup a frame chain for fast walking.  If this flag is set, r11 is
+///     implicitly added to the list of saved non-volatile integer registers.
+/// Stack Adjust : 10-bit field indicating the number of bytes of stack that are
+///                allocated for this function.  Only values between 0x000 and
+///                0x3f3 can be directly encoded.  If the value is 0x3f4 or
+///                greater, then the low 4 bits have special meaning as follows:
+///                - Bit 0-1
+///                  indicate the number of words' of adjustment (1-4), minus 1
+///                - Bit 2
+///                  indicates if the prologue combined adjustment into push
+///                - Bit 3
+///                  indicates if the epilogue combined adjustment into pop
+///
+/// RESTRICTIONS:
+///   - IF C is SET:
+///     + L flag must be set since frame chaining requires r11 and lr
+///     + r11 must NOT be included in the set of registers described by Reg
+///   - IF Ret is 0:
+///     + L flag must be set
+
+// NOTE: RuntimeFunction is meant to be a simple class that provides raw access
+// to all fields in the structure.  The accessor methods reflect the names of
+// the bitfields that they correspond to.  Although some obvious simplifications
+// are possible via merging of methods, it would prevent the use of this class
+// to fully inspect the contents of the data structure which is particularly
+// useful for scenarios such as llvm-readobj to aid in testing.
+
+class RuntimeFunction {
+public:
+  const support::ulittle32_t BeginAddress;
+  const support::ulittle32_t UnwindData;
+
+  RuntimeFunction(const support::ulittle32_t *Data)
+    : BeginAddress(Data[0]), UnwindData(Data[1]) {}
+
+  RuntimeFunction(const support::ulittle32_t BeginAddress,
+                  const support::ulittle32_t UnwindData)
+    : BeginAddress(BeginAddress), UnwindData(UnwindData) {}
+
+  RuntimeFunctionFlag Flag() const {
+    return RuntimeFunctionFlag(UnwindData & 0x3);
+  }
+
+  uint32_t ExceptionInformationRVA() const {
+    assert(Flag() == RuntimeFunctionFlag::RFF_Unpacked &&
+           "unpacked form required for this operation");
+    return (UnwindData & ~0x3);
+  }
+
+  uint32_t PackedUnwindData() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return (UnwindData & ~0x3);
+  }
+  uint32_t FunctionLength() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return (((UnwindData & 0x00001ffc) >> 2) << 1);
+  }
+  ReturnType Ret() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    assert(((UnwindData & 0x00006000) || L()) && "L must be set to 1");
+    return ReturnType((UnwindData & 0x00006000) >> 13);
+  }
+  bool H() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return ((UnwindData & 0x00008000) >> 15);
+  }
+  uint8_t Reg() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return ((UnwindData & 0x00070000) >> 16);
+  }
+  bool R() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return ((UnwindData & 0x00080000) >> 19);
+  }
+  bool L() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return ((UnwindData & 0x00100000) >> 20);
+  }
+  bool C() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    assert(((~UnwindData & 0x00200000) || L()) &&
+           "L flag must be set, chaining requires r11 and LR");
+    assert(((~UnwindData & 0x00200000) || (Reg() < 7) || R()) &&
+           "r11 must not be included in Reg; C implies r11");
+    return ((UnwindData & 0x00200000) >> 21);
+  }
+  uint16_t StackAdjust() const {
+    assert((Flag() == RuntimeFunctionFlag::RFF_Packed ||
+            Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+           "packed form required for this operation");
+    return ((UnwindData & 0xffc00000) >> 22);
+  }
+};
+
+/// PrologueFolding - pseudo-flag derived from Stack Adjust indicating that the
+/// prologue has stack adjustment combined into the push
+inline bool PrologueFolding(const RuntimeFunction &RF) {
+  return RF.StackAdjust() >= 0x3f4 && (RF.StackAdjust() & 0x4);
+}
+/// Epilogue - pseudo-flag derived from Stack Adjust indicating that the
+/// epilogue has stack adjustment combined into the pop
+inline bool EpilogueFolding(const RuntimeFunction &RF) {
+  return RF.StackAdjust() >= 0x3f4 && (RF.StackAdjust() & 0x8);
+}
+/// StackAdjustment - calculated stack adjustment in words.  The stack
+/// adjustment should be determined via this function to account for the special
+/// handling the special encoding when the value is >= 0x3f4.
+inline uint16_t StackAdjustment(const RuntimeFunction &RF) {
+  uint16_t Adjustment = RF.StackAdjust();
+  if (Adjustment >= 0x3f4)
+    return (Adjustment & 0x3) ? ((Adjustment & 0x3) << 2) - 1 : 0;
+  return Adjustment;
+}
+
+/// SavedRegisterMask - Utility function to calculate the set of saved general
+/// purpose (r0-r15) and VFP (d0-d31) registers.
+std::pair<uint16_t, uint32_t> SavedRegisterMask(const RuntimeFunction &RF);
+
+/// ExceptionDataRecord - An entry in the table of exception data (.xdata)
+///
+///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
+///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+/// +-------+---------+-+-+-+---+-----------------------------------+
+/// | C Wrd | Epi Cnt |F|E|X|Ver|         Function Length           |
+/// +-------+--------+'-'-'-'---'---+-------------------------------+
+/// |    Reserved    |Ex. Code Words|   (Extended Epilogue Count)   |
+/// +-------+--------+--------------+-------------------------------+
+///
+/// Function Length : 18-bit field indicating the total length of the function
+///                   in bytes divided by 2.  If a function is larger than
+///                   512KB, then multiple pdata and xdata records must be used.
+/// Vers : 2-bit field describing the version of the remaining structure.  Only
+///        version 0 is currently defined (values 1-3 are not permitted).
+/// X : 1-bit field indicating the presence of exception data
+/// E : 1-bit field indicating that the single epilogue is packed into the
+///     header
+/// F : 1-bit field indicating that the record describes a function fragment
+///     (implies that no prologue is present, and prologue processing should be
+///     skipped)
+/// Epilogue Count : 5-bit field that differs in meaning based on the E field.
+///
+///                  If E is set, then this field specifies the index of the
+///                  first unwind code describing the (only) epilogue.
+///
+///                  Otherwise, this field indicates the number of exception
+///                  scopes.  If more than 31 scopes exist, then this field and
+///                  the Code Words field must both be set to 0 to indicate that
+///                  an extension word is required.
+/// Code Words : 4-bit field that species the number of 32-bit words needed to
+///              contain all the unwind codes.  If more than 15 words (63 code
+///              bytes) are required, then this field and the Epilogue Count
+///              field must both be set to 0 to indicate that an extension word
+///              is required.
+/// Extended Epilogue Count, Extended Code Words :
+///                          Valid only if Epilog Count and Code Words are both
+///                          set to 0.  Provides an 8-bit extended code word
+///                          count and 16-bits for epilogue count
+///
+///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
+///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+/// +----------------+------+---+---+-------------------------------+
+/// |  Ep Start Idx  | Cond |Res|       Epilogue Start Offset       |
+/// +----------------+------+---+-----------------------------------+
+///
+/// If the E bit is unset in the header, the header is followed by a series of
+/// epilogue scopes, which are sorted by their offset.
+///
+/// Epilogue Start Offset: 18-bit field encoding the offset of epilogue relative
+///                        to the start of the function in bytes divided by two
+/// Res : 2-bit field reserved for future expansion (must be set to 0)
+/// Condition : 4-bit field providing the condition under which the epilogue is
+///             executed.  Unconditional epilogues should set this field to 0xe.
+///             Epilogues must be entirely conditional or unconditional, and in
+///             Thumb-2 mode.  The epilogue beings with the first instruction
+///             after the IT opcode.
+/// Epilogue Start Index : 8-bit field indicating the byte index of the first
+///                        unwind code describing the epilogue
+///
+///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
+///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+/// +---------------+---------------+---------------+---------------+
+/// | Unwind Code 3 | Unwind Code 2 | Unwind Code 1 | Unwind Code 0 |
+/// +---------------+---------------+---------------+---------------+
+///
+/// Following the epilogue scopes, the byte code describing the unwinding
+/// follows.  This is padded to align up to word alignment.  Bytes are stored in
+/// little endian.
+///
+///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
+///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+/// +---------------------------------------------------------------+
+/// |           Exception Handler RVA (requires X = 1)              |
+/// +---------------------------------------------------------------+
+/// |  (possibly followed by data required for exception handler)   |
+/// +---------------------------------------------------------------+
+///
+/// If the X bit is set in the header, the unwind byte code is followed by the
+/// exception handler information.  This constants of one Exception Handler RVA
+/// which is the address to the exception handler, followed immediately by the
+/// variable length data associated with the exception handler.
+///
+
+struct EpilogueScope {
+  const support::ulittle32_t ES;
+
+  EpilogueScope(const support::ulittle32_t Data) : ES(Data) {}
+  uint32_t EpilogueStartOffset() const {
+    return (ES & 0x0003ffff);
+  }
+  uint8_t Res() const {
+    return ((ES & 0x000c0000) >> 18);
+  }
+  uint8_t Condition() const {
+    return ((ES & 0x00f00000) >> 20);
+  }
+  uint8_t EpilogueStartIndex() const {
+    return ((ES & 0xff000000) >> 24);
+  }
+};
+
+struct ExceptionDataRecord;
+inline size_t HeaderWords(const ExceptionDataRecord &XR);
+
+struct ExceptionDataRecord {
+  const support::ulittle32_t *Data;
+
+  ExceptionDataRecord(const support::ulittle32_t *Data) : Data(Data) {}
+
+  uint32_t FunctionLength() const {
+    return (Data[0] & 0x0003ffff);
+  }
+
+  uint8_t Vers() const {
+    return (Data[0] & 0x000C0000) >> 18;
+  }
+
+  bool X() const {
+    return ((Data[0] & 0x00100000) >> 20);
+  }
+
+  bool E() const {
+    return ((Data[0] & 0x00200000) >> 21);
+  }
+
+  bool F() const {
+    return ((Data[0] & 0x00400000) >> 22);
+  }
+
+  uint8_t EpilogueCount() const {
+    if (HeaderWords(*this) == 1)
+      return (Data[0] & 0x0f800000) >> 23;
+    return Data[1] & 0x0000ffff;
+  }
+
+  uint8_t CodeWords() const {
+    if (HeaderWords(*this) == 1)
+      return (Data[0] & 0xf0000000) >> 28;
+    return (Data[1] & 0x00ff0000) >> 16;
+  }
+
+  ArrayRef<support::ulittle32_t> EpilogueScopes() const {
+    assert(E() == 0 && "epilogue scopes are only present when the E bit is 0");
+    size_t Offset = HeaderWords(*this);
+    return ArrayRef<support::ulittle32_t>(&Data[Offset], EpilogueCount());
+  }
+
+  ArrayRef<support::ulittle8_t> UnwindByteCode() const {
+    const size_t Offset = HeaderWords(*this)
+                        + (E() ? 0 :  EpilogueCount());
+    const support::ulittle8_t *ByteCode =
+      reinterpret_cast<const support::ulittle8_t *>(&Data[Offset]);
+    return ArrayRef<support::ulittle8_t>(ByteCode,
+                                         CodeWords() * sizeof(uint32_t));
+  }
+
+  uint32_t ExceptionHandlerRVA() const {
+    assert(X() && "Exception Handler RVA is only valid if the X bit is set");
+    return Data[HeaderWords(*this) + EpilogueCount() + CodeWords()];
+  }
+
+  uint32_t ExceptionHandlerParameter() const {
+    assert(X() && "Exception Handler RVA is only valid if the X bit is set");
+    return Data[HeaderWords(*this) + EpilogueCount() + CodeWords() + 1];
+  }
+};
+
+inline size_t HeaderWords(const ExceptionDataRecord &XR) {
+  return (XR.Data[0] & 0xff800000) ? 1 : 2;
+}
+}
+}
+}
+
+#endif
+
diff --git a/include/llvm/Support/COFF.h b/include/llvm/Support/COFF.h
index f0e5c7d..e09ef07 100644
--- a/include/llvm/Support/COFF.h
+++ b/include/llvm/Support/COFF.h
@@ -553,7 +553,8 @@ namespace COFF {
     IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE = 0x0040,
     /// Code integrity checks are enforced.
     IMAGE_DLL_CHARACTERISTICS_FORCE_INTEGRITY = 0x0080,
-    IMAGE_DLL_CHARACTERISTICS_NX_COMPAT = 0x0100, ///< Image is NX compatible.
+    ///< Image is NX compatible.
+    IMAGE_DLL_CHARACTERISTICS_NX_COMPAT = 0x0100,
     /// Isolation aware, but do not isolate the image.
     IMAGE_DLL_CHARACTERISTICS_NO_ISOLATION = 0x0200,
     /// Does not use structured exception handling (SEH). No SEH handler may be
@@ -561,7 +562,12 @@ namespace COFF {
     IMAGE_DLL_CHARACTERISTICS_NO_SEH = 0x0400,
     /// Do not bind the image.
     IMAGE_DLL_CHARACTERISTICS_NO_BIND = 0x0800,
-    IMAGE_DLL_CHARACTERISTICS_WDM_DRIVER = 0x2000, ///< A WDM driver.
+    ///< Image should execute in an AppContainer.
+    IMAGE_DLL_CHARACTERISTICS_APPCONTAINER = 0x1000,
+    ///< A WDM driver.
+    IMAGE_DLL_CHARACTERISTICS_WDM_DRIVER = 0x2000,
+    ///< Image supports Control Flow Guard.
+    IMAGE_DLL_CHARACTERISTICS_GUARD_CF = 0x4000,
     /// Terminal Server aware.
     IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE = 0x8000
   };
diff --git a/include/llvm/Support/Compiler.h b/include/llvm/Support/Compiler.h
index 1edcd45..25bf32a 100644
--- a/include/llvm/Support/Compiler.h
+++ b/include/llvm/Support/Compiler.h
@@ -61,6 +61,12 @@
 #define LLVM_MSC_PREREQ(version) 0
 #endif
 
+#ifndef _MSC_VER
+#define LLVM_NOEXCEPT noexcept
+#else
+#define LLVM_NOEXCEPT
+#endif
+
 /// \brief Does the compiler support r-value reference *this?
 ///
 /// Sadly, this is separate from just r-value reference support because GCC
diff --git a/include/llvm/Support/ConvertUTF.h b/include/llvm/Support/ConvertUTF.h
index 2820366..a184d0d 100644
--- a/include/llvm/Support/ConvertUTF.h
+++ b/include/llvm/Support/ConvertUTF.h
@@ -136,7 +136,19 @@ ConversionResult ConvertUTF8toUTF16 (
   const UTF8** sourceStart, const UTF8* sourceEnd,
   UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
 
-ConversionResult ConvertUTF8toUTF32 (
+/**
+ * Convert a partial UTF8 sequence to UTF32.  If the sequence ends in an
+ * incomplete code unit sequence, returns \c sourceExhausted.
+ */
+ConversionResult ConvertUTF8toUTF32Partial(
+  const UTF8** sourceStart, const UTF8* sourceEnd,
+  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
+
+/**
+ * Convert a partial UTF8 sequence to UTF32.  If the sequence ends in an
+ * incomplete code unit sequence, returns \c sourceIllegal.
+ */
+ConversionResult ConvertUTF8toUTF32(
   const UTF8** sourceStart, const UTF8* sourceEnd,
   UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
 
diff --git a/include/llvm/Support/CrashRecoveryContext.h b/include/llvm/Support/CrashRecoveryContext.h
index c132373..3869ebd 100644
--- a/include/llvm/Support/CrashRecoveryContext.h
+++ b/include/llvm/Support/CrashRecoveryContext.h
@@ -87,6 +87,9 @@ public:
   /// requested stack size).
   ///
   /// See RunSafely() and llvm_execute_on_thread().
+  ///
+  /// On Darwin, if PRIO_DARWIN_BG is set on the calling thread, it will be
+  /// propagated to the new thread as well.
   bool RunSafelyOnThread(function_ref<void()>, unsigned RequestedStackSize = 0);
   bool RunSafelyOnThread(void (*Fn)(void*), void *UserData,
                          unsigned RequestedStackSize = 0) {
diff --git a/include/llvm/Support/DataTypes.h.cmake b/include/llvm/Support/DataTypes.h.cmake
index a26070c..1f0c8eb 100644
--- a/include/llvm/Support/DataTypes.h.cmake
+++ b/include/llvm/Support/DataTypes.h.cmake
@@ -37,6 +37,16 @@
 #include <math.h>
 #endif
 
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h>
+#endif
+
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#else
+#error "Compiler must provide an implementation of stdint.h"
+#endif
+
 #ifndef _MSC_VER
 
 /* Note that this header's correct operation depends on __STDC_LIMIT_MACROS
@@ -55,14 +65,6 @@
 /* Note that <inttypes.h> includes <stdint.h>, if this is a C99 system. */
 #include <sys/types.h>
 
-#ifdef HAVE_INTTYPES_H
-#include <inttypes.h>
-#endif
-
-#ifdef HAVE_STDINT_H
-#include <stdint.h>
-#endif
-
 #ifdef _AIX
 #include "llvm/Support/AIXDataTypesFix.h"
 #endif
@@ -77,11 +79,6 @@ typedef u_int64_t uint64_t;
 #endif
 
 #else /* _MSC_VER */
-/* Visual C++ doesn't provide standard integer headers, but it does provide
-   built-in data types. */
-#ifdef HAVE_STDINT_H
-#include <stdint.h>
-#endif
 #include <stdlib.h>
 #include <stddef.h>
 #include <sys/types.h>
@@ -90,93 +87,21 @@ typedef u_int64_t uint64_t;
 #else
 #include <math.h>
 #endif
-typedef __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-typedef signed int int32_t;
-typedef unsigned int uint32_t;
-typedef short int16_t;
-typedef unsigned short uint16_t;
-typedef signed char int8_t;
-typedef unsigned char uint8_t;
+
 #if defined(_WIN64)
-  typedef signed __int64 ssize_t;
+typedef signed __int64 ssize_t;
 #else
-  typedef signed int ssize_t;
-#endif
-#ifndef INT8_MAX
-# define INT8_MAX 127
-#endif
-#ifndef INT8_MIN
-# define INT8_MIN -128
-#endif
-#ifndef UINT8_MAX
-# define UINT8_MAX 255
-#endif
-#ifndef INT16_MAX
-# define INT16_MAX 32767
-#endif
-#ifndef INT16_MIN
-# define INT16_MIN -32768
-#endif
-#ifndef UINT16_MAX
-# define UINT16_MAX 65535
-#endif
-#ifndef INT32_MAX
-# define INT32_MAX 2147483647
-#endif
-#ifndef INT32_MIN
-/* MSC treats -2147483648 as -(2147483648U). */
-# define INT32_MIN (-INT32_MAX - 1)
-#endif
-#ifndef UINT32_MAX
-# define UINT32_MAX 4294967295U
-#endif
-/* Certain compatibility updates to VC++ introduce the `cstdint'
- * header, which defines the INT*_C macros. On default installs they
- * are absent. */
-#ifndef INT8_C
-# define INT8_C(C)   C##i8
-#endif
-#ifndef UINT8_C
-# define UINT8_C(C)  C##ui8
-#endif
-#ifndef INT16_C
-# define INT16_C(C)  C##i16
-#endif
-#ifndef UINT16_C
-# define UINT16_C(C) C##ui16
-#endif
-#ifndef INT32_C
-# define INT32_C(C)  C##i32
-#endif
-#ifndef UINT32_C
-# define UINT32_C(C) C##ui32
-#endif
-#ifndef INT64_C
-# define INT64_C(C)  C##i64
-#endif
-#ifndef UINT64_C
-# define UINT64_C(C) C##ui64
-#endif
-
-#ifndef PRId64
-# define PRId64 "I64d"
-#endif
-#ifndef PRIi64
-# define PRIi64 "I64i"
-#endif
-#ifndef PRIo64
-# define PRIo64 "I64o"
-#endif
-#ifndef PRIu64
-# define PRIu64 "I64u"
-#endif
-#ifndef PRIx64
-# define PRIx64 "I64x"
-#endif
-#ifndef PRIX64
-# define PRIX64 "I64X"
-#endif
+typedef signed int ssize_t;
+#endif /* _WIN64 */
+
+#ifndef HAVE_INTTYPES_H
+#define PRId64 "I64d"
+#define PRIi64 "I64i"
+#define PRIo64 "I64o"
+#define PRIu64 "I64u"
+#define PRIx64 "I64x"
+#define PRIX64 "I64X"
+#endif /* HAVE_INTTYPES_H */
 
 #endif /* _MSC_VER */
 
diff --git a/include/llvm/Support/DataTypes.h.in b/include/llvm/Support/DataTypes.h.in
index 7fc9b72..09cfcdf 100644
--- a/include/llvm/Support/DataTypes.h.in
+++ b/include/llvm/Support/DataTypes.h.in
@@ -37,6 +37,16 @@
 #include <math.h>
 #endif
 
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h>
+#endif
+
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#else
+#error "Compiler must provide an implementation of stdint.h"
+#endif
+
 #ifndef _MSC_VER
 
 /* Note that this header's correct operation depends on __STDC_LIMIT_MACROS
@@ -55,14 +65,6 @@
 /* Note that <inttypes.h> includes <stdint.h>, if this is a C99 system. */
 #include <sys/types.h>
 
-#ifdef HAVE_INTTYPES_H
-#include <inttypes.h>
-#endif
-
-#ifdef HAVE_STDINT_H
-#include <stdint.h>
-#endif
-
 #ifdef _AIX
 #include "llvm/Support/AIXDataTypesFix.h"
 #endif
@@ -77,8 +79,6 @@ typedef u_int64_t uint64_t;
 #endif
 
 #else /* _MSC_VER */
-/* Visual C++ doesn't provide standard integer headers, but it does provide
-   built-in data types. */
 #include <stdlib.h>
 #include <stddef.h>
 #include <sys/types.h>
@@ -87,94 +87,21 @@ typedef u_int64_t uint64_t;
 #else
 #include <math.h>
 #endif
-typedef __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-typedef signed int int32_t;
-typedef unsigned int uint32_t;
-typedef short int16_t;
-typedef unsigned short uint16_t;
-typedef signed char int8_t;
-typedef unsigned char uint8_t;
+
 #if defined(_WIN64)
-  typedef signed __int64 ssize_t;
+typedef signed __int64 ssize_t;
 #else
-  typedef signed int ssize_t;
-#endif
-
-#ifndef INT8_MAX
-# define INT8_MAX 127
-#endif
-#ifndef INT8_MIN
-# define INT8_MIN -128
-#endif
-#ifndef UINT8_MAX
-# define UINT8_MAX 255
-#endif
-#ifndef INT16_MAX
-# define INT16_MAX 32767
-#endif
-#ifndef INT16_MIN
-# define INT16_MIN -32768
-#endif
-#ifndef UINT16_MAX
-# define UINT16_MAX 65535
-#endif
-#ifndef INT32_MAX
-# define INT32_MAX 2147483647
-#endif
-#ifndef INT32_MIN
-/* MSC treats -2147483648 as -(2147483648U). */
-# define INT32_MIN (-INT32_MAX - 1)
-#endif
-#ifndef UINT32_MAX
-# define UINT32_MAX 4294967295U
-#endif
-/* Certain compatibility updates to VC++ introduce the `cstdint'
- * header, which defines the INT*_C macros. On default installs they
- * are absent. */
-#ifndef INT8_C
-# define INT8_C(C)   C##i8
-#endif
-#ifndef UINT8_C
-# define UINT8_C(C)  C##ui8
-#endif
-#ifndef INT16_C
-# define INT16_C(C)  C##i16
-#endif
-#ifndef UINT16_C
-# define UINT16_C(C) C##ui16
-#endif
-#ifndef INT32_C
-# define INT32_C(C)  C##i32
-#endif
-#ifndef UINT32_C
-# define UINT32_C(C) C##ui32
-#endif
-#ifndef INT64_C
-# define INT64_C(C)  C##i64
-#endif
-#ifndef UINT64_C
-# define UINT64_C(C) C##ui64
-#endif
-
-#ifndef PRId64
-# define PRId64 "I64d"
-#endif
-#ifndef PRIi64
-# define PRIi64 "I64i"
-#endif
-#ifndef PRIo64
-# define PRIo64 "I64o"
-#endif
-#ifndef PRIu64
-# define PRIu64 "I64u"
-#endif
-#ifndef PRIx64
-# define PRIx64 "I64x"
-#endif
-#ifndef PRIX64
-# define PRIX64 "I64X"
-#endif
+typedef signed int ssize_t;
+#endif /* _WIN64 */
+
+#ifndef HAVE_INTTYPES_H
+#define PRId64 "I64d"
+#define PRIi64 "I64i"
+#define PRIo64 "I64o"
+#define PRIu64 "I64u"
+#define PRIx64 "I64x"
+#define PRIX64 "I64X"
+#endif /* HAVE_INTTYPES_H */
 
 #endif /* _MSC_VER */
 
diff --git a/include/llvm/Support/Dwarf.h b/include/llvm/Support/Dwarf.h
index ca31644..cd9f756 100644
--- a/include/llvm/Support/Dwarf.h
+++ b/include/llvm/Support/Dwarf.h
@@ -57,7 +57,6 @@ enum LLVMConstants : uint32_t {
   DW_TAG_user_base = 0x1000, // Recommended base for user tags.
 
   DWARF_VERSION = 4,       // Default dwarf version we output.
-  DW_CIE_VERSION = 1,      // Common frame information version.
   DW_PUBTYPES_VERSION = 2, // Section version number for .debug_pubtypes.
   DW_PUBNAMES_VERSION = 2, // Section version number for .debug_pubnames.
   DW_ARANGES_VERSION = 2   // Section version number for .debug_aranges.
diff --git a/include/llvm/Support/ELF.h b/include/llvm/Support/ELF.h
index 0b3e55b..67cc651 100644
--- a/include/llvm/Support/ELF.h
+++ b/include/llvm/Support/ELF.h
@@ -124,6 +124,8 @@ enum {
 };
 
 // Machine architectures
+// See current registered ELF machine architectures at:
+//    http://www.uxsglobal.com/developers/gabi/latest/ch4.eheader.html
 enum {
   EM_NONE          = 0, // No machine
   EM_M32           = 1, // AT&T WE 32100
@@ -287,7 +289,26 @@ enum {
   EM_RL78          = 197, // Renesas RL78 family
   EM_VIDEOCORE5    = 198, // Broadcom VideoCore V processor
   EM_78KOR         = 199, // Renesas 78KOR family
-  EM_56800EX       = 200  // Freescale 56800EX Digital Signal Controller (DSC)
+  EM_56800EX       = 200, // Freescale 56800EX Digital Signal Controller (DSC)
+  EM_BA1           = 201, // Beyond BA1 CPU architecture
+  EM_BA2           = 202, // Beyond BA2 CPU architecture
+  EM_XCORE         = 203, // XMOS xCORE processor family
+  EM_MCHP_PIC      = 204, // Microchip 8-bit PIC(r) family
+  EM_INTEL205      = 205, // Reserved by Intel
+  EM_INTEL206      = 206, // Reserved by Intel
+  EM_INTEL207      = 207, // Reserved by Intel
+  EM_INTEL208      = 208, // Reserved by Intel
+  EM_INTEL209      = 209, // Reserved by Intel
+  EM_KM32          = 210, // KM211 KM32 32-bit processor
+  EM_KMX32         = 211, // KM211 KMX32 32-bit processor
+  EM_KMX16         = 212, // KM211 KMX16 16-bit processor
+  EM_KMX8          = 213, // KM211 KMX8 8-bit processor
+  EM_KVARC         = 214, // KM211 KVARC processor
+  EM_CDP           = 215, // Paneve CDP architecture family
+  EM_COGE          = 216, // Cognitive Smart Memory Processor
+  EM_COOL          = 217, // iCelero CoolEngine
+  EM_NORC          = 218, // Nanoradio Optimized RISC
+  EM_CSR_KALIMBA   = 219  // CSR Kalimba architecture family
 };
 
 // Object file classes.
@@ -1278,6 +1299,7 @@ enum : unsigned {
 
   SHT_MIPS_REGINFO        = 0x70000006, // Register usage information
   SHT_MIPS_OPTIONS        = 0x7000000d, // General options
+  SHT_MIPS_ABIFLAGS       = 0x7000002a, // ABI information.
 
   SHT_HIPROC        = 0x7fffffff, // Highest processor arch-specific type.
   SHT_LOUSER        = 0x80000000, // Lowest type reserved for applications.
@@ -1595,7 +1617,8 @@ enum {
   // MIPS program header types.
   PT_MIPS_REGINFO  = 0x70000000,  // Register usage information.
   PT_MIPS_RTPROC   = 0x70000001,  // Runtime procedure table.
-  PT_MIPS_OPTIONS  = 0x70000002   // Options segment.
+  PT_MIPS_OPTIONS  = 0x70000002,  // Options segment.
+  PT_MIPS_ABIFLAGS = 0x70000003   // Abiflags segment.
 };
 
 // Segment flag bits.
diff --git a/include/llvm/Support/Endian.h b/include/llvm/Support/Endian.h
index 2c5ab74..455d0fc 100644
--- a/include/llvm/Support/Endian.h
+++ b/include/llvm/Support/Endian.h
@@ -38,7 +38,7 @@ namespace endian {
 template<typename value_type, endianness endian>
 inline value_type byte_swap(value_type value) {
   if (endian != native && sys::IsBigEndianHost != (endian == big))
-    return sys::SwapByteOrder(value);
+    sys::swapByteOrder(value);
   return value;
 }
 
diff --git a/include/llvm/Support/Errc.h b/include/llvm/Support/Errc.h
new file mode 100644
index 0000000..80bfe2a
--- /dev/null
+++ b/include/llvm/Support/Errc.h
@@ -0,0 +1,86 @@
+//===- llvm/Support/Errc.h - Defines the llvm::errc enum --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// While std::error_code works OK on all platforms we use, there are some
+// some problems with std::errc that can be avoided by using our own
+// enumeration:
+//
+// * std::errc is a namespace in some implementations. That meas that ADL
+//   doesn't work and it is sometimes necessary to write std::make_error_code
+//   or in templates:
+//   using std::make_error_code;
+//   make_error_code(...);
+//
+//   with this enum it is safe to always just use make_error_code.
+//
+// * Some implementations define fewer names than others. This header has
+//   the intersection of all the ones we support.
+//
+// * std::errc is just marked with is_error_condition_enum. This means that
+//   common patters like AnErrorCode == errc::no_such_file_or_directory take
+//   4 virtual calls instead of two comparisons.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_ERRC_H
+#define LLVM_SUPPORT_ERRC_H
+
+#include <system_error>
+
+namespace llvm {
+enum class errc {
+  argument_list_too_long = int(std::errc::argument_list_too_long),
+  argument_out_of_domain = int(std::errc::argument_out_of_domain),
+  bad_address = int(std::errc::bad_address),
+  bad_file_descriptor = int(std::errc::bad_file_descriptor),
+  broken_pipe = int(std::errc::broken_pipe),
+  device_or_resource_busy = int(std::errc::device_or_resource_busy),
+  directory_not_empty = int(std::errc::directory_not_empty),
+  executable_format_error = int(std::errc::executable_format_error),
+  file_exists = int(std::errc::file_exists),
+  file_too_large = int(std::errc::file_too_large),
+  filename_too_long = int(std::errc::filename_too_long),
+  function_not_supported = int(std::errc::function_not_supported),
+  illegal_byte_sequence = int(std::errc::illegal_byte_sequence),
+  inappropriate_io_control_operation =
+      int(std::errc::inappropriate_io_control_operation),
+  interrupted = int(std::errc::interrupted),
+  invalid_argument = int(std::errc::invalid_argument),
+  invalid_seek = int(std::errc::invalid_seek),
+  io_error = int(std::errc::io_error),
+  is_a_directory = int(std::errc::is_a_directory),
+  no_child_process = int(std::errc::no_child_process),
+  no_lock_available = int(std::errc::no_lock_available),
+  no_space_on_device = int(std::errc::no_space_on_device),
+  no_such_device_or_address = int(std::errc::no_such_device_or_address),
+  no_such_device = int(std::errc::no_such_device),
+  no_such_file_or_directory = int(std::errc::no_such_file_or_directory),
+  no_such_process = int(std::errc::no_such_process),
+  not_a_directory = int(std::errc::not_a_directory),
+  not_enough_memory = int(std::errc::not_enough_memory),
+  operation_not_permitted = int(std::errc::operation_not_permitted),
+  permission_denied = int(std::errc::permission_denied),
+  read_only_file_system = int(std::errc::read_only_file_system),
+  resource_deadlock_would_occur = int(std::errc::resource_deadlock_would_occur),
+  resource_unavailable_try_again =
+      int(std::errc::resource_unavailable_try_again),
+  result_out_of_range = int(std::errc::result_out_of_range),
+  too_many_files_open_in_system = int(std::errc::too_many_files_open_in_system),
+  too_many_files_open = int(std::errc::too_many_files_open),
+  too_many_links = int(std::errc::too_many_links)
+};
+
+inline std::error_code make_error_code(errc E) {
+  return std::error_code(static_cast<int>(E), std::generic_category());
+}
+}
+
+namespace std {
+template <> struct is_error_code_enum<llvm::errc> : std::true_type {};
+}
+#endif
diff --git a/include/llvm/Support/ErrorHandling.h b/include/llvm/Support/ErrorHandling.h
index ac3a4d8..9afd52d 100644
--- a/include/llvm/Support/ErrorHandling.h
+++ b/include/llvm/Support/ErrorHandling.h
@@ -30,9 +30,6 @@ namespace llvm {
   /// install_fatal_error_handler - Installs a new error handler to be used
   /// whenever a serious (non-recoverable) error is encountered by LLVM.
   ///
-  /// If you are using llvm_start_multithreaded, you should register the handler
-  /// before doing that.
-  ///
   /// If no error handler is installed the default is to print the error message
   /// to stderr, and call exit(1).  If an error handler is installed then it is
   /// the handler's responsibility to log the message, it will no longer be
@@ -50,8 +47,6 @@ namespace llvm {
                                    void *user_data = nullptr);
 
   /// Restores default error handling behaviour.
-  /// This must not be called between llvm_start_multithreaded() and
-  /// llvm_stop_multithreaded().
   void remove_fatal_error_handler();
 
   /// ScopedFatalErrorHandler - This is a simple helper class which just
diff --git a/include/llvm/Support/ErrorOr.h b/include/llvm/Support/ErrorOr.h
index becd957..0742a2d 100644
--- a/include/llvm/Support/ErrorOr.h
+++ b/include/llvm/Support/ErrorOr.h
@@ -18,8 +18,8 @@
 
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/Support/AlignOf.h"
-#include "llvm/Support/system_error.h"
 #include <cassert>
+#include <system_error>
 #include <type_traits>
 
 namespace llvm {
@@ -94,15 +94,16 @@ private:
 
 public:
   template <class E>
-  ErrorOr(E ErrorCode, typename std::enable_if<is_error_code_enum<E>::value ||
-                                               is_error_condition_enum<E>::value,
-                                               void *>::type = 0)
+  ErrorOr(E ErrorCode,
+          typename std::enable_if<std::is_error_code_enum<E>::value ||
+                                      std::is_error_condition_enum<E>::value,
+                                  void *>::type = 0)
       : HasError(true) {
-    new (getErrorStorage()) error_code(make_error_code(ErrorCode));
+    new (getErrorStorage()) std::error_code(make_error_code(ErrorCode));
   }
 
-  ErrorOr(llvm::error_code EC) : HasError(true) {
-    new (getErrorStorage()) error_code(EC);
+  ErrorOr(std::error_code EC) : HasError(true) {
+    new (getErrorStorage()) std::error_code(EC);
   }
 
   ErrorOr(T Val) : HasError(false) {
@@ -162,8 +163,8 @@ public:
   reference get() { return *getStorage(); }
   const_reference get() const { return const_cast<ErrorOr<T> >(this)->get(); }
 
-  error_code getError() const {
-    return HasError ? *getErrorStorage() : error_code::success();
+  std::error_code getError() const {
+    return HasError ? *getErrorStorage() : std::error_code();
   }
 
   pointer operator ->() {
@@ -184,7 +185,7 @@ private:
     } else {
       // Get other's error.
       HasError = true;
-      new (getErrorStorage()) error_code(Other.getError());
+      new (getErrorStorage()) std::error_code(Other.getError());
     }
   }
 
@@ -216,7 +217,7 @@ private:
     } else {
       // Get other's error.
       HasError = true;
-      new (getErrorStorage()) error_code(Other.getError());
+      new (getErrorStorage()) std::error_code(Other.getError());
     }
   }
 
@@ -247,28 +248,29 @@ private:
     return reinterpret_cast<const storage_type*>(TStorage.buffer);
   }
 
-  error_code *getErrorStorage() {
+  std::error_code *getErrorStorage() {
     assert(HasError && "Cannot get error when a value exists!");
-    return reinterpret_cast<error_code*>(ErrorStorage.buffer);
+    return reinterpret_cast<std::error_code *>(ErrorStorage.buffer);
   }
 
-  const error_code *getErrorStorage() const {
+  const std::error_code *getErrorStorage() const {
     return const_cast<ErrorOr<T> *>(this)->getErrorStorage();
   }
 
 
   union {
     AlignedCharArrayUnion<storage_type> TStorage;
-    AlignedCharArrayUnion<error_code> ErrorStorage;
+    AlignedCharArrayUnion<std::error_code> ErrorStorage;
   };
   bool HasError : 1;
 };
 
-template<class T, class E>
-typename std::enable_if<is_error_code_enum<E>::value ||
-                        is_error_condition_enum<E>::value, bool>::type
-operator ==(ErrorOr<T> &Err, E Code) {
-  return error_code(Err) == Code;
+template <class T, class E>
+typename std::enable_if<std::is_error_code_enum<E>::value ||
+                            std::is_error_condition_enum<E>::value,
+                        bool>::type
+operator==(ErrorOr<T> &Err, E Code) {
+  return std::error_code(Err) == Code;
 }
 } // end namespace llvm
 
diff --git a/include/llvm/Support/FEnv.h b/include/llvm/Support/FEnv.h
deleted file mode 100644
index 8560ee0..0000000
--- a/include/llvm/Support/FEnv.h
+++ /dev/null
@@ -1,56 +0,0 @@
-//===- llvm/Support/FEnv.h - Host floating-point exceptions ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides an operating system independent interface to
-// floating-point exception interfaces.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_SUPPORT_FENV_H
-#define LLVM_SUPPORT_FENV_H
-
-#include "llvm/Config/config.h"
-#include <cerrno>
-#ifdef HAVE_FENV_H
-#include <fenv.h>
-#endif
-
-// FIXME: Clang's #include handling apparently doesn't work for libstdc++'s
-// fenv.h; see PR6907 for details.
-#if defined(__clang__) && defined(_GLIBCXX_FENV_H)
-#undef HAVE_FENV_H
-#endif
-
-namespace llvm {
-namespace sys {
-
-/// llvm_fenv_clearexcept - Clear the floating-point exception state.
-static inline void llvm_fenv_clearexcept() {
-#if defined(HAVE_FENV_H) && HAVE_DECL_FE_ALL_EXCEPT
-  feclearexcept(FE_ALL_EXCEPT);
-#endif
-  errno = 0;
-}
-
-/// llvm_fenv_testexcept - Test if a floating-point exception was raised.
-static inline bool llvm_fenv_testexcept() {
-  int errno_val = errno;
-  if (errno_val == ERANGE || errno_val == EDOM)
-    return true;
-#if defined(HAVE_FENV_H) && HAVE_DECL_FE_ALL_EXCEPT && HAVE_DECL_FE_INEXACT
-  if (fetestexcept(FE_ALL_EXCEPT & ~FE_INEXACT))
-    return true;
-#endif
-  return false;
-}
-
-} // End sys namespace
-} // End llvm namespace
-
-#endif
diff --git a/include/llvm/Support/FileOutputBuffer.h b/include/llvm/Support/FileOutputBuffer.h
index a8a48fa..0a9a979 100644
--- a/include/llvm/Support/FileOutputBuffer.h
+++ b/include/llvm/Support/FileOutputBuffer.h
@@ -20,8 +20,6 @@
 #include "llvm/Support/FileSystem.h"
 
 namespace llvm {
-class error_code;
-
 /// FileOutputBuffer - This interface provides simple way to create an in-memory
 /// buffer which will be written to a file. During the lifetime of these
 /// objects, the content or existence of the specified file is undefined. That
@@ -39,9 +37,9 @@ public:
   /// Factory method to create an OutputBuffer object which manages a read/write
   /// buffer of the specified size. When committed, the buffer will be written
   /// to the file at the specified path.
-  static error_code create(StringRef FilePath, size_t Size,
-                           std::unique_ptr<FileOutputBuffer> &Result,
-                           unsigned Flags = 0);
+  static std::error_code create(StringRef FilePath, size_t Size,
+                                std::unique_ptr<FileOutputBuffer> &Result,
+                                unsigned Flags = 0);
 
   /// Returns a pointer to the start of the buffer.
   uint8_t *getBufferStart() {
@@ -68,7 +66,7 @@ public:
   /// is called, the file is deleted in the destructor. The optional parameter
   /// is used if it turns out you want the file size to be smaller than
   /// initially requested.
-  error_code commit(int64_t NewSmallerSize = -1);
+  std::error_code commit(int64_t NewSmallerSize = -1);
 
   /// If this object was previously committed, the destructor just deletes
   /// this object.  If this object was not committed, the destructor
diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h
index 806a3e3..556701c 100644
--- a/include/llvm/Support/FileSystem.h
+++ b/include/llvm/Support/FileSystem.h
@@ -33,11 +33,11 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TimeValue.h"
-#include "llvm/Support/system_error.h"
 #include <ctime>
 #include <iterator>
 #include <stack>
 #include <string>
+#include <system_error>
 #include <tuple>
 #include <vector>
 
@@ -49,26 +49,18 @@ namespace llvm {
 namespace sys {
 namespace fs {
 
-/// An "enum class" enumeration for the file system's view of the type.
-struct file_type {
-  enum Impl {
-    status_error,
-    file_not_found,
-    regular_file,
-    directory_file,
-    symlink_file,
-    block_file,
-    character_file,
-    fifo_file,
-    socket_file,
-    type_unknown
-  };
-
-  file_type(Impl V) : V(V) {}
-  operator Impl() const { return V; }
-
-private:
-  Impl V;
+/// An enumeration for the file system's view of the type.
+enum class file_type {
+  status_error,
+  file_not_found,
+  regular_file,
+  directory_file,
+  symlink_file,
+  block_file,
+  character_file,
+  fifo_file,
+  socket_file,
+  type_unknown
 };
 
 /// space_info - Self explanatory.
@@ -142,7 +134,7 @@ public:
 };
 
 /// file_status - Represents the result of a call to stat and friends. It has
-///               a platform specific member to store the result.
+///               a platform-specific member to store the result.
 class file_status
 {
   #if defined(LLVM_ON_UNIX)
@@ -281,8 +273,8 @@ private:
 ///
 /// @param path A path that is modified to be an absolute path.
 /// @returns errc::success if \a path has been made absolute, otherwise a
-///          platform specific error_code.
-error_code make_absolute(SmallVectorImpl<char> &path);
+///          platform-specific error_code.
+std::error_code make_absolute(SmallVectorImpl<char> &path);
 
 /// @brief Normalize path separators in \a Path
 ///
@@ -290,7 +282,7 @@ error_code make_absolute(SmallVectorImpl<char> &path);
 /// This is particularly useful when cross-compiling Windows on Linux, but is
 /// safe to invoke on Windows, which accepts both characters as a path
 /// separator.
-error_code normalize_separators(SmallVectorImpl<char> &Path);
+std::error_code normalize_separators(SmallVectorImpl<char> &Path);
 
 /// @brief Create all the non-existent directories in path.
 ///
@@ -298,7 +290,8 @@ error_code normalize_separators(SmallVectorImpl<char> &Path);
 /// @returns errc::success if is_directory(path), otherwise a platform
 ///          specific error_code. If IgnoreExisting is false, also returns
 ///          error if the directory already existed.
-error_code create_directories(const Twine &path, bool IgnoreExisting = true);
+std::error_code create_directories(const Twine &path,
+                                   bool IgnoreExisting = true);
 
 /// @brief Create the directory in path.
 ///
@@ -306,7 +299,7 @@ error_code create_directories(const Twine &path, bool IgnoreExisting = true);
 /// @returns errc::success if is_directory(path), otherwise a platform
 ///          specific error_code. If IgnoreExisting is false, also returns
 ///          error if the directory already existed.
-error_code create_directory(const Twine &path, bool IgnoreExisting = true);
+std::error_code create_directory(const Twine &path, bool IgnoreExisting = true);
 
 /// @brief Create a link from \a from to \a to.
 ///
@@ -319,36 +312,42 @@ error_code create_directory(const Twine &path, bool IgnoreExisting = true);
 /// @param from The path to hard link from. This is created.
 /// @returns errc::success if the link was created, otherwise a platform
 /// specific error_code.
-error_code create_link(const Twine &to, const Twine &from);
+std::error_code create_link(const Twine &to, const Twine &from);
 
 /// @brief Get the current path.
 ///
 /// @param result Holds the current path on return.
 /// @returns errc::success if the current path has been stored in result,
-///          otherwise a platform specific error_code.
-error_code current_path(SmallVectorImpl<char> &result);
+///          otherwise a platform-specific error_code.
+std::error_code current_path(SmallVectorImpl<char> &result);
 
 /// @brief Remove path. Equivalent to POSIX remove().
 ///
 /// @param path Input path.
 /// @returns errc::success if path has been removed or didn't exist, otherwise a
-///          platform specific error code. If IgnoreNonExisting is false, also
+///          platform-specific error code. If IgnoreNonExisting is false, also
 ///          returns error if the file didn't exist.
-error_code remove(const Twine &path, bool IgnoreNonExisting = true);
+std::error_code remove(const Twine &path, bool IgnoreNonExisting = true);
 
 /// @brief Rename \a from to \a to. Files are renamed as if by POSIX rename().
 ///
 /// @param from The path to rename from.
 /// @param to The path to rename to. This is created.
-error_code rename(const Twine &from, const Twine &to);
+std::error_code rename(const Twine &from, const Twine &to);
+
+/// @brief Copy the contents of \a From to \a To.
+///
+/// @param From The path to copy from.
+/// @param To The path to copy to. This is created.
+std::error_code copy_file(const Twine &From, const Twine &To);
 
 /// @brief Resize path to size. File is resized as if by POSIX truncate().
 ///
 /// @param path Input path.
 /// @param size Size to resize to.
 /// @returns errc::success if \a path has been resized to \a size, otherwise a
-///          platform specific error_code.
-error_code resize_file(const Twine &path, uint64_t size);
+///          platform-specific error_code.
+std::error_code resize_file(const Twine &path, uint64_t size);
 
 /// @}
 /// @name Physical Observers
@@ -367,8 +366,8 @@ bool exists(file_status status);
 /// @param result Set to true if the file represented by status exists, false if
 ///               it does not. Undefined otherwise.
 /// @returns errc::success if result has been successfully set, otherwise a
-///          platform specific error_code.
-error_code exists(const Twine &path, bool &result);
+///          platform-specific error_code.
+std::error_code exists(const Twine &path, bool &result);
 
 /// @brief Simpler version of exists for clients that don't need to
 ///        differentiate between an error and false.
@@ -409,8 +408,8 @@ bool equivalent(file_status A, file_status B);
 /// @param result Set to true if stat(A) and stat(B) have the same device and
 ///               inode (or equivalent).
 /// @returns errc::success if result has been successfully set, otherwise a
-///          platform specific error_code.
-error_code equivalent(const Twine &A, const Twine &B, bool &result);
+///          platform-specific error_code.
+std::error_code equivalent(const Twine &A, const Twine &B, bool &result);
 
 /// @brief Simpler version of equivalent for clients that don't need to
 ///        differentiate between an error and false.
@@ -431,8 +430,8 @@ bool is_directory(file_status status);
 /// @param result Set to true if \a path is a directory, false if it is not.
 ///               Undefined otherwise.
 /// @returns errc::success if result has been successfully set, otherwise a
-///          platform specific error_code.
-error_code is_directory(const Twine &path, bool &result);
+///          platform-specific error_code.
+std::error_code is_directory(const Twine &path, bool &result);
 
 /// @brief Simpler version of is_directory for clients that don't need to
 ///        differentiate between an error and false.
@@ -453,8 +452,8 @@ bool is_regular_file(file_status status);
 /// @param result Set to true if \a path is a regular file, false if it is not.
 ///               Undefined otherwise.
 /// @returns errc::success if result has been successfully set, otherwise a
-///          platform specific error_code.
-error_code is_regular_file(const Twine &path, bool &result);
+///          platform-specific error_code.
+std::error_code is_regular_file(const Twine &path, bool &result);
 
 /// @brief Simpler version of is_regular_file for clients that don't need to
 ///        differentiate between an error and false.
@@ -479,41 +478,41 @@ bool is_other(file_status status);
 /// @param result Set to true if \a path exists, but is not a directory, regular
 ///               file, or a symlink, false if it does not. Undefined otherwise.
 /// @returns errc::success if result has been successfully set, otherwise a
-///          platform specific error_code.
-error_code is_other(const Twine &path, bool &result);
+///          platform-specific error_code.
+std::error_code is_other(const Twine &path, bool &result);
 
 /// @brief Get file status as if by POSIX stat().
 ///
 /// @param path Input path.
 /// @param result Set to the file status.
 /// @returns errc::success if result has been successfully set, otherwise a
-///          platform specific error_code.
-error_code status(const Twine &path, file_status &result);
+///          platform-specific error_code.
+std::error_code status(const Twine &path, file_status &result);
 
 /// @brief A version for when a file descriptor is already available.
-error_code status(int FD, file_status &Result);
+std::error_code status(int FD, file_status &Result);
 
 /// @brief Get file size.
 ///
 /// @param Path Input path.
 /// @param Result Set to the size of the file in \a Path.
 /// @returns errc::success if result has been successfully set, otherwise a
-///          platform specific error_code.
-inline error_code file_size(const Twine &Path, uint64_t &Result) {
+///          platform-specific error_code.
+inline std::error_code file_size(const Twine &Path, uint64_t &Result) {
   file_status Status;
-  error_code EC = status(Path, Status);
+  std::error_code EC = status(Path, Status);
   if (EC)
     return EC;
   Result = Status.getSize();
-  return error_code::success();
+  return std::error_code();
 }
 
 /// @brief Set the file modification and access time.
 ///
 /// @returns errc::success if the file times were successfully set, otherwise a
-///          platform specific error_code or errc::not_supported on platforms
-///          where the functionality isn't available.
-error_code setLastModificationAndAccessTime(int FD, TimeValue Time);
+///          platform-specific error_code or errc::function_not_supported on
+///          platforms where the functionality isn't available.
+std::error_code setLastModificationAndAccessTime(int FD, TimeValue Time);
 
 /// @brief Is status available?
 ///
@@ -526,8 +525,8 @@ bool status_known(file_status s);
 /// @param path Input path.
 /// @param result Set to true if status() != status_error.
 /// @returns errc::success if result has been successfully set, otherwise a
-///          platform specific error_code.
-error_code status_known(const Twine &path, bool &result);
+///          platform-specific error_code.
+std::error_code status_known(const Twine &path, bool &result);
 
 /// @brief Create a uniquely named file.
 ///
@@ -549,14 +548,14 @@ error_code status_known(const Twine &path, bool &result);
 /// @param ResultFD Set to the opened file's file descriptor.
 /// @param ResultPath Set to the opened file's absolute path.
 /// @returns errc::success if Result{FD,Path} have been successfully set,
-///          otherwise a platform specific error_code.
-error_code createUniqueFile(const Twine &Model, int &ResultFD,
-                            SmallVectorImpl<char> &ResultPath,
-                            unsigned Mode = all_read | all_write);
+///          otherwise a platform-specific error_code.
+std::error_code createUniqueFile(const Twine &Model, int &ResultFD,
+                                 SmallVectorImpl<char> &ResultPath,
+                                 unsigned Mode = all_read | all_write);
 
 /// @brief Simpler version for clients that don't want an open file.
-error_code createUniqueFile(const Twine &Model,
-                            SmallVectorImpl<char> &ResultPath);
+std::error_code createUniqueFile(const Twine &Model,
+                                 SmallVectorImpl<char> &ResultPath);
 
 /// @brief Create a file in the system temporary directory.
 ///
@@ -566,16 +565,16 @@ error_code createUniqueFile(const Twine &Model,
 ///
 /// This should be used for things like a temporary .s that is removed after
 /// running the assembler.
-error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
-                               int &ResultFD,
-                               SmallVectorImpl<char> &ResultPath);
+std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
+                                    int &ResultFD,
+                                    SmallVectorImpl<char> &ResultPath);
 
 /// @brief Simpler version for clients that don't want an open file.
-error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
-                               SmallVectorImpl<char> &ResultPath);
+std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
+                                    SmallVectorImpl<char> &ResultPath);
 
-error_code createUniqueDirectory(const Twine &Prefix,
-                                 SmallVectorImpl<char> &ResultPath);
+std::error_code createUniqueDirectory(const Twine &Prefix,
+                                      SmallVectorImpl<char> &ResultPath);
 
 enum OpenFlags : unsigned {
   F_None = 0,
@@ -606,31 +605,10 @@ inline OpenFlags &operator|=(OpenFlags &A, OpenFlags B) {
   return A;
 }
 
-error_code openFileForWrite(const Twine &Name, int &ResultFD, OpenFlags Flags,
-                            unsigned Mode = 0666);
+std::error_code openFileForWrite(const Twine &Name, int &ResultFD,
+                                 OpenFlags Flags, unsigned Mode = 0666);
 
-error_code openFileForRead(const Twine &Name, int &ResultFD);
-
-/// @brief Are \a path's first bytes \a magic?
-///
-/// @param path Input path.
-/// @param magic Byte sequence to compare \a path's first len(magic) bytes to.
-/// @returns errc::success if result has been successfully set, otherwise a
-///          platform specific error_code.
-error_code has_magic(const Twine &path, const Twine &magic, bool &result);
-
-/// @brief Get \a path's first \a len bytes.
-///
-/// @param path Input path.
-/// @param len Number of magic bytes to get.
-/// @param result Set to the first \a len bytes in the file pointed to by
-///               \a path. Or the entire file if file_size(path) < len, in which
-///               case result.size() returns the size of the file.
-/// @returns errc::success if result has been successfully set,
-///          errc::value_too_large if len is larger then the file pointed to by
-///          \a path, otherwise a platform specific error_code.
-error_code get_magic(const Twine &path, uint32_t len,
-                     SmallVectorImpl<char> &result);
+std::error_code openFileForRead(const Twine &Name, int &ResultFD);
 
 /// @brief Identify the type of a binary file based on how magical it is.
 file_magic identify_magic(StringRef magic);
@@ -640,10 +618,10 @@ file_magic identify_magic(StringRef magic);
 /// @param path Input path.
 /// @param result Set to the type of file, or file_magic::unknown.
 /// @returns errc::success if result has been successfully set, otherwise a
-///          platform specific error_code.
-error_code identify_magic(const Twine &path, file_magic &result);
+///          platform-specific error_code.
+std::error_code identify_magic(const Twine &path, file_magic &result);
 
-error_code getUniqueID(const Twine Path, UniqueID &Result);
+std::error_code getUniqueID(const Twine Path, UniqueID &Result);
 
 /// This class represents a memory mapped file. It is based on
 /// boost::iostreams::mapped_file.
@@ -660,7 +638,7 @@ public:
   };
 
 private:
-  /// Platform specific mapping state.
+  /// Platform-specific mapping state.
   mapmode Mode;
   uint64_t Size;
   void *Mapping;
@@ -670,7 +648,7 @@ private:
   void *FileMappingHandle;
 #endif
 
-  error_code init(int FD, bool CloseFD, uint64_t Offset);
+  std::error_code init(int FD, bool CloseFD, uint64_t Offset);
 
 public:
   typedef char char_type;
@@ -692,21 +670,14 @@ public:
   ///               mapped_file_region::alignment().
   /// \param ec This is set to errc::success if the map was constructed
   ///           successfully. Otherwise it is set to a platform dependent error.
-  mapped_file_region(const Twine &path,
-                     mapmode mode,
-                     uint64_t length,
-                     uint64_t offset,
-                     error_code &ec);
+  mapped_file_region(const Twine &path, mapmode mode, uint64_t length,
+                     uint64_t offset, std::error_code &ec);
 
   /// \param fd An open file descriptor to map. mapped_file_region takes
   ///   ownership if closefd is true. It must have been opended in the correct
   ///   mode.
-  mapped_file_region(int fd,
-                     bool closefd,
-                     mapmode mode,
-                     uint64_t length,
-                     uint64_t offset,
-                     error_code &ec);
+  mapped_file_region(int fd, bool closefd, mapmode mode, uint64_t length,
+                     uint64_t offset, std::error_code &ec);
 
   ~mapped_file_region();
 
@@ -722,30 +693,6 @@ public:
   static int alignment();
 };
 
-/// @brief Memory maps the contents of a file
-///
-/// @param path Path to file to map.
-/// @param file_offset Byte offset in file where mapping should begin.
-/// @param size Byte length of range of the file to map.
-/// @param map_writable If true, the file will be mapped in r/w such
-///        that changes to the mapped buffer will be flushed back
-///        to the file.  If false, the file will be mapped read-only
-///        and the buffer will be read-only.
-/// @param result Set to the start address of the mapped buffer.
-/// @returns errc::success if result has been successfully set, otherwise a
-///          platform specific error_code.
-error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,  
-                          bool map_writable, void *&result);
-
-
-/// @brief Memory unmaps the contents of a file
-///
-/// @param base Pointer to the start of the buffer.
-/// @param size Byte length of the range to unmmap.
-/// @returns errc::success if result has been successfully set, otherwise a
-///          platform specific error_code.
-error_code unmap_file_pages(void *base, size_t size);
-
 /// Return the path to the main executable, given the value of argv[0] from
 /// program startup and the address of main itself. In extremis, this function
 /// may fail and return an empty path.
@@ -777,7 +724,7 @@ public:
   void replace_filename(const Twine &filename, file_status st = file_status());
 
   const std::string &path() const { return Path; }
-  error_code status(file_status &result) const;
+  std::error_code status(file_status &result) const;
 
   bool operator==(const directory_entry& rhs) const { return Path == rhs.Path; }
   bool operator!=(const directory_entry& rhs) const { return !(*this == rhs); }
@@ -790,9 +737,9 @@ public:
 namespace detail {
   struct DirIterState;
 
-  error_code directory_iterator_construct(DirIterState&, StringRef);
-  error_code directory_iterator_increment(DirIterState&);
-  error_code directory_iterator_destruct(DirIterState&);
+  std::error_code directory_iterator_construct(DirIterState &, StringRef);
+  std::error_code directory_iterator_increment(DirIterState &);
+  std::error_code directory_iterator_destruct(DirIterState &);
 
   /// DirIterState - Keeps state for the directory_iterator. It is reference
   /// counted in order to preserve InputIterator semantics on copy.
@@ -816,14 +763,14 @@ class directory_iterator {
   IntrusiveRefCntPtr<detail::DirIterState> State;
 
 public:
-  explicit directory_iterator(const Twine &path, error_code &ec) {
+  explicit directory_iterator(const Twine &path, std::error_code &ec) {
     State = new detail::DirIterState;
     SmallString<128> path_storage;
     ec = detail::directory_iterator_construct(*State,
             path.toStringRef(path_storage));
   }
 
-  explicit directory_iterator(const directory_entry &de, error_code &ec) {
+  explicit directory_iterator(const directory_entry &de, std::error_code &ec) {
     State = new detail::DirIterState;
     ec = detail::directory_iterator_construct(*State, de.path());
   }
@@ -832,7 +779,7 @@ public:
   directory_iterator() : State(nullptr) {}
 
   // No operator++ because we need error_code.
-  directory_iterator &increment(error_code &ec) {
+  directory_iterator &increment(std::error_code &ec) {
     ec = directory_iterator_increment(*State);
     return *this;
   }
@@ -878,14 +825,14 @@ class recursive_directory_iterator {
 
 public:
   recursive_directory_iterator() {}
-  explicit recursive_directory_iterator(const Twine &path, error_code &ec)
-    : State(new detail::RecDirIterState) {
+  explicit recursive_directory_iterator(const Twine &path, std::error_code &ec)
+      : State(new detail::RecDirIterState) {
     State->Stack.push(directory_iterator(path, ec));
     if (State->Stack.top() == directory_iterator())
       State.reset();
   }
   // No operator++ because we need error_code.
-  recursive_directory_iterator &increment(error_code &ec) {
+  recursive_directory_iterator &increment(std::error_code &ec) {
     const directory_iterator end_itr;
 
     if (State->HasNoPushRequest)
@@ -934,7 +881,7 @@ public:
     assert(State->Level > 0 && "Cannot pop an iterator with level < 1");
 
     const directory_iterator end_itr;
-    error_code ec;
+    std::error_code ec;
     do {
       if (ec)
         report_fatal_error("Error incrementing directory iterator.");
diff --git a/include/llvm/Support/Format.h b/include/llvm/Support/Format.h
index a62801f..b713cc7 100644
--- a/include/llvm/Support/Format.h
+++ b/include/llvm/Support/Format.h
@@ -36,23 +36,23 @@
 
 namespace llvm {
 
-/// format_object_base - This is a helper class used for handling formatted
-/// output.  It is the abstract base class of a templated derived class.
+/// This is a helper class used for handling formatted output.  It is the
+/// abstract base class of a templated derived class.
 class format_object_base {
 protected:
   const char *Fmt;
   virtual void home(); // Out of line virtual method.
 
-  /// snprint - Call snprintf() for this object, on the given buffer and size.
+  /// Call snprintf() for this object, on the given buffer and size.
   virtual int snprint(char *Buffer, unsigned BufferSize) const = 0;
 
 public:
   format_object_base(const char *fmt) : Fmt(fmt) {}
   virtual ~format_object_base() {}
 
-  /// print - Format the object into the specified buffer.  On success, this
-  /// returns the length of the formatted string.  If the buffer is too small,
-  /// this returns a length to retry with, which will be larger than BufferSize.
+  /// Format the object into the specified buffer.  On success, this returns
+  /// the length of the formatted string.  If the buffer is too small, this
+  /// returns a length to retry with, which will be larger than BufferSize.
   unsigned print(char *Buffer, unsigned BufferSize) const {
     assert(BufferSize && "Invalid buffer size!");
 
@@ -61,21 +61,23 @@ public:
 
     // VC++ and old GlibC return negative on overflow, just double the size.
     if (N < 0)
-      return BufferSize*2;
+      return BufferSize * 2;
 
-    // Other impls yield number of bytes needed, not including the final '\0'.
+    // Other implementations yield number of bytes needed, not including the
+    // final '\0'.
     if (unsigned(N) >= BufferSize)
-      return N+1;
+      return N + 1;
 
     // Otherwise N is the length of output (not including the final '\0').
     return N;
   }
 };
 
-/// format_object1 - This is a templated helper class used by the format
-/// function that captures the object to be formated and the format string. When
-/// actually printed, this synthesizes the string into a temporary buffer
-/// provided and returns whether or not it is big enough.
+/// These are templated helper classes used by the format function that
+/// capture the object to be formated and the format string. When actually
+/// printed, this synthesizes the string into a temporary buffer provided and
+/// returns whether or not it is big enough.
+
 template <typename T>
 class format_object1 : public format_object_base {
   T Val;
@@ -89,10 +91,6 @@ public:
   }
 };
 
-/// format_object2 - This is a templated helper class used by the format
-/// function that captures the object to be formated and the format string. When
-/// actually printed, this synthesizes the string into a temporary buffer
-/// provided and returns whether or not it is big enough.
 template <typename T1, typename T2>
 class format_object2 : public format_object_base {
   T1 Val1;
@@ -107,10 +105,6 @@ public:
   }
 };
 
-/// format_object3 - This is a templated helper class used by the format
-/// function that captures the object to be formated and the format string. When
-/// actually printed, this synthesizes the string into a temporary buffer
-/// provided and returns whether or not it is big enough.
 template <typename T1, typename T2, typename T3>
 class format_object3 : public format_object_base {
   T1 Val1;
@@ -126,10 +120,6 @@ public:
   }
 };
 
-/// format_object4 - This is a templated helper class used by the format
-/// function that captures the object to be formated and the format string. When
-/// actually printed, this synthesizes the string into a temporary buffer
-/// provided and returns whether or not it is big enough.
 template <typename T1, typename T2, typename T3, typename T4>
 class format_object4 : public format_object_base {
   T1 Val1;
@@ -147,10 +137,6 @@ public:
   }
 };
 
-/// format_object5 - This is a templated helper class used by the format
-/// function that captures the object to be formated and the format string. When
-/// actually printed, this synthesizes the string into a temporary buffer
-/// provided and returns whether or not it is big enough.
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
 class format_object5 : public format_object_base {
   T1 Val1;
@@ -170,47 +156,52 @@ public:
   }
 };
 
-/// This is a helper function that is used to produce formatted output.
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6>
+class format_object6 : public format_object_base {
+  T1 Val1;
+  T2 Val2;
+  T3 Val3;
+  T4 Val4;
+  T5 Val5;
+  T6 Val6;
+public:
+  format_object6(const char *Fmt, const T1 &Val1, const T2 &Val2,
+                 const T3 &Val3, const T4 &Val4, const T5 &Val5, const T6 &Val6)
+    : format_object_base(Fmt), Val1(Val1), Val2(Val2), Val3(Val3), Val4(Val4),
+      Val5(Val5), Val6(Val6) { }
+
+  int snprint(char *Buffer, unsigned BufferSize) const override {
+    return snprintf(Buffer, BufferSize, Fmt, Val1, Val2, Val3, Val4, Val5, Val6);
+  }
+};
+
+/// These are helper functions used to produce formatted output.  They use
+/// template type deduction to construct the appropriate instance of the
+/// format_object class to simplify their construction.
 ///
 /// This is typically used like:
 /// \code
 ///   OS << format("%0.4f", myfloat) << '\n';
 /// \endcode
+
 template <typename T>
 inline format_object1<T> format(const char *Fmt, const T &Val) {
   return format_object1<T>(Fmt, Val);
 }
 
-/// This is a helper function that is used to produce formatted output.
-///
-/// This is typically used like:
-/// \code
-///   OS << format("%0.4f", myfloat) << '\n';
-/// \endcode
 template <typename T1, typename T2>
 inline format_object2<T1, T2> format(const char *Fmt, const T1 &Val1,
                                      const T2 &Val2) {
   return format_object2<T1, T2>(Fmt, Val1, Val2);
 }
 
-/// This is a helper function that is used to produce formatted output.
-///
-/// This is typically used like:
-/// \code
-///   OS << format("%0.4f", myfloat) << '\n';
-/// \endcode
 template <typename T1, typename T2, typename T3>
   inline format_object3<T1, T2, T3> format(const char *Fmt, const T1 &Val1,
                                            const T2 &Val2, const T3 &Val3) {
   return format_object3<T1, T2, T3>(Fmt, Val1, Val2, Val3);
 }
 
-/// This is a helper function that is used to produce formatted output.
-///
-/// This is typically used like:
-/// \code
-///   OS << format("%0.4f", myfloat) << '\n';
-/// \endcode
 template <typename T1, typename T2, typename T3, typename T4>
 inline format_object4<T1, T2, T3, T4> format(const char *Fmt, const T1 &Val1,
                                              const T2 &Val2, const T3 &Val3,
@@ -218,12 +209,6 @@ inline format_object4<T1, T2, T3, T4> format(const char *Fmt, const T1 &Val1,
   return format_object4<T1, T2, T3, T4>(Fmt, Val1, Val2, Val3, Val4);
 }
 
-/// This is a helper function that is used to produce formatted output.
-///
-/// This is typically used like:
-/// \code
-///   OS << format("%0.4f", myfloat) << '\n';
-/// \endcode
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
 inline format_object5<T1, T2, T3, T4, T5> format(const char *Fmt,const T1 &Val1,
                                              const T2 &Val2, const T3 &Val3,
@@ -231,6 +216,15 @@ inline format_object5<T1, T2, T3, T4, T5> format(const char *Fmt,const T1 &Val1,
   return format_object5<T1, T2, T3, T4, T5>(Fmt, Val1, Val2, Val3, Val4, Val5);
 }
 
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6>
+inline format_object6<T1, T2, T3, T4, T5, T6>
+format(const char *Fmt, const T1 &Val1, const T2 &Val2, const T3 &Val3,
+       const T4 &Val4, const T5 &Val5, const T6 &Val6) {
+  return format_object6<T1, T2, T3, T4, T5, T6>(Fmt, Val1, Val2, Val3, Val4,
+                                                Val5, Val6);
+}
+
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/Support/GenericDomTree.h b/include/llvm/Support/GenericDomTree.h
index e344220..876ab6e 100644
--- a/include/llvm/Support/GenericDomTree.h
+++ b/include/llvm/Support/GenericDomTree.h
@@ -330,6 +330,10 @@ public:
     return DomTreeNodes.lookup(BB);
   }
 
+  inline DomTreeNodeBase<NodeT> *operator[](NodeT *BB) const {
+    return getNode(BB);
+  }
+
   /// getRootNode - This returns the entry node for the CFG of the function.  If
   /// this tree represents the post-dominance relations for a function, however,
   /// this root may be a node with the block == NULL.  This is the case when
diff --git a/include/llvm/Support/GraphWriter.h b/include/llvm/Support/GraphWriter.h
index 539673a..2f02aa7 100644
--- a/include/llvm/Support/GraphWriter.h
+++ b/include/llvm/Support/GraphWriter.h
@@ -50,7 +50,7 @@ namespace GraphProgram {
    };
 }
 
-void DisplayGraph(StringRef Filename, bool wait = true,
+bool DisplayGraph(StringRef Filename, bool wait = true,
                   GraphProgram::Name program = GraphProgram::DOT);
 
 template<typename GraphType>
diff --git a/include/llvm/Support/LockFileManager.h b/include/llvm/Support/LockFileManager.h
index 523a781..61c65da 100644
--- a/include/llvm/Support/LockFileManager.h
+++ b/include/llvm/Support/LockFileManager.h
@@ -12,11 +12,10 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 #include <utility> // for std::pair
 
 namespace llvm {
-
 /// \brief Class that manages the creation of a lock file to aid
 /// implicit coordination between different processes.
 ///
@@ -56,7 +55,7 @@ private:
   SmallString<128> UniqueLockFileName;
 
   Optional<std::pair<std::string, int> > Owner;
-  Optional<error_code> Error;
+  Optional<std::error_code> Error;
 
   LockFileManager(const LockFileManager &) LLVM_DELETED_FUNCTION;
   LockFileManager &operator=(const LockFileManager &) LLVM_DELETED_FUNCTION;
diff --git a/include/llvm/Support/MachO.h b/include/llvm/Support/MachO.h
index 2a0fc7b..bd4dc2f 100644
--- a/include/llvm/Support/MachO.h
+++ b/include/llvm/Support/MachO.h
@@ -360,11 +360,28 @@ namespace llvm {
     enum {
       // Constant masks for the "n_desc" field in llvm::MachO::nlist and
       // llvm::MachO::nlist_64
+      // The low 3 bits are the for the REFERENCE_TYPE.
+      REFERENCE_TYPE                            = 0x7,
+      REFERENCE_FLAG_UNDEFINED_NON_LAZY         = 0,
+      REFERENCE_FLAG_UNDEFINED_LAZY             = 1,
+      REFERENCE_FLAG_DEFINED                    = 2,
+      REFERENCE_FLAG_PRIVATE_DEFINED            = 3,
+      REFERENCE_FLAG_PRIVATE_UNDEFINED_NON_LAZY = 4,
+      REFERENCE_FLAG_PRIVATE_UNDEFINED_LAZY     = 5,
+      // Flag bits (some overlap with the library ordinal bits).
       N_ARM_THUMB_DEF   = 0x0008u,
+      REFERENCED_DYNAMICALLY = 0x0010u,
       N_NO_DEAD_STRIP   = 0x0020u,
       N_WEAK_REF        = 0x0040u,
       N_WEAK_DEF        = 0x0080u,
-      N_SYMBOL_RESOLVER = 0x0100u
+      N_SYMBOL_RESOLVER = 0x0100u,
+      N_ALT_ENTRY       = 0x0200u,
+      // For undefined symbols coming from libraries, see GET_LIBRARY_ORDINAL()
+      // as these are in the top 8 bits.
+      SELF_LIBRARY_ORDINAL   = 0x0,
+      MAX_LIBRARY_ORDINAL    = 0xfd,
+      DYNAMIC_LOOKUP_ORDINAL = 0xfe,
+      EXECUTABLE_ORDINAL     = 0xff 
     };
 
     enum StabType {
@@ -998,8 +1015,8 @@ namespace llvm {
 
     enum : uint32_t {
       // Capability bits used in the definition of cpusubtype.
-      CPU_SUB_TYPE_MASK  = 0xff000000,   // Mask for architecture bits
-      CPU_SUB_TYPE_LIB64 = 0x80000000,   // 64 bit libraries
+      CPU_SUBTYPE_MASK  = 0xff000000,   // Mask for architecture bits
+      CPU_SUBTYPE_LIB64 = 0x80000000,   // 64 bit libraries
 
       // Special CPU subtype constants.
       CPU_SUBTYPE_MULTIPLE = ~0u
diff --git a/include/llvm/Support/ManagedStatic.h b/include/llvm/Support/ManagedStatic.h
index 1bb8cea..d8fbfeb 100644
--- a/include/llvm/Support/ManagedStatic.h
+++ b/include/llvm/Support/ManagedStatic.h
@@ -103,9 +103,6 @@ void llvm_shutdown();
 /// llvm_shutdown() when it is destroyed.
 struct llvm_shutdown_obj {
   llvm_shutdown_obj() { }
-  explicit llvm_shutdown_obj(bool multithreaded) {
-    if (multithreaded) llvm_start_multithreaded();
-  }
   ~llvm_shutdown_obj() { llvm_shutdown(); }
 };
 
diff --git a/include/llvm/Support/MathExtras.h b/include/llvm/Support/MathExtras.h
index f1f7b4f..0abba62 100644
--- a/include/llvm/Support/MathExtras.h
+++ b/include/llvm/Support/MathExtras.h
@@ -230,6 +230,9 @@ static const unsigned char BitReverseTable256[256] = {
 #define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
 #define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
   R6(0), R6(2), R6(1), R6(3)
+#undef R2
+#undef R4
+#undef R6
 };
 
 /// \brief Reverse the bits in \p Val.
@@ -258,6 +261,12 @@ inline uint32_t Lo_32(uint64_t Value) {
   return static_cast<uint32_t>(Value);
 }
 
+/// Make_64 - This functions makes a 64-bit integer from a high / low pair of
+///           32-bit integers.
+inline uint64_t Make_64(uint32_t High, uint32_t Low) {
+  return ((uint64_t)High << 32) | (uint64_t)Low;
+}
+
 /// isInt - Checks if an integer fits into the given bit width.
 template<unsigned N>
 inline bool isInt(int64_t x) {
diff --git a/include/llvm/Support/Memory.h b/include/llvm/Support/Memory.h
index 0996adb..b4305cb 100644
--- a/include/llvm/Support/Memory.h
+++ b/include/llvm/Support/Memory.h
@@ -15,8 +15,8 @@
 #define LLVM_SUPPORT_MEMORY_H
 
 #include "llvm/Support/DataTypes.h"
-#include "llvm/Support/system_error.h"
 #include <string>
+#include <system_error>
 
 namespace llvm {
 namespace sys {
@@ -77,7 +77,7 @@ namespace sys {
     static MemoryBlock allocateMappedMemory(size_t NumBytes,
                                             const MemoryBlock *const NearBlock,
                                             unsigned Flags,
-                                            error_code &EC);
+                                            std::error_code &EC);
 
     /// This method releases a block of memory that was allocated with the
     /// allocateMappedMemory method. It should not be used to release any
@@ -88,7 +88,7 @@ namespace sys {
     /// describing the failure if an error occurred.
     /// 
     /// @brief Release mapped memory.
-    static error_code releaseMappedMemory(MemoryBlock &Block);
+    static std::error_code releaseMappedMemory(MemoryBlock &Block);
 
     /// This method sets the protection flags for a block of memory to the
     /// state specified by /p Flags.  The behavior is not specified if the
@@ -105,8 +105,8 @@ namespace sys {
     /// describing the failure if an error occurred.
     ///
     /// @brief Set memory protection state.
-    static error_code protectMappedMemory(const MemoryBlock &Block,
-                                          unsigned Flags);
+    static std::error_code protectMappedMemory(const MemoryBlock &Block,
+                                               unsigned Flags);
 
     /// This method allocates a block of Read/Write/Execute memory that is
     /// suitable for executing dynamically generated code (e.g. JIT). An
diff --git a/include/llvm/Support/MemoryBuffer.h b/include/llvm/Support/MemoryBuffer.h
index 5810c47..147be47 100644
--- a/include/llvm/Support/MemoryBuffer.h
+++ b/include/llvm/Support/MemoryBuffer.h
@@ -19,12 +19,11 @@
 #include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorOr.h"
 #include <memory>
+#include <system_error>
 
 namespace llvm {
-
-class error_code;
-
 /// MemoryBuffer - This interface provides simple read-only access to a block
 /// of memory, and provides simple methods for reading files and standard input
 /// into a memory buffer.  In addition to basic access to the characters in the
@@ -62,19 +61,17 @@ public:
     return "Unknown buffer";
   }
 
-  /// getFile - Open the specified file as a MemoryBuffer, returning a new
-  /// MemoryBuffer if successful, otherwise returning null.  If FileSize is
-  /// specified, this means that the client knows that the file exists and that
-  /// it has the specified size.
+  /// Open the specified file as a MemoryBuffer, returning a new MemoryBuffer
+  /// if successful, otherwise returning null. If FileSize is specified, this
+  /// means that the client knows that the file exists and that it has the
+  /// specified size.
   ///
   /// \param IsVolatileSize Set to true to indicate that the file size may be
   /// changing, e.g. when libclang tries to parse while the user is
   /// editing/updating the file.
-  static error_code getFile(Twine Filename,
-                            std::unique_ptr<MemoryBuffer> &Result,
-                            int64_t FileSize = -1,
-                            bool RequiresNullTerminator = true,
-                            bool IsVolatileSize = false);
+  static ErrorOr<std::unique_ptr<MemoryBuffer>>
+  getFile(Twine Filename, int64_t FileSize = -1,
+          bool RequiresNullTerminator = true, bool IsVolatileSize = false);
 
   /// Given an already-open file descriptor, map some slice of it into a
   /// MemoryBuffer. The slice is specified by an \p Offset and \p MapSize.
@@ -83,10 +80,9 @@ public:
   /// \param IsVolatileSize Set to true to indicate that the file size may be
   /// changing, e.g. when libclang tries to parse while the user is
   /// editing/updating the file.
-  static error_code getOpenFileSlice(int FD, const char *Filename,
-                                     std::unique_ptr<MemoryBuffer> &Result,
-                                     uint64_t MapSize, int64_t Offset,
-                                     bool IsVolatileSize = false);
+  static ErrorOr<std::unique_ptr<MemoryBuffer>>
+  getOpenFileSlice(int FD, const char *Filename, uint64_t MapSize,
+                   int64_t Offset, bool IsVolatileSize = false);
 
   /// Given an already-open file descriptor, read the file and return a
   /// MemoryBuffer.
@@ -94,11 +90,9 @@ public:
   /// \param IsVolatileSize Set to true to indicate that the file size may be
   /// changing, e.g. when libclang tries to parse while the user is
   /// editing/updating the file.
-  static error_code getOpenFile(int FD, const char *Filename,
-                                std::unique_ptr<MemoryBuffer> &Result,
-                                uint64_t FileSize,
-                                bool RequiresNullTerminator = true,
-                                bool IsVolatileSize = false);
+  static ErrorOr<std::unique_ptr<MemoryBuffer>>
+  getOpenFile(int FD, const char *Filename, uint64_t FileSize,
+              bool RequiresNullTerminator = true, bool IsVolatileSize = false);
 
   /// getMemBuffer - Open the specified memory range as a MemoryBuffer.  Note
   /// that InputData must be null terminated if RequiresNullTerminator is true.
@@ -125,17 +119,13 @@ public:
   static MemoryBuffer *getNewUninitMemBuffer(size_t Size,
                                              StringRef BufferName = "");
 
-  /// getSTDIN - Read all of stdin into a file buffer, and return it.
-  /// If an error occurs, this returns null and sets ec.
-  static error_code getSTDIN(std::unique_ptr<MemoryBuffer> &Result);
-
+  /// Read all of stdin into a file buffer, and return it.
+  static ErrorOr<std::unique_ptr<MemoryBuffer>> getSTDIN();
 
-  /// getFileOrSTDIN - Open the specified file as a MemoryBuffer, or open stdin
-  /// if the Filename is "-".  If an error occurs, this returns null and sets
-  /// ec.
-  static error_code getFileOrSTDIN(StringRef Filename,
-                                   std::unique_ptr<MemoryBuffer> &Result,
-                                   int64_t FileSize = -1);
+  /// Open the specified file as a MemoryBuffer, or open stdin if the Filename
+  /// is "-".
+  static ErrorOr<std::unique_ptr<MemoryBuffer>>
+  getFileOrSTDIN(StringRef Filename, int64_t FileSize = -1);
 
   //===--------------------------------------------------------------------===//
   // Provided for performance analysis.
diff --git a/include/llvm/Support/Process.h b/include/llvm/Support/Process.h
index 7f6441e..30973de 100644
--- a/include/llvm/Support/Process.h
+++ b/include/llvm/Support/Process.h
@@ -31,7 +31,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/TimeValue.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 namespace llvm {
 class StringRef;
@@ -171,10 +171,17 @@ public:
   // string. \arg Name is assumed to be in UTF-8 encoding too.
   static Optional<std::string> GetEnv(StringRef name);
 
+  /// This function searches for an existing file in the list of directories
+  /// in a PATH like environment variable, and returns the first file found,
+  /// according to the order of the entries in the PATH like environment
+  /// variable.
+  static Optional<std::string> FindInEnvPath(const std::string& EnvName,
+                                             const std::string& FileName);
+
   /// This function returns a SmallVector containing the arguments passed from
   /// the operating system to the program.  This function expects to be handed
   /// the vector passed in from main.
-  static error_code
+  static std::error_code
   GetArgumentVector(SmallVectorImpl<const char *> &Args,
                     ArrayRef<const char *> ArgsFromMain,
                     SpecificBumpPtrAllocator<char> &ArgAllocator);
diff --git a/include/llvm/Support/Program.h b/include/llvm/Support/Program.h
index 9160b7d..51279a9 100644
--- a/include/llvm/Support/Program.h
+++ b/include/llvm/Support/Program.h
@@ -16,10 +16,9 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 namespace llvm {
-class error_code;
 namespace sys {
 
   /// This is the OS-specific separator for PATH like environment variables:
@@ -67,8 +66,8 @@ struct ProcessInfo {
   // These functions change the specified standard stream (stdin or stdout) to
   // binary mode. They return errc::success if the specified stream
   // was changed. Otherwise a platform dependent error is returned.
-  error_code ChangeStdinToBinary();
-  error_code ChangeStdoutToBinary();
+  std::error_code ChangeStdinToBinary();
+  std::error_code ChangeStdoutToBinary();
 
   /// This function executes the program using the arguments provided.  The
   /// invoked program will inherit the stdin, stdout, and stderr file
diff --git a/include/llvm/Support/RandomNumberGenerator.h b/include/llvm/Support/RandomNumberGenerator.h
new file mode 100644
index 0000000..cadc713
--- /dev/null
+++ b/include/llvm/Support/RandomNumberGenerator.h
@@ -0,0 +1,57 @@
+//==- llvm/Support/RandomNumberGenerator.h - RNG for diversity ---*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an abstraction for random number generation (RNG).
+// Note that the current implementation is not cryptographically secure
+// as it uses the C++11 <random> facilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_RANDOMNUMBERGENERATOR_H_
+#define LLVM_SUPPORT_RANDOMNUMBERGENERATOR_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DataTypes.h" // Needed for uint64_t on Windows.
+#include <random>
+
+namespace llvm {
+
+/// A random number generator.
+/// Instances of this class should not be shared across threads.
+class RandomNumberGenerator {
+public:
+  /// Seeds and salts the underlying RNG engine. The salt of type StringRef
+  /// is passed into the constructor. The seed can be set on the command
+  /// line via -rng-seed=<uint64>.
+  /// The reason for the salt is to ensure different random streams even if
+  /// the same seed is used for multiple invocations of the compiler.
+  /// A good salt value should add additional entropy and be constant across
+  /// different machines (i.e., no paths) to allow for reproducible builds.
+  /// An instance of this class can be retrieved from the current Module.
+  /// \see Module::getRNG
+  RandomNumberGenerator(StringRef Salt);
+
+  /// Returns a random number in the range [0, Max).
+  uint64_t next(uint64_t Max);
+
+private:
+  // 64-bit Mersenne Twister by Matsumoto and Nishimura, 2000
+  // http://en.cppreference.com/w/cpp/numeric/random/mersenne_twister_engine
+  std::mt19937_64 Generator;
+
+  // Noncopyable.
+  RandomNumberGenerator(const RandomNumberGenerator &other)
+      LLVM_DELETED_FUNCTION;
+  RandomNumberGenerator &
+  operator=(const RandomNumberGenerator &other) LLVM_DELETED_FUNCTION;
+};
+}
+
+#endif
diff --git a/include/llvm/Support/ScaledNumber.h b/include/llvm/Support/ScaledNumber.h
new file mode 100644
index 0000000..2bd7e74
--- /dev/null
+++ b/include/llvm/Support/ScaledNumber.h
@@ -0,0 +1,897 @@
+//===- llvm/Support/ScaledNumber.h - Support for scaled numbers -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains functions (and a class) useful for working with scaled
+// numbers -- in particular, pairs of integers where one represents digits and
+// another represents a scale.  The functions are helpers and live in the
+// namespace ScaledNumbers.  The class ScaledNumber is useful for modelling
+// certain cost metrics that need simple, integer-like semantics that are easy
+// to reason about.
+//
+// These might remind you of soft-floats.  If you want one of those, you're in
+// the wrong place.  Look at include/llvm/ADT/APFloat.h instead.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_SCALEDNUMBER_H
+#define LLVM_SUPPORT_SCALEDNUMBER_H
+
+#include "llvm/Support/MathExtras.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <tuple>
+#include <utility>
+
+namespace llvm {
+namespace ScaledNumbers {
+
+/// \brief Maximum scale; same as APFloat for easy debug printing.
+const int32_t MaxScale = 16383;
+
+/// \brief Maximum scale; same as APFloat for easy debug printing.
+const int32_t MinScale = -16382;
+
+/// \brief Get the width of a number.
+template <class DigitsT> inline int getWidth() { return sizeof(DigitsT) * 8; }
+
+/// \brief Conditionally round up a scaled number.
+///
+/// Given \c Digits and \c Scale, round up iff \c ShouldRound is \c true.
+/// Always returns \c Scale unless there's an overflow, in which case it
+/// returns \c 1+Scale.
+///
+/// \pre adding 1 to \c Scale will not overflow INT16_MAX.
+template <class DigitsT>
+inline std::pair<DigitsT, int16_t> getRounded(DigitsT Digits, int16_t Scale,
+                                              bool ShouldRound) {
+  static_assert(!std::numeric_limits<DigitsT>::is_signed, "expected unsigned");
+
+  if (ShouldRound)
+    if (!++Digits)
+      // Overflow.
+      return std::make_pair(DigitsT(1) << (getWidth<DigitsT>() - 1), Scale + 1);
+  return std::make_pair(Digits, Scale);
+}
+
+/// \brief Convenience helper for 32-bit rounding.
+inline std::pair<uint32_t, int16_t> getRounded32(uint32_t Digits, int16_t Scale,
+                                                 bool ShouldRound) {
+  return getRounded(Digits, Scale, ShouldRound);
+}
+
+/// \brief Convenience helper for 64-bit rounding.
+inline std::pair<uint64_t, int16_t> getRounded64(uint64_t Digits, int16_t Scale,
+                                                 bool ShouldRound) {
+  return getRounded(Digits, Scale, ShouldRound);
+}
+
+/// \brief Adjust a 64-bit scaled number down to the appropriate width.
+///
+/// \pre Adding 64 to \c Scale will not overflow INT16_MAX.
+template <class DigitsT>
+inline std::pair<DigitsT, int16_t> getAdjusted(uint64_t Digits,
+                                               int16_t Scale = 0) {
+  static_assert(!std::numeric_limits<DigitsT>::is_signed, "expected unsigned");
+
+  const int Width = getWidth<DigitsT>();
+  if (Width == 64 || Digits <= std::numeric_limits<DigitsT>::max())
+    return std::make_pair(Digits, Scale);
+
+  // Shift right and round.
+  int Shift = 64 - Width - countLeadingZeros(Digits);
+  return getRounded<DigitsT>(Digits >> Shift, Scale + Shift,
+                             Digits & (UINT64_C(1) << (Shift - 1)));
+}
+
+/// \brief Convenience helper for adjusting to 32 bits.
+inline std::pair<uint32_t, int16_t> getAdjusted32(uint64_t Digits,
+                                                  int16_t Scale = 0) {
+  return getAdjusted<uint32_t>(Digits, Scale);
+}
+
+/// \brief Convenience helper for adjusting to 64 bits.
+inline std::pair<uint64_t, int16_t> getAdjusted64(uint64_t Digits,
+                                                  int16_t Scale = 0) {
+  return getAdjusted<uint64_t>(Digits, Scale);
+}
+
+/// \brief Multiply two 64-bit integers to create a 64-bit scaled number.
+///
+/// Implemented with four 64-bit integer multiplies.
+std::pair<uint64_t, int16_t> multiply64(uint64_t LHS, uint64_t RHS);
+
+/// \brief Multiply two 32-bit integers to create a 32-bit scaled number.
+///
+/// Implemented with one 64-bit integer multiply.
+template <class DigitsT>
+inline std::pair<DigitsT, int16_t> getProduct(DigitsT LHS, DigitsT RHS) {
+  static_assert(!std::numeric_limits<DigitsT>::is_signed, "expected unsigned");
+
+  if (getWidth<DigitsT>() <= 32 || (LHS <= UINT32_MAX && RHS <= UINT32_MAX))
+    return getAdjusted<DigitsT>(uint64_t(LHS) * RHS);
+
+  return multiply64(LHS, RHS);
+}
+
+/// \brief Convenience helper for 32-bit product.
+inline std::pair<uint32_t, int16_t> getProduct32(uint32_t LHS, uint32_t RHS) {
+  return getProduct(LHS, RHS);
+}
+
+/// \brief Convenience helper for 64-bit product.
+inline std::pair<uint64_t, int16_t> getProduct64(uint64_t LHS, uint64_t RHS) {
+  return getProduct(LHS, RHS);
+}
+
+/// \brief Divide two 64-bit integers to create a 64-bit scaled number.
+///
+/// Implemented with long division.
+///
+/// \pre \c Dividend and \c Divisor are non-zero.
+std::pair<uint64_t, int16_t> divide64(uint64_t Dividend, uint64_t Divisor);
+
+/// \brief Divide two 32-bit integers to create a 32-bit scaled number.
+///
+/// Implemented with one 64-bit integer divide/remainder pair.
+///
+/// \pre \c Dividend and \c Divisor are non-zero.
+std::pair<uint32_t, int16_t> divide32(uint32_t Dividend, uint32_t Divisor);
+
+/// \brief Divide two 32-bit numbers to create a 32-bit scaled number.
+///
+/// Implemented with one 64-bit integer divide/remainder pair.
+///
+/// Returns \c (DigitsT_MAX, MaxScale) for divide-by-zero (0 for 0/0).
+template <class DigitsT>
+std::pair<DigitsT, int16_t> getQuotient(DigitsT Dividend, DigitsT Divisor) {
+  static_assert(!std::numeric_limits<DigitsT>::is_signed, "expected unsigned");
+  static_assert(sizeof(DigitsT) == 4 || sizeof(DigitsT) == 8,
+                "expected 32-bit or 64-bit digits");
+
+  // Check for zero.
+  if (!Dividend)
+    return std::make_pair(0, 0);
+  if (!Divisor)
+    return std::make_pair(std::numeric_limits<DigitsT>::max(), MaxScale);
+
+  if (getWidth<DigitsT>() == 64)
+    return divide64(Dividend, Divisor);
+  return divide32(Dividend, Divisor);
+}
+
+/// \brief Convenience helper for 32-bit quotient.
+inline std::pair<uint32_t, int16_t> getQuotient32(uint32_t Dividend,
+                                                  uint32_t Divisor) {
+  return getQuotient(Dividend, Divisor);
+}
+
+/// \brief Convenience helper for 64-bit quotient.
+inline std::pair<uint64_t, int16_t> getQuotient64(uint64_t Dividend,
+                                                  uint64_t Divisor) {
+  return getQuotient(Dividend, Divisor);
+}
+
+/// \brief Implementation of getLg() and friends.
+///
+/// Returns the rounded lg of \c Digits*2^Scale and an int specifying whether
+/// this was rounded up (1), down (-1), or exact (0).
+///
+/// Returns \c INT32_MIN when \c Digits is zero.
+template <class DigitsT>
+inline std::pair<int32_t, int> getLgImpl(DigitsT Digits, int16_t Scale) {
+  static_assert(!std::numeric_limits<DigitsT>::is_signed, "expected unsigned");
+
+  if (!Digits)
+    return std::make_pair(INT32_MIN, 0);
+
+  // Get the floor of the lg of Digits.
+  int32_t LocalFloor = sizeof(Digits) * 8 - countLeadingZeros(Digits) - 1;
+
+  // Get the actual floor.
+  int32_t Floor = Scale + LocalFloor;
+  if (Digits == UINT64_C(1) << LocalFloor)
+    return std::make_pair(Floor, 0);
+
+  // Round based on the next digit.
+  assert(LocalFloor >= 1);
+  bool Round = Digits & UINT64_C(1) << (LocalFloor - 1);
+  return std::make_pair(Floor + Round, Round ? 1 : -1);
+}
+
+/// \brief Get the lg (rounded) of a scaled number.
+///
+/// Get the lg of \c Digits*2^Scale.
+///
+/// Returns \c INT32_MIN when \c Digits is zero.
+template <class DigitsT> int32_t getLg(DigitsT Digits, int16_t Scale) {
+  return getLgImpl(Digits, Scale).first;
+}
+
+/// \brief Get the lg floor of a scaled number.
+///
+/// Get the floor of the lg of \c Digits*2^Scale.
+///
+/// Returns \c INT32_MIN when \c Digits is zero.
+template <class DigitsT> int32_t getLgFloor(DigitsT Digits, int16_t Scale) {
+  auto Lg = getLgImpl(Digits, Scale);
+  return Lg.first - (Lg.second > 0);
+}
+
+/// \brief Get the lg ceiling of a scaled number.
+///
+/// Get the ceiling of the lg of \c Digits*2^Scale.
+///
+/// Returns \c INT32_MIN when \c Digits is zero.
+template <class DigitsT> int32_t getLgCeiling(DigitsT Digits, int16_t Scale) {
+  auto Lg = getLgImpl(Digits, Scale);
+  return Lg.first + (Lg.second < 0);
+}
+
+/// \brief Implementation for comparing scaled numbers.
+///
+/// Compare two 64-bit numbers with different scales.  Given that the scale of
+/// \c L is higher than that of \c R by \c ScaleDiff, compare them.  Return -1,
+/// 1, and 0 for less than, greater than, and equal, respectively.
+///
+/// \pre 0 <= ScaleDiff < 64.
+int compareImpl(uint64_t L, uint64_t R, int ScaleDiff);
+
+/// \brief Compare two scaled numbers.
+///
+/// Compare two scaled numbers.  Returns 0 for equal, -1 for less than, and 1
+/// for greater than.
+template <class DigitsT>
+int compare(DigitsT LDigits, int16_t LScale, DigitsT RDigits, int16_t RScale) {
+  static_assert(!std::numeric_limits<DigitsT>::is_signed, "expected unsigned");
+
+  // Check for zero.
+  if (!LDigits)
+    return RDigits ? -1 : 0;
+  if (!RDigits)
+    return 1;
+
+  // Check for the scale.  Use getLgFloor to be sure that the scale difference
+  // is always lower than 64.
+  int32_t lgL = getLgFloor(LDigits, LScale), lgR = getLgFloor(RDigits, RScale);
+  if (lgL != lgR)
+    return lgL < lgR ? -1 : 1;
+
+  // Compare digits.
+  if (LScale < RScale)
+    return compareImpl(LDigits, RDigits, RScale - LScale);
+
+  return -compareImpl(RDigits, LDigits, LScale - RScale);
+}
+
+/// \brief Match scales of two numbers.
+///
+/// Given two scaled numbers, match up their scales.  Change the digits and
+/// scales in place.  Shift the digits as necessary to form equivalent numbers,
+/// losing precision only when necessary.
+///
+/// If the output value of \c LDigits (\c RDigits) is \c 0, the output value of
+/// \c LScale (\c RScale) is unspecified.
+///
+/// As a convenience, returns the matching scale.  If the output value of one
+/// number is zero, returns the scale of the other.  If both are zero, which
+/// scale is returned is unspecifed.
+template <class DigitsT>
+int16_t matchScales(DigitsT &LDigits, int16_t &LScale, DigitsT &RDigits,
+                    int16_t &RScale) {
+  static_assert(!std::numeric_limits<DigitsT>::is_signed, "expected unsigned");
+
+  if (LScale < RScale)
+    // Swap arguments.
+    return matchScales(RDigits, RScale, LDigits, LScale);
+  if (!LDigits)
+    return RScale;
+  if (!RDigits || LScale == RScale)
+    return LScale;
+
+  // Now LScale > RScale.  Get the difference.
+  int32_t ScaleDiff = int32_t(LScale) - RScale;
+  if (ScaleDiff >= 2 * getWidth<DigitsT>()) {
+    // Don't bother shifting.  RDigits will get zero-ed out anyway.
+    RDigits = 0;
+    return LScale;
+  }
+
+  // Shift LDigits left as much as possible, then shift RDigits right.
+  int32_t ShiftL = std::min<int32_t>(countLeadingZeros(LDigits), ScaleDiff);
+  assert(ShiftL < getWidth<DigitsT>() && "can't shift more than width");
+
+  int32_t ShiftR = ScaleDiff - ShiftL;
+  if (ShiftR >= getWidth<DigitsT>()) {
+    // Don't bother shifting.  RDigits will get zero-ed out anyway.
+    RDigits = 0;
+    return LScale;
+  }
+
+  LDigits <<= ShiftL;
+  RDigits >>= ShiftR;
+
+  LScale -= ShiftL;
+  RScale += ShiftR;
+  assert(LScale == RScale && "scales should match");
+  return LScale;
+}
+
+/// \brief Get the sum of two scaled numbers.
+///
+/// Get the sum of two scaled numbers with as much precision as possible.
+///
+/// \pre Adding 1 to \c LScale (or \c RScale) will not overflow INT16_MAX.
+template <class DigitsT>
+std::pair<DigitsT, int16_t> getSum(DigitsT LDigits, int16_t LScale,
+                                   DigitsT RDigits, int16_t RScale) {
+  static_assert(!std::numeric_limits<DigitsT>::is_signed, "expected unsigned");
+
+  // Check inputs up front.  This is only relevent if addition overflows, but
+  // testing here should catch more bugs.
+  assert(LScale < INT16_MAX && "scale too large");
+  assert(RScale < INT16_MAX && "scale too large");
+
+  // Normalize digits to match scales.
+  int16_t Scale = matchScales(LDigits, LScale, RDigits, RScale);
+
+  // Compute sum.
+  DigitsT Sum = LDigits + RDigits;
+  if (Sum >= RDigits)
+    return std::make_pair(Sum, Scale);
+
+  // Adjust sum after arithmetic overflow.
+  DigitsT HighBit = DigitsT(1) << (getWidth<DigitsT>() - 1);
+  return std::make_pair(HighBit | Sum >> 1, Scale + 1);
+}
+
+/// \brief Convenience helper for 32-bit sum.
+inline std::pair<uint32_t, int16_t> getSum32(uint32_t LDigits, int16_t LScale,
+                                             uint32_t RDigits, int16_t RScale) {
+  return getSum(LDigits, LScale, RDigits, RScale);
+}
+
+/// \brief Convenience helper for 64-bit sum.
+inline std::pair<uint64_t, int16_t> getSum64(uint64_t LDigits, int16_t LScale,
+                                             uint64_t RDigits, int16_t RScale) {
+  return getSum(LDigits, LScale, RDigits, RScale);
+}
+
+/// \brief Get the difference of two scaled numbers.
+///
+/// Get LHS minus RHS with as much precision as possible.
+///
+/// Returns \c (0, 0) if the RHS is larger than the LHS.
+template <class DigitsT>
+std::pair<DigitsT, int16_t> getDifference(DigitsT LDigits, int16_t LScale,
+                                          DigitsT RDigits, int16_t RScale) {
+  static_assert(!std::numeric_limits<DigitsT>::is_signed, "expected unsigned");
+
+  // Normalize digits to match scales.
+  const DigitsT SavedRDigits = RDigits;
+  const int16_t SavedRScale = RScale;
+  matchScales(LDigits, LScale, RDigits, RScale);
+
+  // Compute difference.
+  if (LDigits <= RDigits)
+    return std::make_pair(0, 0);
+  if (RDigits || !SavedRDigits)
+    return std::make_pair(LDigits - RDigits, LScale);
+
+  // Check if RDigits just barely lost its last bit.  E.g., for 32-bit:
+  //
+  //   1*2^32 - 1*2^0 == 0xffffffff != 1*2^32
+  const auto RLgFloor = getLgFloor(SavedRDigits, SavedRScale);
+  if (!compare(LDigits, LScale, DigitsT(1), RLgFloor + getWidth<DigitsT>()))
+    return std::make_pair(std::numeric_limits<DigitsT>::max(), RLgFloor);
+
+  return std::make_pair(LDigits, LScale);
+}
+
+/// \brief Convenience helper for 32-bit difference.
+inline std::pair<uint32_t, int16_t> getDifference32(uint32_t LDigits,
+                                                    int16_t LScale,
+                                                    uint32_t RDigits,
+                                                    int16_t RScale) {
+  return getDifference(LDigits, LScale, RDigits, RScale);
+}
+
+/// \brief Convenience helper for 64-bit difference.
+inline std::pair<uint64_t, int16_t> getDifference64(uint64_t LDigits,
+                                                    int16_t LScale,
+                                                    uint64_t RDigits,
+                                                    int16_t RScale) {
+  return getDifference(LDigits, LScale, RDigits, RScale);
+}
+
+} // end namespace ScaledNumbers
+} // end namespace llvm
+
+namespace llvm {
+
+class raw_ostream;
+class ScaledNumberBase {
+public:
+  static const int DefaultPrecision = 10;
+
+  static void dump(uint64_t D, int16_t E, int Width);
+  static raw_ostream &print(raw_ostream &OS, uint64_t D, int16_t E, int Width,
+                            unsigned Precision);
+  static std::string toString(uint64_t D, int16_t E, int Width,
+                              unsigned Precision);
+  static int countLeadingZeros32(uint32_t N) { return countLeadingZeros(N); }
+  static int countLeadingZeros64(uint64_t N) { return countLeadingZeros(N); }
+  static uint64_t getHalf(uint64_t N) { return (N >> 1) + (N & 1); }
+
+  static std::pair<uint64_t, bool> splitSigned(int64_t N) {
+    if (N >= 0)
+      return std::make_pair(N, false);
+    uint64_t Unsigned = N == INT64_MIN ? UINT64_C(1) << 63 : uint64_t(-N);
+    return std::make_pair(Unsigned, true);
+  }
+  static int64_t joinSigned(uint64_t U, bool IsNeg) {
+    if (U > uint64_t(INT64_MAX))
+      return IsNeg ? INT64_MIN : INT64_MAX;
+    return IsNeg ? -int64_t(U) : int64_t(U);
+  }
+};
+
+/// \brief Simple representation of a scaled number.
+///
+/// ScaledNumber is a number represented by digits and a scale.  It uses simple
+/// saturation arithmetic and every operation is well-defined for every value.
+/// It's somewhat similar in behaviour to a soft-float, but is *not* a
+/// replacement for one.  If you're doing numerics, look at \a APFloat instead.
+/// Nevertheless, we've found these semantics useful for modelling certain cost
+/// metrics.
+///
+/// The number is split into a signed scale and unsigned digits.  The number
+/// represented is \c getDigits()*2^getScale().  In this way, the digits are
+/// much like the mantissa in the x87 long double, but there is no canonical
+/// form so the same number can be represented by many bit representations.
+///
+/// ScaledNumber is templated on the underlying integer type for digits, which
+/// is expected to be unsigned.
+///
+/// Unlike APFloat, ScaledNumber does not model architecture floating point
+/// behaviour -- while this might make it a little faster and easier to reason
+/// about, it certainly makes it more dangerous for general numerics.
+///
+/// ScaledNumber is totally ordered.  However, there is no canonical form, so
+/// there are multiple representations of most scalars.  E.g.:
+///
+///     ScaledNumber(8u, 0) == ScaledNumber(4u, 1)
+///     ScaledNumber(4u, 1) == ScaledNumber(2u, 2)
+///     ScaledNumber(2u, 2) == ScaledNumber(1u, 3)
+///
+/// ScaledNumber implements most arithmetic operations.  Precision is kept
+/// where possible.  Uses simple saturation arithmetic, so that operations
+/// saturate to 0.0 or getLargest() rather than under or overflowing.  It has
+/// some extra arithmetic for unit inversion.  0.0/0.0 is defined to be 0.0.
+/// Any other division by 0.0 is defined to be getLargest().
+///
+/// As a convenience for modifying the exponent, left and right shifting are
+/// both implemented, and both interpret negative shifts as positive shifts in
+/// the opposite direction.
+///
+/// Scales are limited to the range accepted by x87 long double.  This makes
+/// it trivial to add functionality to convert to APFloat (this is already
+/// relied on for the implementation of printing).
+///
+/// Possible (and conflicting) future directions:
+///
+///  1. Turn this into a wrapper around \a APFloat.
+///  2. Share the algorithm implementations with \a APFloat.
+///  3. Allow \a ScaledNumber to represent a signed number.
+template <class DigitsT> class ScaledNumber : ScaledNumberBase {
+public:
+  static_assert(!std::numeric_limits<DigitsT>::is_signed,
+                "only unsigned floats supported");
+
+  typedef DigitsT DigitsType;
+
+private:
+  typedef std::numeric_limits<DigitsType> DigitsLimits;
+
+  static const int Width = sizeof(DigitsType) * 8;
+  static_assert(Width <= 64, "invalid integer width for digits");
+
+private:
+  DigitsType Digits;
+  int16_t Scale;
+
+public:
+  ScaledNumber() : Digits(0), Scale(0) {}
+
+  ScaledNumber(DigitsType Digits, int16_t Scale)
+      : Digits(Digits), Scale(Scale) {}
+
+private:
+  ScaledNumber(const std::pair<uint64_t, int16_t> &X)
+      : Digits(X.first), Scale(X.second) {}
+
+public:
+  static ScaledNumber getZero() { return ScaledNumber(0, 0); }
+  static ScaledNumber getOne() { return ScaledNumber(1, 0); }
+  static ScaledNumber getLargest() {
+    return ScaledNumber(DigitsLimits::max(), ScaledNumbers::MaxScale);
+  }
+  static ScaledNumber get(uint64_t N) { return adjustToWidth(N, 0); }
+  static ScaledNumber getInverse(uint64_t N) {
+    return get(N).invert();
+  }
+  static ScaledNumber getFraction(DigitsType N, DigitsType D) {
+    return getQuotient(N, D);
+  }
+
+  int16_t getScale() const { return Scale; }
+  DigitsType getDigits() const { return Digits; }
+
+  /// \brief Convert to the given integer type.
+  ///
+  /// Convert to \c IntT using simple saturating arithmetic, truncating if
+  /// necessary.
+  template <class IntT> IntT toInt() const;
+
+  bool isZero() const { return !Digits; }
+  bool isLargest() const { return *this == getLargest(); }
+  bool isOne() const {
+    if (Scale > 0 || Scale <= -Width)
+      return false;
+    return Digits == DigitsType(1) << -Scale;
+  }
+
+  /// \brief The log base 2, rounded.
+  ///
+  /// Get the lg of the scalar.  lg 0 is defined to be INT32_MIN.
+  int32_t lg() const { return ScaledNumbers::getLg(Digits, Scale); }
+
+  /// \brief The log base 2, rounded towards INT32_MIN.
+  ///
+  /// Get the lg floor.  lg 0 is defined to be INT32_MIN.
+  int32_t lgFloor() const { return ScaledNumbers::getLgFloor(Digits, Scale); }
+
+  /// \brief The log base 2, rounded towards INT32_MAX.
+  ///
+  /// Get the lg ceiling.  lg 0 is defined to be INT32_MIN.
+  int32_t lgCeiling() const {
+    return ScaledNumbers::getLgCeiling(Digits, Scale);
+  }
+
+  bool operator==(const ScaledNumber &X) const { return compare(X) == 0; }
+  bool operator<(const ScaledNumber &X) const { return compare(X) < 0; }
+  bool operator!=(const ScaledNumber &X) const { return compare(X) != 0; }
+  bool operator>(const ScaledNumber &X) const { return compare(X) > 0; }
+  bool operator<=(const ScaledNumber &X) const { return compare(X) <= 0; }
+  bool operator>=(const ScaledNumber &X) const { return compare(X) >= 0; }
+
+  bool operator!() const { return isZero(); }
+
+  /// \brief Convert to a decimal representation in a string.
+  ///
+  /// Convert to a string.  Uses scientific notation for very large/small
+  /// numbers.  Scientific notation is used roughly for numbers outside of the
+  /// range 2^-64 through 2^64.
+  ///
+  /// \c Precision indicates the number of decimal digits of precision to use;
+  /// 0 requests the maximum available.
+  ///
+  /// As a special case to make debugging easier, if the number is small enough
+  /// to convert without scientific notation and has more than \c Precision
+  /// digits before the decimal place, it's printed accurately to the first
+  /// digit past zero.  E.g., assuming 10 digits of precision:
+  ///
+  ///     98765432198.7654... => 98765432198.8
+  ///      8765432198.7654... =>  8765432198.8
+  ///       765432198.7654... =>   765432198.8
+  ///        65432198.7654... =>    65432198.77
+  ///         5432198.7654... =>     5432198.765
+  std::string toString(unsigned Precision = DefaultPrecision) {
+    return ScaledNumberBase::toString(Digits, Scale, Width, Precision);
+  }
+
+  /// \brief Print a decimal representation.
+  ///
+  /// Print a string.  See toString for documentation.
+  raw_ostream &print(raw_ostream &OS,
+                     unsigned Precision = DefaultPrecision) const {
+    return ScaledNumberBase::print(OS, Digits, Scale, Width, Precision);
+  }
+  void dump() const { return ScaledNumberBase::dump(Digits, Scale, Width); }
+
+  ScaledNumber &operator+=(const ScaledNumber &X) {
+    std::tie(Digits, Scale) =
+        ScaledNumbers::getSum(Digits, Scale, X.Digits, X.Scale);
+    // Check for exponent past MaxScale.
+    if (Scale > ScaledNumbers::MaxScale)
+      *this = getLargest();
+    return *this;
+  }
+  ScaledNumber &operator-=(const ScaledNumber &X) {
+    std::tie(Digits, Scale) =
+        ScaledNumbers::getDifference(Digits, Scale, X.Digits, X.Scale);
+    return *this;
+  }
+  ScaledNumber &operator*=(const ScaledNumber &X);
+  ScaledNumber &operator/=(const ScaledNumber &X);
+  ScaledNumber &operator<<=(int16_t Shift) {
+    shiftLeft(Shift);
+    return *this;
+  }
+  ScaledNumber &operator>>=(int16_t Shift) {
+    shiftRight(Shift);
+    return *this;
+  }
+
+private:
+  void shiftLeft(int32_t Shift);
+  void shiftRight(int32_t Shift);
+
+  /// \brief Adjust two floats to have matching exponents.
+  ///
+  /// Adjust \c this and \c X to have matching exponents.  Returns the new \c X
+  /// by value.  Does nothing if \a isZero() for either.
+  ///
+  /// The value that compares smaller will lose precision, and possibly become
+  /// \a isZero().
+  ScaledNumber matchScales(ScaledNumber X) {
+    ScaledNumbers::matchScales(Digits, Scale, X.Digits, X.Scale);
+    return X;
+  }
+
+public:
+  /// \brief Scale a large number accurately.
+  ///
+  /// Scale N (multiply it by this).  Uses full precision multiplication, even
+  /// if Width is smaller than 64, so information is not lost.
+  uint64_t scale(uint64_t N) const;
+  uint64_t scaleByInverse(uint64_t N) const {
+    // TODO: implement directly, rather than relying on inverse.  Inverse is
+    // expensive.
+    return inverse().scale(N);
+  }
+  int64_t scale(int64_t N) const {
+    std::pair<uint64_t, bool> Unsigned = splitSigned(N);
+    return joinSigned(scale(Unsigned.first), Unsigned.second);
+  }
+  int64_t scaleByInverse(int64_t N) const {
+    std::pair<uint64_t, bool> Unsigned = splitSigned(N);
+    return joinSigned(scaleByInverse(Unsigned.first), Unsigned.second);
+  }
+
+  int compare(const ScaledNumber &X) const {
+    return ScaledNumbers::compare(Digits, Scale, X.Digits, X.Scale);
+  }
+  int compareTo(uint64_t N) const {
+    ScaledNumber Scaled = get(N);
+    int Compare = compare(Scaled);
+    if (Width == 64 || Compare != 0)
+      return Compare;
+
+    // Check for precision loss.  We know *this == RoundTrip.
+    uint64_t RoundTrip = Scaled.template toInt<uint64_t>();
+    return N == RoundTrip ? 0 : RoundTrip < N ? -1 : 1;
+  }
+  int compareTo(int64_t N) const { return N < 0 ? 1 : compareTo(uint64_t(N)); }
+
+  ScaledNumber &invert() { return *this = ScaledNumber::get(1) / *this; }
+  ScaledNumber inverse() const { return ScaledNumber(*this).invert(); }
+
+private:
+  static ScaledNumber getProduct(DigitsType LHS, DigitsType RHS) {
+    return ScaledNumbers::getProduct(LHS, RHS);
+  }
+  static ScaledNumber getQuotient(DigitsType Dividend, DigitsType Divisor) {
+    return ScaledNumbers::getQuotient(Dividend, Divisor);
+  }
+
+  static int countLeadingZerosWidth(DigitsType Digits) {
+    if (Width == 64)
+      return countLeadingZeros64(Digits);
+    if (Width == 32)
+      return countLeadingZeros32(Digits);
+    return countLeadingZeros32(Digits) + Width - 32;
+  }
+
+  /// \brief Adjust a number to width, rounding up if necessary.
+  ///
+  /// Should only be called for \c Shift close to zero.
+  ///
+  /// \pre Shift >= MinScale && Shift + 64 <= MaxScale.
+  static ScaledNumber adjustToWidth(uint64_t N, int32_t Shift) {
+    assert(Shift >= ScaledNumbers::MinScale && "Shift should be close to 0");
+    assert(Shift <= ScaledNumbers::MaxScale - 64 &&
+           "Shift should be close to 0");
+    auto Adjusted = ScaledNumbers::getAdjusted<DigitsT>(N, Shift);
+    return Adjusted;
+  }
+
+  static ScaledNumber getRounded(ScaledNumber P, bool Round) {
+    // Saturate.
+    if (P.isLargest())
+      return P;
+
+    return ScaledNumbers::getRounded(P.Digits, P.Scale, Round);
+  }
+};
+
+#define SCALED_NUMBER_BOP(op, base)                                            \
+  template <class DigitsT>                                                     \
+  ScaledNumber<DigitsT> operator op(const ScaledNumber<DigitsT> &L,            \
+                                    const ScaledNumber<DigitsT> &R) {          \
+    return ScaledNumber<DigitsT>(L) base R;                                    \
+  }
+SCALED_NUMBER_BOP(+, += )
+SCALED_NUMBER_BOP(-, -= )
+SCALED_NUMBER_BOP(*, *= )
+SCALED_NUMBER_BOP(/, /= )
+SCALED_NUMBER_BOP(<<, <<= )
+SCALED_NUMBER_BOP(>>, >>= )
+#undef SCALED_NUMBER_BOP
+
+template <class DigitsT>
+raw_ostream &operator<<(raw_ostream &OS, const ScaledNumber<DigitsT> &X) {
+  return X.print(OS, 10);
+}
+
+#define SCALED_NUMBER_COMPARE_TO_TYPE(op, T1, T2)                              \
+  template <class DigitsT>                                                     \
+  bool operator op(const ScaledNumber<DigitsT> &L, T1 R) {                     \
+    return L.compareTo(T2(R)) op 0;                                            \
+  }                                                                            \
+  template <class DigitsT>                                                     \
+  bool operator op(T1 L, const ScaledNumber<DigitsT> &R) {                     \
+    return 0 op R.compareTo(T2(L));                                            \
+  }
+#define SCALED_NUMBER_COMPARE_TO(op)                                           \
+  SCALED_NUMBER_COMPARE_TO_TYPE(op, uint64_t, uint64_t)                        \
+  SCALED_NUMBER_COMPARE_TO_TYPE(op, uint32_t, uint64_t)                        \
+  SCALED_NUMBER_COMPARE_TO_TYPE(op, int64_t, int64_t)                          \
+  SCALED_NUMBER_COMPARE_TO_TYPE(op, int32_t, int64_t)
+SCALED_NUMBER_COMPARE_TO(< )
+SCALED_NUMBER_COMPARE_TO(> )
+SCALED_NUMBER_COMPARE_TO(== )
+SCALED_NUMBER_COMPARE_TO(!= )
+SCALED_NUMBER_COMPARE_TO(<= )
+SCALED_NUMBER_COMPARE_TO(>= )
+#undef SCALED_NUMBER_COMPARE_TO
+#undef SCALED_NUMBER_COMPARE_TO_TYPE
+
+template <class DigitsT>
+uint64_t ScaledNumber<DigitsT>::scale(uint64_t N) const {
+  if (Width == 64 || N <= DigitsLimits::max())
+    return (get(N) * *this).template toInt<uint64_t>();
+
+  // Defer to the 64-bit version.
+  return ScaledNumber<uint64_t>(Digits, Scale).scale(N);
+}
+
+template <class DigitsT>
+template <class IntT>
+IntT ScaledNumber<DigitsT>::toInt() const {
+  typedef std::numeric_limits<IntT> Limits;
+  if (*this < 1)
+    return 0;
+  if (*this >= Limits::max())
+    return Limits::max();
+
+  IntT N = Digits;
+  if (Scale > 0) {
+    assert(size_t(Scale) < sizeof(IntT) * 8);
+    return N << Scale;
+  }
+  if (Scale < 0) {
+    assert(size_t(-Scale) < sizeof(IntT) * 8);
+    return N >> -Scale;
+  }
+  return N;
+}
+
+template <class DigitsT>
+ScaledNumber<DigitsT> &ScaledNumber<DigitsT>::
+operator*=(const ScaledNumber &X) {
+  if (isZero())
+    return *this;
+  if (X.isZero())
+    return *this = X;
+
+  // Save the exponents.
+  int32_t Scales = int32_t(Scale) + int32_t(X.Scale);
+
+  // Get the raw product.
+  *this = getProduct(Digits, X.Digits);
+
+  // Combine with exponents.
+  return *this <<= Scales;
+}
+template <class DigitsT>
+ScaledNumber<DigitsT> &ScaledNumber<DigitsT>::
+operator/=(const ScaledNumber &X) {
+  if (isZero())
+    return *this;
+  if (X.isZero())
+    return *this = getLargest();
+
+  // Save the exponents.
+  int32_t Scales = int32_t(Scale) - int32_t(X.Scale);
+
+  // Get the raw quotient.
+  *this = getQuotient(Digits, X.Digits);
+
+  // Combine with exponents.
+  return *this <<= Scales;
+}
+template <class DigitsT> void ScaledNumber<DigitsT>::shiftLeft(int32_t Shift) {
+  if (!Shift || isZero())
+    return;
+  assert(Shift != INT32_MIN);
+  if (Shift < 0) {
+    shiftRight(-Shift);
+    return;
+  }
+
+  // Shift as much as we can in the exponent.
+  int32_t ScaleShift = std::min(Shift, ScaledNumbers::MaxScale - Scale);
+  Scale += ScaleShift;
+  if (ScaleShift == Shift)
+    return;
+
+  // Check this late, since it's rare.
+  if (isLargest())
+    return;
+
+  // Shift the digits themselves.
+  Shift -= ScaleShift;
+  if (Shift > countLeadingZerosWidth(Digits)) {
+    // Saturate.
+    *this = getLargest();
+    return;
+  }
+
+  Digits <<= Shift;
+  return;
+}
+
+template <class DigitsT> void ScaledNumber<DigitsT>::shiftRight(int32_t Shift) {
+  if (!Shift || isZero())
+    return;
+  assert(Shift != INT32_MIN);
+  if (Shift < 0) {
+    shiftLeft(-Shift);
+    return;
+  }
+
+  // Shift as much as we can in the exponent.
+  int32_t ScaleShift = std::min(Shift, Scale - ScaledNumbers::MinScale);
+  Scale -= ScaleShift;
+  if (ScaleShift == Shift)
+    return;
+
+  // Shift the digits themselves.
+  Shift -= ScaleShift;
+  if (Shift >= Width) {
+    // Saturate.
+    *this = getZero();
+    return;
+  }
+
+  Digits >>= Shift;
+  return;
+}
+
+template <typename T> struct isPodLike;
+template <typename T> struct isPodLike<ScaledNumber<T>> {
+  static const bool value = true;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/Support/SourceMgr.h b/include/llvm/Support/SourceMgr.h
index 39f896d..4717553 100644
--- a/include/llvm/Support/SourceMgr.h
+++ b/include/llvm/Support/SourceMgr.h
@@ -30,7 +30,7 @@ namespace llvm {
   class Twine;
   class raw_ostream;
 
-/// SourceMgr - This owns the files read by a parser, handles include stacks,
+/// This owns the files read by a parser, handles include stacks,
 /// and handles diagnostic wrangling.
 class SourceMgr {
 public:
@@ -40,34 +40,34 @@ public:
     DK_Note
   };
 
-  /// DiagHandlerTy - Clients that want to handle their own diagnostics in a
-  /// custom way can register a function pointer+context as a diagnostic
-  /// handler.  It gets called each time PrintMessage is invoked.
+  /// Clients that want to handle their own diagnostics in a custom way can
+  /// register a function pointer+context as a diagnostic handler.
+  /// It gets called each time PrintMessage is invoked.
   typedef void (*DiagHandlerTy)(const SMDiagnostic &, void *Context);
 private:
   struct SrcBuffer {
-    /// Buffer - The memory buffer for the file.
+    /// The memory buffer for the file.
     MemoryBuffer *Buffer;
 
-    /// IncludeLoc - This is the location of the parent include, or null if at
-    /// the top level.
+    /// This is the location of the parent include, or null if at the top level.
     SMLoc IncludeLoc;
   };
 
-  /// Buffers - This is all of the buffers that we are reading from.
+  /// This is all of the buffers that we are reading from.
   std::vector<SrcBuffer> Buffers;
 
-  // IncludeDirectories - This is the list of directories we should search for
-  // include files in.
+  // This is the list of directories we should search for include files in.
   std::vector<std::string> IncludeDirectories;
 
-  /// LineNoCache - This is a cache for line number queries, its implementation
-  /// is really private to SourceMgr.cpp.
+  /// This is a cache for line number queries, its implementation is really
+  /// private to SourceMgr.cpp.
   mutable void *LineNoCache;
 
   DiagHandlerTy DiagHandler;
   void *DiagContext;
 
+  bool isValidBufferID(unsigned i) const { return i && i <= Buffers.size(); }
+
   SourceMgr(const SourceMgr&) LLVM_DELETED_FUNCTION;
   void operator=(const SourceMgr&) LLVM_DELETED_FUNCTION;
 public:
@@ -79,8 +79,8 @@ public:
     IncludeDirectories = Dirs;
   }
 
-  /// setDiagHandler - Specify a diagnostic handler to be invoked every time
-  /// PrintMessage is called. Ctx is passed into the handler when it is invoked.
+  /// Specify a diagnostic handler to be invoked every time PrintMessage is
+  /// called. \p Ctx is passed into the handler when it is invoked.
   void setDiagHandler(DiagHandlerTy DH, void *Ctx = nullptr) {
     DiagHandler = DH;
     DiagContext = Ctx;
@@ -90,60 +90,67 @@ public:
   void *getDiagContext() const { return DiagContext; }
 
   const SrcBuffer &getBufferInfo(unsigned i) const {
-    assert(i < Buffers.size() && "Invalid Buffer ID!");
-    return Buffers[i];
+    assert(isValidBufferID(i));
+    return Buffers[i - 1];
   }
 
   const MemoryBuffer *getMemoryBuffer(unsigned i) const {
-    assert(i < Buffers.size() && "Invalid Buffer ID!");
-    return Buffers[i].Buffer;
+    assert(isValidBufferID(i));
+    return Buffers[i - 1].Buffer;
   }
 
-  size_t getNumBuffers() const {
+  unsigned getNumBuffers() const {
     return Buffers.size();
   }
 
+  unsigned getMainFileID() const {
+    assert(getNumBuffers());
+    return 1;
+  }
+
   SMLoc getParentIncludeLoc(unsigned i) const {
-    assert(i < Buffers.size() && "Invalid Buffer ID!");
-    return Buffers[i].IncludeLoc;
+    assert(isValidBufferID(i));
+    return Buffers[i - 1].IncludeLoc;
   }
 
-  /// AddNewSourceBuffer - Add a new source buffer to this source manager.  This
-  /// takes ownership of the memory buffer.
-  size_t AddNewSourceBuffer(MemoryBuffer *F, SMLoc IncludeLoc) {
+  /// Add a new source buffer to this source manager. This takes ownership of
+  /// the memory buffer.
+  unsigned AddNewSourceBuffer(MemoryBuffer *F, SMLoc IncludeLoc) {
     SrcBuffer NB;
     NB.Buffer = F;
     NB.IncludeLoc = IncludeLoc;
     Buffers.push_back(NB);
-    return Buffers.size() - 1;
+    return Buffers.size();
   }
 
-  /// AddIncludeFile - Search for a file with the specified name in the current
-  /// directory or in one of the IncludeDirs.  If no file is found, this returns
-  /// ~0, otherwise it returns the buffer ID of the stacked file.
-  /// The full path to the included file can be found in IncludedFile.
-  size_t AddIncludeFile(const std::string &Filename, SMLoc IncludeLoc,
-                        std::string &IncludedFile);
+  /// Search for a file with the specified name in the current directory or in
+  /// one of the IncludeDirs.
+  ///
+  /// If no file is found, this returns 0, otherwise it returns the buffer ID
+  /// of the stacked file. The full path to the included file can be found in
+  /// \p IncludedFile.
+  unsigned AddIncludeFile(const std::string &Filename, SMLoc IncludeLoc,
+                          std::string &IncludedFile);
 
-  /// FindBufferContainingLoc - Return the ID of the buffer containing the
-  /// specified location, returning -1 if not found.
-  int FindBufferContainingLoc(SMLoc Loc) const;
+  /// Return the ID of the buffer containing the specified location.
+  ///
+  /// 0 is returned if the buffer is not found.
+  unsigned FindBufferContainingLoc(SMLoc Loc) const;
 
-  /// FindLineNumber - Find the line number for the specified location in the
-  /// specified file.  This is not a fast method.
-  unsigned FindLineNumber(SMLoc Loc, int BufferID = -1) const {
+  /// Find the line number for the specified location in the specified file.
+  /// This is not a fast method.
+  unsigned FindLineNumber(SMLoc Loc, unsigned BufferID = 0) const {
     return getLineAndColumn(Loc, BufferID).first;
   }
 
-  /// getLineAndColumn - Find the line and column number for the specified
-  /// location in the specified file.  This is not a fast method.
-  std::pair<unsigned, unsigned>
-    getLineAndColumn(SMLoc Loc, int BufferID = -1) const;
+  /// Find the line and column number for the specified location in the
+  /// specified file. This is not a fast method.
+  std::pair<unsigned, unsigned> getLineAndColumn(SMLoc Loc,
+                                                 unsigned BufferID = 0) const;
 
-  /// PrintMessage - Emit a message about the specified location with the
-  /// specified string.
+  /// Emit a message about the specified location with the specified string.
   ///
-  /// @param ShowColors - Display colored messages if output is a terminal and
+  /// \param ShowColors Display colored messages if output is a terminal and
   /// the default error handler is used.
   void PrintMessage(raw_ostream &OS, SMLoc Loc, DiagKind Kind,
                     const Twine &Msg,
@@ -157,21 +164,28 @@ public:
                     ArrayRef<SMFixIt> FixIts = None,
                     bool ShowColors = true) const;
 
-  /// GetMessage - Return an SMDiagnostic at the specified location with the
-  /// specified string.
+  /// Emits a manually-constructed diagnostic to the given output stream.
+  ///
+  /// \param ShowColors Display colored messages if output is a terminal and
+  /// the default error handler is used.
+  void PrintMessage(raw_ostream &OS, const SMDiagnostic &Diagnostic,
+                    bool ShowColors = true) const;
+
+  /// Return an SMDiagnostic at the specified location with the specified
+  /// string.
   ///
-  /// @param Msg If non-null, the kind of message (e.g., "error") which is
+  /// \param Msg If non-null, the kind of message (e.g., "error") which is
   /// prefixed to the message.
   SMDiagnostic GetMessage(SMLoc Loc, DiagKind Kind, const Twine &Msg,
                           ArrayRef<SMRange> Ranges = None,
                           ArrayRef<SMFixIt> FixIts = None) const;
 
-  /// PrintIncludeStack - Prints the names of included files and the line of the
-  /// file they were included from.  A diagnostic handler can use this before
-  /// printing its custom formatted message.
+  /// Prints the names of included files and the line of the file they were
+  /// included from. A diagnostic handler can use this before printing its
+  /// custom formatted message.
   ///
-  /// @param IncludeLoc - The line of the include.
-  /// @param OS the raw_ostream to print on.
+  /// \param IncludeLoc The location of the include.
+  /// \param OS the raw_ostream to print on.
   void PrintIncludeStack(SMLoc IncludeLoc, raw_ostream &OS) const;
 };
 
@@ -208,8 +222,8 @@ public:
 };
 
 
-/// SMDiagnostic - Instances of this class encapsulate one diagnostic report,
-/// allowing printing to a raw_ostream as a caret diagnostic.
+/// Instances of this class encapsulate one diagnostic report, allowing
+/// printing to a raw_ostream as a caret diagnostic.
 class SMDiagnostic {
   const SourceMgr *SM;
   SMLoc Loc;
diff --git a/include/llvm/Support/SpecialCaseList.h b/include/llvm/Support/SpecialCaseList.h
new file mode 100644
index 0000000..098b9c7
--- /dev/null
+++ b/include/llvm/Support/SpecialCaseList.h
@@ -0,0 +1,96 @@
+//===-- SpecialCaseList.h - special case list for sanitizers ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//===----------------------------------------------------------------------===//
+//
+// This is a utility class used to parse user-provided text files with
+// "special case lists" for code sanitizers. Such files are used to
+// define "ABI list" for DataFlowSanitizer and blacklists for another sanitizers
+// like AddressSanitizer or UndefinedBehaviorSanitizer.
+//
+// Empty lines and lines starting with "#" are ignored. All the rest lines
+// should have the form:
+//   section:wildcard_expression[=category]
+// If category is not specified, it is assumed to be empty string.
+// Definitions of "section" and "category" are sanitizer-specific. For example,
+// sanitizer blacklists support sections "src", "fun" and "global".
+// Wildcard expressions define, respectively, source files, functions or
+// globals which shouldn't be instrumented.
+// Examples of categories:
+//   "functional": used in DFSan to list functions with pure functional
+//                 semantics.
+//   "init": used in ASan blacklist to disable initialization-order bugs
+//           detection for certain globals or source files.
+// Full special case list file example:
+// ---
+// # Blacklisted items:
+// fun:*_ZN4base6subtle*
+// global:*global_with_bad_access_or_initialization*
+// global:*global_with_initialization_issues*=init
+// type:*Namespace::ClassName*=init
+// src:file_with_tricky_code.cc
+// src:ignore-global-initializers-issues.cc=init
+//
+// # Functions with pure functional semantics:
+// fun:cos=functional
+// fun:sin=functional
+// ---
+// Note that the wild card is in fact an llvm::Regex, but * is automatically
+// replaced with .*
+// This is similar to the "ignore" feature of ThreadSanitizer.
+// http://code.google.com/p/data-race-test/wiki/ThreadSanitizerIgnores
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_SPECIALCASELIST_H
+#define LLVM_SUPPORT_SPECIALCASELIST_H
+
+#include "llvm/ADT/StringMap.h"
+
+namespace llvm {
+class MemoryBuffer;
+class Regex;
+class StringRef;
+
+class SpecialCaseList {
+ public:
+  /// Parses the special case list from a file. If Path is empty, returns
+  /// an empty special case list. On failure, returns 0 and writes an error
+  /// message to string.
+  static SpecialCaseList *create(const StringRef Path, std::string &Error);
+  /// Parses the special case list from a memory buffer. On failure, returns
+  /// 0 and writes an error message to string.
+  static SpecialCaseList *create(const MemoryBuffer *MB, std::string &Error);
+  /// Parses the special case list from a file. On failure, reports a fatal
+  /// error.
+  static SpecialCaseList *createOrDie(const StringRef Path);
+
+  ~SpecialCaseList();
+
+  /// Returns true, if special case list contains a line
+  /// \code
+  ///   @Section:<E>=@Category
+  /// \endcode
+  /// and @Query satisfies a wildcard expression <E>.
+  bool inSection(const StringRef Section, const StringRef Query,
+                 const StringRef Category = StringRef()) const;
+
+ private:
+  SpecialCaseList(SpecialCaseList const &) LLVM_DELETED_FUNCTION;
+  SpecialCaseList &operator=(SpecialCaseList const &) LLVM_DELETED_FUNCTION;
+
+  struct Entry;
+  StringMap<StringMap<Entry> > Entries;
+
+  SpecialCaseList();
+  /// Parses just-constructed SpecialCaseList entries from a memory buffer.
+  bool parse(const MemoryBuffer *MB, std::string &Error);
+};
+
+}  // namespace llvm
+
+#endif  // LLVM_SUPPORT_SPECIALCASELIST_H
+
diff --git a/include/llvm/Support/StreamableMemoryObject.h b/include/llvm/Support/StreamableMemoryObject.h
index 9c9e55c..6e71ad4 100644
--- a/include/llvm/Support/StreamableMemoryObject.h
+++ b/include/llvm/Support/StreamableMemoryObject.h
@@ -13,6 +13,7 @@
 
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataStream.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryObject.h"
 #include <cassert>
 #include <memory>
@@ -115,7 +116,7 @@ public:
     // requiring that the bitcode size be known, or otherwise ensuring that
     // the memory doesn't go away/get reallocated, but it's
     // not currently necessary. Users that need the pointer don't stream.
-    assert(0 && "getPointer in streaming memory objects not allowed");
+    llvm_unreachable("getPointer in streaming memory objects not allowed");
     return nullptr;
   }
   bool isValidAddress(uint64_t address) const override;
@@ -154,8 +155,8 @@ private:
                                         kChunkSize);
       BytesRead += bytes;
       if (bytes < kChunkSize) {
-        if (ObjectSize && BytesRead < Pos)
-          assert(0 && "Unexpected short read fetching bitcode");
+        assert((!ObjectSize || BytesRead >= Pos) &&
+               "Unexpected short read fetching bitcode");
         if (BytesRead <= Pos) { // reached EOF/ran out of bytes
           ObjectSize = BytesRead;
           EOFReached = true;
diff --git a/include/llvm/Support/StringPool.h b/include/llvm/Support/StringPool.h
index 7e1394c..3e04653 100644
--- a/include/llvm/Support/StringPool.h
+++ b/include/llvm/Support/StringPool.h
@@ -29,6 +29,7 @@
 #ifndef LLVM_SUPPORT_STRINGPOOL_H
 #define LLVM_SUPPORT_STRINGPOOL_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/ADT/StringMap.h"
 #include <cassert>
 #include <new>
@@ -128,10 +129,10 @@ namespace llvm {
     }
 
     inline const char *operator*() const { return begin(); }
-    inline operator bool() const { return S != nullptr; }
+    inline LLVM_EXPLICIT operator bool() const { return S != nullptr; }
 
-    inline bool operator==(const PooledStringPtr &That) { return S == That.S; }
-    inline bool operator!=(const PooledStringPtr &That) { return S != That.S; }
+    inline bool operator==(const PooledStringPtr &That) const { return S == That.S; }
+    inline bool operator!=(const PooledStringPtr &That) const { return S != That.S; }
   };
 
 } // End llvm namespace
diff --git a/include/llvm/Support/SwapByteOrder.h b/include/llvm/Support/SwapByteOrder.h
index e65f9cc..340954f 100644
--- a/include/llvm/Support/SwapByteOrder.h
+++ b/include/llvm/Support/SwapByteOrder.h
@@ -68,33 +68,38 @@ inline uint64_t SwapByteOrder_64(uint64_t value) {
 #endif
 }
 
-inline unsigned char  SwapByteOrder(unsigned char C) { return C; }
-inline   signed char  SwapByteOrder(signed char C) { return C; }
-inline          char  SwapByteOrder(char C) { return C; }
+inline unsigned char  getSwappedBytes(unsigned char C) { return C; }
+inline   signed char  getSwappedBytes(signed char C) { return C; }
+inline          char  getSwappedBytes(char C) { return C; }
 
-inline unsigned short SwapByteOrder(unsigned short C) { return SwapByteOrder_16(C); }
-inline   signed short SwapByteOrder(  signed short C) { return SwapByteOrder_16(C); }
+inline unsigned short getSwappedBytes(unsigned short C) { return SwapByteOrder_16(C); }
+inline   signed short getSwappedBytes(  signed short C) { return SwapByteOrder_16(C); }
 
-inline unsigned int   SwapByteOrder(unsigned int   C) { return SwapByteOrder_32(C); }
-inline   signed int   SwapByteOrder(  signed int   C) { return SwapByteOrder_32(C); }
+inline unsigned int   getSwappedBytes(unsigned int   C) { return SwapByteOrder_32(C); }
+inline   signed int   getSwappedBytes(  signed int   C) { return SwapByteOrder_32(C); }
 
 #if __LONG_MAX__ == __INT_MAX__
-inline unsigned long  SwapByteOrder(unsigned long  C) { return SwapByteOrder_32(C); }
-inline   signed long  SwapByteOrder(  signed long  C) { return SwapByteOrder_32(C); }
+inline unsigned long  getSwappedBytes(unsigned long  C) { return SwapByteOrder_32(C); }
+inline   signed long  getSwappedBytes(  signed long  C) { return SwapByteOrder_32(C); }
 #elif __LONG_MAX__ == __LONG_LONG_MAX__
-inline unsigned long  SwapByteOrder(unsigned long  C) { return SwapByteOrder_64(C); }
-inline   signed long  SwapByteOrder(  signed long  C) { return SwapByteOrder_64(C); }
+inline unsigned long  getSwappedBytes(unsigned long  C) { return SwapByteOrder_64(C); }
+inline   signed long  getSwappedBytes(  signed long  C) { return SwapByteOrder_64(C); }
 #else
 #error "Unknown long size!"
 #endif
 
-inline unsigned long long SwapByteOrder(unsigned long long C) {
+inline unsigned long long getSwappedBytes(unsigned long long C) {
   return SwapByteOrder_64(C);
 }
-inline signed long long SwapByteOrder(signed long long C) {
+inline signed long long getSwappedBytes(signed long long C) {
   return SwapByteOrder_64(C);
 }
 
+template<typename T>
+inline void swapByteOrder(T &Value) {
+  Value = getSwappedBytes(Value);
+}
+
 } // end namespace sys
 } // end namespace llvm
 
diff --git a/include/llvm/Support/TargetRegistry.h b/include/llvm/Support/TargetRegistry.h
index fcdc604..5d5b86a 100644
--- a/include/llvm/Support/TargetRegistry.h
+++ b/include/llvm/Support/TargetRegistry.h
@@ -51,6 +51,7 @@ namespace llvm {
   class raw_ostream;
   class formatted_raw_ostream;
 
+  MCStreamer *createNullStreamer(MCContext &Ctx);
   MCStreamer *createAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
                                 bool isVerboseAsm, bool useDwarfDirectory,
                                 MCInstPrinter *InstPrint, MCCodeEmitter *CE,
@@ -139,6 +140,7 @@ namespace llvm {
                                              MCCodeEmitter *CE,
                                              MCAsmBackend *TAB,
                                              bool ShowInst);
+    typedef MCStreamer *(*NullStreamerCtorTy)(MCContext &Ctx);
     typedef MCRelocationInfo *(*MCRelocationInfoCtorTy)(StringRef TT,
                                                         MCContext &Ctx);
     typedef MCSymbolizer *(*MCSymbolizerCtorTy)(StringRef TT,
@@ -225,6 +227,10 @@ namespace llvm {
     /// AsmStreamer, if registered (default = llvm::createAsmStreamer).
     AsmStreamerCtorTy AsmStreamerCtorFn;
 
+    /// Construction function for this target's NullStreamer, if registered
+    /// (default = llvm::createNullStreamer).
+    NullStreamerCtorTy NullStreamerCtorFn;
+
     /// MCRelocationInfoCtorFn - Construction function for this target's
     /// MCRelocationInfo, if registered (default = llvm::createMCRelocationInfo)
     MCRelocationInfoCtorTy MCRelocationInfoCtorFn;
@@ -235,8 +241,8 @@ namespace llvm {
 
   public:
     Target()
-        : AsmStreamerCtorFn(nullptr), MCRelocationInfoCtorFn(nullptr),
-          MCSymbolizerCtorFn(nullptr) {}
+        : AsmStreamerCtorFn(nullptr), NullStreamerCtorFn(nullptr),
+          MCRelocationInfoCtorFn(nullptr), MCSymbolizerCtorFn(nullptr) {}
 
     /// @name Target Information
     /// @{
@@ -447,6 +453,12 @@ namespace llvm {
                                      InstPrint, CE, TAB, ShowInst);
     }
 
+    MCStreamer *createNullStreamer(MCContext &Ctx) const {
+      if (NullStreamerCtorFn)
+        return NullStreamerCtorFn(Ctx);
+      return llvm::createNullStreamer(Ctx);
+    }
+
     /// createMCRelocationInfo - Create a target specific MCRelocationInfo.
     ///
     /// \param TT The target triple.
@@ -553,13 +565,6 @@ namespace llvm {
                                       Triple &TheTriple,
                                       std::string &Error);
 
-    /// getClosestTargetForJIT - Pick the best target that is compatible with
-    /// the current host.  If no close target can be found, this returns null
-    /// and sets the Error string to a reason.
-    ///
-    /// Maintained for compatibility through 2.6.
-    static const Target *getClosestTargetForJIT(std::string &Error);
-
     /// @}
     /// @name Target Registration
     /// @{
@@ -780,6 +785,10 @@ namespace llvm {
       T.AsmStreamerCtorFn = Fn;
     }
 
+    static void RegisterNullStreamer(Target &T, Target::NullStreamerCtorTy Fn) {
+      T.NullStreamerCtorFn = Fn;
+    }
+
     /// RegisterMCRelocationInfo - Register an MCRelocationInfo
     /// implementation for the given target.
     ///
diff --git a/include/llvm/Support/Threading.h b/include/llvm/Support/Threading.h
index a7e8774..7e87584 100644
--- a/include/llvm/Support/Threading.h
+++ b/include/llvm/Support/Threading.h
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TThis file defines llvm_start_multithreaded() and friends.
+// This file declares helper functions for running LLVM in a multi-threaded
+// environment.
 //
 //===----------------------------------------------------------------------===//
 
@@ -15,32 +16,10 @@
 #define LLVM_SUPPORT_THREADING_H
 
 namespace llvm {
-  /// llvm_start_multithreaded - Allocate and initialize structures needed to
-  /// make LLVM safe for multithreading.  The return value indicates whether
-  /// multithreaded initialization succeeded.  LLVM will still be operational
-  /// on "failed" return, and will still be safe for hosting threading
-  /// applications in the JIT, but will not be safe for concurrent calls to the
-  /// LLVM APIs.
-  /// THIS MUST EXECUTE IN ISOLATION FROM ALL OTHER LLVM API CALLS.
-  bool llvm_start_multithreaded();
-
-  /// llvm_stop_multithreaded - Deallocate structures necessary to make LLVM
-  /// safe for multithreading.
-  /// THIS MUST EXECUTE IN ISOLATION FROM ALL OTHER LLVM API CALLS.
-  void llvm_stop_multithreaded();
-
-  /// llvm_is_multithreaded - Check whether LLVM is executing in thread-safe
-  /// mode or not.
+  /// Returns true if LLVM is compiled with support for multi-threading, and
+  /// false otherwise.
   bool llvm_is_multithreaded();
 
-  /// acquire_global_lock - Acquire the global lock.  This is a no-op if called
-  /// before llvm_start_multithreaded().
-  void llvm_acquire_global_lock();
-
-  /// release_global_lock - Release the global lock.  This is a no-op if called
-  /// before llvm_start_multithreaded().
-  void llvm_release_global_lock();
-
   /// llvm_execute_on_thread - Execute the given \p UserFn on a separate
   /// thread, passing it the provided \p UserData.
   ///
diff --git a/include/llvm/Support/WindowsError.h b/include/llvm/Support/WindowsError.h
new file mode 100644
index 0000000..0e909a0
--- /dev/null
+++ b/include/llvm/Support/WindowsError.h
@@ -0,0 +1,19 @@
+//===-- WindowsError.h - Support for mapping windows errors to posix-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_WINDOWS_ERROR_H
+#define LLVM_SUPPORT_WINDOWS_ERROR_H
+
+#include <system_error>
+
+namespace llvm {
+std::error_code mapWindowsError(unsigned EV);
+}
+
+#endif
diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
index 4ee05ed..a23faf6 100644
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h
@@ -24,7 +24,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 namespace llvm {
 namespace yaml {
@@ -880,7 +880,7 @@ public:
   ~Input();
 
   // Check if there was an syntax or semantic error during parsing.
-  llvm::error_code error();
+  std::error_code error();
 
 private:
   bool outputting() override;
@@ -982,13 +982,13 @@ public:
   // These are only used by operator>>. They could be private
   // if those templated things could be made friends.
   bool setCurrentDocument();
-  void nextDocument();
+  bool nextDocument();
 
 private:
   llvm::SourceMgr                     SrcMgr; // must be before Strm
   std::unique_ptr<llvm::yaml::Stream> Strm;
   std::unique_ptr<HNode>              TopNode;
-  llvm::error_code                    EC;
+  std::error_code                     EC;
   llvm::BumpPtrAllocator              StringAllocator;
   llvm::yaml::document_iterator       DocIterator;
   std::vector<bool>                   BitValuesUsed;
diff --git a/include/llvm/Support/system_error.h b/include/llvm/Support/system_error.h
deleted file mode 100644
index aa5e9f7..0000000
--- a/include/llvm/Support/system_error.h
+++ /dev/null
@@ -1,901 +0,0 @@
-//===---------------------------- system_error ------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This was lifted from libc++ and modified for C++03. This is called
-// system_error even though it does not define that class because that's what
-// it's called in C++0x. We don't define system_error because it is only used
-// for exception handling, which we don't use in LLVM.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_SUPPORT_SYSTEM_ERROR_H
-#define LLVM_SUPPORT_SYSTEM_ERROR_H
-
-#include "llvm/Support/Compiler.h"
-
-/*
-    system_error synopsis
-
-namespace std
-{
-
-class error_category
-{
-public:
-    virtual ~error_category();
-
-    error_category(const error_category&) = delete;
-    error_category& operator=(const error_category&) = delete;
-
-    virtual const char* name() const = 0;
-    virtual error_condition default_error_condition(int ev) const;
-    virtual bool equivalent(int code, const error_condition& condition) const;
-    virtual bool equivalent(const error_code& code, int condition) const;
-    virtual std::string message(int ev) const = 0;
-
-    bool operator==(const error_category& rhs) const;
-    bool operator!=(const error_category& rhs) const;
-    bool operator<(const error_category& rhs) const;
-};
-
-const error_category& generic_category();
-const error_category& system_category();
-
-template <class T> struct is_error_code_enum
-    : public std::false_type {};
-
-template <class T> struct is_error_condition_enum
-    : public std::false_type {};
-
-class error_code
-{
-public:
-    // constructors:
-    error_code();
-    error_code(int val, const error_category& cat);
-    template <class ErrorCodeEnum>
-        error_code(ErrorCodeEnum e);
-
-    // modifiers:
-    void assign(int val, const error_category& cat);
-    template <class ErrorCodeEnum>
-        error_code& operator=(ErrorCodeEnum e);
-    void clear();
-
-    // observers:
-    int value() const;
-    const error_category& category() const;
-    error_condition default_error_condition() const;
-    std::string message() const;
-    explicit operator bool() const;
-};
-
-// non-member functions:
-bool operator<(const error_code& lhs, const error_code& rhs);
-template <class charT, class traits>
-    basic_ostream<charT,traits>&
-    operator<<(basic_ostream<charT,traits>& os, const error_code& ec);
-
-class error_condition
-{
-public:
-    // constructors:
-    error_condition();
-    error_condition(int val, const error_category& cat);
-    template <class ErrorConditionEnum>
-        error_condition(ErrorConditionEnum e);
-
-    // modifiers:
-    void assign(int val, const error_category& cat);
-    template <class ErrorConditionEnum>
-        error_condition& operator=(ErrorConditionEnum e);
-    void clear();
-
-    // observers:
-    int value() const;
-    const error_category& category() const;
-    std::string message() const;
-    explicit operator bool() const;
-};
-
-bool operator<(const error_condition& lhs, const error_condition& rhs);
-
-class system_error
-    : public runtime_error
-{
-public:
-    system_error(error_code ec, const std::string& what_arg);
-    system_error(error_code ec, const char* what_arg);
-    system_error(error_code ec);
-    system_error(int ev, const error_category& ecat, const std::string& what_arg);
-    system_error(int ev, const error_category& ecat, const char* what_arg);
-    system_error(int ev, const error_category& ecat);
-
-    const error_code& code() const throw();
-    const char* what() const throw();
-};
-
-enum class errc
-{
-    address_family_not_supported,       // EAFNOSUPPORT
-    address_in_use,                     // EADDRINUSE
-    address_not_available,              // EADDRNOTAVAIL
-    already_connected,                  // EISCONN
-    argument_list_too_long,             // E2BIG
-    argument_out_of_domain,             // EDOM
-    bad_address,                        // EFAULT
-    bad_file_descriptor,                // EBADF
-    bad_message,                        // EBADMSG
-    broken_pipe,                        // EPIPE
-    connection_aborted,                 // ECONNABORTED
-    connection_already_in_progress,     // EALREADY
-    connection_refused,                 // ECONNREFUSED
-    connection_reset,                   // ECONNRESET
-    cross_device_link,                  // EXDEV
-    destination_address_required,       // EDESTADDRREQ
-    device_or_resource_busy,            // EBUSY
-    directory_not_empty,                // ENOTEMPTY
-    executable_format_error,            // ENOEXEC
-    file_exists,                        // EEXIST
-    file_too_large,                     // EFBIG
-    filename_too_long,                  // ENAMETOOLONG
-    function_not_supported,             // ENOSYS
-    host_unreachable,                   // EHOSTUNREACH
-    identifier_removed,                 // EIDRM
-    illegal_byte_sequence,              // EILSEQ
-    inappropriate_io_control_operation, // ENOTTY
-    interrupted,                        // EINTR
-    invalid_argument,                   // EINVAL
-    invalid_seek,                       // ESPIPE
-    io_error,                           // EIO
-    is_a_directory,                     // EISDIR
-    message_size,                       // EMSGSIZE
-    network_down,                       // ENETDOWN
-    network_reset,                      // ENETRESET
-    network_unreachable,                // ENETUNREACH
-    no_buffer_space,                    // ENOBUFS
-    no_child_process,                   // ECHILD
-    no_link,                            // ENOLINK
-    no_lock_available,                  // ENOLCK
-    no_message_available,               // ENODATA
-    no_message,                         // ENOMSG
-    no_protocol_option,                 // ENOPROTOOPT
-    no_space_on_device,                 // ENOSPC
-    no_stream_resources,                // ENOSR
-    no_such_device_or_address,          // ENXIO
-    no_such_device,                     // ENODEV
-    no_such_file_or_directory,          // ENOENT
-    no_such_process,                    // ESRCH
-    not_a_directory,                    // ENOTDIR
-    not_a_socket,                       // ENOTSOCK
-    not_a_stream,                       // ENOSTR
-    not_connected,                      // ENOTCONN
-    not_enough_memory,                  // ENOMEM
-    not_supported,                      // ENOTSUP
-    operation_canceled,                 // ECANCELED
-    operation_in_progress,              // EINPROGRESS
-    operation_not_permitted,            // EPERM
-    operation_not_supported,            // EOPNOTSUPP
-    operation_would_block,              // EWOULDBLOCK
-    owner_dead,                         // EOWNERDEAD
-    permission_denied,                  // EACCES
-    protocol_error,                     // EPROTO
-    protocol_not_supported,             // EPROTONOSUPPORT
-    read_only_file_system,              // EROFS
-    resource_deadlock_would_occur,      // EDEADLK
-    resource_unavailable_try_again,     // EAGAIN
-    result_out_of_range,                // ERANGE
-    state_not_recoverable,              // ENOTRECOVERABLE
-    stream_timeout,                     // ETIME
-    text_file_busy,                     // ETXTBSY
-    timed_out,                          // ETIMEDOUT
-    too_many_files_open_in_system,      // ENFILE
-    too_many_files_open,                // EMFILE
-    too_many_links,                     // EMLINK
-    too_many_symbolic_link_levels,      // ELOOP
-    value_too_large,                    // EOVERFLOW
-    wrong_protocol_type                 // EPROTOTYPE
-};
-
-template <> struct is_error_condition_enum<errc> : std::true_type { }
-
-error_code make_error_code(errc e);
-error_condition make_error_condition(errc e);
-
-// Comparison operators:
-bool operator==(const error_code& lhs, const error_code& rhs);
-bool operator==(const error_code& lhs, const error_condition& rhs);
-bool operator==(const error_condition& lhs, const error_code& rhs);
-bool operator==(const error_condition& lhs, const error_condition& rhs);
-bool operator!=(const error_code& lhs, const error_code& rhs);
-bool operator!=(const error_code& lhs, const error_condition& rhs);
-bool operator!=(const error_condition& lhs, const error_code& rhs);
-bool operator!=(const error_condition& lhs, const error_condition& rhs);
-
-template <> struct hash<std::error_code>;
-
-}  // std
-
-*/
-
-#include "llvm/Config/llvm-config.h"
-#include <cerrno>
-#include <string>
-
-// This must be here instead of a .inc file because it is used in the definition
-// of the enum values below.
-#ifdef LLVM_ON_WIN32
-
-  // The following numbers were taken from VS2010.
-# ifndef EAFNOSUPPORT
-#   define EAFNOSUPPORT 102
-# endif
-# ifndef EADDRINUSE
-#   define EADDRINUSE 100
-# endif
-# ifndef EADDRNOTAVAIL
-#   define EADDRNOTAVAIL 101
-# endif
-# ifndef EISCONN
-#   define EISCONN 113
-# endif
-# ifndef E2BIG
-#   define E2BIG 7
-# endif
-# ifndef EDOM
-#   define EDOM 33
-# endif
-# ifndef EFAULT
-#   define EFAULT 14
-# endif
-# ifndef EBADF
-#   define EBADF 9
-# endif
-# ifndef EBADMSG
-#   define EBADMSG 104
-# endif
-# ifndef EPIPE
-#   define EPIPE 32
-# endif
-# ifndef ECONNABORTED
-#   define ECONNABORTED 106
-# endif
-# ifndef EALREADY
-#   define EALREADY 103
-# endif
-# ifndef ECONNREFUSED
-#   define ECONNREFUSED 107
-# endif
-# ifndef ECONNRESET
-#   define ECONNRESET 108
-# endif
-# ifndef EXDEV
-#   define EXDEV 18
-# endif
-# ifndef EDESTADDRREQ
-#   define EDESTADDRREQ 109
-# endif
-# ifndef EBUSY
-#   define EBUSY 16
-# endif
-# ifndef ENOTEMPTY
-#   define ENOTEMPTY 41
-# endif
-# ifndef ENOEXEC
-#   define ENOEXEC 8
-# endif
-# ifndef EEXIST
-#   define EEXIST 17
-# endif
-# ifndef EFBIG
-#   define EFBIG 27
-# endif
-# ifndef ENAMETOOLONG
-#   define ENAMETOOLONG 38
-# endif
-# ifndef ENOSYS
-#   define ENOSYS 40
-# endif
-# ifndef EHOSTUNREACH
-#   define EHOSTUNREACH 110
-# endif
-# ifndef EIDRM
-#   define EIDRM 111
-# endif
-# ifndef EILSEQ
-#   define EILSEQ 42
-# endif
-# ifndef ENOTTY
-#   define ENOTTY 25
-# endif
-# ifndef EINTR
-#   define EINTR 4
-# endif
-# ifndef EINVAL
-#   define EINVAL 22
-# endif
-# ifndef ESPIPE
-#   define ESPIPE 29
-# endif
-# ifndef EIO
-#   define EIO 5
-# endif
-# ifndef EISDIR
-#   define EISDIR 21
-# endif
-# ifndef EMSGSIZE
-#   define EMSGSIZE 115
-# endif
-# ifndef ENETDOWN
-#   define ENETDOWN 116
-# endif
-# ifndef ENETRESET
-#   define ENETRESET 117
-# endif
-# ifndef ENETUNREACH
-#   define ENETUNREACH 118
-# endif
-# ifndef ENOBUFS
-#   define ENOBUFS 119
-# endif
-# ifndef ECHILD
-#   define ECHILD 10
-# endif
-# ifndef ENOLINK
-#   define ENOLINK 121
-# endif
-# ifndef ENOLCK
-#   define ENOLCK 39
-# endif
-# ifndef ENODATA
-#   define ENODATA 120
-# endif
-# ifndef ENOMSG
-#   define ENOMSG 122
-# endif
-# ifndef ENOPROTOOPT
-#   define ENOPROTOOPT 123
-# endif
-# ifndef ENOSPC
-#   define ENOSPC 28
-# endif
-# ifndef ENOSR
-#   define ENOSR 124
-# endif
-# ifndef ENXIO
-#   define ENXIO 6
-# endif
-# ifndef ENODEV
-#   define ENODEV 19
-# endif
-# ifndef ENOENT
-#   define ENOENT 2
-# endif
-# ifndef ESRCH
-#   define ESRCH 3
-# endif
-# ifndef ENOTDIR
-#   define ENOTDIR 20
-# endif
-# ifndef ENOTSOCK
-#   define ENOTSOCK 128
-# endif
-# ifndef ENOSTR
-#   define ENOSTR 125
-# endif
-# ifndef ENOTCONN
-#   define ENOTCONN 126
-# endif
-# ifndef ENOMEM
-#   define ENOMEM 12
-# endif
-# ifndef ENOTSUP
-#   define ENOTSUP 129
-# endif
-# ifndef ECANCELED
-#   define ECANCELED 105
-# endif
-# ifndef EINPROGRESS
-#   define EINPROGRESS 112
-# endif
-# ifndef EPERM
-#   define EPERM 1
-# endif
-# ifndef EOPNOTSUPP
-#   define EOPNOTSUPP 130
-# endif
-# ifndef EWOULDBLOCK
-#   define EWOULDBLOCK 140
-# endif
-# ifndef EOWNERDEAD
-#   define EOWNERDEAD 133
-# endif
-# ifndef EACCES
-#   define EACCES 13
-# endif
-# ifndef EPROTO
-#   define EPROTO 134
-# endif
-# ifndef EPROTONOSUPPORT
-#   define EPROTONOSUPPORT 135
-# endif
-# ifndef EROFS
-#   define EROFS 30
-# endif
-# ifndef EDEADLK
-#   define EDEADLK 36
-# endif
-# ifndef EAGAIN
-#   define EAGAIN 11
-# endif
-# ifndef ERANGE
-#   define ERANGE 34
-# endif
-# ifndef ENOTRECOVERABLE
-#   define ENOTRECOVERABLE 127
-# endif
-# ifndef ETIME
-#   define ETIME 137
-# endif
-# ifndef ETXTBSY
-#   define ETXTBSY 139
-# endif
-# ifndef ETIMEDOUT
-#   define ETIMEDOUT 138
-# endif
-# ifndef ENFILE
-#   define ENFILE 23
-# endif
-# ifndef EMFILE
-#   define EMFILE 24
-# endif
-# ifndef EMLINK
-#   define EMLINK 31
-# endif
-# ifndef ELOOP
-#   define ELOOP 114
-# endif
-# ifndef EOVERFLOW
-#   define EOVERFLOW 132
-# endif
-# ifndef EPROTOTYPE
-#   define EPROTOTYPE 136
-# endif
-#endif
-
-namespace llvm {
-
-// is_error_code_enum
-
-template <class Tp> struct is_error_code_enum : public std::false_type {};
-
-// is_error_condition_enum
-
-template <class Tp> struct is_error_condition_enum : public std::false_type {};
-
-// Some error codes are not present on all platforms, so we provide equivalents
-// for them:
-
-//enum class errc
-struct errc {
-enum _ {
-  success                             = 0,
-  address_family_not_supported        = EAFNOSUPPORT,
-  address_in_use                      = EADDRINUSE,
-  address_not_available               = EADDRNOTAVAIL,
-  already_connected                   = EISCONN,
-  argument_list_too_long              = E2BIG,
-  argument_out_of_domain              = EDOM,
-  bad_address                         = EFAULT,
-  bad_file_descriptor                 = EBADF,
-#ifdef EBADMSG
-  bad_message                         = EBADMSG,
-#else
-  bad_message                         = EINVAL,
-#endif
-  broken_pipe                         = EPIPE,
-  connection_aborted                  = ECONNABORTED,
-  connection_already_in_progress      = EALREADY,
-  connection_refused                  = ECONNREFUSED,
-  connection_reset                    = ECONNRESET,
-  cross_device_link                   = EXDEV,
-  destination_address_required        = EDESTADDRREQ,
-  device_or_resource_busy             = EBUSY,
-  directory_not_empty                 = ENOTEMPTY,
-  executable_format_error             = ENOEXEC,
-  file_exists                         = EEXIST,
-  file_too_large                      = EFBIG,
-  filename_too_long                   = ENAMETOOLONG,
-  function_not_supported              = ENOSYS,
-  host_unreachable                    = EHOSTUNREACH,
-  identifier_removed                  = EIDRM,
-  illegal_byte_sequence               = EILSEQ,
-  inappropriate_io_control_operation  = ENOTTY,
-  interrupted                         = EINTR,
-  invalid_argument                    = EINVAL,
-  invalid_seek                        = ESPIPE,
-  io_error                            = EIO,
-  is_a_directory                      = EISDIR,
-  message_size                        = EMSGSIZE,
-  network_down                        = ENETDOWN,
-  network_reset                       = ENETRESET,
-  network_unreachable                 = ENETUNREACH,
-  no_buffer_space                     = ENOBUFS,
-  no_child_process                    = ECHILD,
-#ifdef ENOLINK
-  no_link                             = ENOLINK,
-#else
-  no_link                             = EINVAL,
-#endif
-  no_lock_available                   = ENOLCK,
-#ifdef ENODATA
-  no_message_available                = ENODATA,
-#else
-  no_message_available                = ENOMSG,
-#endif
-  no_message                          = ENOMSG,
-  no_protocol_option                  = ENOPROTOOPT,
-  no_space_on_device                  = ENOSPC,
-#ifdef ENOSR
-  no_stream_resources                 = ENOSR,
-#else
-  no_stream_resources                 = ENOMEM,
-#endif
-  no_such_device_or_address           = ENXIO,
-  no_such_device                      = ENODEV,
-  no_such_file_or_directory           = ENOENT,
-  no_such_process                     = ESRCH,
-  not_a_directory                     = ENOTDIR,
-  not_a_socket                        = ENOTSOCK,
-#ifdef ENOSTR
-  not_a_stream                        = ENOSTR,
-#else
-  not_a_stream                        = EINVAL,
-#endif
-  not_connected                       = ENOTCONN,
-  not_enough_memory                   = ENOMEM,
-  not_supported                       = ENOTSUP,
-#ifdef ECANCELED
-  operation_canceled                  = ECANCELED,
-#else
-  operation_canceled                  = EINVAL,
-#endif
-  operation_in_progress               = EINPROGRESS,
-  operation_not_permitted             = EPERM,
-  operation_not_supported             = EOPNOTSUPP,
-  operation_would_block               = EWOULDBLOCK,
-#ifdef EOWNERDEAD
-  owner_dead                          = EOWNERDEAD,
-#else
-  owner_dead                          = EINVAL,
-#endif
-  permission_denied                   = EACCES,
-#ifdef EPROTO
-  protocol_error                      = EPROTO,
-#else
-  protocol_error                      = EINVAL,
-#endif
-  protocol_not_supported              = EPROTONOSUPPORT,
-  read_only_file_system               = EROFS,
-  resource_deadlock_would_occur       = EDEADLK,
-  resource_unavailable_try_again      = EAGAIN,
-  result_out_of_range                 = ERANGE,
-#ifdef ENOTRECOVERABLE
-  state_not_recoverable               = ENOTRECOVERABLE,
-#else
-  state_not_recoverable               = EINVAL,
-#endif
-#ifdef ETIME
-  stream_timeout                      = ETIME,
-#else
-  stream_timeout                      = ETIMEDOUT,
-#endif
-  text_file_busy                      = ETXTBSY,
-  timed_out                           = ETIMEDOUT,
-  too_many_files_open_in_system       = ENFILE,
-  too_many_files_open                 = EMFILE,
-  too_many_links                      = EMLINK,
-  too_many_symbolic_link_levels       = ELOOP,
-  value_too_large                     = EOVERFLOW,
-  wrong_protocol_type                 = EPROTOTYPE
-};
-
-  _ v_;
-
-  errc(_ v) : v_(v) {}
-  operator int() const {return v_;}
-};
-
-template <> struct is_error_condition_enum<errc> : std::true_type { };
-
-template <> struct is_error_condition_enum<errc::_> : std::true_type { };
-
-class error_condition;
-class error_code;
-
-// class error_category
-
-class _do_message;
-
-class error_category
-{
-public:
-  virtual ~error_category();
-
-  error_category();
-private:
-  error_category(const error_category&) LLVM_DELETED_FUNCTION;
-  error_category& operator=(const error_category&) LLVM_DELETED_FUNCTION;
-
-public:
-  virtual const char* name() const = 0;
-  virtual error_condition default_error_condition(int _ev) const;
-  virtual bool equivalent(int _code, const error_condition& _condition) const;
-  virtual bool equivalent(const error_code& _code, int _condition) const;
-  virtual std::string message(int _ev) const = 0;
-
-  bool operator==(const error_category& _rhs) const {return this == &_rhs;}
-
-  bool operator!=(const error_category& _rhs) const {return !(*this == _rhs);}
-
-  bool operator< (const error_category& _rhs) const {return this < &_rhs;}
-
-  friend class _do_message;
-};
-
-class _do_message : public error_category
-{
-public:
-  std::string message(int ev) const override;
-};
-
-const error_category& generic_category();
-const error_category& system_category();
-
-/// Get the error_category used for errno values from POSIX functions. This is
-/// the same as the system_category on POSIX systems, but is the same as the
-/// generic_category on Windows.
-const error_category& posix_category();
-
-class error_condition
-{
-  int _val_;
-  const error_category* _cat_;
-public:
-  error_condition() : _val_(0), _cat_(&generic_category()) {}
-
-  error_condition(int _val, const error_category& _cat)
-    : _val_(_val), _cat_(&_cat) {}
-
-  template <class E>
-  error_condition(E _e, typename std::enable_if<
-                          is_error_condition_enum<E>::value
-                        >::type* = 0)
-    {*this = make_error_condition(_e);}
-
-  void assign(int _val, const error_category& _cat) {
-    _val_ = _val;
-    _cat_ = &_cat;
-  }
-
-  template <class E>
-  typename std::enable_if<is_error_condition_enum<E>::value,
-                          error_condition &>::type
-  operator=(E _e) {
-    *this = make_error_condition(_e);
-    return *this;
-  }
-
-  void clear() {
-    _val_ = 0;
-    _cat_ = &generic_category();
-  }
-
-  int value() const {return _val_;}
-
-  const error_category& category() const {return *_cat_;}
-  std::string message() const;
-
-  typedef void (*unspecified_bool_type)();
-  static void unspecified_bool_true() {}
-
-  operator unspecified_bool_type() const { // true if error
-    return _val_ == 0 ? nullptr : unspecified_bool_true;
-  }
-};
-
-inline error_condition make_error_condition(errc _e) {
-  return error_condition(static_cast<int>(_e), generic_category());
-}
-
-inline bool operator<(const error_condition& _x, const error_condition& _y) {
-  return _x.category() < _y.category()
-      || (_x.category() == _y.category() && _x.value() < _y.value());
-}
-
-// error_code
-
-class error_code {
-  int _val_;
-  const error_category* _cat_;
-public:
-  error_code() : _val_(0), _cat_(&system_category()) {}
-
-  static error_code success() {
-    return error_code();
-  }
-
-  error_code(int _val, const error_category& _cat)
-    : _val_(_val), _cat_(&_cat) {}
-
-  template <class E>
-  error_code(E _e, typename std::enable_if<
-                     is_error_code_enum<E>::value
-                   >::type* = 0) {
-    *this = make_error_code(_e);
-  }
-
-  void assign(int _val, const error_category& _cat) {
-      _val_ = _val;
-      _cat_ = &_cat;
-  }
-
-  template <class E>
-  typename std::enable_if<is_error_code_enum<E>::value, error_code &>::type
-  operator=(E _e) {
-    *this = make_error_code(_e);
-    return *this;
-  }
-
-  void clear() {
-    _val_ = 0;
-    _cat_ = &system_category();
-  }
-
-  int value() const {return _val_;}
-
-  const error_category& category() const {return *_cat_;}
-
-  error_condition default_error_condition() const
-    {return _cat_->default_error_condition(_val_);}
-
-  std::string message() const;
-
-  typedef void (*unspecified_bool_type)();
-  static void unspecified_bool_true() {}
-
-  operator unspecified_bool_type() const { // true if error
-    return _val_ == 0 ? nullptr : unspecified_bool_true;
-  }
-};
-
-inline error_code make_error_code(errc _e) {
-  return error_code(static_cast<int>(_e), generic_category());
-}
-
-inline bool operator<(const error_code& _x, const error_code& _y) {
-  return _x.category() < _y.category()
-      || (_x.category() == _y.category() && _x.value() < _y.value());
-}
-
-inline bool operator==(const error_code& _x, const error_code& _y) {
-  return _x.category() == _y.category() && _x.value() == _y.value();
-}
-
-inline bool operator==(const error_code& _x, const error_condition& _y) {
-  return _x.category().equivalent(_x.value(), _y)
-      || _y.category().equivalent(_x, _y.value());
-}
-
-inline bool operator==(const error_condition& _x, const error_code& _y) {
-  return _y == _x;
-}
-
-inline bool operator==(const error_condition& _x, const error_condition& _y) {
-   return _x.category() == _y.category() && _x.value() == _y.value();
-}
-
-inline bool operator!=(const error_code& _x, const error_code& _y) {
-  return !(_x == _y);
-}
-
-inline bool operator!=(const error_code& _x, const error_condition& _y) {
-  return !(_x == _y);
-}
-
-inline bool operator!=(const error_condition& _x, const error_code& _y) {
-  return !(_x == _y);
-}
-
-inline bool operator!=(const error_condition& _x, const error_condition& _y) {
-  return !(_x == _y);
-}
-
-// Windows errors.
-
-//  To construct an error_code after an API error:
-//
-//      error_code( ::GetLastError(), system_category() )
-struct windows_error {
-enum _ {
-  success = 0,
-  // These names and values are based on Windows WinError.h
-  // This is not a complete list. Add to this list if you need to explicitly
-  // check for it.
-  invalid_function        = 1, // ERROR_INVALID_FUNCTION,
-  file_not_found          = 2, // ERROR_FILE_NOT_FOUND,
-  path_not_found          = 3, // ERROR_PATH_NOT_FOUND,
-  too_many_open_files     = 4, // ERROR_TOO_MANY_OPEN_FILES,
-  access_denied           = 5, // ERROR_ACCESS_DENIED,
-  invalid_handle          = 6, // ERROR_INVALID_HANDLE,
-  arena_trashed           = 7, // ERROR_ARENA_TRASHED,
-  not_enough_memory       = 8, // ERROR_NOT_ENOUGH_MEMORY,
-  invalid_block           = 9, // ERROR_INVALID_BLOCK,
-  bad_environment         = 10, // ERROR_BAD_ENVIRONMENT,
-  bad_format              = 11, // ERROR_BAD_FORMAT,
-  invalid_access          = 12, // ERROR_INVALID_ACCESS,
-  outofmemory             = 14, // ERROR_OUTOFMEMORY,
-  invalid_drive           = 15, // ERROR_INVALID_DRIVE,
-  current_directory       = 16, // ERROR_CURRENT_DIRECTORY,
-  not_same_device         = 17, // ERROR_NOT_SAME_DEVICE,
-  no_more_files           = 18, // ERROR_NO_MORE_FILES,
-  write_protect           = 19, // ERROR_WRITE_PROTECT,
-  bad_unit                = 20, // ERROR_BAD_UNIT,
-  not_ready               = 21, // ERROR_NOT_READY,
-  bad_command             = 22, // ERROR_BAD_COMMAND,
-  crc                     = 23, // ERROR_CRC,
-  bad_length              = 24, // ERROR_BAD_LENGTH,
-  seek                    = 25, // ERROR_SEEK,
-  not_dos_disk            = 26, // ERROR_NOT_DOS_DISK,
-  sector_not_found        = 27, // ERROR_SECTOR_NOT_FOUND,
-  out_of_paper            = 28, // ERROR_OUT_OF_PAPER,
-  write_fault             = 29, // ERROR_WRITE_FAULT,
-  read_fault              = 30, // ERROR_READ_FAULT,
-  gen_failure             = 31, // ERROR_GEN_FAILURE,
-  sharing_violation       = 32, // ERROR_SHARING_VIOLATION,
-  lock_violation          = 33, // ERROR_LOCK_VIOLATION,
-  wrong_disk              = 34, // ERROR_WRONG_DISK,
-  sharing_buffer_exceeded = 36, // ERROR_SHARING_BUFFER_EXCEEDED,
-  handle_eof              = 38, // ERROR_HANDLE_EOF,
-  handle_disk_full        = 39, // ERROR_HANDLE_DISK_FULL,
-  rem_not_list            = 51, // ERROR_REM_NOT_LIST,
-  dup_name                = 52, // ERROR_DUP_NAME,
-  bad_net_path            = 53, // ERROR_BAD_NETPATH,
-  network_busy            = 54, // ERROR_NETWORK_BUSY,
-  file_exists             = 80, // ERROR_FILE_EXISTS,
-  cannot_make             = 82, // ERROR_CANNOT_MAKE,
-  broken_pipe             = 109, // ERROR_BROKEN_PIPE,
-  open_failed             = 110, // ERROR_OPEN_FAILED,
-  buffer_overflow         = 111, // ERROR_BUFFER_OVERFLOW,
-  disk_full               = 112, // ERROR_DISK_FULL,
-  insufficient_buffer     = 122, // ERROR_INSUFFICIENT_BUFFER,
-  lock_failed             = 167, // ERROR_LOCK_FAILED,
-  busy                    = 170, // ERROR_BUSY,
-  cancel_violation        = 173, // ERROR_CANCEL_VIOLATION,
-  already_exists          = 183  // ERROR_ALREADY_EXISTS
-};
-  _ v_;
-
-  windows_error(_ v) : v_(v) {}
-  explicit windows_error(int v) : v_(_(v)) {}
-  operator int() const {return v_;}
-};
-
-
-template <> struct is_error_code_enum<windows_error> : std::true_type { };
-
-template <> struct is_error_code_enum<windows_error::_> : std::true_type { };
-
-inline error_code make_error_code(windows_error e) {
-  return error_code(static_cast<int>(e), system_category());
-}
-
-} // end namespace llvm
-
-#endif
diff --git a/utils/TableGen/SetTheory.h b/include/llvm/TableGen/SetTheory.h
index 5baed79..5baed79 100644
--- a/utils/TableGen/SetTheory.h
+++ b/include/llvm/TableGen/SetTheory.h
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index 7d1f19c..f77cc7a 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -587,6 +587,11 @@ class Operand<ValueType ty> : DAGOperand {
   string OperandType = "OPERAND_UNKNOWN";
   dag MIOperandInfo = (ops);
 
+  // MCOperandPredicate - Optionally, a code fragment operating on
+  // const MCOperand &MCOp, and returning a bool, to indicate if
+  // the value of MCOp is valid for the specific subclass of Operand
+  code MCOperandPredicate;
+
   // ParserMatchClass - The "match class" that operands of this type fit
   // in. Match classes are used to define the order in which instructions are
   // match, to ensure that which instructions gets matched is deterministic.
diff --git a/include/llvm/Target/TargetFrameLowering.h b/include/llvm/Target/TargetFrameLowering.h
index 7c42e23..bfddd06 100644
--- a/include/llvm/Target/TargetFrameLowering.h
+++ b/include/llvm/Target/TargetFrameLowering.h
@@ -93,6 +93,19 @@ public:
   /// stack pointer.
   virtual bool isFPCloseToIncomingSP() const { return true; }
 
+  /// assignCalleeSavedSpillSlots - Allows target to override spill slot
+  /// assignment logic.  If implemented, assignCalleeSavedSpillSlots() should
+  /// assign frame slots to all CSI entries and return true.  If this method
+  /// returns false, spill slots will be assigned using generic implementation.
+  /// assignCalleeSavedSpillSlots() may add, delete or rearrange elements of
+  /// CSI.
+  virtual bool
+  assignCalleeSavedSpillSlots(MachineFunction &MF,
+                              const TargetRegisterInfo *TRI,
+                              std::vector<CalleeSavedInfo> &CSI) const {
+    return false;
+  }
+
   /// getCalleeSavedSpillSlots - This method returns a pointer to an array of
   /// pairs, that contains an entry for each callee saved register that must be
   /// spilled to a particular stack location if it is spilled.
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index 165b35f..87e7c14 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -29,6 +29,7 @@ class MachineRegisterInfo;
 class MDNode;
 class MCInst;
 class MCSchedModel;
+class MCSymbolRefExpr;
 class SDNode;
 class ScheduleHazardRecognizer;
 class SelectionDAG;
@@ -36,6 +37,7 @@ class ScheduleDAG;
 class TargetRegisterClass;
 class TargetRegisterInfo;
 class BranchProbability;
+class TargetSubtargetInfo;
 
 template<class T> class SmallVectorImpl;
 
@@ -321,6 +323,20 @@ public:
   virtual void ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
                                        MachineBasicBlock *NewDest) const;
 
+  /// getUnconditionalBranch - Get an instruction that performs an unconditional
+  /// branch to the given symbol.
+  virtual void
+  getUnconditionalBranch(MCInst &MI,
+                         const MCSymbolRefExpr *BranchTarget) const {
+    llvm_unreachable("Target didn't implement "
+                     "TargetInstrInfo::getUnconditionalBranch!");
+  }
+
+  /// getTrap - Get a machine trap instruction
+  virtual void getTrap(MCInst &MI) const {
+    llvm_unreachable("Target didn't implement TargetInstrInfo::getTrap!");
+  }
+
   /// isLegalToSplitMBBAt - Return true if it's legal to split the given basic
   /// block at the specified instruction (i.e. instruction would be the start
   /// of a new basic block).
@@ -728,7 +744,7 @@ public:
   /// use for this target when scheduling the machine instructions before
   /// register allocation.
   virtual ScheduleHazardRecognizer*
-  CreateTargetHazardRecognizer(const TargetMachine *TM,
+  CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
                                const ScheduleDAG *DAG) const;
 
   /// CreateTargetMIHazardRecognizer - Allocate and return a hazard recognizer
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 60a4079..5e9978d 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -185,10 +185,15 @@ public:
   /// Return true if the target has BitExtract instructions.
   bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }
 
-  /// Return true if a vector of the given type should be split
-  /// (TypeSplitVector) instead of promoted (TypePromoteInteger) during type
-  /// legalization.
-  virtual bool shouldSplitVectorType(EVT /*VT*/) const { return false; }
+  /// Return the preferred vector type legalization action.
+  virtual TargetLoweringBase::LegalizeTypeAction
+  getPreferredVectorAction(EVT VT) const {
+    // The default action for one element vectors is to scalarize
+    if (VT.getVectorNumElements() == 1)
+      return TypeScalarizeVector;
+    // The default action for other vectors is to promote
+    return TypePromoteInteger;
+  }
 
   // There are two general methods for expanding a BUILD_VECTOR node:
   //  1. Use SCALAR_TO_VECTOR on the defined scalar values and then shuffle
@@ -279,8 +284,17 @@ public:
   /// selects between the two kinds.  For example on X86 a scalar boolean should
   /// be zero extended from i1, while the elements of a vector of booleans
   /// should be sign extended from i1.
-  BooleanContent getBooleanContents(bool isVec) const {
-    return isVec ? BooleanVectorContents : BooleanContents;
+  ///
+  /// Some cpus also treat floating point types the same way as they treat
+  /// vectors instead of the way they treat scalars.
+  BooleanContent getBooleanContents(bool isVec, bool isFloat) const {
+    if (isVec)
+      return BooleanVectorContents;
+    return isFloat ? BooleanFloatContents : BooleanContents;
+  }
+
+  BooleanContent getBooleanContents(EVT Type) const {
+    return getBooleanContents(Type.isVector(), Type.isFloatingPoint());
   }
 
   /// Return target scheduling preference.
@@ -711,6 +725,13 @@ public:
   /// reduce runtime.
   virtual bool ShouldShrinkFPConstant(EVT) const { return true; }
 
+  /// When splitting a value of the specified type into parts, does the Lo
+  /// or Hi part come first?  This usually follows the endianness, except
+  /// for ppcf128, where the Hi part always comes first.
+  bool hasBigEndianPartOrdering(EVT VT) const {
+    return isBigEndian() || VT == MVT::ppcf128;
+  }
+
   /// If true, the target has custom DAG combine transformations that it can
   /// perform for the specified node.
   bool hasTargetDAGCombine(ISD::NodeType NT) const {
@@ -938,9 +959,19 @@ public:
   virtual void resetOperationActions() {}
 
 protected:
-  /// Specify how the target extends the result of a boolean value from i1 to a
-  /// wider type.  See getBooleanContents.
-  void setBooleanContents(BooleanContent Ty) { BooleanContents = Ty; }
+  /// Specify how the target extends the result of integer and floating point
+  /// boolean values from i1 to a wider type.  See getBooleanContents.
+  void setBooleanContents(BooleanContent Ty) {
+    BooleanContents = Ty;
+    BooleanFloatContents = Ty;
+  }
+
+  /// Specify how the target extends the result of integer and floating point
+  /// boolean values from i1 to a wider type.  See getBooleanContents.
+  void setBooleanContents(BooleanContent IntTy, BooleanContent FloatTy) {
+    BooleanContents = IntTy;
+    BooleanFloatContents = FloatTy;
+  }
 
   /// Specify how the target extends the result of a vector boolean value from a
   /// vector of i1 to a wider type.  See getBooleanContents.
@@ -1484,6 +1515,10 @@ private:
   /// a type wider than i1. See getBooleanContents.
   BooleanContent BooleanContents;
 
+  /// Information about the contents of the high-bits in boolean values held in
+  /// a type wider than i1. See getBooleanContents.
+  BooleanContent BooleanFloatContents;
+
   /// Information about the contents of the high-bits in boolean vector values
   /// when the element type is wider than i1. See getBooleanContents.
   BooleanContent BooleanVectorContents;
@@ -2111,7 +2146,7 @@ public:
     unsigned NumFixedArgs;
     CallingConv::ID CallConv;
     SDValue Callee;
-    ArgListTy *Args;
+    ArgListTy Args;
     SelectionDAG &DAG;
     SDLoc DL;
     ImmutableCallSite *CS;
@@ -2123,7 +2158,7 @@ public:
       : RetTy(nullptr), RetSExt(false), RetZExt(false), IsVarArg(false),
         IsInReg(false), DoesNotReturn(false), IsReturnValueUsed(true),
         IsTailCall(false), NumFixedArgs(-1), CallConv(CallingConv::C),
-        Args(nullptr), DAG(DAG), CS(nullptr) {}
+        DAG(DAG), CS(nullptr) {}
 
     CallLoweringInfo &setDebugLoc(SDLoc dl) {
       DL = dl;
@@ -2136,19 +2171,19 @@ public:
     }
 
     CallLoweringInfo &setCallee(CallingConv::ID CC, Type *ResultType,
-                                SDValue Target, ArgListTy *ArgsList,
+                                SDValue Target, ArgListTy &&ArgsList,
                                 unsigned FixedArgs = -1) {
       RetTy = ResultType;
       Callee = Target;
       CallConv = CC;
       NumFixedArgs =
-        (FixedArgs == static_cast<unsigned>(-1) ? Args->size() : FixedArgs);
-      Args = ArgsList;
+        (FixedArgs == static_cast<unsigned>(-1) ? Args.size() : FixedArgs);
+      Args = std::move(ArgsList);
       return *this;
     }
 
     CallLoweringInfo &setCallee(Type *ResultType, FunctionType *FTy,
-                                SDValue Target, ArgListTy *ArgsList,
+                                SDValue Target, ArgListTy &&ArgsList,
                                 ImmutableCallSite &Call) {
       RetTy = ResultType;
 
@@ -2163,7 +2198,7 @@ public:
 
       CallConv = Call.getCallingConv();
       NumFixedArgs = FTy->getNumParams();
-      Args = ArgsList;
+      Args = std::move(ArgsList);
 
       CS = &Call;
 
@@ -2206,8 +2241,7 @@ public:
     }
 
     ArgListTy &getArgs() {
-      assert(Args && "Arguments must be set before accessing them");
-      return *Args;
+      return Args;
     }
   };
 
diff --git a/include/llvm/Target/TargetLoweringObjectFile.h b/include/llvm/Target/TargetLoweringObjectFile.h
index 374a163..419eced 100644
--- a/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/include/llvm/Target/TargetLoweringObjectFile.h
@@ -131,14 +131,12 @@ public:
                     MCStreamer &Streamer) const;
 
   virtual const MCSection *getStaticCtorSection(unsigned Priority,
-                                                const MCSymbol *KeySym,
-                                                const MCSection *KeySec) const {
+                                                const MCSymbol *KeySym) const {
     return StaticCtorSection;
   }
 
   virtual const MCSection *getStaticDtorSection(unsigned Priority,
-                                                const MCSymbol *KeySym,
-                                                const MCSection *KeySec) const {
+                                                const MCSymbol *KeySym) const {
     return StaticDtorSection;
   }
 
diff --git a/include/llvm/Target/TargetOptions.h b/include/llvm/Target/TargetOptions.h
index 636eaf5..922fae5 100644
--- a/include/llvm/Target/TargetOptions.h
+++ b/include/llvm/Target/TargetOptions.h
@@ -39,6 +39,17 @@ namespace llvm {
     };
   }
 
+  namespace JumpTable {
+    enum JumpTableType {
+      Single,          // Use a single table for all indirect jumptable calls.
+      Arity,           // Use one table per number of function parameters.
+      Simplified,      // Use one table per function type, with types projected
+                       // into 4 types: pointer to non-function, struct,
+                       // primitive, and function pointer.
+      Full             // Use one table per unique function type
+    };
+  }
+
   class TargetOptions {
   public:
     TargetOptions()
@@ -54,7 +65,7 @@ namespace llvm {
           CompressDebugSections(false), FunctionSections(false),
           DataSections(false), TrapUnreachable(false), TrapFuncName(""),
           FloatABIType(FloatABI::Default),
-          AllowFPOpFusion(FPOpFusion::Standard) {}
+          AllowFPOpFusion(FPOpFusion::Standard), JTType(JumpTable::Single) {}
 
     /// PrintMachineCode - This flag is enabled when the -print-machineinstrs
     /// option is specified on the command line, and should enable debugging
@@ -205,6 +216,10 @@ namespace llvm {
     /// the value of this option.
     FPOpFusion::FPOpFusionMode AllowFPOpFusion;
 
+    /// JTType - This flag specifies the type of jump-instruction table to
+    /// create for functions that have the jumptable attribute.
+    JumpTable::JumpTableType JTType;
+
     /// Machine level options.
     MCTargetOptions MCOptions;
   };
diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h
index a162297..c6f3fbf 100644
--- a/include/llvm/Target/TargetRegisterInfo.h
+++ b/include/llvm/Target/TargetRegisterInfo.h
@@ -813,12 +813,6 @@ public:
   /// getFrameRegister - This method should return the register used as a base
   /// for values allocated in the current stack frame.
   virtual unsigned getFrameRegister(const MachineFunction &MF) const = 0;
-
-  /// getCompactUnwindRegNum - This function maps the register to the number for
-  /// compact unwind encoding. Return -1 if the register isn't valid.
-  virtual int getCompactUnwindRegNum(unsigned, bool) const {
-    return -1;
-  }
 };
 
 
diff --git a/include/llvm/Target/TargetSelectionDAGInfo.h b/include/llvm/Target/TargetSelectionDAGInfo.h
index 98a5149..78a2db1 100644
--- a/include/llvm/Target/TargetSelectionDAGInfo.h
+++ b/include/llvm/Target/TargetSelectionDAGInfo.h
@@ -37,7 +37,7 @@ protected:
   const DataLayout *getDataLayout() const { return DL; }
 
 public:
-  explicit TargetSelectionDAGInfo(const TargetMachine &TM);
+  explicit TargetSelectionDAGInfo(const DataLayout *DL);
   virtual ~TargetSelectionDAGInfo();
 
   /// EmitTargetCodeForMemcpy - Emit target-specific code that performs a
diff --git a/include/llvm/Target/TargetSubtargetInfo.h b/include/llvm/Target/TargetSubtargetInfo.h
index c0c342b..bbb83ef 100644
--- a/include/llvm/Target/TargetSubtargetInfo.h
+++ b/include/llvm/Target/TargetSubtargetInfo.h
@@ -66,6 +66,16 @@ public:
   /// scheduler. It does not yet disable the postRA scheduler.
   virtual bool enableMachineScheduler() const;
 
+  /// \brief True if the subtarget should run PostMachineScheduler.
+  ///
+  /// This only takes effect if the target has configured the
+  /// PostMachineScheduler pass to run, or if the global cl::opt flag,
+  /// MISchedPostRA, is set.
+  virtual bool enablePostMachineScheduler() const;
+
+  /// \brief True if the subtarget should run the atomic expansion pass.
+  virtual bool enableAtomicExpandLoadLinked() const;
+
   /// \brief Override generic scheduling policy within a region.
   ///
   /// This is a convenient way for targets that don't provide any custom
@@ -90,6 +100,12 @@ public:
                                      AntiDepBreakMode& Mode,
                                      RegClassVector& CriticalPathRCs) const;
 
+  /// \brief True if the subtarget should run the local reassignment
+  /// heuristic of the register allocator.
+  /// This heuristic may be compile time intensive, \p OptLevel provides
+  /// a finer grain to tune the register allocator.
+  virtual bool enableRALocalReassignment(CodeGenOpt::Level OptLevel) const;
+
   /// \brief Enable use of alias analysis during code generation (during MI
   /// scheduling, DAGCombine, etc.).
   virtual bool useAA() const;
diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h
index 023de08..50877d0 100644
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -117,6 +117,7 @@ public:
   bool SLPVectorize;
   bool LoopVectorize;
   bool RerollLoops;
+  bool LoadCombine;
 
 private:
   /// ExtensionList - This is list of all of the extensions that are registered.
diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index 61d5c26..c6a339b 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -64,18 +64,14 @@ ModulePass *createGCOVProfilerPass(const GCOVOptions &Options =
                                    GCOVOptions::getDefault());
 
 // Insert AddressSanitizer (address sanity checking) instrumentation
-FunctionPass *createAddressSanitizerFunctionPass(
-    bool CheckInitOrder = true, bool CheckUseAfterReturn = false,
-    bool CheckLifetime = false, StringRef BlacklistFile = StringRef());
-ModulePass *createAddressSanitizerModulePass(
-    bool CheckInitOrder = true, StringRef BlacklistFile = StringRef());
+FunctionPass *createAddressSanitizerFunctionPass();
+ModulePass *createAddressSanitizerModulePass();
 
 // Insert MemorySanitizer instrumentation (detection of uninitialized reads)
-FunctionPass *createMemorySanitizerPass(int TrackOrigins = 0,
-                                        StringRef BlacklistFile = StringRef());
+FunctionPass *createMemorySanitizerPass(int TrackOrigins = 0);
 
 // Insert ThreadSanitizer (race detection) instrumentation
-FunctionPass *createThreadSanitizerPass(StringRef BlacklistFile = StringRef());
+FunctionPass *createThreadSanitizerPass();
 
 // Insert DataFlowSanitizer (dynamic data flow analysis) instrumentation
 ModulePass *createDataFlowSanitizerPass(StringRef ABIListFile = StringRef(),
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index cf1d655..8ecfd80 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -19,6 +19,7 @@
 
 namespace llvm {
 
+class BasicBlockPass;
 class FunctionPass;
 class Pass;
 class GetElementPtrInst;
@@ -381,6 +382,12 @@ FunctionPass *createAddDiscriminatorsPass();
 //
 FunctionPass *createSeparateConstOffsetFromGEPPass();
 
+//===----------------------------------------------------------------------===//
+//
+// LoadCombine - Combine loads into bigger loads.
+//
+BasicBlockPass *createLoadCombinePass();
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index 6f64269..c0c6906 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -148,7 +148,7 @@ bool FlattenCFG(BasicBlock *BB, AliasAnalysis *AA = nullptr);
 /// and if a predecessor branches to us and one of our successors, fold the
 /// setcc into the predecessor and use logical operations to pick the right
 /// destination.
-bool FoldBranchToCommonDest(BranchInst *BI);
+bool FoldBranchToCommonDest(BranchInst *BI, const DataLayout *DL = nullptr);
 
 /// DemoteRegToStack - This function takes a virtual register computed by an
 /// Instruction and replaces it with a slot in the stack frame, allocated via
diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h
index ee26d83..7e3a74a 100644
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@@ -17,6 +17,7 @@
 namespace llvm {
 class AliasAnalysis;
 class BasicBlock;
+class DataLayout;
 class DominatorTree;
 class Loop;
 class LoopInfo;
@@ -32,7 +33,8 @@ BasicBlock *InsertPreheaderForLoop(Loop *L, Pass *P);
 /// will optionally update \c AliasAnalysis and \c ScalarEvolution analyses if
 /// passed into it.
 bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP,
-                  AliasAnalysis *AA = nullptr, ScalarEvolution *SE = nullptr);
+                  AliasAnalysis *AA = nullptr, ScalarEvolution *SE = nullptr,
+                  const DataLayout *DL = nullptr);
 
 /// \brief Put loop into LCSSA form.
 ///
diff --git a/include/llvm/Transforms/Utils/SpecialCaseList.h b/include/llvm/Transforms/Utils/SpecialCaseList.h
deleted file mode 100644
index 508a6df..0000000
--- a/include/llvm/Transforms/Utils/SpecialCaseList.h
+++ /dev/null
@@ -1,114 +0,0 @@
-//===-- SpecialCaseList.h - special case list for sanitizers ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//===----------------------------------------------------------------------===//
-//
-// This is a utility class for instrumentation passes (like AddressSanitizer
-// or ThreadSanitizer) to avoid instrumenting some functions or global
-// variables based on a user-supplied list.
-//
-// The list can also specify categories for specific globals, which can be used
-// to instruct an instrumentation pass to treat certain functions or global
-// variables in a specific way, such as by omitting certain aspects of
-// instrumentation while keeping others, or informing the instrumentation pass
-// that a specific uninstrumentable function has certain semantics, thus
-// allowing the pass to instrument callers according to those semantics.
-//
-// For example, AddressSanitizer uses the "init" category for globals whose
-// initializers should not be instrumented, but which in all other respects
-// should be instrumented.
-//
-// Each line contains a prefix, followed by a colon and a wild card expression,
-// followed optionally by an equals sign and an instrumentation-specific
-// category.  Empty lines and lines starting with "#" are ignored.
-// ---
-// # Blacklisted items:
-// fun:*_ZN4base6subtle*
-// global:*global_with_bad_access_or_initialization*
-// global:*global_with_initialization_issues*=init
-// type:*Namespace::ClassName*=init
-// src:file_with_tricky_code.cc
-// src:ignore-global-initializers-issues.cc=init
-//
-// # Functions with pure functional semantics:
-// fun:cos=functional
-// fun:sin=functional
-// ---
-// Note that the wild card is in fact an llvm::Regex, but * is automatically
-// replaced with .*
-// This is similar to the "ignore" feature of ThreadSanitizer.
-// http://code.google.com/p/data-race-test/wiki/ThreadSanitizerIgnores
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_UTILS_SPECIALCASELIST_H
-#define LLVM_TRANSFORMS_UTILS_SPECIALCASELIST_H
-
-#include "llvm/ADT/StringMap.h"
-
-namespace llvm {
-class Function;
-class GlobalAlias;
-class GlobalVariable;
-class MemoryBuffer;
-class Module;
-class Regex;
-class StringRef;
-
-class SpecialCaseList {
- public:
-  /// Parses the special case list from a file. If Path is empty, returns
-  /// an empty special case list. On failure, returns 0 and writes an error
-  /// message to string.
-  static SpecialCaseList *create(const StringRef Path, std::string &Error);
-  /// Parses the special case list from a memory buffer. On failure, returns
-  /// 0 and writes an error message to string.
-  static SpecialCaseList *create(const MemoryBuffer *MB, std::string &Error);
-  /// Parses the special case list from a file. On failure, reports a fatal
-  /// error.
-  static SpecialCaseList *createOrDie(const StringRef Path);
-
-  ~SpecialCaseList();
-
-  /// Returns whether either this function or its source file are listed in the
-  /// given category, which may be omitted to search the empty category.
-  bool isIn(const Function &F, const StringRef Category = StringRef()) const;
-
-  /// Returns whether this global, its type or its source file are listed in the
-  /// given category, which may be omitted to search the empty category.
-  bool isIn(const GlobalVariable &G,
-            const StringRef Category = StringRef()) const;
-
-  /// Returns whether this global alias is listed in the given category, which
-  /// may be omitted to search the empty category.
-  ///
-  /// If GA aliases a function, the alias's name is matched as a function name
-  /// would be.  Similarly, aliases of globals are matched like globals.
-  bool isIn(const GlobalAlias &GA,
-            const StringRef Category = StringRef()) const;
-
-  /// Returns whether this module is listed in the given category, which may be
-  /// omitted to search the empty category.
-  bool isIn(const Module &M, const StringRef Category = StringRef()) const;
-
- private:
-  SpecialCaseList(SpecialCaseList const &) LLVM_DELETED_FUNCTION;
-  SpecialCaseList &operator=(SpecialCaseList const &) LLVM_DELETED_FUNCTION;
-
-  struct Entry;
-  StringMap<StringMap<Entry> > Entries;
-
-  SpecialCaseList();
-  /// Parses just-constructed SpecialCaseList entries from a memory buffer.
-  bool parse(const MemoryBuffer *MB, std::string &Error);
-
-  bool inSectionCategory(const StringRef Section, const StringRef Query,
-                         const StringRef Category) const;
-};
-
-}  // namespace llvm
-
-#endif  // LLVM_TRANSFORMS_UTILS_SPECIALCASELIST_H
diff --git a/include/llvm/Transforms/Utils/VectorUtils.h b/include/llvm/Transforms/Utils/VectorUtils.h
index e1d6c56..44a7149 100644
--- a/include/llvm/Transforms/Utils/VectorUtils.h
+++ b/include/llvm/Transforms/Utils/VectorUtils.h
@@ -48,12 +48,27 @@ static inline bool isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::pow:
   case Intrinsic::fma:
   case Intrinsic::fmuladd:
+  case Intrinsic::ctlz:
+  case Intrinsic::cttz:
+  case Intrinsic::powi:
     return true;
   default:
     return false;
   }
 }
 
+static bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID,
+                                         unsigned ScalarOpdIdx) {
+  switch (ID) {
+    case Intrinsic::ctlz:
+    case Intrinsic::cttz:
+    case Intrinsic::powi:
+      return (ScalarOpdIdx == 1);
+    default:
+      return false;
+  }
+}
+
 static Intrinsic::ID checkUnaryFloatSignature(const CallInst &I,
                                               Intrinsic::ID ValidIntrinsicID) {
   if (I.getNumArgOperands() != 1 ||
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index 57237e5..5cde979 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -60,6 +60,13 @@ bool AliasAnalysis::pointsToConstantMemory(const Location &Loc,
   return AA->pointsToConstantMemory(Loc, OrLocal);
 }
 
+AliasAnalysis::Location
+AliasAnalysis::getArgLocation(ImmutableCallSite CS, unsigned ArgIdx,
+                              AliasAnalysis::ModRefResult &Mask) {
+  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
+  return AA->getArgLocation(CS, ArgIdx, Mask);
+}
+
 void AliasAnalysis::deleteValue(Value *V) {
   assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
   AA->deleteValue(V);
@@ -91,22 +98,26 @@ AliasAnalysis::getModRefInfo(ImmutableCallSite CS,
 
   if (onlyAccessesArgPointees(MRB)) {
     bool doesAlias = false;
+    ModRefResult AllArgsMask = NoModRef;
     if (doesAccessArgPointees(MRB)) {
-      MDNode *CSTag = CS.getInstruction()->getMetadata(LLVMContext::MD_tbaa);
       for (ImmutableCallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
            AI != AE; ++AI) {
         const Value *Arg = *AI;
         if (!Arg->getType()->isPointerTy())
           continue;
-        Location CSLoc(Arg, UnknownSize, CSTag);
+        ModRefResult ArgMask;
+        Location CSLoc =
+          getArgLocation(CS, (unsigned) std::distance(CS.arg_begin(), AI),
+                         ArgMask);
         if (!isNoAlias(CSLoc, Loc)) {
           doesAlias = true;
-          break;
+          AllArgsMask = ModRefResult(AllArgsMask | ArgMask);
         }
       }
     }
     if (!doesAlias)
       return NoModRef;
+    Mask = ModRefResult(Mask & AllArgsMask);
   }
 
   // If Loc is a constant memory location, the call definitely could not
@@ -150,14 +161,23 @@ AliasAnalysis::getModRefInfo(ImmutableCallSite CS1, ImmutableCallSite CS2) {
   if (onlyAccessesArgPointees(CS2B)) {
     AliasAnalysis::ModRefResult R = NoModRef;
     if (doesAccessArgPointees(CS2B)) {
-      MDNode *CS2Tag = CS2.getInstruction()->getMetadata(LLVMContext::MD_tbaa);
       for (ImmutableCallSite::arg_iterator
            I = CS2.arg_begin(), E = CS2.arg_end(); I != E; ++I) {
         const Value *Arg = *I;
         if (!Arg->getType()->isPointerTy())
           continue;
-        Location CS2Loc(Arg, UnknownSize, CS2Tag);
-        R = ModRefResult((R | getModRefInfo(CS1, CS2Loc)) & Mask);
+        ModRefResult ArgMask;
+        Location CS2Loc =
+          getArgLocation(CS2, (unsigned) std::distance(CS2.arg_begin(), I),
+                         ArgMask);
+        // ArgMask indicates what CS2 might do to CS2Loc, and the dependence of
+        // CS1 on that location is the inverse.
+        if (ArgMask == Mod)
+          ArgMask = ModRef;
+        else if (ArgMask == Ref)
+          ArgMask = Mod;
+
+        R = ModRefResult((R | (getModRefInfo(CS1, CS2Loc) & ArgMask)) & Mask);
         if (R == Mask)
           break;
       }
@@ -170,14 +190,16 @@ AliasAnalysis::getModRefInfo(ImmutableCallSite CS1, ImmutableCallSite CS2) {
   if (onlyAccessesArgPointees(CS1B)) {
     AliasAnalysis::ModRefResult R = NoModRef;
     if (doesAccessArgPointees(CS1B)) {
-      MDNode *CS1Tag = CS1.getInstruction()->getMetadata(LLVMContext::MD_tbaa);
       for (ImmutableCallSite::arg_iterator
            I = CS1.arg_begin(), E = CS1.arg_end(); I != E; ++I) {
         const Value *Arg = *I;
         if (!Arg->getType()->isPointerTy())
           continue;
-        Location CS1Loc(Arg, UnknownSize, CS1Tag);
-        if (getModRefInfo(CS2, CS1Loc) != NoModRef) {
+        ModRefResult ArgMask;
+        Location CS1Loc =
+          getArgLocation(CS1, (unsigned) std::distance(CS1.arg_begin(), I),
+                         ArgMask);
+        if ((getModRefInfo(CS2, CS1Loc) & ArgMask) != NoModRef) {
           R = Mask;
           break;
         }
diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp
index 01c1c7e..ade940a 100644
--- a/lib/Analysis/Analysis.cpp
+++ b/lib/Analysis/Analysis.cpp
@@ -48,6 +48,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeIVUsersPass(Registry);
   initializeInstCountPass(Registry);
   initializeIntervalPartitionPass(Registry);
+  initializeJumpInstrTableInfoPass(Registry);
   initializeLazyValueInfoPass(Registry);
   initializeLibCallAliasAnalysisPass(Registry);
   initializeLintPass(Registry);
diff --git a/lib/Analysis/Android.mk b/lib/Analysis/Android.mk
index bca673e..4e435a1 100644
--- a/lib/Analysis/Android.mk
+++ b/lib/Analysis/Android.mk
@@ -27,6 +27,7 @@ analysis_SRC_FILES := \
   InstructionSimplify.cpp \
   Interval.cpp \
   IntervalPartition.cpp \
+  JumpInstrTableInfo.cpp \
   LazyCallGraph.cpp \
   LazyValueInfo.cpp \
   LibCallAliasAnalysis.cpp \
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index fe90b84..c50dd4a 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -490,6 +490,10 @@ namespace {
     /// global) or not.
     bool pointsToConstantMemory(const Location &Loc, bool OrLocal) override;
 
+    /// Get the location associated with a pointer argument of a callsite.
+    Location getArgLocation(ImmutableCallSite CS, unsigned ArgIdx,
+                            ModRefResult &Mask) override;
+
     /// getModRefBehavior - Return the behavior when calling the given
     /// call site.
     ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override;
@@ -653,6 +657,21 @@ BasicAliasAnalysis::pointsToConstantMemory(const Location &Loc, bool OrLocal) {
   return Worklist.empty();
 }
 
+static bool isMemsetPattern16(const Function *MS,
+                              const TargetLibraryInfo &TLI) {
+  if (TLI.has(LibFunc::memset_pattern16) &&
+      MS->getName() == "memset_pattern16") {
+    FunctionType *MemsetType = MS->getFunctionType();
+    if (!MemsetType->isVarArg() && MemsetType->getNumParams() == 3 &&
+        isa<PointerType>(MemsetType->getParamType(0)) &&
+        isa<PointerType>(MemsetType->getParamType(1)) &&
+        isa<IntegerType>(MemsetType->getParamType(2)))
+      return true;
+  }
+
+  return false;
+}
+
 /// getModRefBehavior - Return the behavior when calling the given call site.
 AliasAnalysis::ModRefBehavior
 BasicAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
@@ -692,10 +711,93 @@ BasicAliasAnalysis::getModRefBehavior(const Function *F) {
   if (F->onlyReadsMemory())
     Min = OnlyReadsMemory;
 
+  const TargetLibraryInfo &TLI = getAnalysis<TargetLibraryInfo>();
+  if (isMemsetPattern16(F, TLI))
+    Min = OnlyAccessesArgumentPointees;
+
   // Otherwise be conservative.
   return ModRefBehavior(AliasAnalysis::getModRefBehavior(F) & Min);
 }
 
+AliasAnalysis::Location
+BasicAliasAnalysis::getArgLocation(ImmutableCallSite CS, unsigned ArgIdx,
+                                   ModRefResult &Mask) {
+  Location Loc = AliasAnalysis::getArgLocation(CS, ArgIdx, Mask);
+  const TargetLibraryInfo &TLI = getAnalysis<TargetLibraryInfo>();
+  const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction());
+  if (II != nullptr)
+    switch (II->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::memset:
+    case Intrinsic::memcpy:
+    case Intrinsic::memmove: {
+      assert((ArgIdx == 0 || ArgIdx == 1) &&
+             "Invalid argument index for memory intrinsic");
+      if (ConstantInt *LenCI = dyn_cast<ConstantInt>(II->getArgOperand(2)))
+        Loc.Size = LenCI->getZExtValue();
+      assert(Loc.Ptr == II->getArgOperand(ArgIdx) &&
+             "Memory intrinsic location pointer not argument?");
+      Mask = ArgIdx ? Ref : Mod;
+      break;
+    }
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+    case Intrinsic::invariant_start: {
+      assert(ArgIdx == 1 && "Invalid argument index");
+      assert(Loc.Ptr == II->getArgOperand(ArgIdx) &&
+             "Intrinsic location pointer not argument?");
+      Loc.Size = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
+      break;
+    }
+    case Intrinsic::invariant_end: {
+      assert(ArgIdx == 2 && "Invalid argument index");
+      assert(Loc.Ptr == II->getArgOperand(ArgIdx) &&
+             "Intrinsic location pointer not argument?");
+      Loc.Size = cast<ConstantInt>(II->getArgOperand(1))->getZExtValue();
+      break;
+    }
+    case Intrinsic::arm_neon_vld1: {
+      assert(ArgIdx == 0 && "Invalid argument index");
+      assert(Loc.Ptr == II->getArgOperand(ArgIdx) &&
+             "Intrinsic location pointer not argument?");
+      // LLVM's vld1 and vst1 intrinsics currently only support a single
+      // vector register.
+      if (DL)
+        Loc.Size = DL->getTypeStoreSize(II->getType());
+      break;
+    }
+    case Intrinsic::arm_neon_vst1: {
+      assert(ArgIdx == 0 && "Invalid argument index");
+      assert(Loc.Ptr == II->getArgOperand(ArgIdx) &&
+             "Intrinsic location pointer not argument?");
+      if (DL)
+        Loc.Size = DL->getTypeStoreSize(II->getArgOperand(1)->getType());
+      break;
+    }
+    }
+
+  // We can bound the aliasing properties of memset_pattern16 just as we can
+  // for memcpy/memset.  This is particularly important because the
+  // LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16
+  // whenever possible.
+  else if (CS.getCalledFunction() &&
+           isMemsetPattern16(CS.getCalledFunction(), TLI)) {
+    assert((ArgIdx == 0 || ArgIdx == 1) &&
+           "Invalid argument index for memset_pattern16");
+    if (ArgIdx == 1)
+      Loc.Size = 16;
+    else if (const ConstantInt *LenCI =
+             dyn_cast<ConstantInt>(CS.getArgument(2)))
+      Loc.Size = LenCI->getZExtValue();
+    assert(Loc.Ptr == CS.getArgument(ArgIdx) &&
+           "memset_pattern16 location pointer not argument?");
+    Mask = ArgIdx ? Ref : Mod;
+  }
+  // FIXME: Handle memset_pattern4 and memset_pattern8 also.
+
+  return Loc;
+}
+
 /// getModRefInfo - Check to see if the specified callsite can clobber the
 /// specified memory object.  Since we only look at local properties of this
 /// function, we really can't say much about this query.  We do, however, use
@@ -748,124 +850,8 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
       return NoModRef;
   }
 
-  const TargetLibraryInfo &TLI = getAnalysis<TargetLibraryInfo>();
-  ModRefResult Min = ModRef;
-
-  // Finally, handle specific knowledge of intrinsics.
-  const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction());
-  if (II != nullptr)
-    switch (II->getIntrinsicID()) {
-    default: break;
-    case Intrinsic::memcpy:
-    case Intrinsic::memmove: {
-      uint64_t Len = UnknownSize;
-      if (ConstantInt *LenCI = dyn_cast<ConstantInt>(II->getArgOperand(2)))
-        Len = LenCI->getZExtValue();
-      Value *Dest = II->getArgOperand(0);
-      Value *Src = II->getArgOperand(1);
-      // If it can't overlap the source dest, then it doesn't modref the loc.
-      if (isNoAlias(Location(Dest, Len), Loc)) {
-        if (isNoAlias(Location(Src, Len), Loc))
-          return NoModRef;
-        // If it can't overlap the dest, then worst case it reads the loc.
-        Min = Ref;
-      } else if (isNoAlias(Location(Src, Len), Loc)) {
-        // If it can't overlap the source, then worst case it mutates the loc.
-        Min = Mod;
-      }
-      break;
-    }
-    case Intrinsic::memset:
-      // Since memset is 'accesses arguments' only, the AliasAnalysis base class
-      // will handle it for the variable length case.
-      if (ConstantInt *LenCI = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
-        uint64_t Len = LenCI->getZExtValue();
-        Value *Dest = II->getArgOperand(0);
-        if (isNoAlias(Location(Dest, Len), Loc))
-          return NoModRef;
-      }
-      // We know that memset doesn't load anything.
-      Min = Mod;
-      break;
-    case Intrinsic::lifetime_start:
-    case Intrinsic::lifetime_end:
-    case Intrinsic::invariant_start: {
-      uint64_t PtrSize =
-        cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
-      if (isNoAlias(Location(II->getArgOperand(1),
-                             PtrSize,
-                             II->getMetadata(LLVMContext::MD_tbaa)),
-                    Loc))
-        return NoModRef;
-      break;
-    }
-    case Intrinsic::invariant_end: {
-      uint64_t PtrSize =
-        cast<ConstantInt>(II->getArgOperand(1))->getZExtValue();
-      if (isNoAlias(Location(II->getArgOperand(2),
-                             PtrSize,
-                             II->getMetadata(LLVMContext::MD_tbaa)),
-                    Loc))
-        return NoModRef;
-      break;
-    }
-    case Intrinsic::arm_neon_vld1: {
-      // LLVM's vld1 and vst1 intrinsics currently only support a single
-      // vector register.
-      uint64_t Size =
-        DL ? DL->getTypeStoreSize(II->getType()) : UnknownSize;
-      if (isNoAlias(Location(II->getArgOperand(0), Size,
-                             II->getMetadata(LLVMContext::MD_tbaa)),
-                    Loc))
-        return NoModRef;
-      break;
-    }
-    case Intrinsic::arm_neon_vst1: {
-      uint64_t Size =
-        DL ? DL->getTypeStoreSize(II->getArgOperand(1)->getType()) : UnknownSize;
-      if (isNoAlias(Location(II->getArgOperand(0), Size,
-                             II->getMetadata(LLVMContext::MD_tbaa)),
-                    Loc))
-        return NoModRef;
-      break;
-    }
-    }
-
-  // We can bound the aliasing properties of memset_pattern16 just as we can
-  // for memcpy/memset.  This is particularly important because the
-  // LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16
-  // whenever possible.
-  else if (TLI.has(LibFunc::memset_pattern16) &&
-           CS.getCalledFunction() &&
-           CS.getCalledFunction()->getName() == "memset_pattern16") {
-    const Function *MS = CS.getCalledFunction();
-    FunctionType *MemsetType = MS->getFunctionType();
-    if (!MemsetType->isVarArg() && MemsetType->getNumParams() == 3 &&
-        isa<PointerType>(MemsetType->getParamType(0)) &&
-        isa<PointerType>(MemsetType->getParamType(1)) &&
-        isa<IntegerType>(MemsetType->getParamType(2))) {
-      uint64_t Len = UnknownSize;
-      if (const ConstantInt *LenCI = dyn_cast<ConstantInt>(CS.getArgument(2)))
-        Len = LenCI->getZExtValue();
-      const Value *Dest = CS.getArgument(0);
-      const Value *Src = CS.getArgument(1);
-      // If it can't overlap the source dest, then it doesn't modref the loc.
-      if (isNoAlias(Location(Dest, Len), Loc)) {
-        // Always reads 16 bytes of the source.
-        if (isNoAlias(Location(Src, 16), Loc))
-          return NoModRef;
-        // If it can't overlap the dest, then worst case it reads the loc.
-        Min = Ref;
-      // Always reads 16 bytes of the source.
-      } else if (isNoAlias(Location(Src, 16), Loc)) {
-        // If it can't overlap the source, then worst case it mutates the loc.
-        Min = Mod;
-      }
-    }
-  }
-
   // The AliasAnalysis base class has some smarts, lets use them.
-  return ModRefResult(AliasAnalysis::getModRefInfo(CS, Loc) & Min);
+  return AliasAnalysis::getModRefInfo(CS, Loc);
 }
 
 /// aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP instruction
diff --git a/lib/Analysis/BlockFrequencyInfoImpl.cpp b/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 87d93a4..4fd2c11 100644
--- a/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
-#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/Support/raw_ostream.h"
 #include <deque>
@@ -24,298 +23,13 @@ using namespace llvm::bfi_detail;
 
 //===----------------------------------------------------------------------===//
 //
-// UnsignedFloat implementation.
-//
-//===----------------------------------------------------------------------===//
-#ifndef _MSC_VER
-const int32_t UnsignedFloatBase::MaxExponent;
-const int32_t UnsignedFloatBase::MinExponent;
-#endif
-
-static void appendDigit(std::string &Str, unsigned D) {
-  assert(D < 10);
-  Str += '0' + D % 10;
-}
-
-static void appendNumber(std::string &Str, uint64_t N) {
-  while (N) {
-    appendDigit(Str, N % 10);
-    N /= 10;
-  }
-}
-
-static bool doesRoundUp(char Digit) {
-  switch (Digit) {
-  case '5':
-  case '6':
-  case '7':
-  case '8':
-  case '9':
-    return true;
-  default:
-    return false;
-  }
-}
-
-static std::string toStringAPFloat(uint64_t D, int E, unsigned Precision) {
-  assert(E >= UnsignedFloatBase::MinExponent);
-  assert(E <= UnsignedFloatBase::MaxExponent);
-
-  // Find a new E, but don't let it increase past MaxExponent.
-  int LeadingZeros = UnsignedFloatBase::countLeadingZeros64(D);
-  int NewE = std::min(UnsignedFloatBase::MaxExponent, E + 63 - LeadingZeros);
-  int Shift = 63 - (NewE - E);
-  assert(Shift <= LeadingZeros);
-  assert(Shift == LeadingZeros || NewE == UnsignedFloatBase::MaxExponent);
-  D <<= Shift;
-  E = NewE;
-
-  // Check for a denormal.
-  unsigned AdjustedE = E + 16383;
-  if (!(D >> 63)) {
-    assert(E == UnsignedFloatBase::MaxExponent);
-    AdjustedE = 0;
-  }
-
-  // Build the float and print it.
-  uint64_t RawBits[2] = {D, AdjustedE};
-  APFloat Float(APFloat::x87DoubleExtended, APInt(80, RawBits));
-  SmallVector<char, 24> Chars;
-  Float.toString(Chars, Precision, 0);
-  return std::string(Chars.begin(), Chars.end());
-}
-
-static std::string stripTrailingZeros(const std::string &Float) {
-  size_t NonZero = Float.find_last_not_of('0');
-  assert(NonZero != std::string::npos && "no . in floating point string");
-
-  if (Float[NonZero] == '.')
-    ++NonZero;
-
-  return Float.substr(0, NonZero + 1);
-}
-
-std::string UnsignedFloatBase::toString(uint64_t D, int16_t E, int Width,
-                                        unsigned Precision) {
-  if (!D)
-    return "0.0";
-
-  // Canonicalize exponent and digits.
-  uint64_t Above0 = 0;
-  uint64_t Below0 = 0;
-  uint64_t Extra = 0;
-  int ExtraShift = 0;
-  if (E == 0) {
-    Above0 = D;
-  } else if (E > 0) {
-    if (int Shift = std::min(int16_t(countLeadingZeros64(D)), E)) {
-      D <<= Shift;
-      E -= Shift;
-
-      if (!E)
-        Above0 = D;
-    }
-  } else if (E > -64) {
-    Above0 = D >> -E;
-    Below0 = D << (64 + E);
-  } else if (E > -120) {
-    Below0 = D >> (-E - 64);
-    Extra = D << (128 + E);
-    ExtraShift = -64 - E;
-  }
-
-  // Fall back on APFloat for very small and very large numbers.
-  if (!Above0 && !Below0)
-    return toStringAPFloat(D, E, Precision);
-
-  // Append the digits before the decimal.
-  std::string Str;
-  size_t DigitsOut = 0;
-  if (Above0) {
-    appendNumber(Str, Above0);
-    DigitsOut = Str.size();
-  } else
-    appendDigit(Str, 0);
-  std::reverse(Str.begin(), Str.end());
-
-  // Return early if there's nothing after the decimal.
-  if (!Below0)
-    return Str + ".0";
-
-  // Append the decimal and beyond.
-  Str += '.';
-  uint64_t Error = UINT64_C(1) << (64 - Width);
-
-  // We need to shift Below0 to the right to make space for calculating
-  // digits.  Save the precision we're losing in Extra.
-  Extra = (Below0 & 0xf) << 56 | (Extra >> 8);
-  Below0 >>= 4;
-  size_t SinceDot = 0;
-  size_t AfterDot = Str.size();
-  do {
-    if (ExtraShift) {
-      --ExtraShift;
-      Error *= 5;
-    } else
-      Error *= 10;
-
-    Below0 *= 10;
-    Extra *= 10;
-    Below0 += (Extra >> 60);
-    Extra = Extra & (UINT64_MAX >> 4);
-    appendDigit(Str, Below0 >> 60);
-    Below0 = Below0 & (UINT64_MAX >> 4);
-    if (DigitsOut || Str.back() != '0')
-      ++DigitsOut;
-    ++SinceDot;
-  } while (Error && (Below0 << 4 | Extra >> 60) >= Error / 2 &&
-           (!Precision || DigitsOut <= Precision || SinceDot < 2));
-
-  // Return early for maximum precision.
-  if (!Precision || DigitsOut <= Precision)
-    return stripTrailingZeros(Str);
-
-  // Find where to truncate.
-  size_t Truncate =
-      std::max(Str.size() - (DigitsOut - Precision), AfterDot + 1);
-
-  // Check if there's anything to truncate.
-  if (Truncate >= Str.size())
-    return stripTrailingZeros(Str);
-
-  bool Carry = doesRoundUp(Str[Truncate]);
-  if (!Carry)
-    return stripTrailingZeros(Str.substr(0, Truncate));
-
-  // Round with the first truncated digit.
-  for (std::string::reverse_iterator I(Str.begin() + Truncate), E = Str.rend();
-       I != E; ++I) {
-    if (*I == '.')
-      continue;
-    if (*I == '9') {
-      *I = '0';
-      continue;
-    }
-
-    ++*I;
-    Carry = false;
-    break;
-  }
-
-  // Add "1" in front if we still need to carry.
-  return stripTrailingZeros(std::string(Carry, '1') + Str.substr(0, Truncate));
-}
-
-raw_ostream &UnsignedFloatBase::print(raw_ostream &OS, uint64_t D, int16_t E,
-                                      int Width, unsigned Precision) {
-  return OS << toString(D, E, Width, Precision);
-}
-
-void UnsignedFloatBase::dump(uint64_t D, int16_t E, int Width) {
-  print(dbgs(), D, E, Width, 0) << "[" << Width << ":" << D << "*2^" << E
-                                << "]";
-}
-
-static std::pair<uint64_t, int16_t>
-getRoundedFloat(uint64_t N, bool ShouldRound, int64_t Shift) {
-  if (ShouldRound)
-    if (!++N)
-      // Rounding caused an overflow.
-      return std::make_pair(UINT64_C(1), Shift + 64);
-  return std::make_pair(N, Shift);
-}
-
-std::pair<uint64_t, int16_t> UnsignedFloatBase::divide64(uint64_t Dividend,
-                                                         uint64_t Divisor) {
-  // Input should be sanitized.
-  assert(Divisor);
-  assert(Dividend);
-
-  // Minimize size of divisor.
-  int16_t Shift = 0;
-  if (int Zeros = countTrailingZeros(Divisor)) {
-    Shift -= Zeros;
-    Divisor >>= Zeros;
-  }
-
-  // Check for powers of two.
-  if (Divisor == 1)
-    return std::make_pair(Dividend, Shift);
-
-  // Maximize size of dividend.
-  if (int Zeros = countLeadingZeros64(Dividend)) {
-    Shift -= Zeros;
-    Dividend <<= Zeros;
-  }
-
-  // Start with the result of a divide.
-  uint64_t Quotient = Dividend / Divisor;
-  Dividend %= Divisor;
-
-  // Continue building the quotient with long division.
-  //
-  // TODO: continue with largers digits.
-  while (!(Quotient >> 63) && Dividend) {
-    // Shift Dividend, and check for overflow.
-    bool IsOverflow = Dividend >> 63;
-    Dividend <<= 1;
-    --Shift;
-
-    // Divide.
-    bool DoesDivide = IsOverflow || Divisor <= Dividend;
-    Quotient = (Quotient << 1) | uint64_t(DoesDivide);
-    Dividend -= DoesDivide ? Divisor : 0;
-  }
-
-  // Round.
-  if (Dividend >= getHalf(Divisor))
-    if (!++Quotient)
-      // Rounding caused an overflow in Quotient.
-      return std::make_pair(UINT64_C(1), Shift + 64);
-
-  return getRoundedFloat(Quotient, Dividend >= getHalf(Divisor), Shift);
-}
-
-std::pair<uint64_t, int16_t> UnsignedFloatBase::multiply64(uint64_t L,
-                                                           uint64_t R) {
-  // Separate into two 32-bit digits (U.L).
-  uint64_t UL = L >> 32, LL = L & UINT32_MAX, UR = R >> 32, LR = R & UINT32_MAX;
-
-  // Compute cross products.
-  uint64_t P1 = UL * UR, P2 = UL * LR, P3 = LL * UR, P4 = LL * LR;
-
-  // Sum into two 64-bit digits.
-  uint64_t Upper = P1, Lower = P4;
-  auto addWithCarry = [&](uint64_t N) {
-    uint64_t NewLower = Lower + (N << 32);
-    Upper += (N >> 32) + (NewLower < Lower);
-    Lower = NewLower;
-  };
-  addWithCarry(P2);
-  addWithCarry(P3);
-
-  // Check whether the upper digit is empty.
-  if (!Upper)
-    return std::make_pair(Lower, 0);
-
-  // Shift as little as possible to maximize precision.
-  unsigned LeadingZeros = countLeadingZeros64(Upper);
-  int16_t Shift = 64 - LeadingZeros;
-  if (LeadingZeros)
-    Upper = Upper << LeadingZeros | Lower >> Shift;
-  bool ShouldRound = Shift && (Lower & UINT64_C(1) << (Shift - 1));
-  return getRoundedFloat(Upper, ShouldRound, Shift);
-}
-
-//===----------------------------------------------------------------------===//
-//
 // BlockMass implementation.
 //
 //===----------------------------------------------------------------------===//
-UnsignedFloat<uint64_t> BlockMass::toFloat() const {
+ScaledNumber<uint64_t> BlockMass::toScaled() const {
   if (isFull())
-    return UnsignedFloat<uint64_t>(1, 0);
-  return UnsignedFloat<uint64_t>(getMass() + 1, -64);
+    return ScaledNumber<uint64_t>(1, 0);
+  return ScaledNumber<uint64_t>(getMass() + 1, -64);
 }
 
 void BlockMass::dump() const { print(dbgs()); }
@@ -342,7 +56,7 @@ namespace {
 typedef BlockFrequencyInfoImplBase::BlockNode BlockNode;
 typedef BlockFrequencyInfoImplBase::Distribution Distribution;
 typedef BlockFrequencyInfoImplBase::Distribution::WeightList WeightList;
-typedef BlockFrequencyInfoImplBase::Float Float;
+typedef BlockFrequencyInfoImplBase::Scaled64 Scaled64;
 typedef BlockFrequencyInfoImplBase::LoopData LoopData;
 typedef BlockFrequencyInfoImplBase::Weight Weight;
 typedef BlockFrequencyInfoImplBase::FrequencyData FrequencyData;
@@ -622,7 +336,7 @@ bool BlockFrequencyInfoImplBase::addLoopSuccessorsToDist(
 ///
 /// Gives the maximum number of estimated iterations allowed for a loop.  Very
 /// large numbers cause problems downstream (even within 64-bits).
-static Float getMaxLoopScale() { return Float(1, 12); }
+static Scaled64 getMaxLoopScale() { return Scaled64(1, 12); }
 
 /// \brief Compute the loop scale for a loop.
 void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) {
@@ -634,7 +348,7 @@ void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) {
   BlockMass ExitMass = BlockMass::getFull() - Loop.BackedgeMass;
 
   // Block scale stores the inverse of the scale.
-  Loop.Scale = ExitMass.toFloat().inverse();
+  Loop.Scale = ExitMass.toScaled().inverse();
 
   DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull()
                << " - " << Loop.BackedgeMass << ")\n"
@@ -708,15 +422,16 @@ void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
 }
 
 static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,
-                                     const Float &Min, const Float &Max) {
+                                     const Scaled64 &Min, const Scaled64 &Max) {
   // Scale the Factor to a size that creates integers.  Ideally, integers would
   // be scaled so that Max == UINT64_MAX so that they can be best
   // differentiated.  However, the register allocator currently deals poorly
   // with large numbers.  Instead, push Min up a little from 1 to give some
   // room to differentiate small, unequal numbers.
   //
-  // TODO: fix issues downstream so that ScalingFactor can be Float(1,64)/Max.
-  Float ScalingFactor = Min.inverse();
+  // TODO: fix issues downstream so that ScalingFactor can be
+  // Scaled64(1,64)/Max.
+  Scaled64 ScalingFactor = Min.inverse();
   if ((Max / Min).lg() < 60)
     ScalingFactor <<= 3;
 
@@ -724,10 +439,10 @@ static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,
   DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max
                << ", factor = " << ScalingFactor << "\n");
   for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) {
-    Float Scaled = BFI.Freqs[Index].Floating * ScalingFactor;
+    Scaled64 Scaled = BFI.Freqs[Index].Scaled * ScalingFactor;
     BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt<uint64_t>());
     DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = "
-                 << BFI.Freqs[Index].Floating << ", scaled = " << Scaled
+                 << BFI.Freqs[Index].Scaled << ", scaled = " << Scaled
                  << ", int = " << BFI.Freqs[Index].Integer << "\n");
   }
 }
@@ -740,7 +455,7 @@ static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) {
   DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getLoopName(Loop)
                << ": mass = " << Loop.Mass << ", scale = " << Loop.Scale
                << "\n");
-  Loop.Scale *= Loop.Mass.toFloat();
+  Loop.Scale *= Loop.Mass.toScaled();
   Loop.IsPackaged = false;
   DEBUG(dbgs() << "  => combined-scale = " << Loop.Scale << "\n");
 
@@ -749,9 +464,9 @@ static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) {
   // final head scale will be used for updated the rest of the members.
   for (const BlockNode &N : Loop.Nodes) {
     const auto &Working = BFI.Working[N.Index];
-    Float &F = Working.isAPackage() ? Working.getPackagedLoop()->Scale
-                                    : BFI.Freqs[N.Index].Floating;
-    Float New = Loop.Scale * F;
+    Scaled64 &F = Working.isAPackage() ? Working.getPackagedLoop()->Scale
+                                       : BFI.Freqs[N.Index].Scaled;
+    Scaled64 New = Loop.Scale * F;
     DEBUG(dbgs() << " - " << BFI.getBlockName(N) << ": " << F << " => " << New
                  << "\n");
     F = New;
@@ -761,7 +476,7 @@ static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) {
 void BlockFrequencyInfoImplBase::unwrapLoops() {
   // Set initial frequencies from loop-local masses.
   for (size_t Index = 0; Index < Working.size(); ++Index)
-    Freqs[Index].Floating = Working[Index].Mass.toFloat();
+    Freqs[Index].Scaled = Working[Index].Mass.toScaled();
 
   for (LoopData &Loop : Loops)
     unwrapLoop(*this, Loop);
@@ -770,12 +485,12 @@ void BlockFrequencyInfoImplBase::unwrapLoops() {
 void BlockFrequencyInfoImplBase::finalizeMetrics() {
   // Unwrap loop packages in reverse post-order, tracking min and max
   // frequencies.
-  auto Min = Float::getLargest();
-  auto Max = Float::getZero();
+  auto Min = Scaled64::getLargest();
+  auto Max = Scaled64::getZero();
   for (size_t Index = 0; Index < Working.size(); ++Index) {
     // Update min/max scale.
-    Min = std::min(Min, Freqs[Index].Floating);
-    Max = std::max(Max, Freqs[Index].Floating);
+    Min = std::min(Min, Freqs[Index].Scaled);
+    Max = std::max(Max, Freqs[Index].Scaled);
   }
 
   // Convert to integers.
@@ -794,11 +509,11 @@ BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const {
     return 0;
   return Freqs[Node.Index].Integer;
 }
-Float
+Scaled64
 BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const {
   if (!Node.isValid())
-    return Float::getZero();
-  return Freqs[Node.Index].Floating;
+    return Scaled64::getZero();
+  return Freqs[Node.Index].Scaled;
 }
 
 std::string
@@ -819,8 +534,8 @@ BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS,
 raw_ostream &
 BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS,
                                            const BlockFrequency &Freq) const {
-  Float Block(Freq.getFrequency(), 0);
-  Float Entry(getEntryFreq(), 0);
+  Scaled64 Block(Freq.getFrequency(), 0);
+  Scaled64 Entry(getEntryFreq(), 0);
 
   return OS << Block / Entry;
 }
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index b546789..d1632fd 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_library(LLVMAnalysis
   InstructionSimplify.cpp
   Interval.cpp
   IntervalPartition.cpp
+  JumpInstrTableInfo.cpp
   LazyCallGraph.cpp
   LazyValueInfo.cpp
   LibCallAliasAnalysis.cpp
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 0ac1cb5..eb3e2c6 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Config/config.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -31,11 +32,15 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FEnv.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include <cerrno>
 #include <cmath>
+
+#ifdef HAVE_FENV_H
+#include <fenv.h>
+#endif
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -706,7 +711,7 @@ static Constant *CastGEPIndices(ArrayRef<Constant *> Ops,
 static Constant* StripPtrCastKeepAS(Constant* Ptr) {
   assert(Ptr->getType()->isPointerTy() && "Not a pointer type");
   PointerType *OldPtrTy = cast<PointerType>(Ptr->getType());
-  Ptr = cast<Constant>(Ptr->stripPointerCasts());
+  Ptr = Ptr->stripPointerCasts();
   PointerType *NewPtrTy = cast<PointerType>(Ptr->getType());
 
   // Preserve the address space number of the pointer.
@@ -1314,12 +1319,34 @@ static Constant *GetConstantFoldFPValue(double V, Type *Ty) {
 
 }
 
+namespace {
+/// llvm_fenv_clearexcept - Clear the floating-point exception state.
+static inline void llvm_fenv_clearexcept() {
+#if defined(HAVE_FENV_H) && HAVE_DECL_FE_ALL_EXCEPT
+  feclearexcept(FE_ALL_EXCEPT);
+#endif
+  errno = 0;
+}
+
+/// llvm_fenv_testexcept - Test if a floating-point exception was raised.
+static inline bool llvm_fenv_testexcept() {
+  int errno_val = errno;
+  if (errno_val == ERANGE || errno_val == EDOM)
+    return true;
+#if defined(HAVE_FENV_H) && HAVE_DECL_FE_ALL_EXCEPT && HAVE_DECL_FE_INEXACT
+  if (fetestexcept(FE_ALL_EXCEPT & ~FE_INEXACT))
+    return true;
+#endif
+  return false;
+}
+} // End namespace
+
 static Constant *ConstantFoldFP(double (*NativeFP)(double), double V,
                                 Type *Ty) {
-  sys::llvm_fenv_clearexcept();
+  llvm_fenv_clearexcept();
   V = NativeFP(V);
-  if (sys::llvm_fenv_testexcept()) {
-    sys::llvm_fenv_clearexcept();
+  if (llvm_fenv_testexcept()) {
+    llvm_fenv_clearexcept();
     return nullptr;
   }
 
@@ -1328,10 +1355,10 @@ static Constant *ConstantFoldFP(double (*NativeFP)(double), double V,
 
 static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
                                       double V, double W, Type *Ty) {
-  sys::llvm_fenv_clearexcept();
+  llvm_fenv_clearexcept();
   V = NativeFP(V, W);
-  if (sys::llvm_fenv_testexcept()) {
-    sys::llvm_fenv_clearexcept();
+  if (llvm_fenv_testexcept()) {
+    llvm_fenv_clearexcept();
     return nullptr;
   }
 
diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp
index 780b1aa..1b74f8c 100644
--- a/lib/Analysis/CostModel.cpp
+++ b/lib/Analysis/CostModel.cpp
@@ -95,6 +95,31 @@ static bool isReverseVectorMask(SmallVectorImpl<int> &Mask) {
   return true;
 }
 
+static bool isAlternateVectorMask(SmallVectorImpl<int> &Mask) {
+  bool isAlternate = true;
+  unsigned MaskSize = Mask.size();
+
+  // Example: shufflevector A, B, <0,5,2,7>
+  for (unsigned i = 0; i < MaskSize && isAlternate; ++i) {
+    if (Mask[i] < 0)
+      continue;
+    isAlternate = Mask[i] == (int)((i & 1) ? MaskSize + i : i);
+  }
+
+  if (isAlternate)
+    return true;
+
+  isAlternate = true;
+  // Example: shufflevector A, B, <4,1,6,3>
+  for (unsigned i = 0; i < MaskSize && isAlternate; ++i) {
+    if (Mask[i] < 0)
+      continue;
+    isAlternate = Mask[i] == (int)((i & 1) ? i : MaskSize + i);
+  }
+
+  return isAlternate;
+}
+
 static TargetTransformInfo::OperandValueKind getOperandInfo(Value *V) {
   TargetTransformInfo::OperandValueKind OpInfo =
     TargetTransformInfo::OK_AnyValue;
@@ -466,9 +491,15 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
     unsigned NumVecElems = VecTypOp0->getVectorNumElements();
     SmallVector<int, 16> Mask = Shuffle->getShuffleMask();
 
-    if (NumVecElems == Mask.size() && isReverseVectorMask(Mask))
-      return TTI->getShuffleCost(TargetTransformInfo::SK_Reverse, VecTypOp0, 0,
-                                 nullptr);
+    if (NumVecElems == Mask.size()) {
+      if (isReverseVectorMask(Mask))
+        return TTI->getShuffleCost(TargetTransformInfo::SK_Reverse, VecTypOp0,
+                                   0, nullptr);
+      if (isAlternateVectorMask(Mask))
+        return TTI->getShuffleCost(TargetTransformInfo::SK_Alternate,
+                                   VecTypOp0, 0, nullptr);
+    }
+
     return -1;
   }
   case Instruction::Call:
diff --git a/lib/Analysis/IPA/CallGraphSCCPass.cpp b/lib/Analysis/IPA/CallGraphSCCPass.cpp
index bfab744..c27edbf 100644
--- a/lib/Analysis/IPA/CallGraphSCCPass.cpp
+++ b/lib/Analysis/IPA/CallGraphSCCPass.cpp
@@ -602,8 +602,12 @@ namespace {
 
     bool runOnSCC(CallGraphSCC &SCC) override {
       Out << Banner;
-      for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I)
-        (*I)->getFunction()->print(Out);
+      for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+        if ((*I)->getFunction())
+          (*I)->getFunction()->print(Out);
+        else
+          Out << "\nPrinting <null> Function\n";
+      }
       return false;
     }
   };
diff --git a/lib/Analysis/IPA/InlineCost.cpp b/lib/Analysis/IPA/InlineCost.cpp
index 66f3f8e..8807529 100644
--- a/lib/Analysis/IPA/InlineCost.cpp
+++ b/lib/Analysis/IPA/InlineCost.cpp
@@ -841,10 +841,7 @@ bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
   // original function which is extremely undefined behavior.
   // FIXME: This logic isn't really right; we can safely inline functions with
   // indirectbr's as long as no other function or global references the
-  // blockaddress of a block within the current function.  And as a QOI issue,
-  // if someone is using a blockaddress without an indirectbr, and that
-  // reference somehow ends up in another function or global, we probably don't
-  // want to inline this function.
+  // blockaddress of a block within the current function.
   HasIndirectBr = true;
   return false;
 }
@@ -1121,6 +1118,15 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
     if (BB->empty())
       continue;
 
+    // Disallow inlining a blockaddress. A blockaddress only has defined
+    // behavior for an indirect branch in the same function, and we do not
+    // currently support inlining indirect branches. But, the inliner may not
+    // see an indirect branch that ends up being dead code at a particular call
+    // site. If the blockaddress escapes the function, e.g., via a global
+    // variable, inlining may lead to an invalid cross-function reference.
+    if (BB->hasAddressTaken())
+      return false;
+
     // Analyze the cost of this block. If we blow through the threshold, this
     // returns false, and we can bail on out.
     if (!analyzeBlock(BB)) {
@@ -1303,8 +1309,9 @@ bool InlineCostAnalysis::isInlineViable(Function &F) {
     F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                    Attribute::ReturnsTwice);
   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
-    // Disallow inlining of functions which contain an indirect branch.
-    if (isa<IndirectBrInst>(BI->getTerminator()))
+    // Disallow inlining of functions which contain indirect branches or
+    // blockaddresses.
+    if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
       return false;
 
     for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index c819bd3..24655aa 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -287,7 +287,10 @@ void IVUsers::print(raw_ostream &OS, const Module *M) const {
       OS << ")";
     }
     OS << " in  ";
-    UI->getUser()->print(OS);
+    if (UI->getUser())
+      UI->getUser()->print(OS);
+    else
+      OS << "Printing <null> User";
     OS << '\n';
   }
 }
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 3684fda..bd42af1 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -39,7 +39,6 @@ using namespace llvm::PatternMatch;
 enum { RecursionLimit = 3 };
 
 STATISTIC(NumExpand,  "Number of expansions");
-STATISTIC(NumFactor , "Number of factorizations");
 STATISTIC(NumReassoc, "Number of reassociations");
 
 struct Query {
@@ -183,78 +182,6 @@ static Value *ExpandBinOp(unsigned Opcode, Value *LHS, Value *RHS,
   return nullptr;
 }
 
-/// FactorizeBinOp - Simplify "LHS Opcode RHS" by factorizing out a common term
-/// using the operation OpCodeToExtract.  For example, when Opcode is Add and
-/// OpCodeToExtract is Mul then this tries to turn "(A*B)+(A*C)" into "A*(B+C)".
-/// Returns the simplified value, or null if no simplification was performed.
-static Value *FactorizeBinOp(unsigned Opcode, Value *LHS, Value *RHS,
-                             unsigned OpcToExtract, const Query &Q,
-                             unsigned MaxRecurse) {
-  Instruction::BinaryOps OpcodeToExtract = (Instruction::BinaryOps)OpcToExtract;
-  // Recursion is always used, so bail out at once if we already hit the limit.
-  if (!MaxRecurse--)
-    return nullptr;
-
-  BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
-  BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
-
-  if (!Op0 || Op0->getOpcode() != OpcodeToExtract ||
-      !Op1 || Op1->getOpcode() != OpcodeToExtract)
-    return nullptr;
-
-  // The expression has the form "(A op' B) op (C op' D)".
-  Value *A = Op0->getOperand(0), *B = Op0->getOperand(1);
-  Value *C = Op1->getOperand(0), *D = Op1->getOperand(1);
-
-  // Use left distributivity, i.e. "X op' (Y op Z) = (X op' Y) op (X op' Z)".
-  // Does the instruction have the form "(A op' B) op (A op' D)" or, in the
-  // commutative case, "(A op' B) op (C op' A)"?
-  if (A == C || (Instruction::isCommutative(OpcodeToExtract) && A == D)) {
-    Value *DD = A == C ? D : C;
-    // Form "A op' (B op DD)" if it simplifies completely.
-    // Does "B op DD" simplify?
-    if (Value *V = SimplifyBinOp(Opcode, B, DD, Q, MaxRecurse)) {
-      // It does!  Return "A op' V" if it simplifies or is already available.
-      // If V equals B then "A op' V" is just the LHS.  If V equals DD then
-      // "A op' V" is just the RHS.
-      if (V == B || V == DD) {
-        ++NumFactor;
-        return V == B ? LHS : RHS;
-      }
-      // Otherwise return "A op' V" if it simplifies.
-      if (Value *W = SimplifyBinOp(OpcodeToExtract, A, V, Q, MaxRecurse)) {
-        ++NumFactor;
-        return W;
-      }
-    }
-  }
-
-  // Use right distributivity, i.e. "(X op Y) op' Z = (X op' Z) op (Y op' Z)".
-  // Does the instruction have the form "(A op' B) op (C op' B)" or, in the
-  // commutative case, "(A op' B) op (B op' D)"?
-  if (B == D || (Instruction::isCommutative(OpcodeToExtract) && B == C)) {
-    Value *CC = B == D ? C : D;
-    // Form "(A op CC) op' B" if it simplifies completely..
-    // Does "A op CC" simplify?
-    if (Value *V = SimplifyBinOp(Opcode, A, CC, Q, MaxRecurse)) {
-      // It does!  Return "V op' B" if it simplifies or is already available.
-      // If V equals A then "V op' B" is just the LHS.  If V equals CC then
-      // "V op' B" is just the RHS.
-      if (V == A || V == CC) {
-        ++NumFactor;
-        return V == A ? LHS : RHS;
-      }
-      // Otherwise return "V op' B" if it simplifies.
-      if (Value *W = SimplifyBinOp(OpcodeToExtract, V, B, Q, MaxRecurse)) {
-        ++NumFactor;
-        return W;
-      }
-    }
-  }
-
-  return nullptr;
-}
-
 /// SimplifyAssociativeBinOp - Generic simplifications for associative binary
 /// operations.  Returns the simpler value, or null if none was found.
 static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS,
@@ -634,11 +561,6 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                                           MaxRecurse))
     return V;
 
-  // Mul distributes over Add.  Try some generic simplifications based on this.
-  if (Value *V = FactorizeBinOp(Instruction::Add, Op0, Op1, Instruction::Mul,
-                                Q, MaxRecurse))
-    return V;
-
   // Threading Add over selects and phi nodes is pointless, so don't bother.
   // Threading over the select in "A + select(cond, B, C)" means evaluating
   // "A+B" and "A+C" and seeing if they are equal; but they are equal if and
@@ -754,16 +676,9 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
   if (Op0 == Op1)
     return Constant::getNullValue(Op0->getType());
 
-  // (X*2) - X -> X
-  // (X<<1) - X -> X
-  Value *X = nullptr;
-  if (match(Op0, m_Mul(m_Specific(Op1), m_ConstantInt<2>())) ||
-      match(Op0, m_Shl(m_Specific(Op1), m_One())))
-    return Op1;
-
   // (X + Y) - Z -> X + (Y - Z) or Y + (X - Z) if everything simplifies.
   // For example, (X + Y) - Y -> X; (Y + X) - Y -> X
-  Value *Y = nullptr, *Z = Op1;
+  Value *X = nullptr, *Y = nullptr, *Z = Op1;
   if (MaxRecurse && match(Op0, m_Add(m_Value(X), m_Value(Y)))) { // (X + Y) - Z
     // See if "V === Y - Z" simplifies.
     if (Value *V = SimplifyBinOp(Instruction::Sub, Y, Z, Q, MaxRecurse-1))
@@ -835,11 +750,6 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
     if (Constant *Result = computePointerDifference(Q.DL, X, Y))
       return ConstantExpr::getIntegerCast(Result, Op0->getType(), true);
 
-  // Mul distributes over Sub.  Try some generic simplifications based on this.
-  if (Value *V = FactorizeBinOp(Instruction::Sub, Op0, Op1, Instruction::Mul,
-                                Q, MaxRecurse))
-    return V;
-
   // i1 sub -> xor.
   if (MaxRecurse && Op0->getType()->isIntegerTy(1))
     if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
@@ -1518,11 +1428,6 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q,
                              Q, MaxRecurse))
     return V;
 
-  // Or distributes over And.  Try some generic simplifications based on this.
-  if (Value *V = FactorizeBinOp(Instruction::And, Op0, Op1, Instruction::Or,
-                                Q, MaxRecurse))
-    return V;
-
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
@@ -1613,11 +1518,6 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q,
                              MaxRecurse))
     return V;
 
-  // And distributes over Or.  Try some generic simplifications based on this.
-  if (Value *V = FactorizeBinOp(Instruction::Or, Op0, Op1, Instruction::And,
-                                Q, MaxRecurse))
-    return V;
-
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
@@ -1625,6 +1525,38 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q,
                                          MaxRecurse))
       return V;
 
+  // (A & C)|(B & D)
+  Value *C = nullptr, *D = nullptr;
+  if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
+      match(Op1, m_And(m_Value(B), m_Value(D)))) {
+    ConstantInt *C1 = dyn_cast<ConstantInt>(C);
+    ConstantInt *C2 = dyn_cast<ConstantInt>(D);
+    if (C1 && C2 && (C1->getValue() == ~C2->getValue())) {
+      // (A & C1)|(B & C2)
+      // If we have: ((V + N) & C1) | (V & C2)
+      // .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
+      // replace with V+N.
+      Value *V1, *V2;
+      if ((C2->getValue() & (C2->getValue() + 1)) == 0 && // C2 == 0+1+
+          match(A, m_Add(m_Value(V1), m_Value(V2)))) {
+        // Add commutes, try both ways.
+        if (V1 == B && MaskedValueIsZero(V2, C2->getValue()))
+          return A;
+        if (V2 == B && MaskedValueIsZero(V1, C2->getValue()))
+          return A;
+      }
+      // Or commutes, try both ways.
+      if ((C1->getValue() & (C1->getValue() + 1)) == 0 &&
+          match(B, m_Add(m_Value(V1), m_Value(V2)))) {
+        // Add commutes, try both ways.
+        if (V1 == A && MaskedValueIsZero(V2, C1->getValue()))
+          return B;
+        if (V2 == A && MaskedValueIsZero(V1, C1->getValue()))
+          return B;
+      }
+    }
+  }
+
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
@@ -1677,11 +1609,6 @@ static Value *SimplifyXorInst(Value *Op0, Value *Op1, const Query &Q,
                                           MaxRecurse))
     return V;
 
-  // And distributes over Xor.  Try some generic simplifications based on this.
-  if (Value *V = FactorizeBinOp(Instruction::Xor, Op0, Op1, Instruction::And,
-                                Q, MaxRecurse))
-    return V;
-
   // Threading Xor over selects and phi nodes is pointless, so don't bother.
   // Threading over the select in "A ^ select(cond, B, C)" means evaluating
   // "A^B" and "A^C" and seeing if they are equal; but they are equal if and
@@ -2021,9 +1948,15 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       if (!CI2->isZero())
         Upper = NegOne.udiv(CI2->getValue()) + 1;
     } else if (match(LHS, m_SDiv(m_ConstantInt(CI2), m_Value()))) {
-      // 'sdiv CI2, x' produces [-|CI2|, |CI2|].
-      Upper = CI2->getValue().abs() + 1;
-      Lower = (-Upper) + 1;
+      if (CI2->isMinSignedValue()) {
+        // 'sdiv INT_MIN, x' produces [INT_MIN, INT_MIN / -2].
+        Lower = CI2->getValue();
+        Upper = Lower.lshr(1) + 1;
+      } else {
+        // 'sdiv CI2, x' produces [-|CI2|, |CI2|].
+        Upper = CI2->getValue().abs() + 1;
+        Lower = (-Upper) + 1;
+      }
     } else if (match(LHS, m_SDiv(m_Value(), m_ConstantInt(CI2)))) {
       // 'sdiv x, CI2' produces [INT_MIN / CI2, INT_MAX / CI2].
       APInt IntMin = APInt::getSignedMinValue(Width);
@@ -2241,6 +2174,25 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     }
   }
 
+  // If a bit is known to be zero for A and known to be one for B,
+  // then A and B cannot be equal.
+  if (ICmpInst::isEquality(Pred)) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+      uint32_t BitWidth = CI->getBitWidth();
+      APInt LHSKnownZero(BitWidth, 0);
+      APInt LHSKnownOne(BitWidth, 0);
+      computeKnownBits(LHS, LHSKnownZero, LHSKnownOne);
+      APInt RHSKnownZero(BitWidth, 0);
+      APInt RHSKnownOne(BitWidth, 0);
+      computeKnownBits(RHS, RHSKnownZero, RHSKnownOne);
+      if (((LHSKnownOne & RHSKnownZero) != 0) ||
+          ((LHSKnownZero & RHSKnownOne) != 0))
+        return (Pred == ICmpInst::ICMP_EQ)
+                   ? ConstantInt::getFalse(CI->getContext())
+                   : ConstantInt::getTrue(CI->getContext());
+    }
+  }
+
   // Special logic for binary operators.
   BinaryOperator *LBO = dyn_cast<BinaryOperator>(LHS);
   BinaryOperator *RBO = dyn_cast<BinaryOperator>(RHS);
diff --git a/lib/Analysis/JumpInstrTableInfo.cpp b/lib/Analysis/JumpInstrTableInfo.cpp
new file mode 100644
index 0000000..b5b4265
--- /dev/null
+++ b/lib/Analysis/JumpInstrTableInfo.cpp
@@ -0,0 +1,40 @@
+//===-- JumpInstrTableInfo.cpp: Info for Jump-Instruction Tables ----------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Information about jump-instruction tables that have been created by
+/// JumpInstrTables pass.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jiti"
+
+#include "llvm/Analysis/JumpInstrTableInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+
+using namespace llvm;
+
+INITIALIZE_PASS(JumpInstrTableInfo, "jump-instr-table-info",
+                "Jump-Instruction Table Info", true, true)
+char JumpInstrTableInfo::ID = 0;
+
+ImmutablePass *llvm::createJumpInstrTableInfoPass() {
+  return new JumpInstrTableInfo();
+}
+
+JumpInstrTableInfo::JumpInstrTableInfo() : ImmutablePass(ID), Tables() {
+  initializeJumpInstrTableInfoPass(*PassRegistry::getPassRegistry());
+}
+
+JumpInstrTableInfo::~JumpInstrTableInfo() {}
+
+void JumpInstrTableInfo::insertEntry(FunctionType *TableFunTy, Function *Target,
+                                     Function *Jump) {
+  Tables[TableFunTy].push_back(JumpPair(Target, Jump));
+}
diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
index 8df18e7..7bd866e 100644
--- a/lib/Analysis/LoopPass.cpp
+++ b/lib/Analysis/LoopPass.cpp
@@ -45,7 +45,10 @@ public:
     for (Loop::block_iterator b = L->block_begin(), be = L->block_end();
          b != be;
          ++b) {
-      (*b)->print(Out);
+      if (*b)
+        (*b)->print(Out);
+      else
+        Out << "Printing <null> block";
     }
     return false;
   }
diff --git a/lib/Analysis/NoAliasAnalysis.cpp b/lib/Analysis/NoAliasAnalysis.cpp
index 4e11e50..139fa38 100644
--- a/lib/Analysis/NoAliasAnalysis.cpp
+++ b/lib/Analysis/NoAliasAnalysis.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/Pass.h"
 using namespace llvm;
 
@@ -53,6 +54,13 @@ namespace {
     bool pointsToConstantMemory(const Location &Loc, bool OrLocal) override {
       return false;
     }
+    Location getArgLocation(ImmutableCallSite CS, unsigned ArgIdx,
+                            ModRefResult &Mask) override {
+      Mask = ModRef;
+      return Location(CS.getArgument(ArgIdx), UnknownSize,
+                      CS.getInstruction()->getMetadata(LLVMContext::MD_tbaa));
+    }
+
     ModRefResult getModRefInfo(ImmutableCallSite CS,
                                const Location &Loc) override {
       return ModRef;
diff --git a/lib/Analysis/RegionPass.cpp b/lib/Analysis/RegionPass.cpp
index 3c7798f..71de144 100644
--- a/lib/Analysis/RegionPass.cpp
+++ b/lib/Analysis/RegionPass.cpp
@@ -195,8 +195,12 @@ public:
 
   bool runOnRegion(Region *R, RGPassManager &RGM) override {
     Out << Banner;
-    for (const auto &BB : R->blocks())
-      BB->print(Out);
+    for (const auto &BB : R->blocks()) {
+      if (BB)
+        BB->print(Out);
+      else
+        Out << "Printing <null> Block";
+    }
 
     return false;
   }
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 42a7aa2..06dbde5 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -7216,6 +7216,15 @@ public:
         cast<SCEVConstant>(Zero)->getValue();
     Remainder = SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap, true);
 
+    if (Remainder->isZero()) {
+      // The Quotient is obtained by replacing Denominator by 1 in Numerator.
+      RewriteMap[cast<SCEVUnknown>(Denominator)->getValue()] =
+          cast<SCEVConstant>(One)->getValue();
+      Quotient =
+          SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap, true);
+      return;
+    }
+
     // Quotient is (Numerator - Remainder) divided by Denominator.
     const SCEV *Q, *R;
     const SCEV *Diff = SE.getMinusSCEV(Numerator, Remainder);
@@ -7356,7 +7365,7 @@ const SCEV *ScalarEvolution::getElementSize(Instruction *Inst) {
   if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
     Ty = Store->getValueOperand()->getType();
   else if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
-    Ty = Load->getPointerOperand()->getType();
+    Ty = Load->getType();
   else
     return nullptr;
 
@@ -7370,7 +7379,7 @@ void ScalarEvolution::findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
                                           SmallVectorImpl<const SCEV *> &Sizes,
                                           const SCEV *ElementSize) const {
 
-  if (Terms.size() < 1)
+  if (Terms.size() < 1 || !ElementSize)
     return;
 
   // Early return when Terms do not contain parameters: we do not delinearize
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index b507043..8c75b0d 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -1706,7 +1707,7 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
 
     // Fold constant phis. They may be congruent to other constant phis and
     // would confuse the logic below that expects proper IVs.
-    if (Value *V = Phi->hasConstantValue()) {
+    if (Value *V = SimplifyInstruction(Phi, SE.DL, SE.TLI, SE.DT)) {
       Phi->replaceAllUsesWith(V);
       DeadInsts.push_back(Phi);
       ++NumElim;
diff --git a/lib/Analysis/ScalarEvolutionNormalization.cpp b/lib/Analysis/ScalarEvolutionNormalization.cpp
index e9db295..3ccefb0 100644
--- a/lib/Analysis/ScalarEvolutionNormalization.cpp
+++ b/lib/Analysis/ScalarEvolutionNormalization.cpp
@@ -241,7 +241,7 @@ TransformSubExpr(const SCEV *S, Instruction *User, Value *OperandValToReplace) {
 }
 
 /// Top level driver for transforming an expression DAG into its requested
-/// post-inc form (either "Normalized" or "Denormalized".
+/// post-inc form (either "Normalized" or "Denormalized").
 const SCEV *llvm::TransformForPostIncUse(TransformKind Kind,
                                          const SCEV *S,
                                          Instruction *User,
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 4f48753..5264745 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -188,7 +188,8 @@ static void computeKnownBitsMul(Value *Op0, Value *Op1, bool NSW,
     KnownOne.setBit(BitWidth - 1);
 }
 
-void llvm::computeKnownBitsLoad(const MDNode &Ranges, APInt &KnownZero) {
+void llvm::computeKnownBitsFromRangeMetadata(const MDNode &Ranges,
+                                             APInt &KnownZero) {
   unsigned BitWidth = KnownZero.getBitWidth();
   unsigned NumRanges = Ranges.getNumOperands() / 2;
   assert(NumRanges >= 1);
@@ -338,7 +339,7 @@ void llvm::computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
   default: break;
   case Instruction::Load:
     if (MDNode *MD = cast<LoadInst>(I)->getMetadata(LLVMContext::MD_range))
-      computeKnownBitsLoad(*MD, KnownZero);
+      computeKnownBitsFromRangeMetadata(*MD, KnownZero);
     break;
   case Instruction::And: {
     // If either the LHS or the RHS are Zero, the result is zero.
@@ -733,6 +734,12 @@ void llvm::computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
     break;
   }
   case Instruction::Call:
+  case Instruction::Invoke:
+    if (MDNode *MD = cast<Instruction>(I)->getMetadata(LLVMContext::MD_range))
+      computeKnownBitsFromRangeMetadata(*MD, KnownZero);
+    // If a range metadata is attached to this IntrinsicInst, intersect the
+    // explicit range specified by the metadata and the implicit range of
+    // the intrinsic.
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
       switch (II->getIntrinsicID()) {
       default: break;
@@ -742,16 +749,16 @@ void llvm::computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
         // If this call is undefined for 0, the result will be less than 2^n.
         if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
           LowBits -= 1;
-        KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
+        KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
         break;
       }
       case Intrinsic::ctpop: {
         unsigned LowBits = Log2_32(BitWidth)+1;
-        KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
+        KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
         break;
       }
       case Intrinsic::x86_sse42_crc32_64_64:
-        KnownZero = APInt::getHighBitsSet(64, 32);
+        KnownZero |= APInt::getHighBitsSet(64, 32);
         break;
       }
     }
@@ -1977,7 +1984,7 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
     return true;
   case Instruction::UDiv:
   case Instruction::URem:
-    // x / y is undefined if y == 0, but calcuations like x / 3 are safe.
+    // x / y is undefined if y == 0, but calculations like x / 3 are safe.
     return isKnownNonZero(Inst->getOperand(1), TD);
   case Instruction::SDiv:
   case Instruction::SRem: {
@@ -2000,12 +2007,12 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
         // Speculative load may create a race that did not exist in the source.
         LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeThread))
       return false;
-    return LI->getPointerOperand()->isDereferenceablePointer();
+    return LI->getPointerOperand()->isDereferenceablePointer(TD);
   }
   case Instruction::Call: {
    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
      switch (II->getIntrinsicID()) {
-       // These synthetic intrinsics have no side-effects, and just mark
+       // These synthetic intrinsics have no side-effects and just mark
        // information about their operands.
        // FIXME: There are other no-op synthetic instructions that potentially
        // should be considered at least *safe* to speculate...
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 44a3412..1e5bcdd 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -209,6 +209,7 @@ lltok::Kind LLLexer::LexToken() {
     return LexToken();
   case '+': return LexPositive();
   case '@': return LexAt();
+  case '$': return LexDollar();
   case '%': return LexPercent();
   case '"': return LexQuote();
   case '.':
@@ -222,13 +223,6 @@ lltok::Kind LLLexer::LexToken() {
       return lltok::dotdotdot;
     }
     return lltok::Error;
-  case '$':
-    if (const char *Ptr = isLabelTail(CurPtr)) {
-      CurPtr = Ptr;
-      StrVal.assign(TokStart, CurPtr-1);
-      return lltok::LabelStr;
-    }
-    return lltok::Error;
   case ';':
     SkipLineComment();
     return LexToken();
@@ -307,6 +301,43 @@ lltok::Kind LLLexer::LexAt() {
   return lltok::Error;
 }
 
+lltok::Kind LLLexer::LexDollar() {
+  if (const char *Ptr = isLabelTail(TokStart)) {
+    CurPtr = Ptr;
+    StrVal.assign(TokStart, CurPtr - 1);
+    return lltok::LabelStr;
+  }
+
+  // Handle DollarStringConstant: $\"[^\"]*\"
+  if (CurPtr[0] == '"') {
+    ++CurPtr;
+
+    while (1) {
+      int CurChar = getNextChar();
+
+      if (CurChar == EOF) {
+        Error("end of file in COMDAT variable name");
+        return lltok::Error;
+      }
+      if (CurChar == '"') {
+        StrVal.assign(TokStart + 2, CurPtr - 1);
+        UnEscapeLexed(StrVal);
+        if (StringRef(StrVal).find_first_of(0) != StringRef::npos) {
+          Error("Null bytes are not allowed in names");
+          return lltok::Error;
+        }
+        return lltok::ComdatVar;
+      }
+    }
+  }
+
+  // Handle ComdatVarName: $[-a-zA-Z$._][-a-zA-Z$._0-9]*
+  if (ReadVarName())
+    return lltok::ComdatVar;
+
+  return lltok::Error;
+}
+
 /// ReadString - Read a string until the closing quote.
 lltok::Kind LLLexer::ReadString(lltok::Kind kind) {
   const char *Start = CurPtr;
@@ -490,7 +521,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(available_externally);
   KEYWORD(linkonce);
   KEYWORD(linkonce_odr);
-  KEYWORD(weak);
+  KEYWORD(weak); // Use as a linkage, and a modifier for "cmpxchg".
   KEYWORD(weak_odr);
   KEYWORD(appending);
   KEYWORD(dllimport);
@@ -583,6 +614,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(cold);
   KEYWORD(inlinehint);
   KEYWORD(inreg);
+  KEYWORD(jumptable);
   KEYWORD(minsize);
   KEYWORD(naked);
   KEYWORD(nest);
@@ -617,6 +649,15 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(type);
   KEYWORD(opaque);
 
+  KEYWORD(comdat);
+
+  // Comdat types
+  KEYWORD(any);
+  KEYWORD(exactmatch);
+  KEYWORD(largest);
+  KEYWORD(noduplicates);
+  KEYWORD(samesize);
+
   KEYWORD(eq); KEYWORD(ne); KEYWORD(slt); KEYWORD(sgt); KEYWORD(sle);
   KEYWORD(sge); KEYWORD(ult); KEYWORD(ugt); KEYWORD(ule); KEYWORD(uge);
   KEYWORD(oeq); KEYWORD(one); KEYWORD(olt); KEYWORD(ogt); KEYWORD(ole);
diff --git a/lib/AsmParser/LLLexer.h b/lib/AsmParser/LLLexer.h
index ad11d49..d42de57 100644
--- a/lib/AsmParser/LLLexer.h
+++ b/lib/AsmParser/LLLexer.h
@@ -81,6 +81,7 @@ namespace llvm {
     lltok::Kind LexDigitOrNegative();
     lltok::Kind LexPositive();
     lltok::Kind LexAt();
+    lltok::Kind LexDollar();
     lltok::Kind LexExclaim();
     lltok::Kind LexPercent();
     lltok::Kind LexQuote();
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 3282e8a..be55ac6 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -163,6 +163,11 @@ bool LLParser::ValidateEndOfModule() {
       return Error(I->second.second,
                    "use of undefined type named '" + I->getKey() + "'");
 
+  if (!ForwardRefComdats.empty())
+    return Error(ForwardRefComdats.begin()->second,
+                 "use of undefined comdat '$" +
+                     ForwardRefComdats.begin()->first + "'");
+
   if (!ForwardRefVals.empty())
     return Error(ForwardRefVals.begin()->second.second,
                  "use of undefined value '@" + ForwardRefVals.begin()->first +
@@ -238,6 +243,7 @@ bool LLParser::ParseTopLevelEntities() {
     case lltok::LocalVar:   if (ParseNamedType()) return true; break;
     case lltok::GlobalID:   if (ParseUnnamedGlobal()) return true; break;
     case lltok::GlobalVar:  if (ParseNamedGlobal()) return true; break;
+    case lltok::ComdatVar:  if (parseComdat()) return true; break;
     case lltok::exclaim:    if (ParseStandaloneMetadata()) return true; break;
     case lltok::MetadataVar:if (ParseNamedMetadata()) return true; break;
 
@@ -257,33 +263,31 @@ bool LLParser::ParseTopLevelEntities() {
     case lltok::kw_appending:           // OptionalLinkage
     case lltok::kw_common:              // OptionalLinkage
     case lltok::kw_extern_weak:         // OptionalLinkage
-    case lltok::kw_external: {          // OptionalLinkage
+    case lltok::kw_external:            // OptionalLinkage
+    case lltok::kw_default:             // OptionalVisibility
+    case lltok::kw_hidden:              // OptionalVisibility
+    case lltok::kw_protected:           // OptionalVisibility
+    case lltok::kw_dllimport:           // OptionalDLLStorageClass
+    case lltok::kw_dllexport:           // OptionalDLLStorageClass
+    case lltok::kw_thread_local:        // OptionalThreadLocal
+    case lltok::kw_addrspace:           // OptionalAddrSpace
+    case lltok::kw_constant:            // GlobalType
+    case lltok::kw_global: {            // GlobalType
       unsigned Linkage, Visibility, DLLStorageClass;
-      if (ParseOptionalLinkage(Linkage) ||
+      bool UnnamedAddr;
+      GlobalVariable::ThreadLocalMode TLM;
+      bool HasLinkage;
+      if (ParseOptionalLinkage(Linkage, HasLinkage) ||
           ParseOptionalVisibility(Visibility) ||
           ParseOptionalDLLStorageClass(DLLStorageClass) ||
-          ParseGlobal("", SMLoc(), Linkage, true, Visibility, DLLStorageClass))
-        return true;
-      break;
-    }
-    case lltok::kw_default:       // OptionalVisibility
-    case lltok::kw_hidden:        // OptionalVisibility
-    case lltok::kw_protected: {   // OptionalVisibility
-      unsigned Visibility, DLLStorageClass;
-      if (ParseOptionalVisibility(Visibility) ||
-          ParseOptionalDLLStorageClass(DLLStorageClass) ||
-          ParseGlobal("", SMLoc(), 0, false, Visibility, DLLStorageClass))
+          ParseOptionalThreadLocal(TLM) ||
+          parseOptionalUnnamedAddr(UnnamedAddr) ||
+          ParseGlobal("", SMLoc(), Linkage, HasLinkage, Visibility,
+                      DLLStorageClass, TLM, UnnamedAddr))
         return true;
       break;
     }
 
-    case lltok::kw_thread_local:  // OptionalThreadLocal
-    case lltok::kw_addrspace:     // OptionalAddrSpace
-    case lltok::kw_constant:      // GlobalType
-    case lltok::kw_global:        // GlobalType
-      if (ParseGlobal("", SMLoc(), 0, false, 0, 0)) return true;
-      break;
-
     case lltok::kw_attributes: if (ParseUnnamedAttrGrp()) return true; break;
     }
   }
@@ -470,15 +474,20 @@ bool LLParser::ParseUnnamedGlobal() {
 
   bool HasLinkage;
   unsigned Linkage, Visibility, DLLStorageClass;
+  GlobalVariable::ThreadLocalMode TLM;
+  bool UnnamedAddr;
   if (ParseOptionalLinkage(Linkage, HasLinkage) ||
       ParseOptionalVisibility(Visibility) ||
-      ParseOptionalDLLStorageClass(DLLStorageClass))
+      ParseOptionalDLLStorageClass(DLLStorageClass) ||
+      ParseOptionalThreadLocal(TLM) ||
+      parseOptionalUnnamedAddr(UnnamedAddr))
     return true;
 
   if (HasLinkage || Lex.getKind() != lltok::kw_alias)
     return ParseGlobal(Name, NameLoc, Linkage, HasLinkage, Visibility,
-                       DLLStorageClass);
-  return ParseAlias(Name, NameLoc, Visibility, DLLStorageClass);
+                       DLLStorageClass, TLM, UnnamedAddr);
+  return ParseAlias(Name, NameLoc, Visibility, DLLStorageClass, TLM,
+                    UnnamedAddr);
 }
 
 /// ParseNamedGlobal:
@@ -493,16 +502,71 @@ bool LLParser::ParseNamedGlobal() {
 
   bool HasLinkage;
   unsigned Linkage, Visibility, DLLStorageClass;
+  GlobalVariable::ThreadLocalMode TLM;
+  bool UnnamedAddr;
   if (ParseToken(lltok::equal, "expected '=' in global variable") ||
       ParseOptionalLinkage(Linkage, HasLinkage) ||
       ParseOptionalVisibility(Visibility) ||
-      ParseOptionalDLLStorageClass(DLLStorageClass))
+      ParseOptionalDLLStorageClass(DLLStorageClass) ||
+      ParseOptionalThreadLocal(TLM) ||
+      parseOptionalUnnamedAddr(UnnamedAddr))
     return true;
 
   if (HasLinkage || Lex.getKind() != lltok::kw_alias)
     return ParseGlobal(Name, NameLoc, Linkage, HasLinkage, Visibility,
-                       DLLStorageClass);
-  return ParseAlias(Name, NameLoc, Visibility, DLLStorageClass);
+                       DLLStorageClass, TLM, UnnamedAddr);
+  return ParseAlias(Name, NameLoc, Visibility, DLLStorageClass, TLM,
+                    UnnamedAddr);
+}
+
+bool LLParser::parseComdat() {
+  assert(Lex.getKind() == lltok::ComdatVar);
+  std::string Name = Lex.getStrVal();
+  LocTy NameLoc = Lex.getLoc();
+  Lex.Lex();
+
+  if (ParseToken(lltok::equal, "expected '=' here"))
+    return true;
+
+  if (ParseToken(lltok::kw_comdat, "expected comdat keyword"))
+    return TokError("expected comdat type");
+
+  Comdat::SelectionKind SK;
+  switch (Lex.getKind()) {
+  default:
+    return TokError("unknown selection kind");
+  case lltok::kw_any:
+    SK = Comdat::Any;
+    break;
+  case lltok::kw_exactmatch:
+    SK = Comdat::ExactMatch;
+    break;
+  case lltok::kw_largest:
+    SK = Comdat::Largest;
+    break;
+  case lltok::kw_noduplicates:
+    SK = Comdat::NoDuplicates;
+    break;
+  case lltok::kw_samesize:
+    SK = Comdat::SameSize;
+    break;
+  }
+  Lex.Lex();
+
+  // See if the comdat was forward referenced, if so, use the comdat.
+  Module::ComdatSymTabType &ComdatSymTab = M->getComdatSymbolTable();
+  Module::ComdatSymTabType::iterator I = ComdatSymTab.find(Name);
+  if (I != ComdatSymTab.end() && !ForwardRefComdats.erase(Name))
+    return Error(NameLoc, "redefinition of comdat '$" + Name + "'");
+
+  Comdat *C;
+  if (I != ComdatSymTab.end())
+    C = &I->second;
+  else
+    C = M->getOrInsertComdat(Name);
+  C->setSelectionKind(SK);
+
+  return false;
 }
 
 // MDString:
@@ -510,6 +574,7 @@ bool LLParser::ParseNamedGlobal() {
 bool LLParser::ParseMDString(MDString *&Result) {
   std::string Str;
   if (ParseStringConstant(Str)) return true;
+  llvm::UpgradeMDStringConstant(Str);
   Result = MDString::get(Context, Str);
   return false;
 }
@@ -628,18 +693,19 @@ static bool isValidVisibilityForLinkage(unsigned V, unsigned L) {
 }
 
 /// ParseAlias:
-///   ::= GlobalVar '=' OptionalVisibility OptionalDLLStorageClass 'alias'
+///   ::= GlobalVar '=' OptionalVisibility OptionalDLLStorageClass
+///                     OptionalThreadLocal OptionalUnNammedAddr 'alias'
 ///                     OptionalLinkage Aliasee
-///   ::= GlobalVar '=' OptionalVisibility OptionalDLLStorageClass 'alias'
-///                     OptionalLinkage OptionalAddrSpace Type, Aliasee
 ///
 /// Aliasee
 ///   ::= TypeAndValue
 ///
-/// Everything through DLL storage class has already been parsed.
+/// Everything through OptionalUnNammedAddr has already been parsed.
 ///
 bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
-                          unsigned Visibility, unsigned DLLStorageClass) {
+                          unsigned Visibility, unsigned DLLStorageClass,
+                          GlobalVariable::ThreadLocalMode TLM,
+                          bool UnnamedAddr) {
   assert(Lex.getKind() == lltok::kw_alias);
   Lex.Lex();
   LocTy LinkageLoc = Lex.getLoc();
@@ -656,51 +722,39 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
     return Error(LinkageLoc,
                  "symbol with local linkage must have default visibility");
 
-  bool HasAddrSpace = Lex.getKind() == lltok::kw_addrspace;
-  unsigned AddrSpace;
-  LocTy AddrSpaceLoc = Lex.getLoc();
-  if (ParseOptionalAddrSpace(AddrSpace))
-    return true;
-
-  LocTy TyLoc = Lex.getLoc();
-  Type *Ty = nullptr;
-  if (ParseType(Ty))
-    return true;
-
-  bool DifferentType = EatIfPresent(lltok::comma);
-  if (HasAddrSpace && !DifferentType)
-    return Error(AddrSpaceLoc, "A type is required if addrspace is given");
-
-  Type *AliaseeType = nullptr;
-  if (DifferentType) {
-    if (ParseType(AliaseeType))
+  Constant *Aliasee;
+  LocTy AliaseeLoc = Lex.getLoc();
+  if (Lex.getKind() != lltok::kw_bitcast &&
+      Lex.getKind() != lltok::kw_getelementptr &&
+      Lex.getKind() != lltok::kw_addrspacecast &&
+      Lex.getKind() != lltok::kw_inttoptr) {
+    if (ParseGlobalTypeAndValue(Aliasee))
       return true;
   } else {
-    AliaseeType = Ty;
-    auto *PTy = dyn_cast<PointerType>(Ty);
-    if (!PTy)
-      return Error(TyLoc, "An alias must have pointer type");
-    Ty = PTy->getElementType();
-    AddrSpace = PTy->getAddressSpace();
+    // The bitcast dest type is not present, it is implied by the dest type.
+    ValID ID;
+    if (ParseValID(ID))
+      return true;
+    if (ID.Kind != ValID::t_Constant)
+      return Error(AliaseeLoc, "invalid aliasee");
+    Aliasee = ID.ConstantVal;
   }
 
-  LocTy AliaseeLoc = Lex.getLoc();
-  Constant *C;
-  if (ParseGlobalValue(AliaseeType, C))
-    return true;
-
-  auto *Aliasee = dyn_cast<GlobalObject>(C);
-  if (!Aliasee)
-    return Error(AliaseeLoc, "Alias must point to function or variable");
-
-  assert(Aliasee->getType()->isPointerTy());
+  Type *AliaseeType = Aliasee->getType();
+  auto *PTy = dyn_cast<PointerType>(AliaseeType);
+  if (!PTy)
+    return Error(AliaseeLoc, "An alias must have pointer type");
+  Type *Ty = PTy->getElementType();
+  unsigned AddrSpace = PTy->getAddressSpace();
 
   // Okay, create the alias but do not insert it into the module yet.
   std::unique_ptr<GlobalAlias> GA(
       GlobalAlias::create(Ty, AddrSpace, (GlobalValue::LinkageTypes)Linkage,
                           Name, Aliasee, /*Parent*/ nullptr));
+  GA->setThreadLocalMode(TLM);
   GA->setVisibility((GlobalValue::VisibilityTypes)Visibility);
   GA->setDLLStorageClass((GlobalValue::DLLStorageClassTypes)DLLStorageClass);
+  GA->setUnnamedAddr(UnnamedAddr);
 
   // See if this value already exists in the symbol table.  If so, it is either
   // a redefinition or a definition of a forward reference.
@@ -720,11 +774,6 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
 
     // If they agree, just RAUW the old value with the alias and remove the
     // forward ref info.
-    for (auto *User : Val->users()) {
-      if (auto *GA = dyn_cast<GlobalAlias>(User))
-        return Error(NameLoc, "Alias is pointed by alias " + GA->getName());
-    }
-
     Val->replaceAllUsesWith(GA.get());
     Val->eraseFromParent();
     ForwardRefVals.erase(I);
@@ -742,34 +791,31 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
 
 /// ParseGlobal
 ///   ::= GlobalVar '=' OptionalLinkage OptionalVisibility OptionalDLLStorageClass
-///       OptionalThreadLocal OptionalAddrSpace OptionalUnNammedAddr
+///       OptionalThreadLocal OptionalUnNammedAddr OptionalAddrSpace
 ///       OptionalExternallyInitialized GlobalType Type Const
 ///   ::= OptionalLinkage OptionalVisibility OptionalDLLStorageClass
-///       OptionalThreadLocal OptionalAddrSpace OptionalUnNammedAddr
+///       OptionalThreadLocal OptionalUnNammedAddr OptionalAddrSpace
 ///       OptionalExternallyInitialized GlobalType Type Const
 ///
-/// Everything up to and including OptionalDLLStorageClass has been parsed
+/// Everything up to and including OptionalUnNammedAddr has been parsed
 /// already.
 ///
 bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
                            unsigned Linkage, bool HasLinkage,
-                           unsigned Visibility, unsigned DLLStorageClass) {
+                           unsigned Visibility, unsigned DLLStorageClass,
+                           GlobalVariable::ThreadLocalMode TLM,
+                           bool UnnamedAddr) {
   if (!isValidVisibilityForLinkage(Visibility, Linkage))
     return Error(NameLoc,
                  "symbol with local linkage must have default visibility");
 
   unsigned AddrSpace;
-  bool IsConstant, UnnamedAddr, IsExternallyInitialized;
-  GlobalVariable::ThreadLocalMode TLM;
-  LocTy UnnamedAddrLoc;
+  bool IsConstant, IsExternallyInitialized;
   LocTy IsExternallyInitializedLoc;
   LocTy TyLoc;
 
   Type *Ty = nullptr;
-  if (ParseOptionalThreadLocal(TLM) ||
-      ParseOptionalAddrSpace(AddrSpace) ||
-      ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr,
-                         &UnnamedAddrLoc) ||
+  if (ParseOptionalAddrSpace(AddrSpace) ||
       ParseOptionalToken(lltok::kw_externally_initialized,
                          IsExternallyInitialized,
                          &IsExternallyInitializedLoc) ||
@@ -848,7 +894,13 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
       if (ParseOptionalAlignment(Alignment)) return true;
       GV->setAlignment(Alignment);
     } else {
-      TokError("unknown global variable property!");
+      Comdat *C;
+      if (parseOptionalComdat(C))
+        return true;
+      if (C)
+        GV->setComdat(C);
+      else
+        return TokError("unknown global variable property!");
     }
   }
 
@@ -967,6 +1019,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_builtin:           B.addAttribute(Attribute::Builtin); break;
     case lltok::kw_cold:              B.addAttribute(Attribute::Cold); break;
     case lltok::kw_inlinehint:        B.addAttribute(Attribute::InlineHint); break;
+    case lltok::kw_jumptable:         B.addAttribute(Attribute::JumpTable); break;
     case lltok::kw_minsize:           B.addAttribute(Attribute::MinSize); break;
     case lltok::kw_naked:             B.addAttribute(Attribute::Naked); break;
     case lltok::kw_nobuiltin:         B.addAttribute(Attribute::NoBuiltin); break;
@@ -1106,6 +1159,24 @@ GlobalValue *LLParser::GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc) {
 
 
 //===----------------------------------------------------------------------===//
+// Comdat Reference/Resolution Routines.
+//===----------------------------------------------------------------------===//
+
+Comdat *LLParser::getComdat(const std::string &Name, LocTy Loc) {
+  // Look this name up in the comdat symbol table.
+  Module::ComdatSymTabType &ComdatSymTab = M->getComdatSymbolTable();
+  Module::ComdatSymTabType::iterator I = ComdatSymTab.find(Name);
+  if (I != ComdatSymTab.end())
+    return &I->second;
+
+  // Otherwise, create a new forward reference for this value and remember it.
+  Comdat *C = M->getOrInsertComdat(Name);
+  ForwardRefComdats[Name] = Loc;
+  return C;
+}
+
+
+//===----------------------------------------------------------------------===//
 // Helper Routines.
 //===----------------------------------------------------------------------===//
 
@@ -1230,6 +1301,7 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     case lltok::kw_alwaysinline:
     case lltok::kw_builtin:
     case lltok::kw_inlinehint:
+    case lltok::kw_jumptable:
     case lltok::kw_minsize:
     case lltok::kw_naked:
     case lltok::kw_nobuiltin:
@@ -1291,6 +1363,7 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_builtin:
     case lltok::kw_cold:
     case lltok::kw_inlinehint:
+    case lltok::kw_jumptable:
     case lltok::kw_minsize:
     case lltok::kw_naked:
     case lltok::kw_nobuiltin:
@@ -2797,6 +2870,19 @@ bool LLParser::ParseGlobalTypeAndValue(Constant *&V) {
          ParseGlobalValue(Ty, V);
 }
 
+bool LLParser::parseOptionalComdat(Comdat *&C) {
+  C = nullptr;
+  if (!EatIfPresent(lltok::kw_comdat))
+    return false;
+  if (Lex.getKind() != lltok::ComdatVar)
+    return TokError("expected comdat variable");
+  LocTy Loc = Lex.getLoc();
+  StringRef Name = Lex.getStrVal();
+  C = getComdat(Name, Loc);
+  Lex.Lex();
+  return false;
+}
+
 /// ParseGlobalValueVector
 ///   ::= /*empty*/
 ///   ::= TypeAndValue (',' TypeAndValue)*
@@ -3097,6 +3183,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   bool UnnamedAddr;
   LocTy UnnamedAddrLoc;
   Constant *Prefix = nullptr;
+  Comdat *C;
 
   if (ParseArgumentList(ArgList, isVarArg) ||
       ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr,
@@ -3105,6 +3192,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
                                  BuiltinLoc) ||
       (EatIfPresent(lltok::kw_section) &&
        ParseStringConstant(Section)) ||
+      parseOptionalComdat(C) ||
       ParseOptionalAlignment(Alignment) ||
       (EatIfPresent(lltok::kw_gc) &&
        ParseStringConstant(GC)) ||
@@ -3207,6 +3295,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   Fn->setUnnamedAddr(UnnamedAddr);
   Fn->setAlignment(Alignment);
   Fn->setSection(Section);
+  Fn->setComdat(C);
   if (!GC.empty()) Fn->setGC(GC.c_str());
   Fn->setPrefixData(Prefix);
   ForwardRefAttrGroups[Fn] = FwdRefAttrGrps;
@@ -4011,7 +4100,8 @@ bool LLParser::ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS) {
     else
       return TokError("expected 'catch' or 'filter' clause type");
 
-    Value *V; LocTy VLoc;
+    Value *V;
+    LocTy VLoc;
     if (ParseTypeAndValue(V, VLoc, PFS)) {
       delete LP;
       return true;
@@ -4027,7 +4117,7 @@ bool LLParser::ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS) {
         Error(VLoc, "'filter' clause has an invalid type");
     }
 
-    LP->addClause(V);
+    LP->addClause(cast<Constant>(V));
   }
 
   Inst = LP;
@@ -4263,8 +4353,8 @@ int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS) {
 }
 
 /// ParseCmpXchg
-///   ::= 'cmpxchg' 'volatile'? TypeAndValue ',' TypeAndValue ',' TypeAndValue
-///       'singlethread'? AtomicOrdering AtomicOrdering
+///   ::= 'cmpxchg' 'weak'? 'volatile'? TypeAndValue ',' TypeAndValue ','
+///       TypeAndValue 'singlethread'? AtomicOrdering AtomicOrdering
 int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Ptr, *Cmp, *New; LocTy PtrLoc, CmpLoc, NewLoc;
   bool AteExtraComma = false;
@@ -4272,6 +4362,10 @@ int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
   AtomicOrdering FailureOrdering = NotAtomic;
   SynchronizationScope Scope = CrossThread;
   bool isVolatile = false;
+  bool isWeak = false;
+
+  if (EatIfPresent(lltok::kw_weak))
+    isWeak = true;
 
   if (EatIfPresent(lltok::kw_volatile))
     isVolatile = true;
@@ -4304,9 +4398,10 @@ int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
     return Error(NewLoc, "cmpxchg operand must be power-of-two byte-sized"
                          " integer");
 
-  AtomicCmpXchgInst *CXI = new AtomicCmpXchgInst(Ptr, Cmp, New, SuccessOrdering,
-                                                 FailureOrdering, Scope);
+  AtomicCmpXchgInst *CXI = new AtomicCmpXchgInst(
+      Ptr, Cmp, New, SuccessOrdering, FailureOrdering, Scope);
   CXI->setVolatile(isVolatile);
+  CXI->setWeak(isWeak);
   Inst = CXI;
   return AteExtraComma ? InstExtraComma : InstNormal;
 }
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index e2bf462..2efb260 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -34,6 +34,7 @@ namespace llvm {
   class Instruction;
   class Constant;
   class GlobalValue;
+  class Comdat;
   class MDString;
   class MDNode;
   class StructType;
@@ -122,6 +123,9 @@ namespace llvm {
     std::map<unsigned, std::pair<GlobalValue*, LocTy> > ForwardRefValIDs;
     std::vector<GlobalValue*> NumberedVals;
 
+    // Comdat forward reference information.
+    std::map<std::string, LocTy> ForwardRefComdats;
+
     // References to blockaddress.  The key is the function ValID, the value is
     // a list of references to blocks in that function.
     std::map<ValID, std::vector<std::pair<ValID, GlobalValue*> > >
@@ -154,6 +158,10 @@ namespace llvm {
     GlobalValue *GetGlobalVal(const std::string &N, Type *Ty, LocTy Loc);
     GlobalValue *GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc);
 
+    /// Get a Comdat with the specified name, creating a forward reference
+    /// record if needed.
+    Comdat *getComdat(const std::string &N, LocTy Loc);
+
     // Helper Routines.
     bool ParseToken(lltok::Kind T, const char *ErrMsg);
     bool EatIfPresent(lltok::Kind T) {
@@ -197,6 +205,9 @@ namespace llvm {
 
     bool ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM);
     bool ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM);
+    bool parseOptionalUnnamedAddr(bool &UnnamedAddr) {
+      return ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr);
+    }
     bool ParseOptionalAddrSpace(unsigned &AddrSpace);
     bool ParseOptionalParamAttrs(AttrBuilder &B);
     bool ParseOptionalReturnAttrs(AttrBuilder &B);
@@ -239,9 +250,12 @@ namespace llvm {
     bool ParseNamedGlobal();
     bool ParseGlobal(const std::string &Name, LocTy Loc, unsigned Linkage,
                      bool HasLinkage, unsigned Visibility,
-                     unsigned DLLStorageClass);
+                     unsigned DLLStorageClass,
+                     GlobalVariable::ThreadLocalMode TLM, bool UnnamedAddr);
     bool ParseAlias(const std::string &Name, LocTy Loc, unsigned Visibility,
-                    unsigned DLLStorageClass);
+                    unsigned DLLStorageClass,
+                    GlobalVariable::ThreadLocalMode TLM, bool UnnamedAddr);
+    bool parseComdat();
     bool ParseStandaloneMetadata();
     bool ParseNamedMetadata();
     bool ParseMDString(MDString *&Result);
@@ -353,6 +367,7 @@ namespace llvm {
     bool ParseGlobalValue(Type *Ty, Constant *&V);
     bool ParseGlobalTypeAndValue(Constant *&V);
     bool ParseGlobalValueVector(SmallVectorImpl<Constant*> &Elts);
+    bool parseOptionalComdat(Comdat *&C);
     bool ParseMetadataListValue(ValID &ID, PerFunctionState *PFS);
     bool ParseMetadataValue(ValID &ID, PerFunctionState *PFS);
     bool ParseMDNodeVector(SmallVectorImpl<Value*> &, PerFunctionState *PFS);
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index b6b7d82..534d824 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -42,7 +42,8 @@ namespace lltok {
     kw_linker_private,          // NOTE: deprecated, for parser compatibility
     kw_linker_private_weak,     // NOTE: deprecated, for parser compatibility
     kw_linkonce, kw_linkonce_odr,
-    kw_weak, kw_weak_odr, kw_appending,
+    kw_weak, // Used as a linkage, and a modifier for "cmpxchg".
+    kw_weak_odr, kw_appending,
     kw_dllimport, kw_dllexport, kw_common, kw_available_externally,
     kw_default, kw_hidden, kw_protected,
     kw_unnamed_addr,
@@ -107,6 +108,7 @@ namespace lltok {
     kw_cold,
     kw_inlinehint,
     kw_inreg,
+    kw_jumptable,
     kw_minsize,
     kw_naked,
     kw_nest,
@@ -140,6 +142,15 @@ namespace lltok {
     kw_type,
     kw_opaque,
 
+    kw_comdat,
+
+    // Comdat types
+    kw_any,
+    kw_exactmatch,
+    kw_largest,
+    kw_noduplicates,
+    kw_samesize,
+
     kw_eq, kw_ne, kw_slt, kw_sgt, kw_sle, kw_sge, kw_ult, kw_ugt, kw_ule,
     kw_uge, kw_oeq, kw_one, kw_olt, kw_ogt, kw_ole, kw_oge, kw_ord, kw_uno,
     kw_ueq, kw_une,
@@ -178,6 +189,7 @@ namespace lltok {
     // String valued tokens (StrVal).
     LabelStr,          // foo:
     GlobalVar,         // @foo @"foo"
+    ComdatVar,         // $foo
     LocalVar,          // %foo %"foo"
     MetadataVar,       // !foo
     StringConstant,    // "foo"
diff --git a/lib/AsmParser/Parser.cpp b/lib/AsmParser/Parser.cpp
index 2606bc2..91bb51c 100644
--- a/lib/AsmParser/Parser.cpp
+++ b/lib/AsmParser/Parser.cpp
@@ -17,8 +17,8 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <cstring>
+#include <system_error>
 using namespace llvm;
 
 Module *llvm::ParseAssembly(MemoryBuffer *F,
@@ -41,21 +41,21 @@ Module *llvm::ParseAssembly(MemoryBuffer *F,
 
 Module *llvm::ParseAssemblyFile(const std::string &Filename, SMDiagnostic &Err,
                                 LLVMContext &Context) {
-  std::unique_ptr<MemoryBuffer> File;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, File)) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
+      MemoryBuffer::getFileOrSTDIN(Filename);
+  if (std::error_code EC = FileOrErr.getError()) {
     Err = SMDiagnostic(Filename, SourceMgr::DK_Error,
-                       "Could not open input file: " + ec.message());
+                       "Could not open input file: " + EC.message());
     return nullptr;
   }
 
-  return ParseAssembly(File.release(), nullptr, Err, Context);
+  return ParseAssembly(FileOrErr.get().release(), nullptr, Err, Context);
 }
 
 Module *llvm::ParseAssemblyString(const char *AsmString, Module *M,
                                   SMDiagnostic &Err, LLVMContext &Context) {
   MemoryBuffer *F =
-    MemoryBuffer::getMemBuffer(StringRef(AsmString, strlen(AsmString)),
-                               "<string>");
+      MemoryBuffer::getMemBuffer(StringRef(AsmString), "<string>");
 
   return ParseAssembly(F, M, Err, Context);
 }
diff --git a/lib/Bitcode/Reader/BitReader.cpp b/lib/Bitcode/Reader/BitReader.cpp
index 716299f..b5886c1 100644
--- a/lib/Bitcode/Reader/BitReader.cpp
+++ b/lib/Bitcode/Reader/BitReader.cpp
@@ -32,7 +32,7 @@ LLVMBool LLVMParseBitcodeInContext(LLVMContextRef ContextRef,
                                    char **OutMessage) {
   ErrorOr<Module *> ModuleOrErr =
       parseBitcodeFile(unwrap(MemBuf), *unwrap(ContextRef));
-  if (error_code EC = ModuleOrErr.getError()) {
+  if (std::error_code EC = ModuleOrErr.getError()) {
     if (OutMessage)
       *OutMessage = strdup(EC.message().c_str());
     *OutModule = wrap((Module*)nullptr);
@@ -54,7 +54,7 @@ LLVMBool LLVMGetBitcodeModuleInContext(LLVMContextRef ContextRef,
   ErrorOr<Module *> ModuleOrErr =
       getLazyBitcodeModule(unwrap(MemBuf), *unwrap(ContextRef));
 
-  if (error_code EC = ModuleOrErr.getError()) {
+  if (std::error_code EC = ModuleOrErr.getError()) {
     *OutM = wrap((Module *)nullptr);
     if (OutMessage)
       *OutMessage = strdup(EC.message().c_str());
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 4170f98..192f753 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -39,12 +39,11 @@ void BitcodeReader::materializeForwardReferencedFunctions() {
 }
 
 void BitcodeReader::FreeState() {
-  if (BufferOwned)
-    delete Buffer;
   Buffer = nullptr;
   std::vector<Type*>().swap(TypeList);
   ValueList.clear();
   MDValueList.clear();
+  std::vector<Comdat *>().swap(ComdatList);
 
   std::vector<AttributeSet>().swap(MAttributes);
   std::vector<BasicBlock*>().swap(FunctionBBs);
@@ -205,6 +204,22 @@ static SynchronizationScope GetDecodedSynchScope(unsigned Val) {
   }
 }
 
+static Comdat::SelectionKind getDecodedComdatSelectionKind(unsigned Val) {
+  switch (Val) {
+  default: // Map unknown selection kinds to any.
+  case bitc::COMDAT_SELECTION_KIND_ANY:
+    return Comdat::Any;
+  case bitc::COMDAT_SELECTION_KIND_EXACT_MATCH:
+    return Comdat::ExactMatch;
+  case bitc::COMDAT_SELECTION_KIND_LARGEST:
+    return Comdat::Largest;
+  case bitc::COMDAT_SELECTION_KIND_NO_DUPLICATES:
+    return Comdat::NoDuplicates;
+  case bitc::COMDAT_SELECTION_KIND_SAME_SIZE:
+    return Comdat::SameSize;
+  }
+}
+
 static void UpgradeDLLImportExportLinkage(llvm::GlobalValue *GV, unsigned Val) {
   switch (Val) {
   case 5: GV->setDLLStorageClass(GlobalValue::DLLImportStorageClass); break;
@@ -470,7 +485,7 @@ static void decodeLLVMAttributesForBitcode(AttrBuilder &B,
                 (EncodedAttrs & 0xffff));
 }
 
-error_code BitcodeReader::ParseAttributeBlock() {
+std::error_code BitcodeReader::ParseAttributeBlock() {
   if (Stream.EnterSubBlock(bitc::PARAMATTR_BLOCK_ID))
     return Error(InvalidRecord);
 
@@ -490,7 +505,7 @@ error_code BitcodeReader::ParseAttributeBlock() {
     case BitstreamEntry::Error:
       return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return error_code::success();
+      return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -549,6 +564,8 @@ static Attribute::AttrKind GetAttrFromCode(uint64_t Code) {
     return Attribute::InlineHint;
   case bitc::ATTR_KIND_IN_REG:
     return Attribute::InReg;
+  case bitc::ATTR_KIND_JUMP_TABLE:
+    return Attribute::JumpTable;
   case bitc::ATTR_KIND_MIN_SIZE:
     return Attribute::MinSize;
   case bitc::ATTR_KIND_NAKED:
@@ -614,15 +631,15 @@ static Attribute::AttrKind GetAttrFromCode(uint64_t Code) {
   }
 }
 
-error_code BitcodeReader::ParseAttrKind(uint64_t Code,
-                                        Attribute::AttrKind *Kind) {
+std::error_code BitcodeReader::ParseAttrKind(uint64_t Code,
+                                             Attribute::AttrKind *Kind) {
   *Kind = GetAttrFromCode(Code);
   if (*Kind == Attribute::None)
     return Error(InvalidValue);
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code BitcodeReader::ParseAttributeGroupBlock() {
+std::error_code BitcodeReader::ParseAttributeGroupBlock() {
   if (Stream.EnterSubBlock(bitc::PARAMATTR_GROUP_BLOCK_ID))
     return Error(InvalidRecord);
 
@@ -640,7 +657,7 @@ error_code BitcodeReader::ParseAttributeGroupBlock() {
     case BitstreamEntry::Error:
       return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return error_code::success();
+      return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -662,13 +679,13 @@ error_code BitcodeReader::ParseAttributeGroupBlock() {
       for (unsigned i = 2, e = Record.size(); i != e; ++i) {
         if (Record[i] == 0) {        // Enum attribute
           Attribute::AttrKind Kind;
-          if (error_code EC = ParseAttrKind(Record[++i], &Kind))
+          if (std::error_code EC = ParseAttrKind(Record[++i], &Kind))
             return EC;
 
           B.addAttribute(Kind);
         } else if (Record[i] == 1) { // Align attribute
           Attribute::AttrKind Kind;
-          if (error_code EC = ParseAttrKind(Record[++i], &Kind))
+          if (std::error_code EC = ParseAttrKind(Record[++i], &Kind))
             return EC;
           if (Kind == Attribute::Alignment)
             B.addAlignmentAttr(Record[++i]);
@@ -704,14 +721,14 @@ error_code BitcodeReader::ParseAttributeGroupBlock() {
   }
 }
 
-error_code BitcodeReader::ParseTypeTable() {
+std::error_code BitcodeReader::ParseTypeTable() {
   if (Stream.EnterSubBlock(bitc::TYPE_BLOCK_ID_NEW))
     return Error(InvalidRecord);
 
   return ParseTypeTableBody();
 }
 
-error_code BitcodeReader::ParseTypeTableBody() {
+std::error_code BitcodeReader::ParseTypeTableBody() {
   if (!TypeList.empty())
     return Error(InvalidMultipleBlocks);
 
@@ -731,7 +748,7 @@ error_code BitcodeReader::ParseTypeTableBody() {
     case BitstreamEntry::EndBlock:
       if (NumRecords != TypeList.size())
         return Error(MalformedBlock);
-      return error_code::success();
+      return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -931,7 +948,7 @@ error_code BitcodeReader::ParseTypeTableBody() {
   }
 }
 
-error_code BitcodeReader::ParseValueSymbolTable() {
+std::error_code BitcodeReader::ParseValueSymbolTable() {
   if (Stream.EnterSubBlock(bitc::VALUE_SYMTAB_BLOCK_ID))
     return Error(InvalidRecord);
 
@@ -947,7 +964,7 @@ error_code BitcodeReader::ParseValueSymbolTable() {
     case BitstreamEntry::Error:
       return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return error_code::success();
+      return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -985,7 +1002,7 @@ error_code BitcodeReader::ParseValueSymbolTable() {
   }
 }
 
-error_code BitcodeReader::ParseMetadata() {
+std::error_code BitcodeReader::ParseMetadata() {
   unsigned NextMDValueNo = MDValueList.size();
 
   if (Stream.EnterSubBlock(bitc::METADATA_BLOCK_ID))
@@ -1002,7 +1019,7 @@ error_code BitcodeReader::ParseMetadata() {
     case BitstreamEntry::Error:
       return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return error_code::success();
+      return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -1062,7 +1079,8 @@ error_code BitcodeReader::ParseMetadata() {
       break;
     }
     case bitc::METADATA_STRING: {
-      SmallString<8> String(Record.begin(), Record.end());
+      std::string String(Record.begin(), Record.end());
+      llvm::UpgradeMDStringConstant(String);
       Value *V = MDString::get(Context, String);
       MDValueList.AssignValue(V, NextMDValueNo++);
       break;
@@ -1094,31 +1112,9 @@ uint64_t BitcodeReader::decodeSignRotatedValue(uint64_t V) {
   return 1ULL << 63;
 }
 
-// FIXME: Delete this in LLVM 4.0 and just assert that the aliasee is a
-// GlobalObject.
-static GlobalObject &
-getGlobalObjectInExpr(const DenseMap<GlobalAlias *, Constant *> &Map,
-                      Constant &C) {
-  auto *GO = dyn_cast<GlobalObject>(&C);
-  if (GO)
-    return *GO;
-
-  auto *GA = dyn_cast<GlobalAlias>(&C);
-  if (GA)
-    return getGlobalObjectInExpr(Map, *Map.find(GA)->second);
-
-  auto &CE = cast<ConstantExpr>(C);
-  assert(CE.getOpcode() == Instruction::BitCast ||
-         CE.getOpcode() == Instruction::GetElementPtr ||
-         CE.getOpcode() == Instruction::AddrSpaceCast);
-  if (CE.getOpcode() == Instruction::GetElementPtr)
-    assert(cast<GEPOperator>(CE).hasAllZeroIndices());
-  return getGlobalObjectInExpr(Map, *CE.getOperand(0));
-}
-
 /// ResolveGlobalAndAliasInits - Resolve all of the initializers for global
 /// values and aliases that we can.
-error_code BitcodeReader::ResolveGlobalAndAliasInits() {
+std::error_code BitcodeReader::ResolveGlobalAndAliasInits() {
   std::vector<std::pair<GlobalVariable*, unsigned> > GlobalInitWorklist;
   std::vector<std::pair<GlobalAlias*, unsigned> > AliasInitWorklist;
   std::vector<std::pair<Function*, unsigned> > FunctionPrefixWorklist;
@@ -1141,30 +1137,19 @@ error_code BitcodeReader::ResolveGlobalAndAliasInits() {
     GlobalInitWorklist.pop_back();
   }
 
-  // FIXME: Delete this in LLVM 4.0
-  // Older versions of llvm could write an alias pointing to another. We cannot
-  // construct those aliases, so we first collect an alias to aliasee expression
-  // and then compute the actual aliasee.
-  DenseMap<GlobalAlias *, Constant *> AliasInit;
-
   while (!AliasInitWorklist.empty()) {
     unsigned ValID = AliasInitWorklist.back().second;
     if (ValID >= ValueList.size()) {
       AliasInits.push_back(AliasInitWorklist.back());
     } else {
       if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
-        AliasInit.insert(std::make_pair(AliasInitWorklist.back().first, C));
+        AliasInitWorklist.back().first->setAliasee(C);
       else
         return Error(ExpectedConstant);
     }
     AliasInitWorklist.pop_back();
   }
 
-  for (auto &Pair : AliasInit) {
-    auto &GO = getGlobalObjectInExpr(AliasInit, *Pair.second);
-    Pair.first->setAliasee(&GO);
-  }
-
   while (!FunctionPrefixWorklist.empty()) {
     unsigned ValID = FunctionPrefixWorklist.back().second;
     if (ValID >= ValueList.size()) {
@@ -1178,7 +1163,7 @@ error_code BitcodeReader::ResolveGlobalAndAliasInits() {
     FunctionPrefixWorklist.pop_back();
   }
 
-  return error_code::success();
+  return std::error_code();
 }
 
 static APInt ReadWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) {
@@ -1189,7 +1174,7 @@ static APInt ReadWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) {
   return APInt(TypeBits, Words);
 }
 
-error_code BitcodeReader::ParseConstants() {
+std::error_code BitcodeReader::ParseConstants() {
   if (Stream.EnterSubBlock(bitc::CONSTANTS_BLOCK_ID))
     return Error(InvalidRecord);
 
@@ -1212,7 +1197,7 @@ error_code BitcodeReader::ParseConstants() {
       // Once all the constants have been read, go through and resolve forward
       // references.
       ValueList.ResolveConstantForwardRefs();
-      return error_code::success();
+      return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -1627,7 +1612,7 @@ error_code BitcodeReader::ParseConstants() {
   }
 }
 
-error_code BitcodeReader::ParseUseLists() {
+std::error_code BitcodeReader::ParseUseLists() {
   if (Stream.EnterSubBlock(bitc::USELIST_BLOCK_ID))
     return Error(InvalidRecord);
 
@@ -1642,7 +1627,7 @@ error_code BitcodeReader::ParseUseLists() {
     case BitstreamEntry::Error:
       return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return error_code::success();
+      return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -1667,7 +1652,7 @@ error_code BitcodeReader::ParseUseLists() {
 /// RememberAndSkipFunctionBody - When we see the block for a function body,
 /// remember where it is and then skip it.  This lets us lazily deserialize the
 /// functions.
-error_code BitcodeReader::RememberAndSkipFunctionBody() {
+std::error_code BitcodeReader::RememberAndSkipFunctionBody() {
   // Get the function we are talking about.
   if (FunctionsWithBodies.empty())
     return Error(InsufficientFunctionProtos);
@@ -1682,10 +1667,10 @@ error_code BitcodeReader::RememberAndSkipFunctionBody() {
   // Skip over the function block for now.
   if (Stream.SkipBlock())
     return Error(InvalidRecord);
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code BitcodeReader::GlobalCleanup() {
+std::error_code BitcodeReader::GlobalCleanup() {
   // Patch the initializers for globals and aliases up.
   ResolveGlobalAndAliasInits();
   if (!GlobalInits.empty() || !AliasInits.empty())
@@ -1711,10 +1696,10 @@ error_code BitcodeReader::GlobalCleanup() {
   // want lazy deserialization.
   std::vector<std::pair<GlobalVariable*, unsigned> >().swap(GlobalInits);
   std::vector<std::pair<GlobalAlias*, unsigned> >().swap(AliasInits);
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code BitcodeReader::ParseModule(bool Resume) {
+std::error_code BitcodeReader::ParseModule(bool Resume) {
   if (Resume)
     Stream.JumpToBit(NextUnreadBit);
   else if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
@@ -1745,30 +1730,30 @@ error_code BitcodeReader::ParseModule(bool Resume) {
           return Error(MalformedBlock);
         break;
       case bitc::PARAMATTR_BLOCK_ID:
-        if (error_code EC = ParseAttributeBlock())
+        if (std::error_code EC = ParseAttributeBlock())
           return EC;
         break;
       case bitc::PARAMATTR_GROUP_BLOCK_ID:
-        if (error_code EC = ParseAttributeGroupBlock())
+        if (std::error_code EC = ParseAttributeGroupBlock())
           return EC;
         break;
       case bitc::TYPE_BLOCK_ID_NEW:
-        if (error_code EC = ParseTypeTable())
+        if (std::error_code EC = ParseTypeTable())
           return EC;
         break;
       case bitc::VALUE_SYMTAB_BLOCK_ID:
-        if (error_code EC = ParseValueSymbolTable())
+        if (std::error_code EC = ParseValueSymbolTable())
           return EC;
         SeenValueSymbolTable = true;
         break;
       case bitc::CONSTANTS_BLOCK_ID:
-        if (error_code EC = ParseConstants())
+        if (std::error_code EC = ParseConstants())
           return EC;
-        if (error_code EC = ResolveGlobalAndAliasInits())
+        if (std::error_code EC = ResolveGlobalAndAliasInits())
           return EC;
         break;
       case bitc::METADATA_BLOCK_ID:
-        if (error_code EC = ParseMetadata())
+        if (std::error_code EC = ParseMetadata())
           return EC;
         break;
       case bitc::FUNCTION_BLOCK_ID:
@@ -1776,12 +1761,12 @@ error_code BitcodeReader::ParseModule(bool Resume) {
         // FunctionsWithBodies list.
         if (!SeenFirstFunctionBody) {
           std::reverse(FunctionsWithBodies.begin(), FunctionsWithBodies.end());
-          if (error_code EC = GlobalCleanup())
+          if (std::error_code EC = GlobalCleanup())
             return EC;
           SeenFirstFunctionBody = true;
         }
 
-        if (error_code EC = RememberAndSkipFunctionBody())
+        if (std::error_code EC = RememberAndSkipFunctionBody())
           return EC;
         // For streaming bitcode, suspend parsing when we reach the function
         // bodies. Subsequent materialization calls will resume it when
@@ -1791,11 +1776,11 @@ error_code BitcodeReader::ParseModule(bool Resume) {
         // just finish the parse now.
         if (LazyStreamer && SeenValueSymbolTable) {
           NextUnreadBit = Stream.GetCurrentBitNo();
-          return error_code::success();
+          return std::error_code();
         }
         break;
       case bitc::USELIST_BLOCK_ID:
-        if (error_code EC = ParseUseLists())
+        if (std::error_code EC = ParseUseLists())
           return EC;
         break;
       }
@@ -1870,6 +1855,20 @@ error_code BitcodeReader::ParseModule(bool Resume) {
       GCTable.push_back(S);
       break;
     }
+    case bitc::MODULE_CODE_COMDAT: { // COMDAT: [selection_kind, name]
+      if (Record.size() < 2)
+        return Error(InvalidRecord);
+      Comdat::SelectionKind SK = getDecodedComdatSelectionKind(Record[0]);
+      unsigned ComdatNameSize = Record[1];
+      std::string ComdatName;
+      ComdatName.reserve(ComdatNameSize);
+      for (unsigned i = 0; i != ComdatNameSize; ++i)
+        ComdatName += (char)Record[2 + i];
+      Comdat *C = TheModule->getOrInsertComdat(ComdatName);
+      C->setSelectionKind(SK);
+      ComdatList.push_back(C);
+      break;
+    }
     // GLOBALVAR: [pointer type, isconst, initid,
     //             linkage, alignment, section, visibility, threadlocal,
     //             unnamed_addr, dllstorageclass]
@@ -1930,6 +1929,12 @@ error_code BitcodeReader::ParseModule(bool Resume) {
       // Remember which value to use for the global initializer.
       if (unsigned InitID = Record[2])
         GlobalInits.push_back(std::make_pair(NewGV, InitID-1));
+
+      if (Record.size() > 11)
+        if (unsigned ComdatID = Record[11]) {
+          assert(ComdatID <= ComdatList.size());
+          NewGV->setComdat(ComdatList[ComdatID - 1]);
+        }
       break;
     }
     // FUNCTION:  [type, callingconv, isproto, linkage, paramattr,
@@ -1983,6 +1988,12 @@ error_code BitcodeReader::ParseModule(bool Resume) {
       else
         UpgradeDLLImportExportLinkage(Func, Record[3]);
 
+      if (Record.size() > 12)
+        if (unsigned ComdatID = Record[12]) {
+          assert(ComdatID <= ComdatList.size());
+          Func->setComdat(ComdatList[ComdatID - 1]);
+        }
+
       ValueList.push_back(Func);
 
       // If this is a function with a body, remember the prototype we are
@@ -2017,6 +2028,10 @@ error_code BitcodeReader::ParseModule(bool Resume) {
         NewGA->setDLLStorageClass(GetDecodedDLLStorageClass(Record[4]));
       else
         UpgradeDLLImportExportLinkage(NewGA, Record[2]);
+      if (Record.size() > 5)
+	NewGA->setThreadLocalMode(GetDecodedThreadLocalMode(Record[5]));
+      if (Record.size() > 6)
+	NewGA->setUnnamedAddr(Record[6]);
       ValueList.push_back(NewGA);
       AliasInits.push_back(std::make_pair(NewGA, Record[1]));
       break;
@@ -2033,10 +2048,10 @@ error_code BitcodeReader::ParseModule(bool Resume) {
   }
 }
 
-error_code BitcodeReader::ParseBitcodeInto(Module *M) {
+std::error_code BitcodeReader::ParseBitcodeInto(Module *M) {
   TheModule = nullptr;
 
-  if (error_code EC = InitStream())
+  if (std::error_code EC = InitStream())
     return EC;
 
   // Sniff for the signature.
@@ -2052,7 +2067,7 @@ error_code BitcodeReader::ParseBitcodeInto(Module *M) {
   // need to understand them all.
   while (1) {
     if (Stream.AtEndOfStream())
-      return error_code::success();
+      return std::error_code();
 
     BitstreamEntry Entry =
       Stream.advance(BitstreamCursor::AF_DontAutoprocessAbbrevs);
@@ -2061,7 +2076,7 @@ error_code BitcodeReader::ParseBitcodeInto(Module *M) {
     case BitstreamEntry::Error:
       return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return error_code::success();
+      return std::error_code();
 
     case BitstreamEntry::SubBlock:
       switch (Entry.ID) {
@@ -2074,10 +2089,10 @@ error_code BitcodeReader::ParseBitcodeInto(Module *M) {
         if (TheModule)
           return Error(InvalidMultipleBlocks);
         TheModule = M;
-        if (error_code EC = ParseModule(false))
+        if (std::error_code EC = ParseModule(false))
           return EC;
         if (LazyStreamer)
-          return error_code::success();
+          return std::error_code();
         break;
       default:
         if (Stream.SkipBlock())
@@ -2094,19 +2109,20 @@ error_code BitcodeReader::ParseBitcodeInto(Module *M) {
       if (Stream.getAbbrevIDWidth() == 2 && Entry.ID == 2 &&
           Stream.Read(6) == 2 && Stream.Read(24) == 0xa0a0a &&
           Stream.AtEndOfStream())
-        return error_code::success();
+        return std::error_code();
 
       return Error(InvalidRecord);
     }
   }
 }
 
-error_code BitcodeReader::ParseModuleTriple(std::string &Triple) {
+ErrorOr<std::string> BitcodeReader::parseModuleTriple() {
   if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
     return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
+  std::string Triple;
   // Read all the records for this module.
   while (1) {
     BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
@@ -2116,7 +2132,7 @@ error_code BitcodeReader::ParseModuleTriple(std::string &Triple) {
     case BitstreamEntry::Error:
       return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return error_code::success();
+      return Triple;
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -2135,10 +2151,11 @@ error_code BitcodeReader::ParseModuleTriple(std::string &Triple) {
     }
     Record.clear();
   }
+  llvm_unreachable("Exit infinite loop");
 }
 
-error_code BitcodeReader::ParseTriple(std::string &Triple) {
-  if (error_code EC = InitStream())
+ErrorOr<std::string> BitcodeReader::parseTriple() {
+  if (std::error_code EC = InitStream())
     return EC;
 
   // Sniff for the signature.
@@ -2159,11 +2176,11 @@ error_code BitcodeReader::ParseTriple(std::string &Triple) {
     case BitstreamEntry::Error:
       return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return error_code::success();
+      return std::error_code();
 
     case BitstreamEntry::SubBlock:
       if (Entry.ID == bitc::MODULE_BLOCK_ID)
-        return ParseModuleTriple(Triple);
+        return parseModuleTriple();
 
       // Ignore other sub-blocks.
       if (Stream.SkipBlock())
@@ -2178,7 +2195,7 @@ error_code BitcodeReader::ParseTriple(std::string &Triple) {
 }
 
 /// ParseMetadataAttachment - Parse metadata attachments.
-error_code BitcodeReader::ParseMetadataAttachment() {
+std::error_code BitcodeReader::ParseMetadataAttachment() {
   if (Stream.EnterSubBlock(bitc::METADATA_ATTACHMENT_ID))
     return Error(InvalidRecord);
 
@@ -2191,7 +2208,7 @@ error_code BitcodeReader::ParseMetadataAttachment() {
     case BitstreamEntry::Error:
       return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return error_code::success();
+      return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -2225,7 +2242,7 @@ error_code BitcodeReader::ParseMetadataAttachment() {
 }
 
 /// ParseFunctionBody - Lazily parse the specified function body block.
-error_code BitcodeReader::ParseFunctionBody(Function *F) {
+std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
   if (Stream.EnterSubBlock(bitc::FUNCTION_BLOCK_ID))
     return Error(InvalidRecord);
 
@@ -2261,20 +2278,20 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
           return Error(InvalidRecord);
         break;
       case bitc::CONSTANTS_BLOCK_ID:
-        if (error_code EC = ParseConstants())
+        if (std::error_code EC = ParseConstants())
           return EC;
         NextValueNo = ValueList.size();
         break;
       case bitc::VALUE_SYMTAB_BLOCK_ID:
-        if (error_code EC = ParseValueSymbolTable())
+        if (std::error_code EC = ParseValueSymbolTable())
           return EC;
         break;
       case bitc::METADATA_ATTACHMENT_ID:
-        if (error_code EC = ParseMetadataAttachment())
+        if (std::error_code EC = ParseMetadataAttachment())
           return EC;
         break;
       case bitc::METADATA_BLOCK_ID:
-        if (error_code EC = ParseMetadata())
+        if (std::error_code EC = ParseMetadata())
           return EC;
         break;
       }
@@ -2857,7 +2874,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
         assert((CT != LandingPadInst::Filter ||
                 isa<ArrayType>(Val->getType())) &&
                "Filter clause has invalid type!");
-        LP->addClause(Val);
+        LP->addClause(cast<Constant>(Val));
       }
 
       I = LP;
@@ -2950,7 +2967,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
     }
     case bitc::FUNC_CODE_INST_CMPXCHG: {
       // CMPXCHG:[ptrty, ptr, cmp, new, vol, successordering, synchscope,
-      //          failureordering]
+      //          failureordering?, isweak?]
       unsigned OpNum = 0;
       Value *Ptr, *Cmp, *New;
       if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
@@ -2958,7 +2975,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
                     cast<PointerType>(Ptr->getType())->getElementType(), Cmp) ||
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), New) ||
-          (OpNum + 3 != Record.size() && OpNum + 4 != Record.size()))
+          (Record.size() < OpNum + 3 || Record.size() > OpNum + 5))
         return Error(InvalidRecord);
       AtomicOrdering SuccessOrdering = GetDecodedOrdering(Record[OpNum+1]);
       if (SuccessOrdering == NotAtomic || SuccessOrdering == Unordered)
@@ -2975,6 +2992,17 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
       I = new AtomicCmpXchgInst(Ptr, Cmp, New, SuccessOrdering, FailureOrdering,
                                 SynchScope);
       cast<AtomicCmpXchgInst>(I)->setVolatile(Record[OpNum]);
+
+      if (Record.size() < 8) {
+        // Before weak cmpxchgs existed, the instruction simply returned the
+        // value loaded from memory, so bitcode files from that era will be
+        // expecting the first component of a modern cmpxchg.
+        CurBB->getInstList().push_back(I);
+        I = ExtractValueInst::Create(I, 0);
+      } else {
+        cast<AtomicCmpXchgInst>(I)->setWeak(Record[OpNum+4]);
+      }
+
       InstructionList.push_back(I);
       break;
     }
@@ -3144,27 +3172,29 @@ OutOfRecordLoop:
   ValueList.shrinkTo(ModuleValueListSize);
   MDValueList.shrinkTo(ModuleMDValueListSize);
   std::vector<BasicBlock*>().swap(FunctionBBs);
-  return error_code::success();
+  return std::error_code();
 }
 
 /// Find the function body in the bitcode stream
-error_code BitcodeReader::FindFunctionInStream(Function *F,
-       DenseMap<Function*, uint64_t>::iterator DeferredFunctionInfoIterator) {
+std::error_code BitcodeReader::FindFunctionInStream(
+    Function *F,
+    DenseMap<Function *, uint64_t>::iterator DeferredFunctionInfoIterator) {
   while (DeferredFunctionInfoIterator->second == 0) {
     if (Stream.AtEndOfStream())
       return Error(CouldNotFindFunctionInStream);
     // ParseModule will parse the next body in the stream and set its
     // position in the DeferredFunctionInfo map.
-    if (error_code EC = ParseModule(true))
+    if (std::error_code EC = ParseModule(true))
       return EC;
   }
-  return error_code::success();
+  return std::error_code();
 }
 
 //===----------------------------------------------------------------------===//
 // GVMaterializer implementation
 //===----------------------------------------------------------------------===//
 
+void BitcodeReader::releaseBuffer() { Buffer.release(); }
 
 bool BitcodeReader::isMaterializable(const GlobalValue *GV) const {
   if (const Function *F = dyn_cast<Function>(GV)) {
@@ -3174,24 +3204,24 @@ bool BitcodeReader::isMaterializable(const GlobalValue *GV) const {
   return false;
 }
 
-error_code BitcodeReader::Materialize(GlobalValue *GV) {
+std::error_code BitcodeReader::Materialize(GlobalValue *GV) {
   Function *F = dyn_cast<Function>(GV);
   // If it's not a function or is already material, ignore the request.
   if (!F || !F->isMaterializable())
-    return error_code::success();
+    return std::error_code();
 
   DenseMap<Function*, uint64_t>::iterator DFII = DeferredFunctionInfo.find(F);
   assert(DFII != DeferredFunctionInfo.end() && "Deferred function not found!");
   // If its position is recorded as 0, its body is somewhere in the stream
   // but we haven't seen it yet.
   if (DFII->second == 0 && LazyStreamer)
-    if (error_code EC = FindFunctionInStream(F, DFII))
+    if (std::error_code EC = FindFunctionInStream(F, DFII))
       return EC;
 
   // Move the bit stream to the saved position of the deferred function body.
   Stream.JumpToBit(DFII->second);
 
-  if (error_code EC = ParseFunctionBody(F))
+  if (std::error_code EC = ParseFunctionBody(F))
     return EC;
 
   // Upgrade any old intrinsic calls in the function.
@@ -3206,7 +3236,7 @@ error_code BitcodeReader::Materialize(GlobalValue *GV) {
     }
   }
 
-  return error_code::success();
+  return std::error_code();
 }
 
 bool BitcodeReader::isDematerializable(const GlobalValue *GV) const {
@@ -3228,8 +3258,7 @@ void BitcodeReader::Dematerialize(GlobalValue *GV) {
   F->deleteBody();
 }
 
-
-error_code BitcodeReader::MaterializeModule(Module *M) {
+std::error_code BitcodeReader::MaterializeModule(Module *M) {
   assert(M == TheModule &&
          "Can only Materialize the Module this BitcodeReader is attached to.");
   // Iterate over the module, deserializing any functions that are still on
@@ -3237,7 +3266,7 @@ error_code BitcodeReader::MaterializeModule(Module *M) {
   for (Module::iterator F = TheModule->begin(), E = TheModule->end();
        F != E; ++F) {
     if (F->isMaterializable()) {
-      if (error_code EC = Materialize(F))
+      if (std::error_code EC = Materialize(F))
         return EC;
     }
   }
@@ -3270,16 +3299,16 @@ error_code BitcodeReader::MaterializeModule(Module *M) {
     UpgradeInstWithTBAATag(InstsWithTBAATag[I]);
 
   UpgradeDebugInfo(*M);
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code BitcodeReader::InitStream() {
+std::error_code BitcodeReader::InitStream() {
   if (LazyStreamer)
     return InitLazyStream();
   return InitStreamFromBuffer();
 }
 
-error_code BitcodeReader::InitStreamFromBuffer() {
+std::error_code BitcodeReader::InitStreamFromBuffer() {
   const unsigned char *BufPtr = (const unsigned char*)Buffer->getBufferStart();
   const unsigned char *BufEnd = BufPtr+Buffer->getBufferSize();
 
@@ -3299,10 +3328,10 @@ error_code BitcodeReader::InitStreamFromBuffer() {
   StreamFile.reset(new BitstreamReader(BufPtr, BufEnd));
   Stream.init(*StreamFile);
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code BitcodeReader::InitLazyStream() {
+std::error_code BitcodeReader::InitLazyStream() {
   // Check and strip off the bitcode wrapper; BitstreamReader expects never to
   // see it.
   StreamingMemoryObject *Bytes = new StreamingMemoryObject(LazyStreamer);
@@ -3323,12 +3352,12 @@ error_code BitcodeReader::InitLazyStream() {
     Bytes->dropLeadingBytes(bitcodeStart - buf);
     Bytes->setKnownObjectSize(bitcodeEnd - bitcodeStart);
   }
-  return error_code::success();
+  return std::error_code();
 }
 
 namespace {
-class BitcodeErrorCategoryType : public error_category {
-  const char *name() const override {
+class BitcodeErrorCategoryType : public std::error_category {
+  const char *name() const LLVM_NOEXCEPT override {
     return "llvm.bitcode";
   }
   std::string message(int IE) const override {
@@ -3378,7 +3407,7 @@ class BitcodeErrorCategoryType : public error_category {
 };
 }
 
-const error_category &BitcodeReader::BitcodeErrorCategory() {
+const std::error_category &BitcodeReader::BitcodeErrorCategory() {
   static BitcodeErrorCategoryType O;
   return O;
 }
@@ -3394,12 +3423,11 @@ ErrorOr<Module *> llvm::getLazyBitcodeModule(MemoryBuffer *Buffer,
   Module *M = new Module(Buffer->getBufferIdentifier(), Context);
   BitcodeReader *R = new BitcodeReader(Buffer, Context);
   M->setMaterializer(R);
-  if (error_code EC = R->ParseBitcodeInto(M)) {
+  if (std::error_code EC = R->ParseBitcodeInto(M)) {
+    R->releaseBuffer(); // Never take ownership on error.
     delete M;  // Also deletes R.
     return EC;
   }
-  // Have the BitcodeReader dtor delete 'Buffer'.
-  R->setBufferOwned(true);
 
   R->materializeForwardReferencedFunctions();
 
@@ -3414,13 +3442,12 @@ Module *llvm::getStreamedBitcodeModule(const std::string &name,
   Module *M = new Module(name, Context);
   BitcodeReader *R = new BitcodeReader(streamer, Context);
   M->setMaterializer(R);
-  if (error_code EC = R->ParseBitcodeInto(M)) {
+  if (std::error_code EC = R->ParseBitcodeInto(M)) {
     if (ErrMsg)
       *ErrMsg = EC.message();
     delete M;  // Also deletes R.
     return nullptr;
   }
-  R->setBufferOwned(false); // no buffer to delete
   return M;
 }
 
@@ -3430,13 +3457,8 @@ ErrorOr<Module *> llvm::parseBitcodeFile(MemoryBuffer *Buffer,
   if (!ModuleOrErr)
     return ModuleOrErr;
   Module *M = ModuleOrErr.get();
-
-  // Don't let the BitcodeReader dtor delete 'Buffer', regardless of whether
-  // there was an error.
-  static_cast<BitcodeReader*>(M->getMaterializer())->setBufferOwned(false);
-
   // Read in the entire module, and destroy the BitcodeReader.
-  if (error_code EC = M->materializeAllPermanently()) {
+  if (std::error_code EC = M->materializeAllPermanently(true)) {
     delete M;
     return EC;
   }
@@ -3448,17 +3470,12 @@ ErrorOr<Module *> llvm::parseBitcodeFile(MemoryBuffer *Buffer,
 }
 
 std::string llvm::getBitcodeTargetTriple(MemoryBuffer *Buffer,
-                                         LLVMContext& Context,
-                                         std::string *ErrMsg) {
+                                         LLVMContext &Context) {
   BitcodeReader *R = new BitcodeReader(Buffer, Context);
-  // Don't let the BitcodeReader dtor delete 'Buffer'.
-  R->setBufferOwned(false);
-
-  std::string Triple("");
-  if (error_code EC = R->ParseTriple(Triple))
-    if (ErrMsg)
-      *ErrMsg = EC.message();
-
+  ErrorOr<std::string> Triple = R->parseTriple();
+  R->releaseBuffer();
   delete R;
-  return Triple;
+  if (Triple.getError())
+    return "";
+  return Triple.get();
 }
diff --git a/lib/Bitcode/Reader/BitcodeReader.h b/lib/Bitcode/Reader/BitcodeReader.h
index 593d8f9..1d4869a 100644
--- a/lib/Bitcode/Reader/BitcodeReader.h
+++ b/lib/Bitcode/Reader/BitcodeReader.h
@@ -22,10 +22,11 @@
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/ValueHandle.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 #include <vector>
 
 namespace llvm {
+  class Comdat;
   class MemoryBuffer;
   class LLVMContext;
 
@@ -125,8 +126,7 @@ public:
 class BitcodeReader : public GVMaterializer {
   LLVMContext &Context;
   Module *TheModule;
-  MemoryBuffer *Buffer;
-  bool BufferOwned;
+  std::unique_ptr<MemoryBuffer> Buffer;
   std::unique_ptr<BitstreamReader> StreamFile;
   BitstreamCursor Stream;
   DataStreamer *LazyStreamer;
@@ -136,6 +136,7 @@ class BitcodeReader : public GVMaterializer {
   std::vector<Type*> TypeList;
   BitcodeReaderValueList ValueList;
   BitcodeReaderMDValueList MDValueList;
+  std::vector<Comdat *> ComdatList;
   SmallVector<Instruction *, 64> InstructionList;
   SmallVector<SmallVector<uint64_t, 64>, 64> UseListRecords;
 
@@ -193,7 +194,7 @@ class BitcodeReader : public GVMaterializer {
   /// not need this flag.
   bool UseRelativeIDs;
 
-  static const error_category &BitcodeErrorCategory();
+  static const std::error_category &BitcodeErrorCategory();
 
 public:
   enum ErrorType {
@@ -219,47 +220,39 @@ public:
     InvalidValue // Invalid version, inst number, attr number, etc
   };
 
-  error_code Error(ErrorType E) {
-    return error_code(E, BitcodeErrorCategory());
+  std::error_code Error(ErrorType E) {
+    return std::error_code(E, BitcodeErrorCategory());
   }
 
   explicit BitcodeReader(MemoryBuffer *buffer, LLVMContext &C)
-    : Context(C), TheModule(nullptr), Buffer(buffer), BufferOwned(false),
-      LazyStreamer(nullptr), NextUnreadBit(0), SeenValueSymbolTable(false),
-      ValueList(C), MDValueList(C),
-      SeenFirstFunctionBody(false), UseRelativeIDs(false) {
-  }
+      : Context(C), TheModule(nullptr), Buffer(buffer), LazyStreamer(nullptr),
+        NextUnreadBit(0), SeenValueSymbolTable(false), ValueList(C),
+        MDValueList(C), SeenFirstFunctionBody(false), UseRelativeIDs(false) {}
   explicit BitcodeReader(DataStreamer *streamer, LLVMContext &C)
-    : Context(C), TheModule(nullptr), Buffer(nullptr), BufferOwned(false),
-      LazyStreamer(streamer), NextUnreadBit(0), SeenValueSymbolTable(false),
-      ValueList(C), MDValueList(C),
-      SeenFirstFunctionBody(false), UseRelativeIDs(false) {
-  }
-  ~BitcodeReader() {
-    FreeState();
-  }
+      : Context(C), TheModule(nullptr), Buffer(nullptr), LazyStreamer(streamer),
+        NextUnreadBit(0), SeenValueSymbolTable(false), ValueList(C),
+        MDValueList(C), SeenFirstFunctionBody(false), UseRelativeIDs(false) {}
+  ~BitcodeReader() { FreeState(); }
 
   void materializeForwardReferencedFunctions();
 
   void FreeState();
 
-  /// setBufferOwned - If this is true, the reader will destroy the MemoryBuffer
-  /// when the reader is destroyed.
-  void setBufferOwned(bool Owned) { BufferOwned = Owned; }
+  void releaseBuffer() override;
 
   bool isMaterializable(const GlobalValue *GV) const override;
   bool isDematerializable(const GlobalValue *GV) const override;
-  error_code Materialize(GlobalValue *GV) override;
-  error_code MaterializeModule(Module *M) override;
+  std::error_code Materialize(GlobalValue *GV) override;
+  std::error_code MaterializeModule(Module *M) override;
   void Dematerialize(GlobalValue *GV) override;
 
   /// @brief Main interface to parsing a bitcode buffer.
   /// @returns true if an error occurred.
-  error_code ParseBitcodeInto(Module *M);
+  std::error_code ParseBitcodeInto(Module *M);
 
   /// @brief Cheap mechanism to just extract module triple
   /// @returns true if an error occurred.
-  error_code ParseTriple(std::string &Triple);
+  ErrorOr<std::string> parseTriple();
 
   static uint64_t decodeSignRotatedValue(uint64_t V);
 
@@ -346,28 +339,29 @@ private:
     return getFnValueByID(ValNo, Ty);
   }
 
-  error_code ParseAttrKind(uint64_t Code, Attribute::AttrKind *Kind);
-  error_code ParseModule(bool Resume);
-  error_code ParseAttributeBlock();
-  error_code ParseAttributeGroupBlock();
-  error_code ParseTypeTable();
-  error_code ParseTypeTableBody();
-
-  error_code ParseValueSymbolTable();
-  error_code ParseConstants();
-  error_code RememberAndSkipFunctionBody();
-  error_code ParseFunctionBody(Function *F);
-  error_code GlobalCleanup();
-  error_code ResolveGlobalAndAliasInits();
-  error_code ParseMetadata();
-  error_code ParseMetadataAttachment();
-  error_code ParseModuleTriple(std::string &Triple);
-  error_code ParseUseLists();
-  error_code InitStream();
-  error_code InitStreamFromBuffer();
-  error_code InitLazyStream();
-  error_code FindFunctionInStream(Function *F,
-         DenseMap<Function*, uint64_t>::iterator DeferredFunctionInfoIterator);
+  std::error_code ParseAttrKind(uint64_t Code, Attribute::AttrKind *Kind);
+  std::error_code ParseModule(bool Resume);
+  std::error_code ParseAttributeBlock();
+  std::error_code ParseAttributeGroupBlock();
+  std::error_code ParseTypeTable();
+  std::error_code ParseTypeTableBody();
+
+  std::error_code ParseValueSymbolTable();
+  std::error_code ParseConstants();
+  std::error_code RememberAndSkipFunctionBody();
+  std::error_code ParseFunctionBody(Function *F);
+  std::error_code GlobalCleanup();
+  std::error_code ResolveGlobalAndAliasInits();
+  std::error_code ParseMetadata();
+  std::error_code ParseMetadataAttachment();
+  ErrorOr<std::string> parseModuleTriple();
+  std::error_code ParseUseLists();
+  std::error_code InitStream();
+  std::error_code InitStreamFromBuffer();
+  std::error_code InitLazyStream();
+  std::error_code FindFunctionInStream(
+      Function *F,
+      DenseMap<Function *, uint64_t>::iterator DeferredFunctionInfoIterator);
 };
 
 } // End llvm namespace
diff --git a/lib/Bitcode/Reader/BitstreamReader.cpp b/lib/Bitcode/Reader/BitstreamReader.cpp
index f31e1fa..72451ec 100644
--- a/lib/Bitcode/Reader/BitstreamReader.cpp
+++ b/lib/Bitcode/Reader/BitstreamReader.cpp
@@ -97,7 +97,7 @@ void BitstreamCursor::readAbbreviatedField(const BitCodeAbbrevOp &Op,
   switch (Op.getEncoding()) {
   case BitCodeAbbrevOp::Array:
   case BitCodeAbbrevOp::Blob:
-    assert(0 && "Should not reach here");
+    llvm_unreachable("Should not reach here");
   case BitCodeAbbrevOp::Fixed:
     Vals.push_back(Read((unsigned)Op.getEncodingData()));
     break;
@@ -117,7 +117,7 @@ void BitstreamCursor::skipAbbreviatedField(const BitCodeAbbrevOp &Op) {
   switch (Op.getEncoding()) {
   case BitCodeAbbrevOp::Array:
   case BitCodeAbbrevOp::Blob:
-    assert(0 && "Should not reach here");
+    llvm_unreachable("Should not reach here");
   case BitCodeAbbrevOp::Fixed:
     (void)Read((unsigned)Op.getEncodingData());
     break;
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index cc73b84..dd9282a 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -177,6 +177,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_INLINE_HINT;
   case Attribute::InReg:
     return bitc::ATTR_KIND_IN_REG;
+  case Attribute::JumpTable:
+    return bitc::ATTR_KIND_JUMP_TABLE;
   case Attribute::MinSize:
     return bitc::ATTR_KIND_MIN_SIZE;
   case Attribute::Naked:
@@ -511,7 +513,7 @@ static unsigned getEncodedDLLStorageClass(const GlobalValue &GV) {
   llvm_unreachable("Invalid DLL storage class");
 }
 
-static unsigned getEncodedThreadLocalMode(const GlobalVariable &GV) {
+static unsigned getEncodedThreadLocalMode(const GlobalValue &GV) {
   switch (GV.getThreadLocalMode()) {
     case GlobalVariable::NotThreadLocal:         return 0;
     case GlobalVariable::GeneralDynamicTLSModel: return 1;
@@ -522,6 +524,35 @@ static unsigned getEncodedThreadLocalMode(const GlobalVariable &GV) {
   llvm_unreachable("Invalid TLS model");
 }
 
+static unsigned getEncodedComdatSelectionKind(const Comdat &C) {
+  switch (C.getSelectionKind()) {
+  case Comdat::Any:
+    return bitc::COMDAT_SELECTION_KIND_ANY;
+  case Comdat::ExactMatch:
+    return bitc::COMDAT_SELECTION_KIND_EXACT_MATCH;
+  case Comdat::Largest:
+    return bitc::COMDAT_SELECTION_KIND_LARGEST;
+  case Comdat::NoDuplicates:
+    return bitc::COMDAT_SELECTION_KIND_NO_DUPLICATES;
+  case Comdat::SameSize:
+    return bitc::COMDAT_SELECTION_KIND_SAME_SIZE;
+  }
+  llvm_unreachable("Invalid selection kind");
+}
+
+static void writeComdats(const ValueEnumerator &VE, BitstreamWriter &Stream) {
+  SmallVector<uint8_t, 64> Vals;
+  for (const Comdat *C : VE.getComdats()) {
+    // COMDAT: [selection_kind, name]
+    Vals.push_back(getEncodedComdatSelectionKind(*C));
+    Vals.push_back(C->getName().size());
+    for (char Chr : C->getName())
+      Vals.push_back((unsigned char)Chr);
+    Stream.EmitRecord(bitc::MODULE_CODE_COMDAT, Vals, /*AbbrevToUse=*/0);
+    Vals.clear();
+  }
+}
+
 // Emit top-level description of module, including target triple, inline asm,
 // descriptors for global variables, and function prototype info.
 static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
@@ -623,12 +654,14 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
     if (GV.isThreadLocal() ||
         GV.getVisibility() != GlobalValue::DefaultVisibility ||
         GV.hasUnnamedAddr() || GV.isExternallyInitialized() ||
-        GV.getDLLStorageClass() != GlobalValue::DefaultStorageClass) {
+        GV.getDLLStorageClass() != GlobalValue::DefaultStorageClass ||
+        GV.hasComdat()) {
       Vals.push_back(getEncodedVisibility(GV));
       Vals.push_back(getEncodedThreadLocalMode(GV));
       Vals.push_back(GV.hasUnnamedAddr());
       Vals.push_back(GV.isExternallyInitialized());
       Vals.push_back(getEncodedDLLStorageClass(GV));
+      Vals.push_back(GV.hasComdat() ? VE.getComdatID(GV.getComdat()) : 0);
     } else {
       AbbrevToUse = SimpleGVarAbbrev;
     }
@@ -654,6 +687,7 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
     Vals.push_back(F.hasPrefixData() ? (VE.getValueID(F.getPrefixData()) + 1)
                                       : 0);
     Vals.push_back(getEncodedDLLStorageClass(F));
+    Vals.push_back(F.hasComdat() ? VE.getComdatID(F.getComdat()) : 0);
 
     unsigned AbbrevToUse = 0;
     Stream.EmitRecord(bitc::MODULE_CODE_FUNCTION, Vals, AbbrevToUse);
@@ -668,6 +702,8 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
     Vals.push_back(getEncodedLinkage(A));
     Vals.push_back(getEncodedVisibility(A));
     Vals.push_back(getEncodedDLLStorageClass(A));
+    Vals.push_back(getEncodedThreadLocalMode(A));
+    Vals.push_back(A.hasUnnamedAddr());
     unsigned AbbrevToUse = 0;
     Stream.EmitRecord(bitc::MODULE_CODE_ALIAS, Vals, AbbrevToUse);
     Vals.clear();
@@ -1445,6 +1481,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
                      cast<AtomicCmpXchgInst>(I).getSynchScope()));
     Vals.push_back(GetEncodedOrdering(
                      cast<AtomicCmpXchgInst>(I).getFailureOrdering()));
+    Vals.push_back(cast<AtomicCmpXchgInst>(I).isWeak());
     break;
   case Instruction::AtomicRMW:
     Code = bitc::FUNC_CODE_INST_ATOMICRMW;
@@ -1910,6 +1947,8 @@ static void WriteModule(const Module *M, BitstreamWriter &Stream) {
   // Emit information describing all of the types in the module.
   WriteTypeTable(VE, Stream);
 
+  writeComdats(VE, Stream);
+
   // Emit top-level description of module, including target triple, inline asm,
   // descriptors for global variables, and function prototype info.
   WriteModuleInfo(M, VE, Stream);
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index 8531e76..15f8034 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -73,37 +73,34 @@ ValueEnumerator::ValueEnumerator(const Module *M) {
   SmallVector<std::pair<unsigned, MDNode*>, 8> MDs;
 
   // Enumerate types used by function bodies and argument lists.
-  for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F) {
-
-    for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-         I != E; ++I)
-      EnumerateType(I->getType());
-
-    for (Function::const_iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
-      for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E;++I){
-        for (User::const_op_iterator OI = I->op_begin(), E = I->op_end();
-             OI != E; ++OI) {
-          if (MDNode *MD = dyn_cast<MDNode>(*OI))
+  for (const Function &F : *M) {
+    for (const Argument &A : F.args())
+      EnumerateType(A.getType());
+
+    for (const BasicBlock &BB : F)
+      for (const Instruction &I : BB) {
+        for (const Use &Op : I.operands()) {
+          if (MDNode *MD = dyn_cast<MDNode>(&Op))
             if (MD->isFunctionLocal() && MD->getFunction())
               // These will get enumerated during function-incorporation.
               continue;
-          EnumerateOperandType(*OI);
+          EnumerateOperandType(Op);
         }
-        EnumerateType(I->getType());
-        if (const CallInst *CI = dyn_cast<CallInst>(I))
+        EnumerateType(I.getType());
+        if (const CallInst *CI = dyn_cast<CallInst>(&I))
           EnumerateAttributes(CI->getAttributes());
-        else if (const InvokeInst *II = dyn_cast<InvokeInst>(I))
+        else if (const InvokeInst *II = dyn_cast<InvokeInst>(&I))
           EnumerateAttributes(II->getAttributes());
 
         // Enumerate metadata attached with this instruction.
         MDs.clear();
-        I->getAllMetadataOtherThanDebugLoc(MDs);
+        I.getAllMetadataOtherThanDebugLoc(MDs);
         for (unsigned i = 0, e = MDs.size(); i != e; ++i)
           EnumerateMetadata(MDs[i].second);
 
-        if (!I->getDebugLoc().isUnknown()) {
+        if (!I.getDebugLoc().isUnknown()) {
           MDNode *Scope, *IA;
-          I->getDebugLoc().getScopeAndInlinedAt(Scope, IA, I->getContext());
+          I.getDebugLoc().getScopeAndInlinedAt(Scope, IA, I.getContext());
           if (Scope) EnumerateMetadata(Scope);
           if (IA) EnumerateMetadata(IA);
         }
@@ -120,6 +117,12 @@ unsigned ValueEnumerator::getInstructionID(const Instruction *Inst) const {
   return I->second;
 }
 
+unsigned ValueEnumerator::getComdatID(const Comdat *C) const {
+  unsigned ComdatID = Comdats.idFor(C);
+  assert(ComdatID && "Comdat not found!");
+  return ComdatID;
+}
+
 void ValueEnumerator::setInstructionID(const Instruction *I) {
   InstructionMap[I] = InstructionCount++;
 }
@@ -310,6 +313,10 @@ void ValueEnumerator::EnumerateValue(const Value *V) {
     return;
   }
 
+  if (auto *GO = dyn_cast<GlobalObject>(V))
+    if (const Comdat *C = GO->getComdat())
+      Comdats.insert(C);
+
   // Enumerate the type of this value.
   EnumerateType(V->getType());
 
diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h
index d1ca15f..1c9f38e 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/lib/Bitcode/Writer/ValueEnumerator.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/UniqueVector.h"
 #include "llvm/IR/Attributes.h"
 #include <vector>
 
@@ -25,6 +26,7 @@ class Type;
 class Value;
 class Instruction;
 class BasicBlock;
+class Comdat;
 class Function;
 class Module;
 class MDNode;
@@ -48,6 +50,10 @@ private:
   typedef DenseMap<const Value*, unsigned> ValueMapType;
   ValueMapType ValueMap;
   ValueList Values;
+
+  typedef UniqueVector<const Comdat *> ComdatSetType;
+  ComdatSetType Comdats;
+
   ValueList MDValues;
   SmallVector<const MDNode *, 8> FunctionLocalMDs;
   ValueMapType MDValueMap;
@@ -139,6 +145,9 @@ public:
     return AttributeGroups;
   }
 
+  const ComdatSetType &getComdats() const { return Comdats; }
+  unsigned getComdatID(const Comdat *C) const;
+
   /// getGlobalBasicBlockID - This returns the function-specific ID for the
   /// specified basic block.  This is relatively expensive information, so it
   /// should only be used by rare constructs such as address-of-label.
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 6fc83a2..1bdf312 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -7,13 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines several CodeGen-specific LLVM IR analysis utilties.
+// This file defines several CodeGen-specific LLVM IR analysis utilities.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -474,8 +475,7 @@ static bool nextRealType(SmallVectorImpl<CompositeType *> &SubTypes,
 /// between it and the return.
 ///
 /// This function only tests target-independent requirements.
-bool llvm::isInTailCallPosition(ImmutableCallSite CS,
-                                const TargetLowering &TLI) {
+bool llvm::isInTailCallPosition(ImmutableCallSite CS, const SelectionDAG &DAG) {
   const Instruction *I = CS.getInstruction();
   const BasicBlock *ExitBB = I->getParent();
   const TerminatorInst *Term = ExitBB->getTerminator();
@@ -490,7 +490,7 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS,
   // longjmp on x86), it can end up causing miscompilation that has not
   // been fully understood.
   if (!Ret &&
-      (!TLI.getTargetMachine().Options.GuaranteedTailCallOpt ||
+      (!DAG.getTarget().Options.GuaranteedTailCallOpt ||
        !isa<UnreachableInst>(Term)))
     return false;
 
@@ -509,7 +509,8 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS,
         return false;
     }
 
-  return returnTypeIsEligibleForTailCall(ExitBB->getParent(), I, Ret, TLI);
+  return returnTypeIsEligibleForTailCall(ExitBB->getParent(), I, Ret,
+                                         *DAG.getTarget().getTargetLowering());
 }
 
 bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
diff --git a/lib/CodeGen/Android.mk b/lib/CodeGen/Android.mk
index 7feb42c..05e5c45 100644
--- a/lib/CodeGen/Android.mk
+++ b/lib/CodeGen/Android.mk
@@ -24,11 +24,13 @@ codegen_SRC_FILES := \
   GCMetadata.cpp \
   GCMetadataPrinter.cpp \
   GCStrategy.cpp \
+  GlobalMerge.cpp \
   IfConversion.cpp \
   InlineSpiller.cpp \
   InterferenceCache.cpp \
   IntrinsicLowering.cpp \
   JITCodeEmitter.cpp \
+  JumpInstrTables.cpp \
   LatencyPriorityQueue.cpp \
   LexicalScopes.cpp \
   LiveDebugVariables.cpp \
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index 1cb0159..251f5ef 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -37,8 +37,7 @@
 using namespace llvm;
 
 ARMException::ARMException(AsmPrinter *A)
-  : DwarfException(A),
-    shouldEmitCFI(false) {}
+  : EHStreamer(A), shouldEmitCFI(false) {}
 
 ARMException::~ARMException() {}
 
@@ -100,7 +99,7 @@ void ARMException::endFunction(const MachineFunction *) {
       ATS.emitHandlerData();
 
       // Emit actual exception table
-      EmitExceptionTable();
+      emitExceptionTable();
     }
   }
 
@@ -108,7 +107,7 @@ void ARMException::endFunction(const MachineFunction *) {
     ATS.emitFnEnd();
 }
 
-void ARMException::EmitTypeInfos(unsigned TTypeEncoding) {
+void ARMException::emitTypeInfos(unsigned TTypeEncoding) {
   const std::vector<const GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
 
diff --git a/lib/CodeGen/AsmPrinter/Android.mk b/lib/CodeGen/AsmPrinter/Android.mk
index f56eb6e..083cc0d 100644
--- a/lib/CodeGen/AsmPrinter/Android.mk
+++ b/lib/CodeGen/AsmPrinter/Android.mk
@@ -1,33 +1,33 @@
 LOCAL_PATH := $(call my-dir)
 
 codegen_asmprinter_SRC_FILES := \
-  AsmPrinter.cpp
+  AddressPool.cpp \
+  ARMException.cpp \
+  AsmPrinter.cpp \
+  AsmPrinterDwarf.cpp \
+  AsmPrinterInlineAsm.cpp \
+  DbgValueHistoryCalculator.cpp \
+  DIE.cpp \
+  DIEHash.cpp \
+  DwarfAccelTable.cpp \
+  DwarfCFIException.cpp \
+  DwarfDebug.cpp \
+  DwarfFile.cpp \
+  DwarfStringPool.cpp \
+  DwarfUnit.cpp \
+  EHStreamer.cpp \
+  ErlangGCPrinter.cpp \
+  OcamlGCPrinter.cpp \
+  Win64Exception.cpp \
+  WinCodeViewLineTables.cpp
+
+
 
 # For the host
 # =====================================================
 include $(CLEAR_VARS)
 
-LOCAL_SRC_FILES :=	\
-	AddressPool.cpp \
-	AsmPrinter.cpp	\
-	AsmPrinterDwarf.cpp	\
-	AsmPrinterInlineAsm.cpp	\
-	ARMException.cpp	\
-	DbgValueHistoryCalculator.cpp \
-	DIE.cpp	\
-	DIEHash.cpp \
-	DwarfAccelTable.cpp \
-	DwarfCFIException.cpp \
-	DwarfDebug.cpp	\
-	DwarfException.cpp	\
-	DwarfFile.cpp \
-	DwarfStringPool.cpp \
-	DwarfUnit.cpp \
-	ErlangGCPrinter.cpp \
-	OcamlGCPrinter.cpp \
-	Win64Exception.cpp \
-	WinCodeViewLineTables.cpp
-
+LOCAL_SRC_FILES := $(codegen_asmprinter_SRC_FILES)
 LOCAL_MODULE:= libLLVMAsmPrinter
 
 LOCAL_MODULE_TAGS := optional
@@ -41,27 +41,7 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 
-LOCAL_SRC_FILES :=	\
-	AddressPool.cpp \
-	AsmPrinter.cpp \
-	AsmPrinterDwarf.cpp \
-	AsmPrinterInlineAsm.cpp \
-	ARMException.cpp        \
-	DbgValueHistoryCalculator.cpp \
-	DIE.cpp \
-	DIEHash.cpp \
-	DwarfAccelTable.cpp \
-	DwarfCFIException.cpp \
-	DwarfDebug.cpp  \
-	DwarfException.cpp      \
-	DwarfFile.cpp \
-	DwarfStringPool.cpp \
-	DwarfUnit.cpp \
-	ErlangGCPrinter.cpp \
-	OcamlGCPrinter.cpp \
-	Win64Exception.cpp \
-	WinCodeViewLineTables.cpp
-
+LOCAL_SRC_FILES := $(codegen_asmprinter_SRC_FILES)
 LOCAL_MODULE:= libLLVMAsmPrinter
 
 LOCAL_MODULE_TAGS := optional
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 7de9c6d..f80fdea 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/JumpInstrTableInfo.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -46,7 +47,6 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
@@ -232,23 +232,23 @@ bool AsmPrinter::doInitialization(Module &M) {
     }
   }
 
-  DwarfException *DE = nullptr;
+  EHStreamer *ES = nullptr;
   switch (MAI->getExceptionHandlingType()) {
   case ExceptionHandling::None:
     break;
   case ExceptionHandling::SjLj:
   case ExceptionHandling::DwarfCFI:
-    DE = new DwarfCFIException(this);
+    ES = new DwarfCFIException(this);
     break;
   case ExceptionHandling::ARM:
-    DE = new ARMException(this);
+    ES = new ARMException(this);
     break;
-  case ExceptionHandling::Win64:
-    DE = new Win64Exception(this);
+  case ExceptionHandling::WinEH:
+    ES = new Win64Exception(this);
     break;
   }
-  if (DE)
-    Handlers.push_back(HandlerInfo(DE, EHTimerName, DWARFGroupName));
+  if (ES)
+    Handlers.push_back(HandlerInfo(ES, EHTimerName, DWARFGroupName));
   return false;
 }
 
@@ -709,13 +709,12 @@ AsmPrinter::CFIMoveType AsmPrinter::needsCFIMoves() {
 }
 
 bool AsmPrinter::needsSEHMoves() {
-  return MAI->getExceptionHandlingType() == ExceptionHandling::Win64 &&
+  return MAI->getExceptionHandlingType() == ExceptionHandling::WinEH &&
     MF->getFunction()->needsUnwindTableEntry();
 }
 
 void AsmPrinter::emitCFIInstruction(const MachineInstr &MI) {
-  ExceptionHandling::ExceptionsType ExceptionHandlingType =
-      MAI->getExceptionHandlingType();
+  ExceptionHandling ExceptionHandlingType = MAI->getExceptionHandlingType();
   if (ExceptionHandlingType != ExceptionHandling::DwarfCFI &&
       ExceptionHandlingType != ExceptionHandling::ARM)
     return;
@@ -870,6 +869,8 @@ void AsmPrinter::EmitFunctionBody() {
   OutStreamer.AddBlankLine();
 }
 
+static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP);
+
 bool AsmPrinter::doFinalization(Module &M) {
   // Emit global variables.
   for (const auto &G : M.globals())
@@ -887,6 +888,54 @@ bool AsmPrinter::doFinalization(Module &M) {
     EmitVisibility(Name, V, false);
   }
 
+  // Get information about jump-instruction tables to print.
+  JumpInstrTableInfo *JITI = getAnalysisIfAvailable<JumpInstrTableInfo>();
+
+  if (JITI && !JITI->getTables().empty()) {
+    unsigned Arch = Triple(getTargetTriple()).getArch();
+    bool IsThumb = (Arch == Triple::thumb || Arch == Triple::thumbeb);
+    MCInst TrapInst;
+    TM.getInstrInfo()->getTrap(TrapInst);
+    for (const auto &KV : JITI->getTables()) {
+      uint64_t Count = 0;
+      for (const auto &FunPair : KV.second) {
+        // Emit the function labels to make this be a function entry point.
+        MCSymbol *FunSym =
+          OutContext.GetOrCreateSymbol(FunPair.second->getName());
+        OutStreamer.EmitSymbolAttribute(FunSym, MCSA_Global);
+        // FIXME: JumpTableInstrInfo should store information about the required
+        // alignment of table entries and the size of the padding instruction.
+        EmitAlignment(3);
+        if (IsThumb)
+          OutStreamer.EmitThumbFunc(FunSym);
+        if (MAI->hasDotTypeDotSizeDirective())
+          OutStreamer.EmitSymbolAttribute(FunSym, MCSA_ELF_TypeFunction);
+        OutStreamer.EmitLabel(FunSym);
+
+        // Emit the jump instruction to transfer control to the original
+        // function.
+        MCInst JumpToFun;
+        MCSymbol *TargetSymbol =
+          OutContext.GetOrCreateSymbol(FunPair.first->getName());
+        const MCSymbolRefExpr *TargetSymRef =
+          MCSymbolRefExpr::Create(TargetSymbol, MCSymbolRefExpr::VK_PLT,
+                                  OutContext);
+        TM.getInstrInfo()->getUnconditionalBranch(JumpToFun, TargetSymRef);
+        OutStreamer.EmitInstruction(JumpToFun, getSubtargetInfo());
+        ++Count;
+      }
+
+      // Emit enough padding instructions to fill up to the next power of two.
+      // This assumes that the trap instruction takes 8 bytes or fewer.
+      uint64_t Remaining = NextPowerOf2(Count) - Count;
+      for (uint64_t C = 0; C < Remaining; ++C) {
+        EmitAlignment(3);
+        OutStreamer.EmitInstruction(TrapInst, getSubtargetInfo());
+      }
+
+    }
+  }
+
   // Emit module flags.
   SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags;
   M.getModuleFlagsMetadata(ModuleFlags);
@@ -932,10 +981,6 @@ bool AsmPrinter::doFinalization(Module &M) {
     for (const auto &Alias : M.aliases()) {
       MCSymbol *Name = getSymbol(&Alias);
 
-      const GlobalValue *GV = Alias.getAliasee();
-      assert(!GV->isDeclaration());
-      MCSymbol *Target = getSymbol(GV);
-
       if (Alias.hasExternalLinkage() || !MAI->getWeakRefDirective())
         OutStreamer.EmitSymbolAttribute(Name, MCSA_Global);
       else if (Alias.hasWeakLinkage() || Alias.hasLinkOnceLinkage())
@@ -947,7 +992,7 @@ bool AsmPrinter::doFinalization(Module &M) {
 
       // Emit the directives as assignments aka .set:
       OutStreamer.EmitAssignment(Name,
-                                 MCSymbolRefExpr::Create(Target, OutContext));
+                                 lowerConstant(Alias.getAliasee(), *this));
     }
   }
 
@@ -1248,7 +1293,7 @@ bool AsmPrinter::EmitSpecialLLVMGlobal(const GlobalVariable *GV) {
   }
 
   // Ignore debug and non-emitted data.  This handles llvm.compiler.used.
-  if (GV->getSection() == "llvm.metadata" ||
+  if (StringRef(GV->getSection()) == "llvm.metadata" ||
       GV->hasAvailableExternallyLinkage())
     return true;
 
@@ -1350,14 +1395,17 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
   for (Structor &S : Structors) {
     const TargetLoweringObjectFile &Obj = getObjFileLowering();
     const MCSymbol *KeySym = nullptr;
-    const MCSection *KeySec = nullptr;
-    if (S.ComdatKey) {
-      KeySym = getSymbol(S.ComdatKey);
-      KeySec = getObjFileLowering().SectionForGlobal(S.ComdatKey, *Mang, TM);
+    if (GlobalValue *GV = S.ComdatKey) {
+      if (GV->hasAvailableExternallyLinkage())
+        // If the associated variable is available_externally, some other TU
+        // will provide its dynamic initializer.
+        continue;
+
+      KeySym = getSymbol(GV);
     }
     const MCSection *OutputSection =
-        (isCtor ? Obj.getStaticCtorSection(S.Priority, KeySym, KeySec)
-                : Obj.getStaticDtorSection(S.Priority, KeySym, KeySec));
+        (isCtor ? Obj.getStaticCtorSection(S.Priority, KeySym)
+                : Obj.getStaticDtorSection(S.Priority, KeySym));
     OutStreamer.SwitchSection(OutputSection);
     if (OutStreamer.getCurrentSection() != OutStreamer.getPreviousSection())
       EmitAlignment(Align);
@@ -1817,7 +1865,10 @@ static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
     SmallString<8> StrVal;
     CFP->getValueAPF().toString(StrVal);
 
-    CFP->getType()->print(AP.OutStreamer.GetCommentOS());
+    if (CFP->getType())
+      CFP->getType()->print(AP.OutStreamer.GetCommentOS());
+    else
+      AP.OutStreamer.GetCommentOS() << "Printing <null> Type";
     AP.OutStreamer.GetCommentOS() << ' ' << StrVal << '\n';
   }
 
@@ -1830,7 +1881,8 @@ static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
 
   // PPC's long double has odd notions of endianness compared to how LLVM
   // handles it: p[0] goes first for *big* endian on PPC.
-  if (AP.TM.getDataLayout()->isBigEndian() != CFP->getType()->isPPC_FP128Ty()) {
+  if (AP.TM.getDataLayout()->isBigEndian() &&
+      !CFP->getType()->isPPC_FP128Ty()) {
     int Chunk = API.getNumWords() - 1;
 
     if (TrailingBytes)
diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index b4ef185..f555f21 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -10,10 +10,10 @@ add_llvm_library(LLVMAsmPrinter
   DwarfAccelTable.cpp
   DwarfCFIException.cpp
   DwarfDebug.cpp
-  DwarfException.cpp
   DwarfFile.cpp
   DwarfStringPool.cpp
   DwarfUnit.cpp
+  EHStreamer.cpp
   ErlangGCPrinter.cpp
   OcamlGCPrinter.cpp
   Win64Exception.cpp
diff --git a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
index 6103254..a66d08e 100644
--- a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
+++ b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <algorithm>
 #include <map>
+#include <set>
 
 #define DEBUG_TYPE "dwarfdebug"
 
@@ -110,45 +111,73 @@ static void clobberRegisterUses(RegDescribedVarsMap &RegVars, unsigned RegNo,
   RegVars.erase(I);
 }
 
-// \brief Terminate location ranges for all variables, described by registers
-// clobbered by @MI.
-static void clobberRegisterUses(RegDescribedVarsMap &RegVars,
-                                const MachineInstr &MI,
-                                const TargetRegisterInfo *TRI,
-                                DbgValueHistoryMap &HistMap) {
+// \brief Collect all registers clobbered by @MI and insert them to @Regs.
+static void collectClobberedRegisters(const MachineInstr &MI,
+                                      const TargetRegisterInfo *TRI,
+                                      std::set<unsigned> &Regs) {
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || !MO.isDef() || !MO.getReg())
       continue;
-    for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid();
-         ++AI) {
-      unsigned RegNo = *AI;
-      clobberRegisterUses(RegVars, RegNo, HistMap, MI);
-    }
+    for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI)
+      Regs.insert(*AI);
   }
 }
 
-// \brief Terminate the location range for all register-described variables
-// by inserting @ClobberingInstr to their history.
-static void clobberAllRegistersUses(RegDescribedVarsMap &RegVars,
-                                    DbgValueHistoryMap &HistMap,
-                                    const MachineInstr &ClobberingInstr) {
-  for (const auto &I : RegVars)
-    for (const auto &Var : I.second)
-      HistMap.endInstrRange(Var, ClobberingInstr);
-  RegVars.clear();
+// \brief Returns the first instruction in @MBB which corresponds to
+// the function epilogue, or nullptr if @MBB doesn't contain an epilogue.
+static const MachineInstr *getFirstEpilogueInst(const MachineBasicBlock &MBB) {
+  auto LastMI = MBB.getLastNonDebugInstr();
+  if (LastMI == MBB.end() || !LastMI->isReturn())
+    return nullptr;
+  // Assume that epilogue starts with instruction having the same debug location
+  // as the return instruction.
+  DebugLoc LastLoc = LastMI->getDebugLoc();
+  auto Res = LastMI;
+  for (MachineBasicBlock::const_reverse_iterator I(std::next(LastMI)); I != MBB.rend();
+       ++I) {
+    if (I->getDebugLoc() != LastLoc)
+      return Res;
+    Res = std::prev(I.base());
+  }
+  // If all instructions have the same debug location, assume whole MBB is
+  // an epilogue.
+  return MBB.begin();
+}
+
+// \brief Collect registers that are modified in the function body (their
+// contents is changed only in the prologue and epilogue).
+static void collectChangingRegs(const MachineFunction *MF,
+                                const TargetRegisterInfo *TRI,
+                                std::set<unsigned> &Regs) {
+  for (const auto &MBB : *MF) {
+    auto FirstEpilogueInst = getFirstEpilogueInst(MBB);
+    bool IsInEpilogue = false;
+    for (const auto &MI : MBB) {
+      IsInEpilogue |= &MI == FirstEpilogueInst;
+      if (!MI.getFlag(MachineInstr::FrameSetup) && !IsInEpilogue)
+        collectClobberedRegisters(MI, TRI, Regs);
+    }
+  }
 }
 
 void calculateDbgValueHistory(const MachineFunction *MF,
                               const TargetRegisterInfo *TRI,
                               DbgValueHistoryMap &Result) {
-  RegDescribedVarsMap RegVars;
+  std::set<unsigned> ChangingRegs;
+  collectChangingRegs(MF, TRI, ChangingRegs);
 
+  RegDescribedVarsMap RegVars;
   for (const auto &MBB : *MF) {
     for (const auto &MI : MBB) {
       if (!MI.isDebugValue()) {
         // Not a DBG_VALUE instruction. It may clobber registers which describe
         // some variables.
-        clobberRegisterUses(RegVars, MI, TRI, Result);
+        std::set<unsigned> MIClobberedRegs;
+        collectClobberedRegisters(MI, TRI, MIClobberedRegs);
+        for (unsigned RegNo : MIClobberedRegs) {
+          if (ChangingRegs.count(RegNo))
+            clobberRegisterUses(RegVars, RegNo, Result, MI);
+        }
         continue;
       }
 
@@ -167,8 +196,10 @@ void calculateDbgValueHistory(const MachineFunction *MF,
     // Make sure locations for register-described variables are valid only
     // until the end of the basic block (unless it's the last basic block, in
     // which case let their liveness run off to the end of the function).
-    if (!MBB.empty() &&  &MBB != &MF->back())
-      clobberAllRegistersUses(RegVars, Result, MBB.back());
+    if (!MBB.empty() &&  &MBB != &MF->back()) {
+      for (unsigned RegNo : ChangingRegs)
+        clobberRegisterUses(RegVars, RegNo, Result, MBB.back());
+    }
   }
 }
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index 30312ac..74215aa 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -40,9 +40,8 @@
 using namespace llvm;
 
 DwarfCFIException::DwarfCFIException(AsmPrinter *A)
-  : DwarfException(A),
-    shouldEmitPersonality(false), shouldEmitLSDA(false), shouldEmitMoves(false),
-    moveTypeModule(AsmPrinter::CFI_M_None) {}
+  : EHStreamer(A), shouldEmitPersonality(false), shouldEmitLSDA(false),
+    shouldEmitMoves(false), moveTypeModule(AsmPrinter::CFI_M_None) {}
 
 DwarfCFIException::~DwarfCFIException() {}
 
@@ -59,26 +58,16 @@ void DwarfCFIException::endModule() {
 
   unsigned PerEncoding = TLOF.getPersonalityEncoding();
 
-  if ((PerEncoding & 0x70) != dwarf::DW_EH_PE_pcrel)
+  if ((PerEncoding & 0x80) != dwarf::DW_EH_PE_indirect)
     return;
 
   // Emit references to all used personality functions
-  bool AtLeastOne = false;
   const std::vector<const Function*> &Personalities = MMI->getPersonalities();
   for (size_t i = 0, e = Personalities.size(); i != e; ++i) {
     if (!Personalities[i])
       continue;
     MCSymbol *Sym = Asm->getSymbol(Personalities[i]);
     TLOF.emitPersonalityValue(Asm->OutStreamer, Asm->TM, Sym);
-    AtLeastOne = true;
-  }
-
-  if (AtLeastOne && !TLOF.isFunctionEHFrameSymbolPrivate()) {
-    // This is a temporary hack to keep sections in the same order they
-    // were before. This lets us produce bit identical outputs while
-    // transitioning to CFI.
-    Asm->OutStreamer.SwitchSection(
-               const_cast<TargetLoweringObjectFile&>(TLOF).getEHFrameSection());
   }
 }
 
@@ -123,9 +112,17 @@ void DwarfCFIException::beginFunction(const MachineFunction *MF) {
       TLOF.getCFIPersonalitySymbol(Per, *Asm->Mang, Asm->TM, MMI);
   Asm->OutStreamer.EmitCFIPersonality(Sym, PerEncoding);
 
-  Asm->OutStreamer.EmitDebugLabel
-    (Asm->GetTempSymbol("eh_func_begin",
-                        Asm->getFunctionNumber()));
+  MCSymbol *EHBegin =
+      Asm->GetTempSymbol("eh_func_begin", Asm->getFunctionNumber());
+  if (Asm->MAI->useAssignmentForEHBegin()) {
+    MCContext &Ctx = Asm->OutContext;
+    MCSymbol *CurPos = Ctx.CreateTempSymbol();
+    Asm->OutStreamer.EmitLabel(CurPos);
+    Asm->OutStreamer.EmitAssignment(EHBegin,
+                                    MCSymbolRefExpr::Create(CurPos, Ctx));
+  } else {
+    Asm->OutStreamer.EmitLabel(EHBegin);
+  }
 
   // Provide LSDA information.
   if (!shouldEmitLSDA)
@@ -153,5 +150,5 @@ void DwarfCFIException::endFunction(const MachineFunction *) {
   // Map all labels and get rid of any dead landing pads.
   MMI->TidyLandingPads();
 
-  EmitExceptionTable();
+  emitExceptionTable();
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 2a0615d..77860c0 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -98,10 +98,6 @@ DwarfPubSections("generate-dwarf-pub-sections", cl::Hidden,
                             clEnumVal(Disable, "Disabled"), clEnumValEnd),
                  cl::init(Default));
 
-static cl::opt<unsigned>
-DwarfVersionNumber("dwarf-version", cl::Hidden,
-                   cl::desc("Generate DWARF for dwarf version."), cl::init(0));
-
 static const char *const DWARFGroupName = "DWARF Emission";
 static const char *const DbgTimerName = "DWARF Debug Writer";
 
@@ -209,9 +205,12 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   else
     HasDwarfPubSections = DwarfPubSections == Enable;
 
+  unsigned DwarfVersionNumber = Asm->TM.Options.MCOptions.DwarfVersion;
   DwarfVersion = DwarfVersionNumber ? DwarfVersionNumber
                                     : MMI->getModule()->getDwarfVersion();
 
+  Asm->OutStreamer.getContext().setDwarfVersion(DwarfVersion);
+
   {
     NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
     beginModule();
@@ -531,8 +530,7 @@ void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &TheCU,
   // shouldn't be found by lookup.
   AbsDef = &SPCU.createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE,
                                  DIDescriptor());
-  SPCU.applySubprogramAttributes(SP, *AbsDef);
-  SPCU.addGlobalName(SP.getName(), *AbsDef, resolve(SP.getContext()));
+  SPCU.applySubprogramAttributesToDefinition(SP, *AbsDef);
 
   SPCU.addUInt(*AbsDef, dwarf::DW_AT_inline, None, dwarf::DW_INL_inlined);
   createAndAddScopeChildren(SPCU, Scope, *AbsDef);
@@ -732,6 +730,8 @@ void DwarfDebug::beginModule() {
 
   const Module *M = MMI->getModule();
 
+  FunctionDIs = makeSubprogramMap(*M);
+
   // If module has named metadata anchors then use them, otherwise scan the
   // module using debug info finder to collect debug info.
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
@@ -784,6 +784,26 @@ void DwarfDebug::beginModule() {
   SectionMap[Asm->getObjFileLowering().getTextSection()];
 }
 
+void DwarfDebug::finishVariableDefinitions() {
+  for (const auto &Var : ConcreteVariables) {
+    DIE *VariableDie = Var->getDIE();
+    // FIXME: There shouldn't be any variables without DIEs.
+    if (!VariableDie)
+      continue;
+    // FIXME: Consider the time-space tradeoff of just storing the unit pointer
+    // in the ConcreteVariables list, rather than looking it up again here.
+    // DIE::getUnit isn't simple - it walks parent pointers, etc.
+    DwarfCompileUnit *Unit = lookupUnit(VariableDie->getUnit());
+    assert(Unit);
+    DbgVariable *AbsVar = getExistingAbstractVariable(Var->getVariable());
+    if (AbsVar && AbsVar->getDIE()) {
+      Unit->addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin,
+                        *AbsVar->getDIE());
+    } else
+      Unit->applyVariableAttributes(*Var, *VariableDie);
+  }
+}
+
 void DwarfDebug::finishSubprogramDefinitions() {
   const Module *M = MMI->getModule();
 
@@ -811,8 +831,7 @@ void DwarfDebug::finishSubprogramDefinitions() {
           // inlined versions during codegen.
           D = SPCU->getOrCreateSubprogramDIE(SP);
         // And attach the attributes
-        SPCU->applySubprogramAttributes(SP, *D);
-        SPCU->addGlobalName(SP.getName(), *D, resolve(SP.getContext()));
+        SPCU->applySubprogramAttributesToDefinition(SP, *D);
       }
     }
   }
@@ -850,8 +869,10 @@ void DwarfDebug::collectDeadVariables() {
         for (unsigned vi = 0, ve = Variables.getNumElements(); vi != ve; ++vi) {
           DIVariable DV(Variables.getElement(vi));
           assert(DV.isVariable());
-          DbgVariable NewVar(DV, nullptr, this);
-          SPDIE->addChild(SPCU->constructVariableDIE(NewVar));
+          DbgVariable NewVar(DV, this);
+          auto VariableDie = SPCU->constructVariableDIE(NewVar);
+          SPCU->applyVariableAttributes(NewVar, *VariableDie);
+          SPDIE->addChild(std::move(VariableDie));
         }
       }
     }
@@ -861,6 +882,8 @@ void DwarfDebug::collectDeadVariables() {
 void DwarfDebug::finalizeModuleInfo() {
   finishSubprogramDefinitions();
 
+  finishVariableDefinitions();
+
   // Collect info for variables that were optimized out.
   collectDeadVariables();
 
@@ -1017,9 +1040,9 @@ void DwarfDebug::endModule() {
     emitDebugInfoDWO();
     emitDebugAbbrevDWO();
     emitDebugLineDWO();
+    emitDebugLocDWO();
     // Emit DWO addresses.
     AddrPool.emit(*Asm, Asm->getObjFileLowering().getDwarfAddrSection());
-    emitDebugLocDWO();
   } else
     // Emit info into a debug loc section.
     emitDebugLoc();
@@ -1047,27 +1070,51 @@ void DwarfDebug::endModule() {
 }
 
 // Find abstract variable, if any, associated with Var.
-DbgVariable *DwarfDebug::findAbstractVariable(DIVariable &DV,
-                                              DebugLoc ScopeLoc) {
-  return findAbstractVariable(DV, ScopeLoc.getScope(DV->getContext()));
-}
-
-DbgVariable *DwarfDebug::findAbstractVariable(DIVariable &DV,
-                                              const MDNode *ScopeNode) {
+DbgVariable *DwarfDebug::getExistingAbstractVariable(const DIVariable &DV,
+                                                     DIVariable &Cleansed) {
   LLVMContext &Ctx = DV->getContext();
   // More then one inlined variable corresponds to one abstract variable.
-  DIVariable Var = cleanseInlinedVariable(DV, Ctx);
-  auto I = AbstractVariables.find(Var);
+  // FIXME: This duplication of variables when inlining should probably be
+  // removed. It's done to allow each DIVariable to describe its location
+  // because the DebugLoc on the dbg.value/declare isn't accurate. We should
+  // make it accurate then remove this duplication/cleansing stuff.
+  Cleansed = cleanseInlinedVariable(DV, Ctx);
+  auto I = AbstractVariables.find(Cleansed);
   if (I != AbstractVariables.end())
     return I->second.get();
+  return nullptr;
+}
 
-  LexicalScope *Scope = LScopes.findAbstractScope(ScopeNode);
-  if (!Scope)
-    return nullptr;
+DbgVariable *DwarfDebug::getExistingAbstractVariable(const DIVariable &DV) {
+  DIVariable Cleansed;
+  return getExistingAbstractVariable(DV, Cleansed);
+}
 
-  auto AbsDbgVariable = make_unique<DbgVariable>(Var, nullptr, this);
+void DwarfDebug::createAbstractVariable(const DIVariable &Var,
+                                        LexicalScope *Scope) {
+  auto AbsDbgVariable = make_unique<DbgVariable>(Var, this);
   addScopeVariable(Scope, AbsDbgVariable.get());
-  return (AbstractVariables[Var] = std::move(AbsDbgVariable)).get();
+  AbstractVariables[Var] = std::move(AbsDbgVariable);
+}
+
+void DwarfDebug::ensureAbstractVariableIsCreated(const DIVariable &DV,
+                                                 const MDNode *ScopeNode) {
+  DIVariable Cleansed = DV;
+  if (getExistingAbstractVariable(DV, Cleansed))
+    return;
+
+  createAbstractVariable(Cleansed, LScopes.getOrCreateAbstractScope(ScopeNode));
+}
+
+void
+DwarfDebug::ensureAbstractVariableIsCreatedIfScoped(const DIVariable &DV,
+                                                    const MDNode *ScopeNode) {
+  DIVariable Cleansed = DV;
+  if (getExistingAbstractVariable(DV, Cleansed))
+    return;
+
+  if (LexicalScope *Scope = LScopes.findAbstractScope(ScopeNode))
+    createAbstractVariable(Cleansed, Scope);
 }
 
 // If Var is a current function argument then add it to CurrentFnArguments list.
@@ -1106,11 +1153,11 @@ void DwarfDebug::collectVariableInfoFromMMITable(
     if (!Scope)
       continue;
 
-    DbgVariable *AbsDbgVariable = findAbstractVariable(DV, VI.Loc);
-    DbgVariable *RegVar = new DbgVariable(DV, AbsDbgVariable, this);
+    ensureAbstractVariableIsCreatedIfScoped(DV, Scope->getScopeNode());
+    ConcreteVariables.push_back(make_unique<DbgVariable>(DV, this));
+    DbgVariable *RegVar = ConcreteVariables.back().get();
     RegVar->setFrameIndex(VI.Slot);
-    if (!addCurrentFnArgument(RegVar, Scope))
-      addScopeVariable(Scope, RegVar);
+    addScopeVariable(Scope, RegVar);
   }
 }
 
@@ -1175,18 +1222,14 @@ DwarfDebug::collectVariableInfo(SmallPtrSet<const MDNode *, 16> &Processed) {
     Processed.insert(DV);
     const MachineInstr *MInsn = Ranges.front().first;
     assert(MInsn->isDebugValue() && "History must begin with debug value");
-    DbgVariable *AbsVar = findAbstractVariable(DV, MInsn->getDebugLoc());
-    DbgVariable *RegVar = new DbgVariable(DV, AbsVar, this);
-    if (!addCurrentFnArgument(RegVar, Scope))
-      addScopeVariable(Scope, RegVar);
-    if (AbsVar)
-      AbsVar->setMInsn(MInsn);
+    ensureAbstractVariableIsCreatedIfScoped(DV, Scope->getScopeNode());
+    ConcreteVariables.push_back(make_unique<DbgVariable>(MInsn, this));
+    DbgVariable *RegVar = ConcreteVariables.back().get();
+    addScopeVariable(Scope, RegVar);
 
     // Check if the first DBG_VALUE is valid for the rest of the function.
-    if (Ranges.size() == 1 && Ranges.front().second == nullptr) {
-      RegVar->setMInsn(MInsn);
+    if (Ranges.size() == 1 && Ranges.front().second == nullptr)
       continue;
-    }
 
     // Handle multiple DBG_VALUE instructions describing one variable.
     RegVar->setDotDebugLocOffset(DotDebugLocEntries.size());
@@ -1205,6 +1248,11 @@ DwarfDebug::collectVariableInfo(SmallPtrSet<const MDNode *, 16> &Processed) {
       if (Begin->getNumOperands() > 1 && Begin->getOperand(0).isReg() &&
           !Begin->getOperand(0).getReg())
         continue;
+      DEBUG(dbgs() << "DotDebugLoc Pair:\n" << "\t" << *Begin);
+      if (End != nullptr)
+        DEBUG(dbgs() << "\t" << *End);
+      else
+        DEBUG(dbgs() << "\tNULL\n");
 
       const MCSymbol *StartLabel = getLabelBeforeInsn(Begin);
       assert(StartLabel && "Forgot label before DBG_VALUE starting a range!");
@@ -1218,8 +1266,6 @@ DwarfDebug::collectVariableInfo(SmallPtrSet<const MDNode *, 16> &Processed) {
         EndLabel = getLabelBeforeInsn(std::next(I)->first);
       assert(EndLabel && "Forgot label after instruction ending a range!");
 
-      DEBUG(dbgs() << "DotDebugLoc Pair:\n"
-                   << "\t" << *Begin << "\t" << *End << "\n");
       DebugLocEntry Loc(StartLabel, EndLabel, getDebugLocValue(Begin), TheCU);
       if (DebugLoc.empty() || !DebugLoc.back().Merge(Loc))
         DebugLoc.push_back(std::move(Loc));
@@ -1233,11 +1279,11 @@ DwarfDebug::collectVariableInfo(SmallPtrSet<const MDNode *, 16> &Processed) {
     assert(DV.isVariable());
     if (!Processed.insert(DV))
       continue;
-    if (LexicalScope *Scope = LScopes.findLexicalScope(DV.getContext()))
-      addScopeVariable(
-          Scope,
-          new DbgVariable(DV, findAbstractVariable(DV, Scope->getScopeNode()),
-                          this));
+    if (LexicalScope *Scope = LScopes.findLexicalScope(DV.getContext())) {
+      ensureAbstractVariableIsCreatedIfScoped(DV, Scope->getScopeNode());
+      ConcreteVariables.push_back(make_unique<DbgVariable>(DV, this));
+      addScopeVariable(Scope, ConcreteVariables.back().get());
+    }
   }
 }
 
@@ -1371,6 +1417,10 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   if (!MMI->hasDebugInfo())
     return;
 
+  auto DI = FunctionDIs.find(MF->getFunction());
+  if (DI == FunctionDIs.end())
+    return;
+
   // Grab the lexical scopes for the function, if we don't have any of those
   // then we're not going to be able to do anything.
   LScopes.initialize(*MF);
@@ -1386,6 +1436,14 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   // belongs to so that we add to the correct per-cu line table in the
   // non-asm case.
   LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
+  // FnScope->getScopeNode() and DI->second should represent the same function,
+  // though they may not be the same MDNode due to inline functions merged in
+  // LTO where the debug info metadata still differs (either due to distinct
+  // written differences - two versions of a linkonce_odr function
+  // written/copied into two separate files, or some sub-optimal metadata that
+  // isn't structurally identical (see: file path/name info from clang, which
+  // includes the directory of the cpp file being built, even when the file name
+  // is absolute (such as an <> lookup header)))
   DwarfCompileUnit *TheCU = SPMap.lookup(FnScope->getScopeNode());
   assert(TheCU && "Unable to find compile unit!");
   if (Asm->OutStreamer.hasRawTextSupport())
@@ -1440,6 +1498,8 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
 }
 
 void DwarfDebug::addScopeVariable(LexicalScope *LS, DbgVariable *Var) {
+  if (addCurrentFnArgument(Var, LS))
+    return;
   SmallVectorImpl<DbgVariable *> &Vars = ScopeVariables[LS];
   DIVariable DV = Var->getVariable();
   // Variables with positive arg numbers are parameters.
@@ -1481,7 +1541,8 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
     assert(CurFn == MF);
   assert(CurFn != nullptr);
 
-  if (!MMI->hasDebugInfo() || LScopes.empty()) {
+  if (!MMI->hasDebugInfo() || LScopes.empty() ||
+      !FunctionDIs.count(MF->getFunction())) {
     // If we don't have a lexical scope for this function then there will
     // be a hole in the range information. Keep note of this by setting the
     // previously used section to nullptr.
@@ -1517,7 +1578,7 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
       assert(DV && DV.isVariable());
       if (!ProcessedVars.insert(DV))
         continue;
-      findAbstractVariable(DV, DV.getContext());
+      ensureAbstractVariableIsCreated(DV, DV.getContext());
     }
     constructAbstractSubprogramScopeDIE(TheCU, AScope);
   }
@@ -1536,12 +1597,8 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   // Ownership of DbgVariables is a bit subtle - ScopeVariables owns all the
   // DbgVariables except those that are also in AbstractVariables (since they
   // can be used cross-function)
-  for (const auto &I : ScopeVariables)
-    for (const auto *Var : I.second)
-      if (!AbstractVariables.count(Var->getVariable()) || Var->getAbstractVariable())
-        delete Var;
   ScopeVariables.clear();
-  DeleteContainerPointers(CurrentFnArguments);
+  CurrentFnArguments.clear();
   DbgValues.clear();
   LabelsBeforeInsn.clear();
   LabelsAfterInsn.clear();
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 2f5abc8..ffe4843 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -27,6 +27,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/MachineLocation.h"
@@ -71,16 +72,21 @@ class DbgVariable {
   DIVariable Var;             // Variable Descriptor.
   DIE *TheDIE;                // Variable DIE.
   unsigned DotDebugLocOffset; // Offset in DotDebugLocEntries.
-  DbgVariable *AbsVar;        // Corresponding Abstract variable, if any.
   const MachineInstr *MInsn;  // DBG_VALUE instruction of the variable.
   int FrameIndex;
   DwarfDebug *DD;
 
 public:
-  // AbsVar may be NULL.
-  DbgVariable(DIVariable V, DbgVariable *AV, DwarfDebug *DD)
-      : Var(V), TheDIE(nullptr), DotDebugLocOffset(~0U), AbsVar(AV),
-        MInsn(nullptr), FrameIndex(~0), DD(DD) {}
+  /// Construct a DbgVariable from a DIVariable.
+  DbgVariable(DIVariable V, DwarfDebug *DD)
+      : Var(V), TheDIE(nullptr), DotDebugLocOffset(~0U), MInsn(nullptr),
+        FrameIndex(~0), DD(DD) {}
+
+  /// Construct a DbgVariable from a DEBUG_VALUE.
+  /// AbstractVar may be NULL.
+  DbgVariable(const MachineInstr *DbgValue, DwarfDebug *DD)
+      : Var(DbgValue->getDebugVariable()), TheDIE(nullptr),
+        DotDebugLocOffset(~0U), MInsn(DbgValue), FrameIndex(~0), DD(DD) {}
 
   // Accessors.
   DIVariable getVariable() const { return Var; }
@@ -89,9 +95,7 @@ public:
   void setDotDebugLocOffset(unsigned O) { DotDebugLocOffset = O; }
   unsigned getDotDebugLocOffset() const { return DotDebugLocOffset; }
   StringRef getName() const { return Var.getName(); }
-  DbgVariable *getAbstractVariable() const { return AbsVar; }
   const MachineInstr *getMInsn() const { return MInsn; }
-  void setMInsn(const MachineInstr *M) { MInsn = M; }
   int getFrameIndex() const { return FrameIndex; }
   void setFrameIndex(int FI) { FrameIndex = FI; }
   // Translate tag to proper Dwarf tag.
@@ -200,6 +204,7 @@ class DwarfDebug : public AsmPrinterHandler {
 
   // Collection of abstract variables.
   DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
+  SmallVector<std::unique_ptr<DbgVariable>, 64> ConcreteVariables;
 
   // Collection of DebugLocEntry. Stored in a linked list so that DIELocLists
   // can refer to them in spite of insertions into this list.
@@ -325,6 +330,8 @@ class DwarfDebug : public AsmPrinterHandler {
   DwarfAccelTable AccelNamespace;
   DwarfAccelTable AccelTypes;
 
+  DenseMap<const Function *, DISubprogram> FunctionDIs;
+
   MCDwarfDwoLineTable *getDwoLineTable(const DwarfCompileUnit &);
 
   void addScopeVariable(LexicalScope *LS, DbgVariable *Var);
@@ -334,8 +341,14 @@ class DwarfDebug : public AsmPrinterHandler {
   }
 
   /// \brief Find abstract variable associated with Var.
-  DbgVariable *findAbstractVariable(DIVariable &Var, DebugLoc Loc);
-  DbgVariable *findAbstractVariable(DIVariable &Var, const MDNode *Scope);
+  DbgVariable *getExistingAbstractVariable(const DIVariable &DV,
+                                           DIVariable &Cleansed);
+  DbgVariable *getExistingAbstractVariable(const DIVariable &DV);
+  void createAbstractVariable(const DIVariable &DV, LexicalScope *Scope);
+  void ensureAbstractVariableIsCreated(const DIVariable &Var,
+                                       const MDNode *Scope);
+  void ensureAbstractVariableIsCreatedIfScoped(const DIVariable &Var,
+                                               const MDNode *Scope);
 
   /// \brief Find DIE for the given subprogram and attach appropriate
   /// DW_AT_low_pc and DW_AT_high_pc attributes. If there are global
@@ -389,6 +402,8 @@ class DwarfDebug : public AsmPrinterHandler {
   /// \brief Collect info for variables that were optimized out.
   void collectDeadVariables();
 
+  void finishVariableDefinitions();
+
   void finishSubprogramDefinitions();
 
   /// \brief Finish off debug information after all functions have been
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h
index f792482..0440fce 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -14,138 +14,14 @@
 #ifndef LLVM_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
 #define LLVM_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
 
-#include "AsmPrinterHandler.h"
-#include "llvm/ADT/DenseMap.h"
+#include "EHStreamer.h"
 #include "llvm/CodeGen/AsmPrinter.h"
-#include <vector>
 
 namespace llvm {
-
-template <typename T> class SmallVectorImpl;
-struct LandingPadInfo;
-class MachineModuleInfo;
-class MachineInstr;
 class MachineFunction;
-class MCAsmInfo;
-class MCExpr;
-class MCSymbol;
-class Function;
 class ARMTargetStreamer;
-class AsmPrinter;
-
-//===----------------------------------------------------------------------===//
-/// DwarfException - Emits Dwarf exception handling directives.
-///
-class DwarfException : public AsmPrinterHandler {
-protected:
-  /// Asm - Target of Dwarf emission.
-  AsmPrinter *Asm;
-
-  /// MMI - Collected machine module information.
-  MachineModuleInfo *MMI;
-
-  /// SharedTypeIds - How many leading type ids two landing pads have in common.
-  static unsigned SharedTypeIds(const LandingPadInfo *L,
-                                const LandingPadInfo *R);
-
-  /// PadRange - Structure holding a try-range and the associated landing pad.
-  struct PadRange {
-    // The index of the landing pad.
-    unsigned PadIndex;
-    // The index of the begin and end labels in the landing pad's label lists.
-    unsigned RangeIndex;
-  };
-
-  typedef DenseMap<MCSymbol *, PadRange> RangeMapType;
-
-  /// ActionEntry - Structure describing an entry in the actions table.
-  struct ActionEntry {
-    int ValueForTypeID; // The value to write - may not be equal to the type id.
-    int NextAction;
-    unsigned Previous;
-  };
-
-  /// CallSiteEntry - Structure describing an entry in the call-site table.
-  struct CallSiteEntry {
-    // The 'try-range' is BeginLabel .. EndLabel.
-    MCSymbol *BeginLabel; // zero indicates the start of the function.
-    MCSymbol *EndLabel;   // zero indicates the end of the function.
-
-    // The landing pad starts at PadLabel.
-    MCSymbol *PadLabel;   // zero indicates that there is no landing pad.
-    unsigned Action;
-  };
-
-  /// ComputeActionsTable - Compute the actions table and gather the first
-  /// action index for each landing pad site.
-  unsigned ComputeActionsTable(const SmallVectorImpl<const LandingPadInfo*>&LPs,
-                               SmallVectorImpl<ActionEntry> &Actions,
-                               SmallVectorImpl<unsigned> &FirstActions);
-
-  /// CallToNoUnwindFunction - Return `true' if this is a call to a function
-  /// marked `nounwind'. Return `false' otherwise.
-  bool CallToNoUnwindFunction(const MachineInstr *MI);
-
-  /// ComputeCallSiteTable - Compute the call-site table.  The entry for an
-  /// invoke has a try-range containing the call, a non-zero landing pad and an
-  /// appropriate action.  The entry for an ordinary call has a try-range
-  /// containing the call and zero for the landing pad and the action.  Calls
-  /// marked 'nounwind' have no entry and must not be contained in the try-range
-  /// of any entry - they form gaps in the table.  Entries must be ordered by
-  /// try-range address.
-  void ComputeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
-                            const RangeMapType &PadMap,
-                            const SmallVectorImpl<const LandingPadInfo *> &LPs,
-                            const SmallVectorImpl<unsigned> &FirstActions);
-
-  /// EmitExceptionTable - Emit landing pads and actions.
-  ///
-  /// The general organization of the table is complex, but the basic concepts
-  /// are easy.  First there is a header which describes the location and
-  /// organization of the three components that follow.
-  ///  1. The landing pad site information describes the range of code covered
-  ///     by the try.  In our case it's an accumulation of the ranges covered
-  ///     by the invokes in the try.  There is also a reference to the landing
-  ///     pad that handles the exception once processed.  Finally an index into
-  ///     the actions table.
-  ///  2. The action table, in our case, is composed of pairs of type ids
-  ///     and next action offset.  Starting with the action index from the
-  ///     landing pad site, each type Id is checked for a match to the current
-  ///     exception.  If it matches then the exception and type id are passed
-  ///     on to the landing pad.  Otherwise the next action is looked up.  This
-  ///     chain is terminated with a next action of zero.  If no type id is
-  ///     found the frame is unwound and handling continues.
-  ///  3. Type id table contains references to all the C++ typeinfo for all
-  ///     catches in the function.  This tables is reversed indexed base 1.
-  void EmitExceptionTable();
-
-  virtual void EmitTypeInfos(unsigned TTypeEncoding);
-
-public:
-  //===--------------------------------------------------------------------===//
-  // Main entry points.
-  //
-  DwarfException(AsmPrinter *A);
-  virtual ~DwarfException();
-
-  /// endModule - Emit all exception information that should come after the
-  /// content.
-  void endModule() override;
-
-  /// beginFunction - Gather pre-function exception information.  Assumes being
-  /// emitted immediately after the function entry point.
-  void beginFunction(const MachineFunction *MF) override;
-
-  /// endFunction - Gather and emit post-function exception information.
-  void endFunction(const MachineFunction *) override;
-
-  // We don't need these.
-  void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
-  void beginInstruction(const MachineInstr *MI) override {}
-  void endInstruction() override {}
-};
 
-class DwarfCFIException : public DwarfException {
+class DwarfCFIException : public EHStreamer {
   /// shouldEmitPersonality - Per-function flag to indicate if .cfi_personality
   /// should be emitted.
   bool shouldEmitPersonality;
@@ -179,8 +55,8 @@ public:
   void endFunction(const MachineFunction *) override;
 };
 
-class ARMException : public DwarfException {
-  void EmitTypeInfos(unsigned TTypeEncoding) override;
+class ARMException : public EHStreamer {
+  void emitTypeInfos(unsigned TTypeEncoding) override;
   ARMTargetStreamer &getTargetStreamer();
 
   /// shouldEmitCFI - Per-function flag to indicate if frame CFI info
@@ -206,7 +82,7 @@ public:
   void endFunction(const MachineFunction *) override;
 };
 
-class Win64Exception : public DwarfException {
+class Win64Exception : public EHStreamer {
   /// shouldEmitPersonality - Per-function flag to indicate if personality
   /// info should be emitted.
   bool shouldEmitPersonality;
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index a70c0f7..9538bee 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -1071,6 +1071,8 @@ std::string DwarfUnit::getParentContextString(DIScope Context) const {
        I != E; ++I) {
     DIScope Ctx = *I;
     StringRef Name = Ctx.getName();
+    if (Name.empty() && Ctx.isNameSpace())
+      Name = "(anonymous namespace)";
     if (!Name.empty()) {
       CS += Name;
       CS += "::";
@@ -1359,12 +1361,13 @@ DIE *DwarfUnit::getOrCreateNameSpace(DINameSpace NS) {
     return NDie;
   DIE &NDie = createAndAddDIE(dwarf::DW_TAG_namespace, *ContextDIE, NS);
 
-  if (!NS.getName().empty()) {
+  StringRef Name = NS.getName();
+  if (!Name.empty())
     addString(NDie, dwarf::DW_AT_name, NS.getName());
-    DD->addAccelNamespace(NS.getName(), NDie);
-    addGlobalName(NS.getName(), NDie, NS.getContext());
-  } else
-    DD->addAccelNamespace("(anonymous namespace)", NDie);
+  else
+    Name = "(anonymous namespace)";
+  DD->addAccelNamespace(Name, NDie);
+  addGlobalName(Name, NDie, NS.getContext());
   addSourceLine(NDie, NS);
   return &NDie;
 }
@@ -1382,14 +1385,14 @@ DIE *DwarfUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
   if (DISubprogram SPDecl = SP.getFunctionDeclaration()) {
     // Add subprogram definitions to the CU die directly.
     ContextDIE = &getUnitDie();
-    // Build the decl now to ensure it preceeds the definition.
+    // Build the decl now to ensure it precedes the definition.
     getOrCreateSubprogramDIE(SPDecl);
   }
 
   // DW_TAG_inlined_subroutine may refer to this DIE.
   DIE &SPDie = createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE, SP);
 
-  // Abort here and fill this in later, depending on whether or not this
+  // Stop here and fill this in later, depending on whether or not this
   // subprogram turns out to have inlined instances or not.
   if (SP.isDefinition())
     return &SPDie;
@@ -1398,12 +1401,21 @@ DIE *DwarfUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
   return &SPDie;
 }
 
+void DwarfUnit::applySubprogramAttributesToDefinition(DISubprogram SP, DIE &SPDie) {
+  DISubprogram SPDecl = SP.getFunctionDeclaration();
+  DIScope Context = resolve(SPDecl ? SPDecl.getContext() : SP.getContext());
+  applySubprogramAttributes(SP, SPDie);
+  addGlobalName(SP.getName(), SPDie, Context);
+}
+
 void DwarfUnit::applySubprogramAttributes(DISubprogram SP, DIE &SPDie) {
   DIE *DeclDie = nullptr;
   StringRef DeclLinkageName;
   if (DISubprogram SPDecl = SP.getFunctionDeclaration()) {
     DeclDie = getDIE(SPDecl);
-    assert(DeclDie);
+    assert(DeclDie && "This DIE should've already been constructed when the "
+                      "definition DIE was created in "
+                      "getOrCreateSubprogramDIE");
     DeclLinkageName = SPDecl.getLinkageName();
   }
 
@@ -1502,6 +1514,17 @@ void DwarfUnit::applySubprogramAttributes(DISubprogram SP, DIE &SPDie) {
     addFlag(SPDie, dwarf::DW_AT_explicit);
 }
 
+void DwarfUnit::applyVariableAttributes(const DbgVariable &Var,
+                                        DIE &VariableDie) {
+  StringRef Name = Var.getName();
+  if (!Name.empty())
+    addString(VariableDie, dwarf::DW_AT_name, Name);
+  addSourceLine(VariableDie, Var.getVariable());
+  addType(VariableDie, Var.getType());
+  if (Var.isArtificial())
+    addFlag(VariableDie, dwarf::DW_AT_artificial);
+}
+
 // Return const expression if value is a GEP to access merged global
 // constant. e.g.
 // i8* getelementptr ({ i8, i8, i8, i8 }* @_MergedGlobals, i32 0, i32 0)
@@ -1665,10 +1688,8 @@ void DwarfCompileUnit::createGlobalVariableDIE(DIGlobalVariable GV) {
       DD->addAccelName(GV.getLinkageName(), AddrDIE);
   }
 
-  if (!GV.isLocalToUnit())
-    addGlobalName(GV.getName(),
-                  VariableSpecDIE ? *VariableSpecDIE : *VariableDIE,
-                  GV.getContext());
+  addGlobalName(GV.getName(), VariableSpecDIE ? *VariableSpecDIE : *VariableDIE,
+                GV.getContext());
 }
 
 /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
@@ -1777,24 +1798,13 @@ std::unique_ptr<DIE> DwarfUnit::constructVariableDIE(DbgVariable &DV,
 
 std::unique_ptr<DIE> DwarfUnit::constructVariableDIEImpl(const DbgVariable &DV,
                                                          bool Abstract) {
-  StringRef Name = DV.getName();
-
   // Define variable debug information entry.
   auto VariableDie = make_unique<DIE>(DV.getTag());
-  DbgVariable *AbsVar = DV.getAbstractVariable();
-  if (AbsVar && AbsVar->getDIE())
-    addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin, *AbsVar->getDIE());
-  else {
-    if (!Name.empty())
-      addString(*VariableDie, dwarf::DW_AT_name, Name);
-    addSourceLine(*VariableDie, DV.getVariable());
-    addType(*VariableDie, DV.getType());
-    if (DV.isArtificial())
-      addFlag(*VariableDie, dwarf::DW_AT_artificial);
-  }
 
-  if (Abstract)
+  if (Abstract) {
+    applyVariableAttributes(DV, *VariableDie);
     return VariableDie;
+  }
 
   // Add variable address.
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index acb7528..b7b83b2 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -400,6 +400,8 @@ public:
   DIE *getOrCreateSubprogramDIE(DISubprogram SP);
 
   void applySubprogramAttributes(DISubprogram SP, DIE &SPDie);
+  void applySubprogramAttributesToDefinition(DISubprogram SP, DIE &SPDie);
+  void applyVariableAttributes(const DbgVariable &Var, DIE &VariableDie);
 
   /// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
   /// given DIType.
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.cpp b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index 3a12c73..73f62bf 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.cpp
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -1,4 +1,4 @@
-//===-- CodeGen/AsmPrinter/DwarfException.cpp - Dwarf Exception Impl ------===//
+//===-- CodeGen/AsmPrinter/EHStreamer.cpp - Exception Directive Streamer --===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,45 +7,31 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains support for writing DWARF exception info into asm files.
+// This file contains support for writing exception info into assembly files.
 //
 //===----------------------------------------------------------------------===//
 
-#include "DwarfException.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/Twine.h"
+#include "EHStreamer.h"
 #include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+
 using namespace llvm;
 
-DwarfException::DwarfException(AsmPrinter *A)
-  : Asm(A), MMI(Asm->MMI) {}
+EHStreamer::EHStreamer(AsmPrinter *A) : Asm(A), MMI(Asm->MMI) {}
 
-DwarfException::~DwarfException() {}
+EHStreamer::~EHStreamer() {}
 
-/// SharedTypeIds - How many leading type ids two landing pads have in common.
-unsigned DwarfException::SharedTypeIds(const LandingPadInfo *L,
-                                       const LandingPadInfo *R) {
+/// How many leading type ids two landing pads have in common.
+unsigned EHStreamer::sharedTypeIDs(const LandingPadInfo *L,
+                                   const LandingPadInfo *R) {
   const std::vector<int> &LIds = L->TypeIds, &RIds = R->TypeIds;
   unsigned LSize = LIds.size(), RSize = RIds.size();
   unsigned MinSize = LSize < RSize ? LSize : RSize;
@@ -58,10 +44,10 @@ unsigned DwarfException::SharedTypeIds(const LandingPadInfo *L,
   return Count;
 }
 
-/// ComputeActionsTable - Compute the actions table and gather the first action
-/// index for each landing pad site.
-unsigned DwarfException::
-ComputeActionsTable(const SmallVectorImpl<const LandingPadInfo*> &LandingPads,
+/// Compute the actions table and gather the first action index for each landing
+/// pad site.
+unsigned EHStreamer::
+computeActionsTable(const SmallVectorImpl<const LandingPadInfo*> &LandingPads,
                     SmallVectorImpl<ActionEntry> &Actions,
                     SmallVectorImpl<unsigned> &FirstActions) {
 
@@ -109,7 +95,7 @@ ComputeActionsTable(const SmallVectorImpl<const LandingPadInfo*> &LandingPads,
          I = LandingPads.begin(), E = LandingPads.end(); I != E; ++I) {
     const LandingPadInfo *LPI = *I;
     const std::vector<int> &TypeIds = LPI->TypeIds;
-    unsigned NumShared = PrevLPI ? SharedTypeIds(LPI, PrevLPI) : 0;
+    unsigned NumShared = PrevLPI ? sharedTypeIDs(LPI, PrevLPI) : 0;
     unsigned SizeSiteActions = 0;
 
     if (NumShared < TypeIds.size()) {
@@ -167,9 +153,9 @@ ComputeActionsTable(const SmallVectorImpl<const LandingPadInfo*> &LandingPads,
   return SizeActions;
 }
 
-/// CallToNoUnwindFunction - Return `true' if this is a call to a function
-/// marked `nounwind'. Return `false' otherwise.
-bool DwarfException::CallToNoUnwindFunction(const MachineInstr *MI) {
+/// Return `true' if this is a call to a function marked `nounwind'. Return
+/// `false' otherwise.
+bool EHStreamer::callToNoUnwindFunction(const MachineInstr *MI) {
   assert(MI->isCall() && "This should be a call instruction!");
 
   bool MarkedNoUnwind = false;
@@ -201,15 +187,14 @@ bool DwarfException::CallToNoUnwindFunction(const MachineInstr *MI) {
   return MarkedNoUnwind;
 }
 
-/// ComputeCallSiteTable - Compute the call-site table.  The entry for an invoke
-/// has a try-range containing the call, a non-zero landing pad, and an
-/// appropriate action.  The entry for an ordinary call has a try-range
-/// containing the call and zero for the landing pad and the action.  Calls
-/// marked 'nounwind' have no entry and must not be contained in the try-range
-/// of any entry - they form gaps in the table.  Entries must be ordered by
-/// try-range address.
-void DwarfException::
-ComputeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
+/// Compute the call-site table.  The entry for an invoke has a try-range
+/// containing the call, a non-zero landing pad, and an appropriate action.  The
+/// entry for an ordinary call has a try-range containing the call and zero for
+/// the landing pad and the action.  Calls marked 'nounwind' have no entry and
+/// must not be contained in the try-range of any entry - they form gaps in the
+/// table.  Entries must be ordered by try-range address.
+void EHStreamer::
+computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
                      const RangeMapType &PadMap,
                      const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
                      const SmallVectorImpl<unsigned> &FirstActions) {
@@ -228,7 +213,7 @@ ComputeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
     for (const auto &MI : MBB) {
       if (!MI.isEHLabel()) {
         if (MI.isCall())
-          SawPotentiallyThrowing |= !CallToNoUnwindFunction(&MI);
+          SawPotentiallyThrowing |= !callToNoUnwindFunction(&MI);
         continue;
       }
 
@@ -308,7 +293,7 @@ ComputeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
   }
 }
 
-/// EmitExceptionTable - Emit landing pads and actions.
+/// Emit landing pads and actions.
 ///
 /// The general organization of the table is complex, but the basic concepts are
 /// easy.  First there is a header which describes the location and organization
@@ -328,7 +313,7 @@ ComputeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
 ///     unwound and handling continues.
 ///  3. Type ID table contains references to all the C++ typeinfo for all
 ///     catches in the function.  This tables is reverse indexed base 1.
-void DwarfException::EmitExceptionTable() {
+void EHStreamer::emitExceptionTable() {
   const std::vector<const GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
   const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads();
@@ -350,7 +335,8 @@ void DwarfException::EmitExceptionTable() {
   // landing pad site.
   SmallVector<ActionEntry, 32> Actions;
   SmallVector<unsigned, 64> FirstActions;
-  unsigned SizeActions=ComputeActionsTable(LandingPads, Actions, FirstActions);
+  unsigned SizeActions =
+    computeActionsTable(LandingPads, Actions, FirstActions);
 
   // Invokes and nounwind calls have entries in PadMap (due to being bracketed
   // by try-range labels when lowered).  Ordinary calls do not, so appropriate
@@ -368,7 +354,7 @@ void DwarfException::EmitExceptionTable() {
 
   // Compute the call-site table.
   SmallVector<CallSiteEntry, 64> CallSites;
-  ComputeCallSiteTable(CallSites, PadMap, LandingPads, FirstActions);
+  computeCallSiteTable(CallSites, PadMap, LandingPads, FirstActions);
 
   // Final tallies.
 
@@ -657,12 +643,12 @@ void DwarfException::EmitExceptionTable() {
     Asm->EmitSLEB128(Action.NextAction);
   }
 
-  EmitTypeInfos(TTypeEncoding);
+  emitTypeInfos(TTypeEncoding);
 
   Asm->EmitAlignment(2);
 }
 
-void DwarfException::EmitTypeInfos(unsigned TTypeEncoding) {
+void EHStreamer::emitTypeInfos(unsigned TTypeEncoding) {
   const std::vector<const GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
 
@@ -703,19 +689,18 @@ void DwarfException::EmitTypeInfos(unsigned TTypeEncoding) {
   }
 }
 
-/// endModule - Emit all exception information that should come after the
-/// content.
-void DwarfException::endModule() {
+/// Emit all exception information that should come after the content.
+void EHStreamer::endModule() {
   llvm_unreachable("Should be implemented");
 }
 
-/// beginFunction - Gather pre-function exception information. Assumes it's
-/// being emitted immediately after the function entry point.
-void DwarfException::beginFunction(const MachineFunction *MF) {
+/// Gather pre-function exception information. Assumes it's being emitted
+/// immediately after the function entry point.
+void EHStreamer::beginFunction(const MachineFunction *MF) {
   llvm_unreachable("Should be implemented");
 }
 
-/// endFunction - Gather and emit post-function exception information.
-void DwarfException::endFunction(const MachineFunction *) {
+/// Gather and emit post-function exception information.
+void EHStreamer::endFunction(const MachineFunction *) {
   llvm_unreachable("Should be implemented");
 }
diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.h b/lib/CodeGen/AsmPrinter/EHStreamer.h
new file mode 100644
index 0000000..2b6ba78
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.h
@@ -0,0 +1,138 @@
+//===-- EHStreamer.h - Exception Handling Directive Streamer ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing exception info into assembly files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_ASMPRINTER_EHSTREAMER_H
+#define LLVM_CODEGEN_ASMPRINTER_EHSTREAMER_H
+
+#include "AsmPrinterHandler.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+struct LandingPadInfo;
+class MachineModuleInfo;
+class MachineInstr;
+class MachineFunction;
+class AsmPrinter;
+
+template <typename T>
+class SmallVectorImpl;
+
+/// Emits exception handling directives.
+class EHStreamer : public AsmPrinterHandler {
+protected:
+  /// Target of directive emission.
+  AsmPrinter *Asm;
+
+  /// Collected machine module information.
+  MachineModuleInfo *MMI;
+
+  /// How many leading type ids two landing pads have in common.
+  static unsigned sharedTypeIDs(const LandingPadInfo *L,
+                                const LandingPadInfo *R);
+
+  /// Structure holding a try-range and the associated landing pad.
+  struct PadRange {
+    // The index of the landing pad.
+    unsigned PadIndex;
+    // The index of the begin and end labels in the landing pad's label lists.
+    unsigned RangeIndex;
+  };
+
+  typedef DenseMap<MCSymbol *, PadRange> RangeMapType;
+
+  /// Structure describing an entry in the actions table.
+  struct ActionEntry {
+    int ValueForTypeID; // The value to write - may not be equal to the type id.
+    int NextAction;
+    unsigned Previous;
+  };
+
+  /// Structure describing an entry in the call-site table.
+  struct CallSiteEntry {
+    // The 'try-range' is BeginLabel .. EndLabel.
+    MCSymbol *BeginLabel; // zero indicates the start of the function.
+    MCSymbol *EndLabel;   // zero indicates the end of the function.
+
+    // The landing pad starts at PadLabel.
+    MCSymbol *PadLabel;   // zero indicates that there is no landing pad.
+    unsigned Action;
+  };
+
+  /// Compute the actions table and gather the first action index for each
+  /// landing pad site.
+  unsigned computeActionsTable(const SmallVectorImpl<const LandingPadInfo*>&LPs,
+                               SmallVectorImpl<ActionEntry> &Actions,
+                               SmallVectorImpl<unsigned> &FirstActions);
+
+  /// Return `true' if this is a call to a function marked `nounwind'. Return
+  /// `false' otherwise.
+  bool callToNoUnwindFunction(const MachineInstr *MI);
+
+  /// Compute the call-site table.  The entry for an invoke has a try-range
+  /// containing the call, a non-zero landing pad and an appropriate action.
+  /// The entry for an ordinary call has a try-range containing the call and
+  /// zero for the landing pad and the action.  Calls marked 'nounwind' have
+  /// no entry and must not be contained in the try-range of any entry - they
+  /// form gaps in the table.  Entries must be ordered by try-range address.
+
+  void computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
+                            const RangeMapType &PadMap,
+                            const SmallVectorImpl<const LandingPadInfo *> &LPs,
+                            const SmallVectorImpl<unsigned> &FirstActions);
+
+  /// Emit landing pads and actions.
+  ///
+  /// The general organization of the table is complex, but the basic concepts
+  /// are easy.  First there is a header which describes the location and
+  /// organization of the three components that follow.
+  ///  1. The landing pad site information describes the range of code covered
+  ///     by the try.  In our case it's an accumulation of the ranges covered
+  ///     by the invokes in the try.  There is also a reference to the landing
+  ///     pad that handles the exception once processed.  Finally an index into
+  ///     the actions table.
+  ///  2. The action table, in our case, is composed of pairs of type ids
+  ///     and next action offset.  Starting with the action index from the
+  ///     landing pad site, each type Id is checked for a match to the current
+  ///     exception.  If it matches then the exception and type id are passed
+  ///     on to the landing pad.  Otherwise the next action is looked up.  This
+  ///     chain is terminated with a next action of zero.  If no type id is
+  ///     found the frame is unwound and handling continues.
+  ///  3. Type id table contains references to all the C++ typeinfo for all
+  ///     catches in the function.  This tables is reversed indexed base 1.
+  void emitExceptionTable();
+
+  virtual void emitTypeInfos(unsigned TTypeEncoding);
+
+public:
+  EHStreamer(AsmPrinter *A);
+  virtual ~EHStreamer();
+
+  /// Emit all exception information that should come after the content.
+  void endModule() override;
+
+  /// Gather pre-function exception information.  Assumes being emitted
+  /// immediately after the function entry point.
+  void beginFunction(const MachineFunction *MF) override;
+
+  /// Gather and emit post-function exception information.
+  void endFunction(const MachineFunction *) override;
+
+  // Unused.
+  void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
+  void beginInstruction(const MachineInstr *MI) override {}
+  void endInstruction() override {}
+};
+}
+
+#endif
+
diff --git a/lib/CodeGen/AsmPrinter/Win64Exception.cpp b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
index 17d8bff..81285d5 100644
--- a/lib/CodeGen/AsmPrinter/Win64Exception.cpp
+++ b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
@@ -38,9 +38,8 @@
 using namespace llvm;
 
 Win64Exception::Win64Exception(AsmPrinter *A)
-  : DwarfException(A),
-    shouldEmitPersonality(false), shouldEmitLSDA(false), shouldEmitMoves(false)
-    {}
+  : EHStreamer(A), shouldEmitPersonality(false), shouldEmitLSDA(false),
+    shouldEmitMoves(false) {}
 
 Win64Exception::~Win64Exception() {}
 
@@ -73,14 +72,14 @@ void Win64Exception::beginFunction(const MachineFunction *MF) {
   if (!shouldEmitPersonality && !shouldEmitMoves)
     return;
 
-  Asm->OutStreamer.EmitWin64EHStartProc(Asm->CurrentFnSym);
+  Asm->OutStreamer.EmitWinCFIStartProc(Asm->CurrentFnSym);
 
   if (!shouldEmitPersonality)
     return;
 
-  MCSymbol *GCCHandlerSym =
-    Asm->GetExternalSymbolSymbol("_GCC_specific_handler");
-  Asm->OutStreamer.EmitWin64EHHandler(GCCHandlerSym, true, true);
+  const MCSymbol *PersHandlerSym =
+      TLOF.getCFIPersonalitySymbol(Per, *Asm->Mang, Asm->TM, MMI);
+  Asm->OutStreamer.EmitWinEHHandler(PersHandlerSym, true, true);
 
   Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin",
                                                 Asm->getFunctionNumber()));
@@ -99,17 +98,10 @@ void Win64Exception::endFunction(const MachineFunction *) {
   MMI->TidyLandingPads();
 
   if (shouldEmitPersonality) {
-    const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
-    const Function *Per = MMI->getPersonalities()[MMI->getPersonalityIndex()];
-    const MCSymbol *Sym =
-        TLOF.getCFIPersonalitySymbol(Per, *Asm->Mang, Asm->TM, MMI);
-
     Asm->OutStreamer.PushSection();
-    Asm->OutStreamer.EmitWin64EHHandlerData();
-    Asm->OutStreamer.EmitValue(MCSymbolRefExpr::Create(Sym, Asm->OutContext),
-                               4);
-    EmitExceptionTable();
+    Asm->OutStreamer.EmitWinEHHandlerData();
+    emitExceptionTable();
     Asm->OutStreamer.PopSection();
   }
-  Asm->OutStreamer.EmitWin64EHEndProc();
+  Asm->OutStreamer.EmitWinCFIEndProc();
 }
diff --git a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
index 2212941..6a5c431 100644
--- a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
+++ b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
@@ -308,7 +308,7 @@ void WinCodeViewLineTables::endFunction(const MachineFunction *MF) {
     return;
 
   const Function *GV = MF->getFunction();
-  assert(FnDebugInfo.count(GV) == true);
+  assert(FnDebugInfo.count(GV));
   assert(CurFn == &FnDebugInfo[GV]);
 
   if (CurFn->Instrs.empty()) {
diff --git a/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp b/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp
index d995333..421946d 100644
--- a/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp
+++ b/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp
@@ -21,17 +21,19 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "arm-atomic-expand"
 
 namespace {
   class AtomicExpandLoadLinked : public FunctionPass {
-    const TargetLowering *TLI;
+    const TargetMachine *TM;
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit AtomicExpandLoadLinked(const TargetMachine *TM = nullptr)
-      : FunctionPass(ID), TLI(TM ? TM->getTargetLowering() : nullptr) {
+      : FunctionPass(ID), TM(TM) {
       initializeAtomicExpandLoadLinkedPass(*PassRegistry::getPassRegistry());
     }
 
@@ -50,29 +52,16 @@ namespace {
 
 char AtomicExpandLoadLinked::ID = 0;
 char &llvm::AtomicExpandLoadLinkedID = AtomicExpandLoadLinked::ID;
-
-static void *initializeAtomicExpandLoadLinkedPassOnce(PassRegistry &Registry) {
-  PassInfo *PI = new PassInfo(
-      "Expand Atomic calls in terms of load-linked & store-conditional",
-      "atomic-ll-sc", &AtomicExpandLoadLinked::ID,
-      PassInfo::NormalCtor_t(callDefaultCtor<AtomicExpandLoadLinked>), false,
-      false, PassInfo::TargetMachineCtor_t(
-                 callTargetMachineCtor<AtomicExpandLoadLinked>));
-  Registry.registerPass(*PI, true);
-  return PI;
-}
-
-void llvm::initializeAtomicExpandLoadLinkedPass(PassRegistry &Registry) {
-  CALL_ONCE_INITIALIZATION(initializeAtomicExpandLoadLinkedPassOnce)
-}
-
+INITIALIZE_TM_PASS(AtomicExpandLoadLinked, "atomic-ll-sc",
+    "Expand Atomic calls in terms of load-linked & store-conditional",
+    false, false)
 
 FunctionPass *llvm::createAtomicExpandLoadLinkedPass(const TargetMachine *TM) {
   return new AtomicExpandLoadLinked(TM);
 }
 
 bool AtomicExpandLoadLinked::runOnFunction(Function &F) {
-  if (!TLI)
+  if (!TM || !TM->getSubtargetImpl()->enableAtomicExpandLoadLinked())
     return false;
 
   SmallVector<Instruction *, 1> AtomicInsts;
@@ -89,7 +78,7 @@ bool AtomicExpandLoadLinked::runOnFunction(Function &F) {
 
   bool MadeChange = false;
   for (Instruction *Inst : AtomicInsts) {
-    if (!TLI->shouldExpandAtomicInIR(Inst))
+    if (!TM->getTargetLowering()->shouldExpandAtomicInIR(Inst))
       continue;
 
     if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
@@ -111,13 +100,14 @@ bool AtomicExpandLoadLinked::expandAtomicLoad(LoadInst *LI) {
   // Load instructions don't actually need a leading fence, even in the
   // SequentiallyConsistent case.
   AtomicOrdering MemOpOrder =
-    TLI->getInsertFencesForAtomic() ? Monotonic : LI->getOrdering();
+      TM->getTargetLowering()->getInsertFencesForAtomic() ? Monotonic
+                                                          : LI->getOrdering();
 
   // The only 64-bit load guaranteed to be single-copy atomic by the ARM ARM is
   // an ldrexd (A3.5.3).
   IRBuilder<> Builder(LI);
-  Value *Val =
-      TLI->emitLoadLinked(Builder, LI->getPointerOperand(), MemOpOrder);
+  Value *Val = TM->getTargetLowering()->emitLoadLinked(
+      Builder, LI->getPointerOperand(), MemOpOrder);
 
   insertTrailingFence(Builder, LI->getOrdering());
 
@@ -178,7 +168,8 @@ bool AtomicExpandLoadLinked::expandAtomicRMW(AtomicRMWInst *AI) {
 
   // Start the main loop block now that we've taken care of the preliminaries.
   Builder.SetInsertPoint(LoopBB);
-  Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
+  Value *Loaded =
+      TM->getTargetLowering()->emitLoadLinked(Builder, Addr, MemOpOrder);
 
   Value *NewVal;
   switch (AI->getOperation()) {
@@ -195,7 +186,7 @@ bool AtomicExpandLoadLinked::expandAtomicRMW(AtomicRMWInst *AI) {
     NewVal = Builder.CreateAnd(Loaded, AI->getValOperand(), "new");
     break;
   case AtomicRMWInst::Nand:
-    NewVal = Builder.CreateAnd(Loaded, Builder.CreateNot(AI->getValOperand()),
+    NewVal = Builder.CreateNot(Builder.CreateAnd(Loaded, AI->getValOperand()),
                                "new");
     break;
   case AtomicRMWInst::Or:
@@ -224,8 +215,8 @@ bool AtomicExpandLoadLinked::expandAtomicRMW(AtomicRMWInst *AI) {
     llvm_unreachable("Unknown atomic op");
   }
 
-  Value *StoreSuccess =
-      TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
+  Value *StoreSuccess = TM->getTargetLowering()->emitStoreConditional(
+      Builder, NewVal, Addr, MemOpOrder);
   Value *TryAgain = Builder.CreateICmpNE(
       StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
   Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
@@ -256,19 +247,26 @@ bool AtomicExpandLoadLinked::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   //     %loaded = @load.linked(%addr)
   //     %should_store = icmp eq %loaded, %desired
   //     br i1 %should_store, label %cmpxchg.trystore,
-  //                          label %cmpxchg.end/%cmpxchg.barrier
+  //                          label %cmpxchg.failure
   // cmpxchg.trystore:
   //     %stored = @store_conditional(%new, %addr)
-  //     %try_again = icmp i32 ne %stored, 0
-  //     br i1 %try_again, label %loop, label %cmpxchg.end
-  // cmpxchg.barrier:
+  //     %success = icmp eq i32 %stored, 0
+  //     br i1 %success, label %cmpxchg.success, label %loop/%cmpxchg.failure
+  // cmpxchg.success:
+  //     fence?
+  //     br label %cmpxchg.end
+  // cmpxchg.failure:
   //     fence?
   //     br label %cmpxchg.end
   // cmpxchg.end:
+  //     %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
+  //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
+  //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
   //     [...]
   BasicBlock *ExitBB = BB->splitBasicBlock(CI, "cmpxchg.end");
-  auto BarrierBB = BasicBlock::Create(Ctx, "cmpxchg.barrier", F, ExitBB);
-  auto TryStoreBB = BasicBlock::Create(Ctx, "cmpxchg.trystore", F, BarrierBB);
+  auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);
+  auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, FailureBB);
+  auto TryStoreBB = BasicBlock::Create(Ctx, "cmpxchg.trystore", F, SuccessBB);
   auto LoopBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, TryStoreBB);
 
   // This grabs the DebugLoc from CI
@@ -284,37 +282,82 @@ bool AtomicExpandLoadLinked::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
 
   // Start the main loop block now that we've taken care of the preliminaries.
   Builder.SetInsertPoint(LoopBB);
-  Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
+  Value *Loaded =
+      TM->getTargetLowering()->emitLoadLinked(Builder, Addr, MemOpOrder);
   Value *ShouldStore =
       Builder.CreateICmpEQ(Loaded, CI->getCompareOperand(), "should_store");
 
   // If the the cmpxchg doesn't actually need any ordering when it fails, we can
   // jump straight past that fence instruction (if it exists).
-  BasicBlock *FailureBB = FailureOrder == Monotonic ? ExitBB : BarrierBB;
   Builder.CreateCondBr(ShouldStore, TryStoreBB, FailureBB);
 
   Builder.SetInsertPoint(TryStoreBB);
-  Value *StoreSuccess = TLI->emitStoreConditional(
+  Value *StoreSuccess = TM->getTargetLowering()->emitStoreConditional(
       Builder, CI->getNewValOperand(), Addr, MemOpOrder);
-  Value *TryAgain = Builder.CreateICmpNE(
+  StoreSuccess = Builder.CreateICmpEQ(
       StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
-  Builder.CreateCondBr(TryAgain, LoopBB, BarrierBB);
+  Builder.CreateCondBr(StoreSuccess, SuccessBB,
+                       CI->isWeak() ? FailureBB : LoopBB);
 
-  // Finally, make sure later instructions don't get reordered with a fence if
-  // necessary.
-  Builder.SetInsertPoint(BarrierBB);
+  // Make sure later instructions don't get reordered with a fence if necessary.
+  Builder.SetInsertPoint(SuccessBB);
   insertTrailingFence(Builder, SuccessOrder);
   Builder.CreateBr(ExitBB);
 
-  CI->replaceAllUsesWith(Loaded);
-  CI->eraseFromParent();
+  Builder.SetInsertPoint(FailureBB);
+  insertTrailingFence(Builder, FailureOrder);
+  Builder.CreateBr(ExitBB);
+
+  // Finally, we have control-flow based knowledge of whether the cmpxchg
+  // succeeded or not. We expose this to later passes by converting any
+  // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate PHI.
 
+  // Setup the builder so we can create any PHIs we need.
+  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+  PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
+  Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
+  Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);
+
+  // Look for any users of the cmpxchg that are just comparing the loaded value
+  // against the desired one, and replace them with the CFG-derived version.
+  SmallVector<ExtractValueInst *, 2> PrunedInsts;
+  for (auto User : CI->users()) {
+    ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User);
+    if (!EV)
+      continue;
+
+    assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 &&
+           "weird extraction from { iN, i1 }");
+
+    if (EV->getIndices()[0] == 0)
+      EV->replaceAllUsesWith(Loaded);
+    else
+      EV->replaceAllUsesWith(Success);
+
+    PrunedInsts.push_back(EV);
+  }
+
+  // We can remove the instructions now we're no longer iterating through them.
+  for (auto EV : PrunedInsts)
+    EV->eraseFromParent();
+
+  if (!CI->use_empty()) {
+    // Some use of the full struct return that we don't understand has happened,
+    // so we've got to reconstruct it properly.
+    Value *Res;
+    Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0);
+    Res = Builder.CreateInsertValue(Res, Success, 1);
+
+    CI->replaceAllUsesWith(Res);
+  }
+
+  CI->eraseFromParent();
   return true;
 }
 
 AtomicOrdering AtomicExpandLoadLinked::insertLeadingFence(IRBuilder<> &Builder,
                                                        AtomicOrdering Ord) {
-  if (!TLI->getInsertFencesForAtomic())
+  if (!TM->getTargetLowering()->getInsertFencesForAtomic())
     return Ord;
 
   if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
@@ -327,7 +370,7 @@ AtomicOrdering AtomicExpandLoadLinked::insertLeadingFence(IRBuilder<> &Builder,
 
 void AtomicExpandLoadLinked::insertTrailingFence(IRBuilder<> &Builder,
                                               AtomicOrdering Ord) {
-  if (!TLI->getInsertFencesForAtomic())
+  if (!TM->getTargetLowering()->getInsertFencesForAtomic())
     return;
 
   if (Ord == Acquire || Ord == AcquireRelease)
diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp
index 7f31b1a..b2737bf 100644
--- a/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -39,6 +39,9 @@ class BasicTTI final : public ImmutablePass, public TargetTransformInfo {
   /// are set if the result needs to be inserted and/or extracted from vectors.
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
 
+  /// Estimate the cost overhead of SK_Alternate shuffle.
+  unsigned getAltShuffleOverhead(Type *Ty) const;
+
   const TargetLoweringBase *getTLI() const { return TM->getTargetLowering(); }
 
 public:
@@ -327,8 +330,28 @@ unsigned BasicTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
   return OpCost;
 }
 
+unsigned BasicTTI::getAltShuffleOverhead(Type *Ty) const {
+  assert(Ty->isVectorTy() && "Can only shuffle vectors");
+  unsigned Cost = 0;
+  // Shuffle cost is equal to the cost of extracting element from its argument
+  // plus the cost of inserting them onto the result vector.
+
+  // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from index
+  // 0 of first vector, index 1 of second vector,index 2 of first vector and
+  // finally index 3 of second vector and insert them at index <0,1,2,3> of
+  // result vector.
+  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+    Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+    Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+  }
+  return Cost;
+}
+
 unsigned BasicTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
                                   Type *SubTp) const {
+  if (Kind == SK_Alternate) {
+    return getAltShuffleOverhead(Tp);
+  }
   return 1;
 }
 
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index f623a48..7503e57 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -1505,10 +1505,17 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
     if (MO.isUse()) {
       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
         Uses.insert(*AI);
-    } else if (!MO.isDead())
-      // Don't try to hoist code in the rare case the terminator defines a
-      // register that is later used.
-      return MBB->end();
+    } else {
+      if (!MO.isDead())
+        // Don't try to hoist code in the rare case the terminator defines a
+        // register that is later used.
+        return MBB->end();
+
+      // If the terminator defines a register, make sure we don't hoist
+      // the instruction whose def might be clobbered by the terminator.
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        Defs.insert(*AI);
+    }
   }
 
   if (Uses.empty())
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 0b492a9..57c24e8 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -22,11 +22,13 @@ add_llvm_library(LLVMCodeGen
   GCMetadata.cpp
   GCMetadataPrinter.cpp
   GCStrategy.cpp
+  GlobalMerge.cpp
   IfConversion.cpp
   InlineSpiller.cpp
   InterferenceCache.cpp
   IntrinsicLowering.cpp
   JITCodeEmitter.cpp
+  JumpInstrTables.cpp
   LLVMTargetMachine.cpp
   LatencyPriorityQueue.cpp
   LexicalScopes.cpp
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 6aa60c6..ccac40c 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -151,19 +151,8 @@ typedef DenseMap<Instruction *, Type *> InstrToOrigTy;
 }
 
 char CodeGenPrepare::ID = 0;
-static void *initializeCodeGenPreparePassOnce(PassRegistry &Registry) {
-  initializeTargetLibraryInfoPass(Registry);
-  PassInfo *PI = new PassInfo(
-      "Optimize for code generation", "codegenprepare", &CodeGenPrepare::ID,
-      PassInfo::NormalCtor_t(callDefaultCtor<CodeGenPrepare>), false, false,
-      PassInfo::TargetMachineCtor_t(callTargetMachineCtor<CodeGenPrepare>));
-  Registry.registerPass(*PI, true);
-  return PI;
-}
-
-void llvm::initializeCodeGenPreparePass(PassRegistry &Registry) {
-  CALL_ONCE_INITIALIZATION(initializeCodeGenPreparePassOnce)
-}
+INITIALIZE_TM_PASS(CodeGenPrepare, "codegenprepare",
+                   "Optimize for code generation", false, false)
 
 FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) {
   return new CodeGenPrepare(TM);
@@ -1078,8 +1067,11 @@ void ExtAddrMode::print(raw_ostream &OS) const {
     NeedPlus = true;
   }
 
-  if (BaseOffs)
-    OS << (NeedPlus ? " + " : "") << BaseOffs, NeedPlus = true;
+  if (BaseOffs) {
+    OS << (NeedPlus ? " + " : "")
+       << BaseOffs;
+    NeedPlus = true;
+  }
 
   if (BaseReg) {
     OS << (NeedPlus ? " + " : "")
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 822636f..d3ffcc7 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -146,8 +146,8 @@ static const SDep *CriticalPathStep(const SUnit *SU) {
 
 void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
   // It's not safe to change register allocation for source operands of
-  // that have special allocation requirements. Also assume all registers
-  // used in a call must not be changed (ABI).
+  // instructions that have special allocation requirements. Also assume all
+  // registers used in a call must not be changed (ABI).
   // FIXME: The issue with predicated instruction is more complex. We are being
   // conservative here because the kill markers cannot be trusted after
   // if-conversion:
@@ -200,6 +200,28 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
     if (Classes[Reg] != reinterpret_cast<TargetRegisterClass *>(-1))
       RegRefs.insert(std::make_pair(Reg, &MO));
 
+    // If this reg is tied and live (Classes[Reg] is set to -1), we can't change
+    // it or any of its sub or super regs. We need to use KeepRegs to mark the
+    // reg because not all uses of the same reg within an instruction are
+    // necessarily tagged as tied.
+    // Example: an x86 "xor %eax, %eax" will have one source operand tied to the
+    // def register but not the second (see PR20020 for details).
+    // FIXME: can this check be relaxed to account for undef uses
+    // of a register? In the above 'xor' example, the uses of %eax are undef, so
+    // earlier instructions could still replace %eax even though the 'xor'
+    // itself can't be changed.
+    if (MI->isRegTiedToUseOperand(i) &&
+        Classes[Reg] == reinterpret_cast<TargetRegisterClass *>(-1)) {
+      for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs) {
+        KeepRegs.set(*SubRegs);
+      }
+      for (MCSuperRegIterator SuperRegs(Reg, TRI);
+           SuperRegs.isValid(); ++SuperRegs) {
+        KeepRegs.set(*SuperRegs);
+      }
+    }
+
     if (MO.isUse() && Special) {
       if (!KeepRegs.test(Reg)) {
         for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
@@ -236,9 +258,15 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
       unsigned Reg = MO.getReg();
       if (Reg == 0) continue;
       if (!MO.isDef()) continue;
+
+      // If we've already marked this reg as unchangeable, carry on.
+      if (KeepRegs.test(Reg)) continue;
+      
       // Ignore two-addr defs.
       if (MI->isRegTiedToUseOperand(i)) continue;
 
+      // FIXME: we should use a SubRegIterator that includes self (as above), so
+      // we don't have to repeat all this code for the reg itself.
       DefIndices[Reg] = Count;
       KillIndices[Reg] = ~0u;
       assert(((KillIndices[Reg] == ~0u) !=
@@ -281,6 +309,9 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
 
     RegRefs.insert(std::make_pair(Reg, &MO));
 
+    // FIXME: we should use an MCRegAliasIterator that includes self so we don't
+    // have to repeat all this code for the reg itself.
+    
     // It wasn't previously live but now it is, this is a kill.
     if (KillIndices[Reg] == ~0u) {
       KillIndices[Reg] = Count;
@@ -309,7 +340,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
 // the two-address instruction also defines NewReg, as may happen with
 // pre/postincrement loads. In this case, both the use and def operands are in
 // RegRefs because the def is inserted by PrescanInstruction and not erased
-// during ScanInstruction. So checking for an instructions with definitions of
+// during ScanInstruction. So checking for an instruction with definitions of
 // both NewReg and AntiDepReg covers it.
 bool
 CriticalAntiDepBreaker::isNewRegClobberedByRefs(RegRefIter RegRefBegin,
@@ -325,7 +356,7 @@ CriticalAntiDepBreaker::isNewRegClobberedByRefs(RegRefIter RegRefBegin,
     if (RefOper->isDef() && RefOper->isEarlyClobber())
       return true;
 
-    // Handle cases in which this instructions defines NewReg.
+    // Handle cases in which this instruction defines NewReg.
     MachineInstr *MI = RefOper->getParent();
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
       const MachineOperand &CheckOper = MI->getOperand(i);
@@ -343,11 +374,11 @@ CriticalAntiDepBreaker::isNewRegClobberedByRefs(RegRefIter RegRefBegin,
         return true;
 
       // Don't allow an instruction using AntiDepReg to be earlyclobbered by
-      // NewReg
+      // NewReg.
       if (CheckOper.isEarlyClobber())
         return true;
 
-      // Don't allow inline asm to define NewReg at all. Who know what it's
+      // Don't allow inline asm to define NewReg at all. Who knows what it's
       // doing with it.
       if (MI->isInlineAsm())
         return true;
@@ -494,8 +525,7 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
   // as we go to help determine which registers are available.
   unsigned Broken = 0;
   unsigned Count = InsertPosIndex - 1;
-  for (MachineBasicBlock::iterator I = End, E = Begin;
-       I != E; --Count) {
+  for (MachineBasicBlock::iterator I = End, E = Begin; I != E; --Count) {
     MachineInstr *MI = --I;
     if (MI->isDebugValue())
       continue;
@@ -526,7 +556,7 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
             // Don't break anti-dependencies on non-allocatable registers.
             AntiDepReg = 0;
           else if (KeepRegs.test(AntiDepReg))
-            // Don't break anti-dependencies if an use down below requires
+            // Don't break anti-dependencies if a use down below requires
             // this exact register.
             AntiDepReg = 0;
           else {
@@ -564,8 +594,7 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
     // If MI's defs have a special allocation requirement, don't allow
     // any def registers to be changed. Also assume all registers
     // defined in a call must not be changed (ABI).
-    if (MI->isCall() || MI->hasExtraDefRegAllocReq() ||
-        TII->isPredicated(MI))
+    if (MI->isCall() || MI->hasExtraDefRegAllocReq() || TII->isPredicated(MI))
       // If this instruction's defs have special allocation requirement, don't
       // break this anti-dependency.
       AntiDepReg = 0;
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.h b/lib/CodeGen/CriticalAntiDepBreaker.h
index 1949a48..45e4ff5 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.h
+++ b/lib/CodeGen/CriticalAntiDepBreaker.h
@@ -55,12 +55,12 @@ class TargetRegisterInfo;
     typedef std::multimap<unsigned, MachineOperand *>::const_iterator
       RegRefIter;
 
-    /// KillIndices - The index of the most recent kill (proceding bottom-up),
+    /// KillIndices - The index of the most recent kill (proceeding bottom-up),
     /// or ~0u if the register is not live.
     std::vector<unsigned> KillIndices;
 
-    /// DefIndices - The index of the most recent complete def (proceding bottom
-    /// up), or ~0u if the register is live.
+    /// DefIndices - The index of the most recent complete def (proceeding
+    /// bottom up), or ~0u if the register is live.
     std::vector<unsigned> DefIndices;
 
     /// KeepRegs - A set of registers which are live and cannot be changed to
diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/CodeGen/GlobalMerge.cpp
index 990d067..027ee38 100644
--- a/lib/Transforms/Scalar/GlobalMerge.cpp
+++ b/lib/CodeGen/GlobalMerge.cpp
@@ -64,6 +64,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -72,7 +73,7 @@ using namespace llvm;
 #define DEBUG_TYPE "global-merge"
 
 cl::opt<bool>
-EnableGlobalMerge("global-merge", cl::Hidden,
+EnableGlobalMerge("enable-global-merge", cl::Hidden,
                   cl::desc("Enable global merge pass"),
                   cl::init(true));
 
@@ -81,6 +82,13 @@ EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden,
                          cl::desc("Enable global merge pass on constants"),
                          cl::init(false));
 
+// FIXME: this could be a transitional option, and we probably need to remove
+// it if only we are sure this optimization could always benefit all targets.
+static cl::opt<bool>
+EnableGlobalMergeOnExternal("global-merge-on-external", cl::Hidden,
+     cl::desc("Enable global merge pass on external linkage"),
+     cl::init(false));
+
 STATISTIC(NumMerged      , "Number of globals merged");
 namespace {
   class GlobalMerge : public FunctionPass {
@@ -129,9 +137,8 @@ namespace {
 } // end anonymous namespace
 
 char GlobalMerge::ID = 0;
-INITIALIZE_PASS(GlobalMerge, "global-merge",
-                "Global Merge", false, false)
-
+INITIALIZE_TM_PASS(GlobalMerge, "global-merge", "Merge global variables",
+                   false, false)
 
 bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                           Module &M, bool isConst, unsigned AddrSpace) const {
@@ -154,11 +161,23 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
 
   Type *Int32Ty = Type::getInt32Ty(M.getContext());
 
+  assert(Globals.size() > 1);
+
+  // FIXME: This simple solution merges globals all together as maximum as
+  // possible. However, with this solution it would be hard to remove dead
+  // global symbols at link-time. An alternative solution could be checking
+  // global symbols references function by function, and make the symbols
+  // being referred in the same function merged and we would probably need
+  // to introduce heuristic algorithm to solve the merge conflict from
+  // different functions.
   for (size_t i = 0, e = Globals.size(); i != e; ) {
     size_t j = 0;
     uint64_t MergedSize = 0;
     std::vector<Type*> Tys;
     std::vector<Constant*> Inits;
+
+    bool HasExternal = false;
+    GlobalVariable *TheFirstExternal = 0;
     for (j = i; j != e; ++j) {
       Type *Ty = Globals[j]->getType()->getElementType();
       MergedSize += DL->getTypeAllocSize(Ty);
@@ -167,17 +186,35 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
       }
       Tys.push_back(Ty);
       Inits.push_back(Globals[j]->getInitializer());
+
+      if (Globals[j]->hasExternalLinkage() && !HasExternal) {
+        HasExternal = true;
+        TheFirstExternal = Globals[j];
+      }
     }
 
+    // If merged variables doesn't have external linkage, we needn't to expose
+    // the symbol after merging.
+    GlobalValue::LinkageTypes Linkage = HasExternal
+                                            ? GlobalValue::ExternalLinkage
+                                            : GlobalValue::InternalLinkage;
+
     StructType *MergedTy = StructType::get(M.getContext(), Tys);
     Constant *MergedInit = ConstantStruct::get(MergedTy, Inits);
-    GlobalVariable *MergedGV = new GlobalVariable(M, MergedTy, isConst,
-                                                  GlobalValue::InternalLinkage,
-                                                  MergedInit, "_MergedGlobals",
-                                                  nullptr,
-                                                  GlobalVariable::NotThreadLocal,
-                                                  AddrSpace);
+
+    // If merged variables have external linkage, we use symbol name of the
+    // first variable merged as the suffix of global symbol name. This would
+    // be able to avoid the link-time naming conflict for globalm symbols.
+    GlobalVariable *MergedGV = new GlobalVariable(
+        M, MergedTy, isConst, Linkage, MergedInit,
+        HasExternal ? "_MergedGlobals_" + TheFirstExternal->getName()
+                    : "_MergedGlobals",
+        nullptr, GlobalVariable::NotThreadLocal, AddrSpace);
+
     for (size_t k = i; k < j; ++k) {
+      GlobalValue::LinkageTypes Linkage = Globals[k]->getLinkage();
+      std::string Name = Globals[k]->getName();
+
       Constant *Idx[2] = {
         ConstantInt::get(Int32Ty, 0),
         ConstantInt::get(Int32Ty, k-i)
@@ -185,6 +222,14 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
       Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(MergedGV, Idx);
       Globals[k]->replaceAllUsesWith(GEP);
       Globals[k]->eraseFromParent();
+
+      if (Linkage != GlobalValue::InternalLinkage) {
+        // Generate a new alias...
+        auto *PTy = cast<PointerType>(GEP->getType());
+        GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
+                            Linkage, Name, GEP, &M);
+      }
+
       NumMerged++;
     }
     i = j;
@@ -245,8 +290,12 @@ bool GlobalMerge::doInitialization(Module &M) {
   // Grab all non-const globals.
   for (Module::global_iterator I = M.global_begin(),
          E = M.global_end(); I != E; ++I) {
-    // Merge is safe for "normal" internal globals only
-    if (!I->hasLocalLinkage() || I->isThreadLocal() || I->hasSection())
+    // Merge is safe for "normal" internal or external globals only
+    if (I->isDeclaration() || I->isThreadLocal() || I->hasSection())
+      continue;
+
+    if (!(EnableGlobalMergeOnExternal && I->hasExternalLinkage()) &&
+        !I->hasInternalLinkage())
       continue;
 
     PointerType *PT = dyn_cast<PointerType>(I->getType());
@@ -270,8 +319,7 @@ bool GlobalMerge::doInitialization(Module &M) {
       continue;
 
     if (DL->getTypeAllocSize(Ty) < MaxOffset) {
-      if (TargetLoweringObjectFile::getKindForGlobal(I, TLI->getTargetMachine())
-          .isBSSLocal())
+      if (TargetLoweringObjectFile::getKindForGlobal(I, *TM).isBSSLocal())
         BSSGlobals[AddressSpace].push_back(I);
       else if (I->isConstant())
         ConstGlobals[AddressSpace].push_back(I);
diff --git a/lib/CodeGen/JumpInstrTables.cpp b/lib/CodeGen/JumpInstrTables.cpp
new file mode 100644
index 0000000..61ef722
--- /dev/null
+++ b/lib/CodeGen/JumpInstrTables.cpp
@@ -0,0 +1,301 @@
+//===-- JumpInstrTables.cpp: Jump-Instruction Tables ----------------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief An implementation of jump-instruction tables.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jt"
+
+#include "llvm/CodeGen/JumpInstrTables.h"
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/JumpInstrTableInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <vector>
+
+using namespace llvm;
+
+char JumpInstrTables::ID = 0;
+
+INITIALIZE_PASS_BEGIN(JumpInstrTables, "jump-instr-tables",
+                      "Jump-Instruction Tables", true, true)
+INITIALIZE_PASS_DEPENDENCY(JumpInstrTableInfo);
+INITIALIZE_PASS_END(JumpInstrTables, "jump-instr-tables",
+                    "Jump-Instruction Tables", true, true)
+
+STATISTIC(NumJumpTables, "Number of indirect call tables generated");
+STATISTIC(NumFuncsInJumpTables, "Number of functions in the jump tables");
+
+ModulePass *llvm::createJumpInstrTablesPass() {
+  // The default implementation uses a single table for all functions.
+  return new JumpInstrTables(JumpTable::Single);
+}
+
+ModulePass *llvm::createJumpInstrTablesPass(JumpTable::JumpTableType JTT) {
+  return new JumpInstrTables(JTT);
+}
+
+namespace {
+static const char jump_func_prefix[] = "__llvm_jump_instr_table_";
+static const char jump_section_prefix[] = ".jump.instr.table.text.";
+
+// Checks to see if a given CallSite is making an indirect call, including
+// cases where the indirect call is made through a bitcast.
+bool isIndirectCall(CallSite &CS) {
+  if (CS.getCalledFunction())
+    return false;
+
+  // Check the value to see if it is merely a bitcast of a function. In
+  // this case, it will translate to a direct function call in the resulting
+  // assembly, so we won't treat it as an indirect call here.
+  const Value *V = CS.getCalledValue();
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    return !(CE->isCast() && isa<Function>(CE->getOperand(0)));
+  }
+
+  // Otherwise, since we know it's a call, it must be an indirect call
+  return true;
+}
+
+// Replaces Functions and GlobalAliases with a different Value.
+bool replaceGlobalValueIndirectUse(GlobalValue *GV, Value *V, Use *U) {
+  User *Us = U->getUser();
+  if (!Us)
+    return false;
+  if (Instruction *I = dyn_cast<Instruction>(Us)) {
+    CallSite CS(I);
+
+    // Don't do the replacement if this use is a direct call to this function.
+    // If the use is not the called value, then replace it.
+    if (CS && (isIndirectCall(CS) || CS.isCallee(U))) {
+      return false;
+    }
+
+    U->set(V);
+  } else if (Constant *C = dyn_cast<Constant>(Us)) {
+    // Don't replace calls to bitcasts of function symbols, since they get
+    // translated to direct calls.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Us)) {
+      if (CE->getOpcode() == Instruction::BitCast) {
+        // This bitcast must have exactly one user.
+        if (CE->user_begin() != CE->user_end()) {
+          User *ParentUs = *CE->user_begin();
+          if (CallInst *CI = dyn_cast<CallInst>(ParentUs)) {
+            CallSite CS(CI);
+            Use &CEU = *CE->use_begin();
+            if (CS.isCallee(&CEU)) {
+              return false;
+            }
+          }
+        }
+      }
+    }
+
+    // GlobalAlias doesn't support replaceUsesOfWithOnConstant. And the verifier
+    // requires alias to point to a defined function. So, GlobalAlias is handled
+    // as a separate case in runOnModule.
+    if (!isa<GlobalAlias>(C))
+      C->replaceUsesOfWithOnConstant(GV, V, U);
+  } else {
+    assert(false && "The Use of a Function symbol is neither an instruction nor"
+                    " a constant");
+  }
+
+  return true;
+}
+
+// Replaces all replaceable address-taken uses of GV with a pointer to a
+// jump-instruction table entry.
+void replaceValueWithFunction(GlobalValue *GV, Function *F) {
+  // Go through all uses of this function and replace the uses of GV with the
+  // jump-table version of the function. Get the uses as a vector before
+  // replacing them, since replacing them changes the use list and invalidates
+  // the iterator otherwise.
+  for (Value::use_iterator I = GV->use_begin(), E = GV->use_end(); I != E;) {
+    Use &U = *I++;
+
+    // Replacement of constants replaces all instances in the constant. So, some
+    // uses might have already been handled by the time we reach them here.
+    if (U.get() == GV)
+      replaceGlobalValueIndirectUse(GV, F, &U);
+  }
+
+  return;
+}
+} // end anonymous namespace
+
+JumpInstrTables::JumpInstrTables()
+    : ModulePass(ID), Metadata(), JITI(nullptr), TableCount(0),
+      JTType(JumpTable::Single) {
+  initializeJumpInstrTablesPass(*PassRegistry::getPassRegistry());
+}
+
+JumpInstrTables::JumpInstrTables(JumpTable::JumpTableType JTT)
+    : ModulePass(ID), Metadata(), JITI(nullptr), TableCount(0), JTType(JTT) {
+  initializeJumpInstrTablesPass(*PassRegistry::getPassRegistry());
+}
+
+JumpInstrTables::~JumpInstrTables() {}
+
+void JumpInstrTables::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<JumpInstrTableInfo>();
+}
+
+Function *JumpInstrTables::insertEntry(Module &M, Function *Target) {
+  FunctionType *OrigFunTy = Target->getFunctionType();
+  FunctionType *FunTy = transformType(OrigFunTy);
+
+  JumpMap::iterator it = Metadata.find(FunTy);
+  if (Metadata.end() == it) {
+    struct TableMeta Meta;
+    Meta.TableNum = TableCount;
+    Meta.Count = 0;
+    Metadata[FunTy] = Meta;
+    it = Metadata.find(FunTy);
+    ++NumJumpTables;
+    ++TableCount;
+  }
+
+  it->second.Count++;
+
+  std::string NewName(jump_func_prefix);
+  NewName += (Twine(it->second.TableNum) + "_" + Twine(it->second.Count)).str();
+  Function *JumpFun =
+      Function::Create(OrigFunTy, GlobalValue::ExternalLinkage, NewName, &M);
+  // The section for this table
+  JumpFun->setSection((jump_section_prefix + Twine(it->second.TableNum)).str());
+  JITI->insertEntry(FunTy, Target, JumpFun);
+
+  ++NumFuncsInJumpTables;
+  return JumpFun;
+}
+
+bool JumpInstrTables::hasTable(FunctionType *FunTy) {
+  FunctionType *TransTy = transformType(FunTy);
+  return Metadata.end() != Metadata.find(TransTy);
+}
+
+FunctionType *JumpInstrTables::transformType(FunctionType *FunTy) {
+  // Returning nullptr forces all types into the same table, since all types map
+  // to the same type
+  Type *VoidPtrTy = Type::getInt8PtrTy(FunTy->getContext());
+
+  // Ignore the return type.
+  Type *RetTy = VoidPtrTy;
+  bool IsVarArg = FunTy->isVarArg();
+  std::vector<Type *> ParamTys(FunTy->getNumParams());
+  FunctionType::param_iterator PI, PE;
+  int i = 0;
+
+  std::vector<Type *> EmptyParams;
+  Type *Int32Ty = Type::getInt32Ty(FunTy->getContext());
+  FunctionType *VoidFnTy = FunctionType::get(
+      Type::getVoidTy(FunTy->getContext()), EmptyParams, false);
+  switch (JTType) {
+  case JumpTable::Single:
+
+    return FunctionType::get(RetTy, EmptyParams, false);
+  case JumpTable::Arity:
+    // Transform all types to void* so that all functions with the same arity
+    // end up in the same table.
+    for (PI = FunTy->param_begin(), PE = FunTy->param_end(); PI != PE;
+         PI++, i++) {
+      ParamTys[i] = VoidPtrTy;
+    }
+
+    return FunctionType::get(RetTy, ParamTys, IsVarArg);
+  case JumpTable::Simplified:
+    // Project all parameters types to one of 3 types: composite, integer, and
+    // function, matching the three subclasses of Type.
+    for (PI = FunTy->param_begin(), PE = FunTy->param_end(); PI != PE;
+         ++PI, ++i) {
+      assert((isa<IntegerType>(*PI) || isa<FunctionType>(*PI) ||
+              isa<CompositeType>(*PI)) &&
+             "This type is not an Integer or a Composite or a Function");
+      if (isa<CompositeType>(*PI)) {
+        ParamTys[i] = VoidPtrTy;
+      } else if (isa<FunctionType>(*PI)) {
+        ParamTys[i] = VoidFnTy;
+      } else if (isa<IntegerType>(*PI)) {
+        ParamTys[i] = Int32Ty;
+      }
+    }
+
+    return FunctionType::get(RetTy, ParamTys, IsVarArg);
+  case JumpTable::Full:
+    // Don't transform this type at all.
+    return FunTy;
+  }
+
+  return nullptr;
+}
+
+bool JumpInstrTables::runOnModule(Module &M) {
+  // Make sure the module is well-formed, especially with respect to jumptable.
+  if (verifyModule(M))
+    return false;
+
+  JITI = &getAnalysis<JumpInstrTableInfo>();
+
+  // Get the set of jumptable-annotated functions.
+  DenseMap<Function *, Function *> Functions;
+  for (Function &F : M) {
+    if (F.hasFnAttribute(Attribute::JumpTable)) {
+      assert(F.hasUnnamedAddr() &&
+             "Attribute 'jumptable' requires 'unnamed_addr'");
+      Functions[&F] = nullptr;
+    }
+  }
+
+  // Create the jump-table functions.
+  for (auto &KV : Functions) {
+    Function *F = KV.first;
+    KV.second = insertEntry(M, F);
+  }
+
+  // GlobalAlias is a special case, because the target of an alias statement
+  // must be a defined function. So, instead of replacing a given function in
+  // the alias, we replace all uses of aliases that target jumptable functions.
+  // Note that there's no need to create these functions, since only aliases
+  // that target known jumptable functions are replaced, and there's no way to
+  // put the jumptable annotation on a global alias.
+  DenseMap<GlobalAlias *, Function *> Aliases;
+  for (GlobalAlias &GA : M.aliases()) {
+    Constant *Aliasee = GA.getAliasee();
+    if (Function *F = dyn_cast<Function>(Aliasee)) {
+      auto it = Functions.find(F);
+      if (it != Functions.end()) {
+        Aliases[&GA] = it->second;
+      }
+    }
+  }
+
+  // Replace each address taken function with its jump-instruction table entry.
+  for (auto &KV : Functions)
+    replaceValueWithFunction(KV.first, KV.second);
+
+  for (auto &KV : Aliases)
+    replaceValueWithFunction(KV.first, KV.second);
+
+  return !Functions.empty();
+}
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index a5ac057..df96b94 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -12,11 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Target/TargetMachine.h"
+
+#include "llvm/Analysis/Passes.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/JumpInstrTables.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -82,6 +86,7 @@ static MCContext *addPassesToGenerateCode(LLVMTargetMachine *TM,
                                           bool DisableVerify,
                                           AnalysisID StartAfter,
                                           AnalysisID StopAfter) {
+
   // Add internal analysis passes from the target machine.
   TM->addAnalysisPasses(PM);
 
@@ -136,6 +141,11 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
                                             bool DisableVerify,
                                             AnalysisID StartAfter,
                                             AnalysisID StopAfter) {
+  // Passes to handle jumptable function annotations. These can't be handled at
+  // JIT time, so we don't add them directly to addPassesToGenerateCode.
+  PM.add(createJumpInstrTableInfoPass());
+  PM.add(createJumpInstrTablesPass(Options.JTType));
+
   // Add common CodeGen passes.
   MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify,
                                                StartAfter, StopAfter);
@@ -199,7 +209,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
   case CGFT_Null:
     // The Null output is intended for use for performance analysis and testing,
     // not real users.
-    AsmStreamer.reset(createNullStreamer(*Context));
+    AsmStreamer.reset(getTarget().createNullStreamer(*Context));
     break;
   }
 
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 388f58f..7d5646b 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -329,12 +329,13 @@ class LDVImpl {
   void computeIntervals();
 
 public:
-  LDVImpl(LiveDebugVariables *ps) : pass(*ps), EmitDone(false),
-                                    ModifiedMF(false) {}
+  LDVImpl(LiveDebugVariables *ps)
+      : pass(*ps), MF(nullptr), EmitDone(false), ModifiedMF(false) {}
   bool runOnMachineFunction(MachineFunction &mf);
 
   /// clear - Release all memory.
   void clear() {
+    MF = nullptr;
     userValues.clear();
     virtRegToEqClass.clear();
     userVarMap.clear();
@@ -693,11 +694,11 @@ void LDVImpl::computeIntervals() {
 }
 
 bool LDVImpl::runOnMachineFunction(MachineFunction &mf) {
+  clear();
   MF = &mf;
   LIS = &pass.getAnalysis<LiveIntervals>();
   MDT = &pass.getAnalysis<MachineDominatorTree>();
   TRI = mf.getTarget().getRegisterInfo();
-  clear();
   LS.initialize(mf);
   DEBUG(dbgs() << "********** COMPUTING LIVE DEBUG VARIABLES: "
                << mf.getName() << " **********\n");
@@ -712,6 +713,8 @@ bool LDVImpl::runOnMachineFunction(MachineFunction &mf) {
 bool LiveDebugVariables::runOnMachineFunction(MachineFunction &mf) {
   if (!EnableLDV)
     return false;
+  if (!FunctionDIs.count(mf.getFunction()))
+    return false;
   if (!pImpl)
     pImpl = new LDVImpl(this);
   return static_cast<LDVImpl*>(pImpl)->runOnMachineFunction(mf);
@@ -974,6 +977,8 @@ void UserValue::emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS,
 
 void LDVImpl::emitDebugValues(VirtRegMap *VRM) {
   DEBUG(dbgs() << "********** EMITTING LIVE DEBUG VARIABLES **********\n");
+  if (!MF)
+    return;
   const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
   for (unsigned i = 0, e = userValues.size(); i != e; ++i) {
     DEBUG(userValues[i]->print(dbgs(), &MF->getTarget()));
@@ -988,6 +993,10 @@ void LiveDebugVariables::emitDebugValues(VirtRegMap *VRM) {
     static_cast<LDVImpl*>(pImpl)->emitDebugValues(VRM);
 }
 
+bool LiveDebugVariables::doInitialization(Module &M) {
+  FunctionDIs = makeSubprogramMap(M);
+  return Pass::doInitialization(M);
+}
 
 #ifndef NDEBUG
 void LiveDebugVariables::dump() {
diff --git a/lib/CodeGen/LiveDebugVariables.h b/lib/CodeGen/LiveDebugVariables.h
index bb67435..7ec0d17 100644
--- a/lib/CodeGen/LiveDebugVariables.h
+++ b/lib/CodeGen/LiveDebugVariables.h
@@ -22,6 +22,7 @@
 #define LLVM_CODEGEN_LIVEDEBUGVARIABLES_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 
 namespace llvm {
@@ -32,6 +33,7 @@ class VirtRegMap;
 
 class LiveDebugVariables : public MachineFunctionPass {
   void *pImpl;
+  DenseMap<const Function*, DISubprogram> FunctionDIs;
 public:
   static char ID; // Pass identification, replacement for typeid
 
@@ -64,6 +66,7 @@ private:
   bool runOnMachineFunction(MachineFunction &) override;
   void releaseMemory() override;
   void getAnalysisUsage(AnalysisUsage &) const override;
+  bool doInitialization(Module &) override;
 
 };
 
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 3563f8e..1559560 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -186,6 +186,7 @@ void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) {
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
   LRCalc->createDeadDefs(LI);
   LRCalc->extendToUses(LI);
+  computeDeadValues(&LI, LI, nullptr, nullptr);
 }
 
 void LiveIntervals::computeVirtRegs() {
@@ -412,21 +413,34 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
 
   // Handle dead values.
   bool CanSeparate = false;
+  computeDeadValues(li, NewLR, &CanSeparate, dead);
+
+  // Move the trimmed segments back.
+  li->segments.swap(NewLR.segments);
+  DEBUG(dbgs() << "Shrunk: " << *li << '\n');
+  return CanSeparate;
+}
+
+void LiveIntervals::computeDeadValues(LiveInterval *li,
+                                      LiveRange &LR,
+                                      bool *CanSeparate,
+                                      SmallVectorImpl<MachineInstr*> *dead) {
   for (LiveInterval::vni_iterator I = li->vni_begin(), E = li->vni_end();
        I != E; ++I) {
     VNInfo *VNI = *I;
     if (VNI->isUnused())
       continue;
-    LiveRange::iterator LRI = NewLR.FindSegmentContaining(VNI->def);
-    assert(LRI != NewLR.end() && "Missing segment for PHI");
+    LiveRange::iterator LRI = LR.FindSegmentContaining(VNI->def);
+    assert(LRI != LR.end() && "Missing segment for PHI");
     if (LRI->end != VNI->def.getDeadSlot())
       continue;
     if (VNI->isPHIDef()) {
       // This is a dead PHI. Remove it.
       VNI->markUnused();
-      NewLR.removeSegment(LRI->start, LRI->end);
+      LR.removeSegment(LRI->start, LRI->end);
       DEBUG(dbgs() << "Dead PHI at " << VNI->def << " may separate interval\n");
-      CanSeparate = true;
+      if (CanSeparate)
+        *CanSeparate = true;
     } else {
       // This is a dead def. Make sure the instruction knows.
       MachineInstr *MI = getInstructionFromIndex(VNI->def);
@@ -438,11 +452,6 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
       }
     }
   }
-
-  // Move the trimmed segments back.
-  li->segments.swap(NewLR.segments);
-  DEBUG(dbgs() << "Shrunk: " << *li << '\n');
-  return CanSeparate;
 }
 
 void LiveIntervals::extendToIndices(LiveRange &LR,
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 0ec5c33..08fef5f 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -332,7 +332,7 @@ void MachineBasicBlock::print(raw_ostream &OS, SlotIndexes *Indexes) const {
   }
 }
 
-void MachineBasicBlock::printAsOperand(raw_ostream &OS, bool /*PrintType*/) {
+void MachineBasicBlock::printAsOperand(raw_ostream &OS, bool /*PrintType*/) const {
   OS << "BB#" << getNumber();
 }
 
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index eb3d71f..6138aef 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -457,7 +457,7 @@ unsigned MachineFunction::addLiveIn(unsigned PReg,
 /// getJTISymbol - Return the MCSymbol for the specified non-empty jump table.
 /// If isLinkerPrivate is specified, an 'l' label is returned, otherwise a
 /// normal 'L' label is returned.
-MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx, 
+MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx,
                                         bool isLinkerPrivate) const {
   const DataLayout *DL = getTarget().getDataLayout();
   assert(JumpTableInfo && "No jump tables");
@@ -530,10 +530,9 @@ int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment,
 ///
 int MachineFrameInfo::CreateSpillStackObject(uint64_t Size,
                                              unsigned Alignment) {
-  Alignment =
-    clampStackAlignment(!getFrameLowering()->isStackRealignable() ||
-                          !RealignOption,
-                        Alignment, getFrameLowering()->getStackAlignment()); 
+  Alignment = clampStackAlignment(
+      !getFrameLowering()->isStackRealignable() || !RealignOption, Alignment,
+      getFrameLowering()->getStackAlignment());
   CreateStackObject(Size, Alignment, true);
   int Index = (int)Objects.size() - NumFixedObjects - 1;
   ensureMaxAlignment(Alignment);
@@ -548,10 +547,9 @@ int MachineFrameInfo::CreateSpillStackObject(uint64_t Size,
 int MachineFrameInfo::CreateVariableSizedObject(unsigned Alignment,
                                                 const AllocaInst *Alloca) {
   HasVarSizedObjects = true;
-  Alignment =
-    clampStackAlignment(!getFrameLowering()->isStackRealignable() ||
-                          !RealignOption,
-                        Alignment, getFrameLowering()->getStackAlignment()); 
+  Alignment = clampStackAlignment(
+      !getFrameLowering()->isStackRealignable() || !RealignOption, Alignment,
+      getFrameLowering()->getStackAlignment());
   Objects.push_back(StackObject(0, Alignment, 0, false, false, Alloca));
   ensureMaxAlignment(Alignment);
   return (int)Objects.size()-NumFixedObjects-1;
@@ -571,16 +569,30 @@ int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
   // object is 16-byte aligned.
   unsigned StackAlign = getFrameLowering()->getStackAlignment();
   unsigned Align = MinAlign(SPOffset, StackAlign);
-  Align =
-    clampStackAlignment(!getFrameLowering()->isStackRealignable() ||
-                          !RealignOption,
-                        Align, getFrameLowering()->getStackAlignment()); 
+  Align = clampStackAlignment(!getFrameLowering()->isStackRealignable() ||
+                                  !RealignOption,
+                              Align, getFrameLowering()->getStackAlignment());
   Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable,
                                               /*isSS*/   false,
                                               /*Alloca*/ nullptr));
   return -++NumFixedObjects;
 }
 
+/// CreateFixedSpillStackObject - Create a spill slot at a fixed location
+/// on the stack.  Returns an index with a negative value.
+int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size,
+                                                  int64_t SPOffset) {
+  unsigned StackAlign = getFrameLowering()->getStackAlignment();
+  unsigned Align = MinAlign(SPOffset, StackAlign);
+  Align = clampStackAlignment(!getFrameLowering()->isStackRealignable() ||
+                                  !RealignOption,
+                              Align, getFrameLowering()->getStackAlignment());
+  Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset,
+                                              /*Immutable*/ true,
+                                              /*isSS*/ true,
+                                              /*Alloca*/ nullptr));
+  return -++NumFixedObjects;
+}
 
 BitVector
 MachineFrameInfo::getPristineRegs(const MachineBasicBlock *MBB) const {
@@ -849,11 +861,10 @@ static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B,
   if (isa<StructType>(A->getType()) || isa<ArrayType>(A->getType()) ||
       isa<StructType>(B->getType()) || isa<ArrayType>(B->getType()))
     return false;
-  
+
   // For now, only support constants with the same size.
   uint64_t StoreSize = TD->getTypeStoreSize(A->getType());
-  if (StoreSize != TD->getTypeStoreSize(B->getType()) || 
-      StoreSize > 128)
+  if (StoreSize != TD->getTypeStoreSize(B->getType()) || StoreSize > 128)
     return false;
 
   Type *IntTy = IntegerType::get(A->getContext(), StoreSize*8);
@@ -882,7 +893,7 @@ static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B,
 /// an existing one.  User must specify the log2 of the minimum required
 /// alignment for the object.
 ///
-unsigned MachineConstantPool::getConstantPoolIndex(const Constant *C, 
+unsigned MachineConstantPool::getConstantPoolIndex(const Constant *C,
                                                    unsigned Alignment) {
   assert(Alignment && "Alignment must be specified!");
   if (Alignment > PoolAlignment) PoolAlignment = Alignment;
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 23847d6..44191f7 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -333,6 +333,12 @@ bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   if (skipOptnoneFunction(*mf.getFunction()))
     return false;
 
+  const TargetSubtargetInfo &ST =
+    mf.getTarget().getSubtarget<TargetSubtargetInfo>();
+  if (!ST.enablePostMachineScheduler()) {
+    DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n");
+    return false;
+  }
   DEBUG(dbgs() << "Before post-MI-sched:\n"; mf.print(dbgs()));
 
   // Initialize the context of the pass.
@@ -472,14 +478,13 @@ void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const {
   // unimplemented
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 void ReadyQueue::dump() {
   dbgs() << Name << ": ";
   for (unsigned i = 0, e = Queue.size(); i < e; ++i)
     dbgs() << Queue[i]->NodeNum << " ";
   dbgs() << "\n";
 }
-#endif
 
 //===----------------------------------------------------------------------===//
 // ScheduleDAGMI - Basic machine instruction scheduling. This is
@@ -529,6 +534,11 @@ void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
     llvm_unreachable(nullptr);
   }
 #endif
+  // SU->TopReadyCycle was set to CurrCycle when it was scheduled. However,
+  // CurrCycle may have advanced since then.
+  if (SuccSU->TopReadyCycle < SU->TopReadyCycle + SuccEdge->getLatency())
+    SuccSU->TopReadyCycle = SU->TopReadyCycle + SuccEdge->getLatency();
+
   --SuccSU->NumPredsLeft;
   if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
     SchedImpl->releaseTopNode(SuccSU);
@@ -563,6 +573,11 @@ void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
     llvm_unreachable(nullptr);
   }
 #endif
+  // SU->BotReadyCycle was set to CurrCycle when it was scheduled. However,
+  // CurrCycle may have advanced since then.
+  if (PredSU->BotReadyCycle < SU->BotReadyCycle + PredEdge->getLatency())
+    PredSU->BotReadyCycle = SU->BotReadyCycle + PredEdge->getLatency();
+
   --PredSU->NumSuccsLeft;
   if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU)
     SchedImpl->releaseBottomNode(PredSU);
@@ -674,10 +689,13 @@ void ScheduleDAGMI::schedule() {
         CurrentBottom = MI;
       }
     }
-    updateQueues(SU, IsTopNode);
-
-    // Notify the scheduling strategy after updating the DAG.
+    // Notify the scheduling strategy before updating the DAG.
+    // This sets the scheduled node's ReadyCycle to CurrCycle. When updateQueues
+    // runs, it can then use the accurate ReadyCycle time to determine whether
+    // newly released nodes can move to the readyQ.
     SchedImpl->schedNode(SU, IsTopNode);
+
+    updateQueues(SU, IsTopNode);
   }
   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
 
@@ -1568,7 +1586,7 @@ void SchedBoundary::reset() {
   // Track the maximum number of stall cycles that could arise either from the
   // latency of a DAG edge or the number of cycles that a processor resource is
   // reserved (SchedBoundary::ReservedCycles).
-  MaxObservedLatency = 0;
+  MaxObservedStall = 0;
 #endif
   // Reserve a zero-count for invalid CritResIdx.
   ExecutedResCounts.resize(1);
@@ -1668,8 +1686,16 @@ bool SchedBoundary::checkHazard(SUnit *SU) {
     for (TargetSchedModel::ProcResIter
            PI = SchedModel->getWriteProcResBegin(SC),
            PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
-      if (getNextResourceCycle(PI->ProcResourceIdx, PI->Cycles) > CurrCycle)
+      unsigned NRCycle = getNextResourceCycle(PI->ProcResourceIdx, PI->Cycles);
+      if (NRCycle > CurrCycle) {
+#ifndef NDEBUG
+        MaxObservedStall = std::max(PI->Cycles, MaxObservedStall);
+#endif
+        DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") "
+              << SchedModel->getResourceName(PI->ProcResourceIdx)
+              << "=" << NRCycle << "c\n");
         return true;
+      }
     }
   }
   return false;
@@ -1725,6 +1751,16 @@ getOtherResourceCount(unsigned &OtherCritIdx) {
 }
 
 void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle) {
+  assert(SU->getInstr() && "Scheduled SUnit must have instr");
+
+#ifndef NDEBUG
+  // ReadyCycle was been bumped up to the CurrCycle when this node was
+  // scheduled, but CurrCycle may have been eagerly advanced immediately after
+  // scheduling, so may now be greater than ReadyCycle.
+  if (ReadyCycle > CurrCycle)
+    MaxObservedStall = std::max(ReadyCycle - CurrCycle, MaxObservedStall);
+#endif
+
   if (ReadyCycle < MinReadyCycle)
     MinReadyCycle = ReadyCycle;
 
@@ -1744,18 +1780,6 @@ void SchedBoundary::releaseTopNode(SUnit *SU) {
   if (SU->isScheduled)
     return;
 
-  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    if (I->isWeak())
-      continue;
-    unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle;
-    unsigned Latency = I->getLatency();
-#ifndef NDEBUG
-    MaxObservedLatency = std::max(Latency, MaxObservedLatency);
-#endif
-    if (SU->TopReadyCycle < PredReadyCycle + Latency)
-      SU->TopReadyCycle = PredReadyCycle + Latency;
-  }
   releaseNode(SU, SU->TopReadyCycle);
 }
 
@@ -1763,20 +1787,6 @@ void SchedBoundary::releaseBottomNode(SUnit *SU) {
   if (SU->isScheduled)
     return;
 
-  assert(SU->getInstr() && "Scheduled SUnit must have instr");
-
-  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I) {
-    if (I->isWeak())
-      continue;
-    unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
-    unsigned Latency = I->getLatency();
-#ifndef NDEBUG
-    MaxObservedLatency = std::max(Latency, MaxObservedLatency);
-#endif
-    if (SU->BotReadyCycle < SuccReadyCycle + Latency)
-      SU->BotReadyCycle = SuccReadyCycle + Latency;
-  }
   releaseNode(SU, SU->BotReadyCycle);
 }
 
@@ -1943,10 +1953,12 @@ void SchedBoundary::bumpNode(SUnit *SU) {
              PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
         unsigned PIdx = PI->ProcResourceIdx;
         if (SchedModel->getProcResource(PIdx)->BufferSize == 0) {
-          ReservedCycles[PIdx] = isTop() ? NextCycle + PI->Cycles : NextCycle;
-#ifndef NDEBUG
-          MaxObservedLatency = std::max(PI->Cycles, MaxObservedLatency);
-#endif
+          if (isTop()) {
+            ReservedCycles[PIdx] =
+              std::max(getNextResourceCycle(PIdx, 0), NextCycle + PI->Cycles);
+          }
+          else
+            ReservedCycles[PIdx] = NextCycle;
         }
       }
     }
@@ -2049,8 +2061,10 @@ SUnit *SchedBoundary::pickOnlyChoice() {
     }
   }
   for (unsigned i = 0; Available.empty(); ++i) {
-    assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedLatency) &&
-           "permanent hazard"); (void)i;
+//  FIXME: Re-enable assert once PR20057 is resolved.
+//    assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedStall) &&
+//           "permanent hazard");
+    (void)i;
     bumpCycle(CurrCycle + 1);
     releasePending();
   }
@@ -2090,111 +2104,6 @@ void SchedBoundary::dumpScheduledState() {
 // GenericScheduler - Generic implementation of MachineSchedStrategy.
 //===----------------------------------------------------------------------===//
 
-namespace {
-/// Base class for GenericScheduler. This class maintains information about
-/// scheduling candidates based on TargetSchedModel making it easy to implement
-/// heuristics for either preRA or postRA scheduling.
-class GenericSchedulerBase : public MachineSchedStrategy {
-public:
-  /// Represent the type of SchedCandidate found within a single queue.
-  /// pickNodeBidirectional depends on these listed by decreasing priority.
-  enum CandReason {
-    NoCand, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak, RegMax,
-    ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
-    TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};
-
-#ifndef NDEBUG
-  static const char *getReasonStr(GenericSchedulerBase::CandReason Reason);
-#endif
-
-  /// Policy for scheduling the next instruction in the candidate's zone.
-  struct CandPolicy {
-    bool ReduceLatency;
-    unsigned ReduceResIdx;
-    unsigned DemandResIdx;
-
-    CandPolicy(): ReduceLatency(false), ReduceResIdx(0), DemandResIdx(0) {}
-  };
-
-  /// Status of an instruction's critical resource consumption.
-  struct SchedResourceDelta {
-    // Count critical resources in the scheduled region required by SU.
-    unsigned CritResources;
-
-    // Count critical resources from another region consumed by SU.
-    unsigned DemandedResources;
-
-    SchedResourceDelta(): CritResources(0), DemandedResources(0) {}
-
-    bool operator==(const SchedResourceDelta &RHS) const {
-      return CritResources == RHS.CritResources
-        && DemandedResources == RHS.DemandedResources;
-    }
-    bool operator!=(const SchedResourceDelta &RHS) const {
-      return !operator==(RHS);
-    }
-  };
-
-  /// Store the state used by GenericScheduler heuristics, required for the
-  /// lifetime of one invocation of pickNode().
-  struct SchedCandidate {
-    CandPolicy Policy;
-
-    // The best SUnit candidate.
-    SUnit *SU;
-
-    // The reason for this candidate.
-    CandReason Reason;
-
-    // Set of reasons that apply to multiple candidates.
-    uint32_t RepeatReasonSet;
-
-    // Register pressure values for the best candidate.
-    RegPressureDelta RPDelta;
-
-    // Critical resource consumption of the best candidate.
-    SchedResourceDelta ResDelta;
-
-    SchedCandidate(const CandPolicy &policy)
-      : Policy(policy), SU(nullptr), Reason(NoCand), RepeatReasonSet(0) {}
-
-    bool isValid() const { return SU; }
-
-    // Copy the status of another candidate without changing policy.
-    void setBest(SchedCandidate &Best) {
-      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
-      SU = Best.SU;
-      Reason = Best.Reason;
-      RPDelta = Best.RPDelta;
-      ResDelta = Best.ResDelta;
-    }
-
-    bool isRepeat(CandReason R) { return RepeatReasonSet & (1 << R); }
-    void setRepeat(CandReason R) { RepeatReasonSet |= (1 << R); }
-
-    void initResourceDelta(const ScheduleDAGMI *DAG,
-                           const TargetSchedModel *SchedModel);
-  };
-
-protected:
-  const MachineSchedContext *Context;
-  const TargetSchedModel *SchedModel;
-  const TargetRegisterInfo *TRI;
-
-  SchedRemainder Rem;
-protected:
-  GenericSchedulerBase(const MachineSchedContext *C):
-    Context(C), SchedModel(nullptr), TRI(nullptr) {}
-
-  void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone,
-                 SchedBoundary *OtherZone);
-
-#ifndef NDEBUG
-  void traceCandidate(const SchedCandidate &Cand);
-#endif
-};
-} // namespace
-
 void GenericSchedulerBase::SchedCandidate::
 initResourceDelta(const ScheduleDAGMI *DAG,
                   const TargetSchedModel *SchedModel) {
@@ -2430,65 +2339,6 @@ static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand,
         << GenericSchedulerBase::getReasonStr(Cand.Reason) << '\n');
 }
 
-namespace {
-/// GenericScheduler shrinks the unscheduled zone using heuristics to balance
-/// the schedule.
-class GenericScheduler : public GenericSchedulerBase {
-  ScheduleDAGMILive *DAG;
-
-  // State of the top and bottom scheduled instruction boundaries.
-  SchedBoundary Top;
-  SchedBoundary Bot;
-
-  MachineSchedPolicy RegionPolicy;
-public:
-  GenericScheduler(const MachineSchedContext *C):
-    GenericSchedulerBase(C), DAG(nullptr), Top(SchedBoundary::TopQID, "TopQ"),
-    Bot(SchedBoundary::BotQID, "BotQ") {}
-
-  void initPolicy(MachineBasicBlock::iterator Begin,
-                  MachineBasicBlock::iterator End,
-                  unsigned NumRegionInstrs) override;
-
-  bool shouldTrackPressure() const override {
-    return RegionPolicy.ShouldTrackPressure;
-  }
-
-  void initialize(ScheduleDAGMI *dag) override;
-
-  SUnit *pickNode(bool &IsTopNode) override;
-
-  void schedNode(SUnit *SU, bool IsTopNode) override;
-
-  void releaseTopNode(SUnit *SU) override {
-    Top.releaseTopNode(SU);
-  }
-
-  void releaseBottomNode(SUnit *SU) override {
-    Bot.releaseBottomNode(SU);
-  }
-
-  void registerRoots() override;
-
-protected:
-  void checkAcyclicLatency();
-
-  void tryCandidate(SchedCandidate &Cand,
-                    SchedCandidate &TryCand,
-                    SchedBoundary &Zone,
-                    const RegPressureTracker &RPTracker,
-                    RegPressureTracker &TempTracker);
-
-  SUnit *pickNodeBidirectional(bool &IsTopNode);
-
-  void pickNodeFromQueue(SchedBoundary &Zone,
-                         const RegPressureTracker &RPTracker,
-                         SchedCandidate &Candidate);
-
-  void reschedulePhysRegCopies(SUnit *SU, bool isTop);
-};
-} // namespace
-
 void GenericScheduler::initialize(ScheduleDAGMI *dag) {
   assert(dag->hasVRegLiveness() &&
          "(PreRA)GenericScheduler needs vreg liveness");
@@ -3023,75 +2873,25 @@ GenericSchedRegistry("converge", "Standard converging scheduler.",
 // PostGenericScheduler - Generic PostRA implementation of MachineSchedStrategy.
 //===----------------------------------------------------------------------===//
 
-namespace {
-/// PostGenericScheduler - Interface to the scheduling algorithm used by
-/// ScheduleDAGMI.
-///
-/// Callbacks from ScheduleDAGMI:
-///   initPolicy -> initialize(DAG) -> registerRoots -> pickNode ...
-class PostGenericScheduler : public GenericSchedulerBase {
-  ScheduleDAGMI *DAG;
-  SchedBoundary Top;
-  SmallVector<SUnit*, 8> BotRoots;
-public:
-  PostGenericScheduler(const MachineSchedContext *C):
-    GenericSchedulerBase(C), Top(SchedBoundary::TopQID, "TopQ") {}
-
-  virtual ~PostGenericScheduler() {}
-
-  void initPolicy(MachineBasicBlock::iterator Begin,
-                  MachineBasicBlock::iterator End,
-                  unsigned NumRegionInstrs) override {
-    /* no configurable policy */
-  };
-
-  /// PostRA scheduling does not track pressure.
-  bool shouldTrackPressure() const override { return false; }
-
-  void initialize(ScheduleDAGMI *Dag) override {
-    DAG = Dag;
-    SchedModel = DAG->getSchedModel();
-    TRI = DAG->TRI;
-
-    Rem.init(DAG, SchedModel);
-    Top.init(DAG, SchedModel, &Rem);
-    BotRoots.clear();
-
-    // Initialize the HazardRecognizers. If itineraries don't exist, are empty,
-    // or are disabled, then these HazardRecs will be disabled.
-    const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
-    const TargetMachine &TM = DAG->MF.getTarget();
-    if (!Top.HazardRec) {
-      Top.HazardRec =
-        TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
-    }
-  }
-
-  void registerRoots() override;
-
-  SUnit *pickNode(bool &IsTopNode) override;
-
-  void scheduleTree(unsigned SubtreeID) override {
-    llvm_unreachable("PostRA scheduler does not support subtree analysis.");
-  }
-
-  void schedNode(SUnit *SU, bool IsTopNode) override;
+void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
+  DAG = Dag;
+  SchedModel = DAG->getSchedModel();
+  TRI = DAG->TRI;
 
-  void releaseTopNode(SUnit *SU) override {
-    Top.releaseTopNode(SU);
-  }
+  Rem.init(DAG, SchedModel);
+  Top.init(DAG, SchedModel, &Rem);
+  BotRoots.clear();
 
-  // Only called for roots.
-  void releaseBottomNode(SUnit *SU) override {
-    BotRoots.push_back(SU);
+  // Initialize the HazardRecognizers. If itineraries don't exist, are empty,
+  // or are disabled, then these HazardRecs will be disabled.
+  const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
+  const TargetMachine &TM = DAG->MF.getTarget();
+  if (!Top.HazardRec) {
+    Top.HazardRec =
+      TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
   }
+}
 
-protected:
-  void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand);
-
-  void pickNodeFromQueue(SchedCandidate &Cand);
-};
-} // namespace
 
 void PostGenericScheduler::registerRoots() {
   Rem.CriticalPath = DAG->ExitSU.getDepth();
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index b3f7198..249b2d0 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -30,11 +30,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-extern cl::opt<bool> EnableStackMapLiveness;
-extern cl::opt<bool> EnablePatchPointLiveness;
-}
-
 static cl::opt<bool> DisablePostRA("disable-post-ra", cl::Hidden,
     cl::desc("Disable Post Regalloc"));
 static cl::opt<bool> DisableBranchFold("disable-branch-fold", cl::Hidden,
@@ -92,9 +87,9 @@ PrintMachineInstrs("print-machineinstrs", cl::ValueOptional,
 
 // Temporary option to allow experimenting with MachineScheduler as a post-RA
 // scheduler. Targets can "properly" enable this with
-// substitutePass(&PostRASchedulerID, &MachineSchedulerID); Ideally it wouldn't
-// be part of the standard pass pipeline, and the target would just add a PostRA
-// scheduling pass wherever it wants.
+// substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); Ideally it
+// wouldn't be part of the standard pass pipeline, and the target would just add
+// a PostRA scheduling pass wherever it wants.
 static cl::opt<bool> MISchedPostRA("misched-postra", cl::Hidden,
   cl::desc("Run MachineScheduler post regalloc (independent of preRA sched)"));
 
@@ -421,7 +416,7 @@ void TargetPassConfig::addPassesToHandleExceptions() {
     // FALLTHROUGH
   case ExceptionHandling::DwarfCFI:
   case ExceptionHandling::ARM:
-  case ExceptionHandling::Win64:
+  case ExceptionHandling::WinEH:
     addPass(createDwarfEHPass(TM));
     break;
   case ExceptionHandling::None:
@@ -566,8 +561,7 @@ void TargetPassConfig::addMachinePasses() {
   if (addPreEmitPass())
     printAndVerify("After PreEmit passes");
 
-  if (EnableStackMapLiveness || EnablePatchPointLiveness)
-    addPass(&StackMapLivenessID);
+  addPass(&StackMapLivenessID);
 }
 
 /// Add passes that optimize machine instructions in SSA form.
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index eeee93a..716cb1f 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -91,6 +91,10 @@ static cl::opt<bool>
 DisablePeephole("disable-peephole", cl::Hidden, cl::init(false),
                 cl::desc("Disable the peephole optimizer"));
 
+static cl::opt<bool>
+DisableAdvCopyOpt("disable-adv-copy-opt", cl::Hidden, cl::init(true),
+                  cl::desc("Disable advanced copy optimization"));
+
 STATISTIC(NumReuse,      "Number of extension results reused");
 STATISTIC(NumCmps,       "Number of compares eliminated");
 STATISTIC(NumImmFold,    "Number of move immediate folded");
@@ -137,6 +141,105 @@ namespace {
     bool isLoadFoldable(MachineInstr *MI,
                         SmallSet<unsigned, 16> &FoldAsLoadDefCandidates);
   };
+
+  /// \brief Helper class to track the possible sources of a value defined by
+  /// a (chain of) copy related instructions.
+  /// Given a definition (instruction and definition index), this class
+  /// follows the use-def chain to find successive suitable sources.
+  /// The given source can be used to rewrite the definition into
+  /// def = COPY src.
+  ///
+  /// For instance, let us consider the following snippet:
+  /// v0 =
+  /// v2 = INSERT_SUBREG v1, v0, sub0
+  /// def = COPY v2.sub0
+  ///
+  /// Using a ValueTracker for def = COPY v2.sub0 will give the following
+  /// suitable sources:
+  /// v2.sub0 and v0.
+  /// Then, def can be rewritten into def = COPY v0.
+  class ValueTracker {
+  private:
+    /// The current point into the use-def chain.
+    const MachineInstr *Def;
+    /// The index of the definition in Def.
+    unsigned DefIdx;
+    /// The sub register index of the definition.
+    unsigned DefSubReg;
+    /// The register where the value can be found.
+    unsigned Reg;
+    /// Specifiy whether or not the value tracking looks through
+    /// complex instructions. When this is false, the value tracker
+    /// bails on everything that is not a copy or a bitcast.
+    ///
+    /// Note: This could have been implemented as a specialized version of
+    /// the ValueTracker class but that would have complicated the code of
+    /// the users of this class.
+    bool UseAdvancedTracking;
+    /// Optional MachineRegisterInfo used to perform some complex
+    /// tracking.
+    const MachineRegisterInfo *MRI;
+
+    /// \brief Dispatcher to the right underlying implementation of
+    /// getNextSource.
+    bool getNextSourceImpl(unsigned &SrcIdx, unsigned &SrcSubReg);
+    /// \brief Specialized version of getNextSource for Copy instructions.
+    bool getNextSourceFromCopy(unsigned &SrcIdx, unsigned &SrcSubReg);
+    /// \brief Specialized version of getNextSource for Bitcast instructions.
+    bool getNextSourceFromBitcast(unsigned &SrcIdx, unsigned &SrcSubReg);
+    /// \brief Specialized version of getNextSource for RegSequence
+    /// instructions.
+    bool getNextSourceFromRegSequence(unsigned &SrcIdx, unsigned &SrcSubReg);
+    /// \brief Specialized version of getNextSource for InsertSubreg
+    /// instructions.
+    bool getNextSourceFromInsertSubreg(unsigned &SrcIdx, unsigned &SrcSubReg);
+    /// \brief Specialized version of getNextSource for ExtractSubreg
+    /// instructions.
+    bool getNextSourceFromExtractSubreg(unsigned &SrcIdx, unsigned &SrcSubReg);
+    /// \brief Specialized version of getNextSource for SubregToReg
+    /// instructions.
+    bool getNextSourceFromSubregToReg(unsigned &SrcIdx, unsigned &SrcSubReg);
+
+  public:
+    /// \brief Create a ValueTracker instance for the value defines by \p MI
+    /// at the operand index \p DefIdx.
+    /// \p DefSubReg represents the sub register index the value tracker will
+    /// track. It does not need to match the sub register index used in \p MI.
+    /// \p UseAdvancedTracking specifies whether or not the value tracker looks
+    /// through complex instructions. By default (false), it handles only copy
+    /// and bitcast instructions.
+    /// \p MRI useful to perform some complex checks.
+    ValueTracker(const MachineInstr &MI, unsigned DefIdx, unsigned DefSubReg,
+                 bool UseAdvancedTracking = false,
+                 const MachineRegisterInfo *MRI = nullptr)
+        : Def(&MI), DefIdx(DefIdx), DefSubReg(DefSubReg),
+          UseAdvancedTracking(UseAdvancedTracking), MRI(MRI) {
+      assert(Def->getOperand(DefIdx).isDef() &&
+             Def->getOperand(DefIdx).isReg() &&
+             "Definition does not match machine instruction");
+      // Initially the value is in the defined register.
+      Reg = Def->getOperand(DefIdx).getReg();
+    }
+
+    /// \brief Following the use-def chain, get the next available source
+    /// for the tracked value.
+    /// When the returned value is not nullptr, getReg() gives the register
+    /// that contain the tracked value.
+    /// \note The sub register index returned in \p SrcSubReg must be used
+    /// on that getReg() to access the actual value.
+    /// \return Unless the returned value is nullptr (i.e., no source found),
+    /// \p SrcIdx gives the index of the next source in the returned
+    /// instruction and \p SrcSubReg the index to be used on that source to
+    /// get the tracked value. When nullptr is returned, no alternative source
+    /// has been found.
+    const MachineInstr *getNextSource(unsigned &SrcIdx, unsigned &SrcSubReg);
+
+    /// \brief Get the last register where the initial value can be found.
+    /// Initially this is the register of the definition.
+    /// Then, after each successful call to getNextSource, this is the
+    /// register of the last source.
+    unsigned getReg() const { return Reg; }
+  };
 }
 
 char PeepholeOptimizer::ID = 0;
@@ -443,31 +546,32 @@ bool PeepholeOptimizer::optimizeCopyOrBitcast(MachineInstr *MI) {
   unsigned Src;
   unsigned SrcSubReg;
   bool ShouldRewrite = false;
-  MachineInstr *Copy = MI;
   const TargetRegisterInfo &TRI = *TM->getRegisterInfo();
 
-  // Follow the chain of copies until we reach the top or find a
-  // more suitable source.
+  // Follow the chain of copies until we reach the top of the use-def chain
+  // or find a more suitable source.
+  ValueTracker ValTracker(*MI, DefIdx, DefSubReg, !DisableAdvCopyOpt, MRI);
   do {
-    unsigned CopyDefIdx, CopySrcIdx;
-    if (!getCopyOrBitcastDefUseIdx(*Copy, CopyDefIdx, CopySrcIdx))
+    unsigned CopySrcIdx, CopySrcSubReg;
+    if (!ValTracker.getNextSource(CopySrcIdx, CopySrcSubReg))
       break;
-    const MachineOperand &MO = Copy->getOperand(CopySrcIdx);
-    assert(MO.isReg() && "Copies must be between registers.");
-    Src = MO.getReg();
-
+    Src = ValTracker.getReg();
+    SrcSubReg = CopySrcSubReg;
+
+    // Do not extend the live-ranges of physical registers as they add
+    // constraints to the register allocator.
+    // Moreover, if we want to extend the live-range of a physical register,
+    // unlike SSA virtual register, we will have to check that they are not
+    // redefine before the related use.
     if (TargetRegisterInfo::isPhysicalRegister(Src))
       break;
 
     const TargetRegisterClass *SrcRC = MRI->getRegClass(Src);
-    SrcSubReg = MO.getSubReg();
 
     // If this source does not incur a cross register bank copy, use it.
     ShouldRewrite = shareSameRegisterFile(TRI, DefRC, DefSubReg, SrcRC,
                                           SrcSubReg);
-    // Follow the chain of copies: get the definition of Src.
-    Copy = MRI->getVRegDef(Src);
-  } while (!ShouldRewrite && Copy && (Copy->isCopy() || Copy->isBitcast()));
+  } while (!ShouldRewrite);
 
   // If we did not find a more suitable source, there is nothing to optimize.
   if (!ShouldRewrite || Src == MI->getOperand(SrcIdx).getReg())
@@ -483,6 +587,9 @@ bool PeepholeOptimizer::optimizeCopyOrBitcast(MachineInstr *MI) {
 
   MRI->replaceRegWith(Def, NewVR);
   MRI->clearKillFlags(NewVR);
+  // We extended the lifetime of Src.
+  // Clear the kill flags to account for that.
+  MRI->clearKillFlags(Src);
   MI->eraseFromParent();
   ++NumCopiesBitcasts;
   return true;
@@ -673,3 +780,251 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
 
   return Changed;
 }
+
+bool ValueTracker::getNextSourceFromCopy(unsigned &SrcIdx,
+                                         unsigned &SrcSubReg) {
+  assert(Def->isCopy() && "Invalid definition");
+  // Copy instruction are supposed to be: Def = Src.
+  // If someone breaks this assumption, bad things will happen everywhere.
+  assert(Def->getDesc().getNumOperands() == 2 && "Invalid number of operands");
+
+  if (Def->getOperand(DefIdx).getSubReg() != DefSubReg)
+    // If we look for a different subreg, it means we want a subreg of src.
+    // Bails as we do not support composing subreg yet.
+    return false;
+  // Otherwise, we want the whole source.
+  SrcIdx = 1;
+  SrcSubReg = Def->getOperand(SrcIdx).getSubReg();
+  return true;
+}
+
+bool ValueTracker::getNextSourceFromBitcast(unsigned &SrcIdx,
+                                            unsigned &SrcSubReg) {
+  assert(Def->isBitcast() && "Invalid definition");
+
+  // Bail if there are effects that a plain copy will not expose.
+  if (Def->hasUnmodeledSideEffects())
+    return false;
+
+  // Bitcasts with more than one def are not supported.
+  if (Def->getDesc().getNumDefs() != 1)
+    return false;
+  if (Def->getOperand(DefIdx).getSubReg() != DefSubReg)
+    // If we look for a different subreg, it means we want a subreg of the src.
+    // Bails as we do not support composing subreg yet.
+    return false;
+
+  SrcIdx = Def->getDesc().getNumOperands();
+  for (unsigned OpIdx = DefIdx + 1, EndOpIdx = SrcIdx; OpIdx != EndOpIdx;
+       ++OpIdx) {
+    const MachineOperand &MO = Def->getOperand(OpIdx);
+    if (!MO.isReg() || !MO.getReg())
+      continue;
+    assert(!MO.isDef() && "We should have skipped all the definitions by now");
+    if (SrcIdx != EndOpIdx)
+      // Multiple sources?
+      return false;
+    SrcIdx = OpIdx;
+  }
+  SrcSubReg = Def->getOperand(SrcIdx).getSubReg();
+  return true;
+}
+
+bool ValueTracker::getNextSourceFromRegSequence(unsigned &SrcIdx,
+                                                unsigned &SrcSubReg) {
+  assert(Def->isRegSequence() && "Invalid definition");
+
+  if (Def->getOperand(DefIdx).getSubReg())
+    // If we are composing subreg, bails out.
+    // The case we are checking is Def.<subreg> = REG_SEQUENCE.
+    // This should almost never happen as the SSA property is tracked at
+    // the register level (as opposed to the subreg level).
+    // I.e.,
+    // Def.sub0 =
+    // Def.sub1 =
+    // is a valid SSA representation for Def.sub0 and Def.sub1, but not for
+    // Def. Thus, it must not be generated.
+    // However, some code could theoretically generates a single
+    // Def.sub0 (i.e, not defining the other subregs) and we would
+    // have this case.
+    // If we can ascertain (or force) that this never happens, we could
+    // turn that into an assertion.
+    return false;
+
+  // We are looking at:
+  // Def = REG_SEQUENCE v0, sub0, v1, sub1, ...
+  // Check if one of the operand defines the subreg we are interested in.
+  for (unsigned OpIdx = DefIdx + 1, EndOpIdx = Def->getNumOperands();
+       OpIdx != EndOpIdx; OpIdx += 2) {
+    const MachineOperand &MOSubIdx = Def->getOperand(OpIdx + 1);
+    assert(MOSubIdx.isImm() &&
+           "One of the subindex of the reg_sequence is not an immediate");
+    if (MOSubIdx.getImm() == DefSubReg) {
+      assert(Def->getOperand(OpIdx).isReg() &&
+             "One of the source of the reg_sequence is not a register");
+      SrcIdx = OpIdx;
+      SrcSubReg = Def->getOperand(SrcIdx).getSubReg();
+      return true;
+    }
+  }
+
+  // If the subreg we are tracking is super-defined by another subreg,
+  // we could follow this value. However, this would require to compose
+  // the subreg and we do not do that for now.
+  return false;
+}
+
+bool ValueTracker::getNextSourceFromInsertSubreg(unsigned &SrcIdx,
+                                                 unsigned &SrcSubReg) {
+  assert(Def->isInsertSubreg() && "Invalid definition");
+  if (Def->getOperand(DefIdx).getSubReg())
+    // If we are composing subreg, bails out.
+    // Same remark as getNextSourceFromRegSequence.
+    // I.e., this may be turned into an assert.
+    return false;
+
+  // We are looking at:
+  // Def = INSERT_SUBREG v0, v1, sub1
+  // There are two cases:
+  // 1. DefSubReg == sub1, get v1.
+  // 2. DefSubReg != sub1, the value may be available through v0.
+
+  // #1 Check if the inserted register matches the require sub index.
+  unsigned InsertedSubReg = Def->getOperand(3).getImm();
+  if (InsertedSubReg == DefSubReg) {
+    SrcIdx = 2;
+    SrcSubReg = Def->getOperand(SrcIdx).getSubReg();
+    return true;
+  }
+  // #2 Otherwise, if the sub register we are looking for is not partial
+  // defined by the inserted element, we can look through the main
+  // register (v0).
+  // To check the overlapping we need a MRI and a TRI.
+  if (!MRI)
+    return false;
+
+  const MachineOperand &MODef = Def->getOperand(DefIdx);
+  const MachineOperand &MOBase = Def->getOperand(1);
+  // If the result register (Def) and the base register (v0) do not
+  // have the same register class or if we have to compose
+  // subregisters, bails out.
+  if (MRI->getRegClass(MODef.getReg()) != MRI->getRegClass(MOBase.getReg()) ||
+      MOBase.getSubReg())
+    return false;
+
+  // Get the TRI and check if inserted sub register overlaps with the
+  // sub register we are tracking.
+  const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+  if (!TRI ||
+      (TRI->getSubRegIndexLaneMask(DefSubReg) &
+       TRI->getSubRegIndexLaneMask(InsertedSubReg)) != 0)
+    return false;
+  // At this point, the value is available in v0 via the same subreg
+  // we used for Def.
+  SrcIdx = 1;
+  SrcSubReg = DefSubReg;
+  return true;
+}
+
+bool ValueTracker::getNextSourceFromExtractSubreg(unsigned &SrcIdx,
+                                                  unsigned &SrcSubReg) {
+  assert(Def->isExtractSubreg() && "Invalid definition");
+  // We are looking at:
+  // Def = EXTRACT_SUBREG v0, sub0
+
+  // Bails if we have to compose sub registers.
+  // Indeed, if DefSubReg != 0, we would have to compose it with sub0.
+  if (DefSubReg)
+    return false;
+
+  // Bails if we have to compose sub registers.
+  // Likewise, if v0.subreg != 0, we would have to compose v0.subreg with sub0.
+  if (Def->getOperand(1).getSubReg())
+    return false;
+  // Otherwise, the value is available in the v0.sub0.
+  SrcIdx = 1;
+  SrcSubReg = Def->getOperand(2).getImm();
+  return true;
+}
+
+bool ValueTracker::getNextSourceFromSubregToReg(unsigned &SrcIdx,
+                                                unsigned &SrcSubReg) {
+  assert(Def->isSubregToReg() && "Invalid definition");
+  // We are looking at:
+  // Def = SUBREG_TO_REG Imm, v0, sub0
+
+  // Bails if we have to compose sub registers.
+  // If DefSubReg != sub0, we would have to check that all the bits
+  // we track are included in sub0 and if yes, we would have to
+  // determine the right subreg in v0.
+  if (DefSubReg != Def->getOperand(3).getImm())
+    return false;
+  // Bails if we have to compose sub registers.
+  // Likewise, if v0.subreg != 0, we would have to compose it with sub0.
+  if (Def->getOperand(2).getSubReg())
+    return false;
+
+  SrcIdx = 2;
+  SrcSubReg = Def->getOperand(3).getImm();
+  return true;
+}
+
+bool ValueTracker::getNextSourceImpl(unsigned &SrcIdx, unsigned &SrcSubReg) {
+  assert(Def && "This method needs a valid definition");
+
+  assert(
+      (DefIdx < Def->getDesc().getNumDefs() || Def->getDesc().isVariadic()) &&
+      Def->getOperand(DefIdx).isDef() && "Invalid DefIdx");
+  if (Def->isCopy())
+    return getNextSourceFromCopy(SrcIdx, SrcSubReg);
+  if (Def->isBitcast())
+    return getNextSourceFromBitcast(SrcIdx, SrcSubReg);
+  // All the remaining cases involve "complex" instructions.
+  // Bails if we did not ask for the advanced tracking.
+  if (!UseAdvancedTracking)
+    return false;
+  if (Def->isRegSequence())
+    return getNextSourceFromRegSequence(SrcIdx, SrcSubReg);
+  if (Def->isInsertSubreg())
+    return getNextSourceFromInsertSubreg(SrcIdx, SrcSubReg);
+  if (Def->isExtractSubreg())
+    return getNextSourceFromExtractSubreg(SrcIdx, SrcSubReg);
+  if (Def->isSubregToReg())
+    return getNextSourceFromSubregToReg(SrcIdx, SrcSubReg);
+  return false;
+}
+
+const MachineInstr *ValueTracker::getNextSource(unsigned &SrcIdx,
+                                                unsigned &SrcSubReg) {
+  // If we reach a point where we cannot move up in the use-def chain,
+  // there is nothing we can get.
+  if (!Def)
+    return nullptr;
+
+  const MachineInstr *PrevDef = nullptr;
+  // Try to find the next source.
+  if (getNextSourceImpl(SrcIdx, SrcSubReg)) {
+    // Update definition, definition index, and subregister for the
+    // next call of getNextSource.
+    const MachineOperand &MO = Def->getOperand(SrcIdx);
+    assert(MO.isReg() && !MO.isDef() && "Source is invalid");
+    // Update the current register.
+    Reg = MO.getReg();
+    // Update the return value before moving up in the use-def chain.
+    PrevDef = Def;
+    // If we can still move up in the use-def chain, move to the next
+    // defintion.
+    if (!TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      Def = MRI->getVRegDef(Reg);
+      DefIdx = MRI->def_begin(Reg).getOperandNo();
+      DefSubReg = SrcSubReg;
+      return PrevDef;
+    }
+  }
+  // If we end up here, this means we will not be able to find another source
+  // for the next iteration.
+  // Make sure any new call to getNextSource bails out early by cutting the
+  // use-def chain.
+  Def = nullptr;
+  return PrevDef;
+}
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index c74a42f..b98d210 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -160,7 +160,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   replaceFrameIndices(Fn);
 
   // If register scavenging is needed, as we've enabled doing it as a
-  // post-pass, scavenge the virtual registers that frame index elimiation
+  // post-pass, scavenge the virtual registers that frame index elimination
   // inserted.
   if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging)
     scavengeFrameVirtualRegs(Fn);
@@ -268,51 +268,56 @@ void PEI::calculateCalleeSavedRegisters(MachineFunction &F) {
     }
   }
 
-  if (CSI.empty())
-    return;   // Early exit if no callee saved registers are modified!
+  if (!TFI->assignCalleeSavedSpillSlots(F, RegInfo, CSI)) {
+    // If target doesn't implement this, use generic code.
 
-  unsigned NumFixedSpillSlots;
-  const TargetFrameLowering::SpillSlot *FixedSpillSlots =
-    TFI->getCalleeSavedSpillSlots(NumFixedSpillSlots);
+    if (CSI.empty())
+      return; // Early exit if no callee saved registers are modified!
 
-  // Now that we know which registers need to be saved and restored, allocate
-  // stack slots for them.
-  for (std::vector<CalleeSavedInfo>::iterator
-         I = CSI.begin(), E = CSI.end(); I != E; ++I) {
-    unsigned Reg = I->getReg();
-    const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
+    unsigned NumFixedSpillSlots;
+    const TargetFrameLowering::SpillSlot *FixedSpillSlots =
+        TFI->getCalleeSavedSpillSlots(NumFixedSpillSlots);
 
-    int FrameIdx;
-    if (RegInfo->hasReservedSpillSlot(F, Reg, FrameIdx)) {
-      I->setFrameIdx(FrameIdx);
-      continue;
-    }
+    // Now that we know which registers need to be saved and restored, allocate
+    // stack slots for them.
+    for (std::vector<CalleeSavedInfo>::iterator I = CSI.begin(), E = CSI.end();
+         I != E; ++I) {
+      unsigned Reg = I->getReg();
+      const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
 
-    // Check to see if this physreg must be spilled to a particular stack slot
-    // on this target.
-    const TargetFrameLowering::SpillSlot *FixedSlot = FixedSpillSlots;
-    while (FixedSlot != FixedSpillSlots+NumFixedSpillSlots &&
-           FixedSlot->Reg != Reg)
-      ++FixedSlot;
-
-    if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) {
-      // Nope, just spill it anywhere convenient.
-      unsigned Align = RC->getAlignment();
-      unsigned StackAlign = TFI->getStackAlignment();
-
-      // We may not be able to satisfy the desired alignment specification of
-      // the TargetRegisterClass if the stack alignment is smaller. Use the
-      // min.
-      Align = std::min(Align, StackAlign);
-      FrameIdx = MFI->CreateStackObject(RC->getSize(), Align, true);
-      if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
-      if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
-    } else {
-      // Spill it to the stack where we must.
-      FrameIdx = MFI->CreateFixedObject(RC->getSize(), FixedSlot->Offset, true);
-    }
+      int FrameIdx;
+      if (RegInfo->hasReservedSpillSlot(F, Reg, FrameIdx)) {
+        I->setFrameIdx(FrameIdx);
+        continue;
+      }
+
+      // Check to see if this physreg must be spilled to a particular stack slot
+      // on this target.
+      const TargetFrameLowering::SpillSlot *FixedSlot = FixedSpillSlots;
+      while (FixedSlot != FixedSpillSlots + NumFixedSpillSlots &&
+             FixedSlot->Reg != Reg)
+        ++FixedSlot;
+
+      if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) {
+        // Nope, just spill it anywhere convenient.
+        unsigned Align = RC->getAlignment();
+        unsigned StackAlign = TFI->getStackAlignment();
+
+        // We may not be able to satisfy the desired alignment specification of
+        // the TargetRegisterClass if the stack alignment is smaller. Use the
+        // min.
+        Align = std::min(Align, StackAlign);
+        FrameIdx = MFI->CreateStackObject(RC->getSize(), Align, true);
+        if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
+        if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
+      } else {
+        // Spill it to the stack where we must.
+        FrameIdx =
+            MFI->CreateFixedSpillStackObject(RC->getSize(), FixedSlot->Offset);
+      }
 
-    I->setFrameIdx(FrameIdx);
+      I->setFrameIdx(FrameIdx);
+    }
   }
 
   MFI->setCalleeSavedInfo(CSI);
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index aa7c178..901b993 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -44,6 +44,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <queue>
 
 using namespace llvm;
@@ -79,6 +80,12 @@ ExhaustiveSearch("exhaustive-register-search", cl::NotHidden,
                  cl::desc("Exhaustive Search for registers bypassing the depth "
                           "and interference cutoffs of last chance recoloring"));
 
+static cl::opt<bool> EnableLocalReassignment(
+    "enable-local-reassign", cl::Hidden,
+    cl::desc("Local reassignment can yield better allocation decisions, but "
+             "may be compile time intensive"),
+    cl::init(false));
+
 // FIXME: Find a good default for this flag and remove the flag.
 static cl::opt<unsigned>
 CSRFirstTimeCost("regalloc-csr-first-time-cost",
@@ -285,6 +292,10 @@ class RAGreedy : public MachineFunctionPass,
   /// Callee-save register cost, calculated once per machine function.
   BlockFrequency CSRCost;
 
+  /// Run or not the local reassignment heuristic. This information is
+  /// obtained from the TargetSubtargetInfo.
+  bool EnableLocalReassign;
+
 public:
   RAGreedy();
 
@@ -731,7 +742,7 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
       // Evicting another local live range in this case could lead to suboptimal
       // coloring.
       if (!MaxCost.isMax() && IsLocal && LIS->intervalIsInOneMBB(*Intf) &&
-          !canReassign(*Intf, PhysReg)) {
+          (!EnableLocalReassign || !canReassign(*Intf, PhysReg))) {
         return false;
       }
     }
@@ -2308,9 +2319,14 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
                << "********** Function: " << mf.getName() << '\n');
 
   MF = &mf;
-  TRI = MF->getTarget().getRegisterInfo();
-  TII = MF->getTarget().getInstrInfo();
+  const TargetMachine &TM = MF->getTarget();
+  TRI = TM.getRegisterInfo();
+  TII = TM.getInstrInfo();
   RCI.runOnMachineFunction(mf);
+
+  EnableLocalReassign = EnableLocalReassignment ||
+    TM.getSubtargetImpl()->enableRALocalReassignment(TM.getOptLevel());
+
   if (VerifyEnabled)
     MF->verify(this, "Before greedy register allocator");
 
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index b2909e0..617e459 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -41,7 +41,7 @@ static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure,
   }
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 void llvm::dumpRegSetPressure(ArrayRef<unsigned> SetPressure,
                               const TargetRegisterInfo *TRI) {
   bool Empty = true;
@@ -55,6 +55,7 @@ void llvm::dumpRegSetPressure(ArrayRef<unsigned> SetPressure,
     dbgs() << "\n";
 }
 
+LLVM_DUMP_METHOD
 void RegisterPressure::dump(const TargetRegisterInfo *TRI) const {
   dbgs() << "Max Pressure: ";
   dumpRegSetPressure(MaxSetPressure, TRI);
@@ -68,6 +69,7 @@ void RegisterPressure::dump(const TargetRegisterInfo *TRI) const {
   dbgs() << '\n';
 }
 
+LLVM_DUMP_METHOD
 void RegPressureTracker::dump() const {
   if (!isTopClosed() || !isBottomClosed()) {
     dbgs() << "Curr Pressure: ";
@@ -75,7 +77,6 @@ void RegPressureTracker::dump() const {
   }
   P.dump(TRI);
 }
-#endif
 
 /// Increase the current pressure as impacted by these registers and bump
 /// the high water mark if needed.
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 92a9a30..0f8b21c 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -1508,7 +1508,7 @@ void SchedDFSResult::scheduleTree(unsigned SubtreeID) {
   }
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 void ILPValue::print(raw_ostream &OS) const {
   OS << InstrCount << " / " << Length << " = ";
   if (!Length)
@@ -1517,16 +1517,17 @@ void ILPValue::print(raw_ostream &OS) const {
     OS << format("%g", ((double)InstrCount / Length));
 }
 
+LLVM_DUMP_METHOD
 void ILPValue::dump() const {
   dbgs() << *this << '\n';
 }
 
 namespace llvm {
 
+LLVM_DUMP_METHOD
 raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) {
   Val.print(OS);
   return OS;
 }
 
 } // namespace llvm
-#endif // !NDEBUG || LLVM_ENABLE_DUMP
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2d2fd53..7c42e4d 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -167,9 +167,18 @@ namespace {
 
     bool CombineToPreIndexedLoadStore(SDNode *N);
     bool CombineToPostIndexedLoadStore(SDNode *N);
-    SDValue SplitIndexingFromLoad(LoadSDNode *LD);
     bool SliceUpLoad(SDNode *N);
 
+    /// \brief Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed
+    ///   load.
+    ///
+    /// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced.
+    /// \param InVecVT type of the input vector to EVE with bitcasts resolved.
+    /// \param EltNo index of the vector element to load.
+    /// \param OriginalLoad load that EVE came from to be replaced.
+    /// \returns EVE on success SDValue() on failure.
+    SDValue ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
+        SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad);
     void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
     SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
     SDValue SExtPromoteOperand(SDValue Op, EVT PVT);
@@ -646,10 +655,14 @@ static ConstantSDNode *isConstOrConstSplat(SDValue N) {
     return CN;
 
   if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
-    ConstantSDNode *CN = BV->getConstantSplatValue();
+    BitVector UndefElements;
+    ConstantSDNode *CN = BV->getConstantSplatNode(&UndefElements);
 
     // BuildVectors can truncate their operands. Ignore that case here.
-    if (CN && CN->getValueType(0) == N.getValueType().getScalarType())
+    // FIXME: We blindly ignore splats which include undef which is overly
+    // pessimistic.
+    if (CN && UndefElements.none() &&
+        CN->getValueType(0) == N.getValueType().getScalarType())
       return CN;
   }
 
@@ -762,14 +775,10 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
 
     // If the operands of this node are only used by the node, they will now
     // be dead.  Make sure to visit them first to delete dead nodes early.
-    for (unsigned i = 0, e = TLO.Old.getNode()->getNumOperands(); i != e; ++i) {
-      SDNode *Op = TLO.Old.getNode()->getOperand(i).getNode();
-      // For an operand generating multiple values, one of the values may
-      // become dead allowing further simplification (e.g. split index
-      // arithmetic from an indexed load).
-      if (Op->hasOneUse() || Op->getNumValues() > 1)
-        AddToWorkList(Op);
-    }
+    for (unsigned i = 0, e = TLO.Old.getNode()->getNumOperands(); i != e; ++i)
+      if (TLO.Old.getNode()->getOperand(i).getNode()->hasOneUse())
+        AddToWorkList(TLO.Old.getNode()->getOperand(i).getNode());
+
     DAG.DeleteNode(TLO.Old.getNode());
   }
 }
@@ -1320,9 +1329,16 @@ SDValue DAGCombiner::combine(SDNode *N) {
 
     // Constant operands are canonicalized to RHS.
     if (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1)) {
-      SDValue Ops[] = { N1, N0 };
-      SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(),
-                                            Ops);
+      SDValue Ops[] = {N1, N0};
+      SDNode *CSENode;
+      if (const BinaryWithFlagsSDNode *BinNode =
+              dyn_cast<BinaryWithFlagsSDNode>(N)) {
+        CSENode = DAG.getNodeIfExists(
+            N->getOpcode(), N->getVTList(), Ops, BinNode->hasNoUnsignedWrap(),
+            BinNode->hasNoSignedWrap(), BinNode->isExact());
+      } else {
+        CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops);
+      }
       if (CSENode)
         return SDValue(CSENode, 0);
     }
@@ -3942,14 +3958,14 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     // If setcc produces all-one true value then:
     // (shl (and (setcc) N01CV) N1CV) -> (and (setcc) N01CV<<N1CV)
     if (N1CV && N1CV->isConstant()) {
-      if (N0.getOpcode() == ISD::AND &&
-          TLI.getBooleanContents(true) ==
-          TargetLowering::ZeroOrNegativeOneBooleanContent) {
+      if (N0.getOpcode() == ISD::AND) {
         SDValue N00 = N0->getOperand(0);
         SDValue N01 = N0->getOperand(1);
         BuildVectorSDNode *N01CV = dyn_cast<BuildVectorSDNode>(N01);
 
-        if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC) {
+        if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC &&
+            TLI.getBooleanContents(N00.getOperand(0).getValueType()) ==
+                TargetLowering::ZeroOrNegativeOneBooleanContent) {
           SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, VT, N01CV, N1CV);
           if (C.getNode())
             return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C);
@@ -4508,11 +4524,20 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   if (VT == MVT::i1 && N1C && N1C->getAPIntValue() == 1)
     return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N2);
   // fold (select C, 0, 1) -> (xor C, 1)
+  // We can't do this reliably if integer based booleans have different contents
+  // to floating point based booleans. This is because we can't tell whether we
+  // have an integer-based boolean or a floating-point-based boolean unless we
+  // can find the SETCC that produced it and inspect its operands. This is
+  // fairly easy if C is the SETCC node, but it can potentially be
+  // undiscoverable (or not reasonably discoverable). For example, it could be
+  // in another basic block or it could require searching a complicated
+  // expression.
   if (VT.isInteger() &&
-      (VT0 == MVT::i1 ||
-       (VT0.isInteger() &&
-        TLI.getBooleanContents(false) ==
-        TargetLowering::ZeroOrOneBooleanContent)) &&
+      (VT0 == MVT::i1 || (VT0.isInteger() &&
+                          TLI.getBooleanContents(false, false) ==
+                              TLI.getBooleanContents(false, true) &&
+                          TLI.getBooleanContents(false, false) ==
+                              TargetLowering::ZeroOrOneBooleanContent)) &&
       N1C && N2C && N1C->isNullValue() && N2C->getAPIntValue() == 1) {
     SDValue XORNode;
     if (VT == VT0)
@@ -4555,12 +4580,9 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
 
   // fold selects based on a setcc into other things, such as min/max/abs
   if (N0.getOpcode() == ISD::SETCC) {
-    // FIXME:
-    // Check against MVT::Other for SELECT_CC, which is a workaround for targets
-    // having to say they don't support SELECT_CC on every type the DAG knows
-    // about, since there is no way to mark an opcode illegal at all value types
-    if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other) &&
-        TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))
+    if ((!LegalOperations &&
+         TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) ||
+	TLI.isOperationLegal(ISD::SELECT_CC, VT))
       return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT,
                          N0.getOperand(0), N0.getOperand(1),
                          N1, N2, N0.getOperand(2));
@@ -4587,6 +4609,56 @@ std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N, SelectionDAG &DAG) {
   return std::make_pair(Lo, Hi);
 }
 
+// This function assumes all the vselect's arguments are CONCAT_VECTOR
+// nodes and that the condition is a BV of ConstantSDNodes (or undefs).
+static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
+  SDLoc dl(N);
+  SDValue Cond = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  MVT VT = N->getSimpleValueType(0);
+  int NumElems = VT.getVectorNumElements();
+  assert(LHS.getOpcode() == ISD::CONCAT_VECTORS &&
+         RHS.getOpcode() == ISD::CONCAT_VECTORS &&
+         Cond.getOpcode() == ISD::BUILD_VECTOR);
+
+  // We're sure we have an even number of elements due to the
+  // concat_vectors we have as arguments to vselect.
+  // Skip BV elements until we find one that's not an UNDEF
+  // After we find an UNDEF element, keep looping until we get to half the
+  // length of the BV and see if all the non-undef nodes are the same.
+  ConstantSDNode *BottomHalf = nullptr;
+  for (int i = 0; i < NumElems / 2; ++i) {
+    if (Cond->getOperand(i)->getOpcode() == ISD::UNDEF)
+      continue;
+
+    if (BottomHalf == nullptr)
+      BottomHalf = cast<ConstantSDNode>(Cond.getOperand(i));
+    else if (Cond->getOperand(i).getNode() != BottomHalf)
+      return SDValue();
+  }
+
+  // Do the same for the second half of the BuildVector
+  ConstantSDNode *TopHalf = nullptr;
+  for (int i = NumElems / 2; i < NumElems; ++i) {
+    if (Cond->getOperand(i)->getOpcode() == ISD::UNDEF)
+      continue;
+
+    if (TopHalf == nullptr)
+      TopHalf = cast<ConstantSDNode>(Cond.getOperand(i));
+    else if (Cond->getOperand(i).getNode() != TopHalf)
+      return SDValue();
+  }
+
+  assert(TopHalf && BottomHalf &&
+         "One half of the selector was all UNDEFs and the other was all the "
+         "same value. This should have been addressed before this function.");
+  return DAG.getNode(
+      ISD::CONCAT_VECTORS, dl, VT,
+      BottomHalf->isNullValue() ? RHS->getOperand(0) : LHS->getOperand(0),
+      TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1));
+}
+
 SDValue DAGCombiner::visitVSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -4659,6 +4731,17 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
   if (ISD::isBuildVectorAllZeros(N0.getNode()))
     return N2;
 
+  // The ConvertSelectToConcatVector function is assuming both the above
+  // checks for (vselect (build_vector all{ones,zeros) ...) have been made
+  // and addressed.
+  if (N1.getOpcode() == ISD::CONCAT_VECTORS &&
+      N2.getOpcode() == ISD::CONCAT_VECTORS &&
+      ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
+    SDValue CV = ConvertSelectToConcatVector(N, DAG);
+    if (CV.getNode())
+      return CV;
+  }
+
   return SDValue();
 }
 
@@ -5003,12 +5086,12 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   }
 
   if (N0.getOpcode() == ISD::SETCC) {
+    EVT N0VT = N0.getOperand(0).getValueType();
     // sext(setcc) -> sext_in_reg(vsetcc) for vectors.
     // Only do this before legalize for now.
     if (VT.isVector() && !LegalOperations &&
-        TLI.getBooleanContents(true) ==
-          TargetLowering::ZeroOrNegativeOneBooleanContent) {
-      EVT N0VT = N0.getOperand(0).getValueType();
+        TLI.getBooleanContents(N0VT) ==
+            TargetLowering::ZeroOrNegativeOneBooleanContent) {
       // On some architectures (such as SSE/NEON/etc) the SETCC result type is
       // of the same size as the compared operands. Only optimize sext(setcc())
       // if this is the case.
@@ -6140,6 +6223,9 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
       // Do not change the width of a volatile load.
       !cast<LoadSDNode>(N0)->isVolatile() &&
+      // Do not remove the cast if the types differ in endian layout.
+      TLI.hasBigEndianPartOrdering(N0.getValueType()) ==
+      TLI.hasBigEndianPartOrdering(VT) &&
       (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)) &&
       TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
@@ -6955,11 +7041,7 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
   }
 
   // The next optimizations are desirable only if SELECT_CC can be lowered.
-  // Check against MVT::Other for SELECT_CC, which is a workaround for targets
-  // having to say they don't support SELECT_CC on every type the DAG knows
-  // about, since there is no way to mark an opcode illegal at all value types
-  // (See also visitSELECT)
-  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other)) {
+  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) {
     // fold (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
     if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&
         !VT.isVector() &&
@@ -7012,11 +7094,7 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
   }
 
   // The next optimizations are desirable only if SELECT_CC can be lowered.
-  // Check against MVT::Other for SELECT_CC, which is a workaround for targets
-  // having to say they don't support SELECT_CC on every type the DAG knows
-  // about, since there is no way to mark an opcode illegal at all value types
-  // (See also visitSELECT)
-  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other)) {
+  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) {
     // fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
 
     if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&
@@ -7849,17 +7927,6 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
   return false;
 }
 
-/// \brief Return the base-pointer arithmetic from an indexed \p LD.
-SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
-  ISD::MemIndexedMode AM = LD->getAddressingMode();
-  assert(AM != ISD::UNINDEXED);
-  SDValue BP = LD->getOperand(1);
-  SDValue Inc = LD->getOperand(2);
-  unsigned Opc =
-      (AM == ISD::PRE_INC || AM == ISD::POST_INC ? ISD::ADD : ISD::SUB);
-  return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
-}
-
 SDValue DAGCombiner::visitLOAD(SDNode *N) {
   LoadSDNode *LD  = cast<LoadSDNode>(N);
   SDValue Chain = LD->getChain();
@@ -7896,16 +7963,8 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
     } else {
       // Indexed loads.
       assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?");
-      if (!N->hasAnyUseOfValue(0)) {
+      if (!N->hasAnyUseOfValue(0) && !N->hasAnyUseOfValue(1)) {
         SDValue Undef = DAG.getUNDEF(N->getValueType(0));
-        SDValue Index;
-        if (N->hasAnyUseOfValue(1)) {
-          Index = SplitIndexingFromLoad(LD);
-          // Try to fold the base pointer arithmetic into subsequent loads and
-          // stores.
-          AddUsersToWorkList(N);
-        } else
-          Index = DAG.getUNDEF(N->getValueType(1));
         DEBUG(dbgs() << "\nReplacing.7 ";
               N->dump(&DAG);
               dbgs() << "\nWith: ";
@@ -7913,7 +7972,8 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
               dbgs() << " and 2 other values\n");
         WorkListRemover DeadNodes(*this);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
-        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1),
+                                      DAG.getUNDEF(N->getValueType(1)));
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain);
         removeFromWorkList(N);
         DAG.DeleteNode(N);
@@ -9666,6 +9726,27 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
     return SDValue();
   unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
 
+  // Canonicalize insert_vector_elt dag nodes.
+  // Example:
+  // (insert_vector_elt (insert_vector_elt A, Idx0), Idx1)
+  // -> (insert_vector_elt (insert_vector_elt A, Idx1), Idx0)
+  //
+  // Do this only if the child insert_vector node has one use; also
+  // do this only if indices are both constants and Idx1 < Idx0.
+  if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse()
+      && isa<ConstantSDNode>(InVec.getOperand(2))) {
+    unsigned OtherElt =
+      cast<ConstantSDNode>(InVec.getOperand(2))->getZExtValue();
+    if (Elt < OtherElt) {
+      // Swap nodes.
+      SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), VT,
+                                  InVec.getOperand(0), InVal, EltNo);
+      AddToWorkList(NewOp.getNode());
+      return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(InVec.getNode()),
+                         VT, NewOp, InVec.getOperand(1), InVec.getOperand(2));
+    }
+  }
+
   // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
   // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
   // vector elements.
@@ -9698,6 +9779,86 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
 }
 
+SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
+    SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad) {
+  EVT ResultVT = EVE->getValueType(0);
+  EVT VecEltVT = InVecVT.getVectorElementType();
+  unsigned Align = OriginalLoad->getAlignment();
+  unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
+      VecEltVT.getTypeForEVT(*DAG.getContext()));
+
+  if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))
+    return SDValue();
+
+  Align = NewAlign;
+
+  SDValue NewPtr = OriginalLoad->getBasePtr();
+  SDValue Offset;
+  EVT PtrType = NewPtr.getValueType();
+  MachinePointerInfo MPI;
+  if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
+    int Elt = ConstEltNo->getZExtValue();
+    unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
+    if (TLI.isBigEndian())
+      PtrOff = InVecVT.getSizeInBits() / 8 - PtrOff;
+    Offset = DAG.getConstant(PtrOff, PtrType);
+    MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
+  } else {
+    Offset = DAG.getNode(
+        ISD::MUL, SDLoc(EVE), EltNo.getValueType(), EltNo,
+        DAG.getConstant(VecEltVT.getStoreSize(), EltNo.getValueType()));
+    if (TLI.isBigEndian())
+      Offset = DAG.getNode(
+          ISD::SUB, SDLoc(EVE), EltNo.getValueType(),
+          DAG.getConstant(InVecVT.getStoreSize(), EltNo.getValueType()), Offset);
+    MPI = OriginalLoad->getPointerInfo();
+  }
+  NewPtr = DAG.getNode(ISD::ADD, SDLoc(EVE), PtrType, NewPtr, Offset);
+
+  // The replacement we need to do here is a little tricky: we need to
+  // replace an extractelement of a load with a load.
+  // Use ReplaceAllUsesOfValuesWith to do the replacement.
+  // Note that this replacement assumes that the extractvalue is the only
+  // use of the load; that's okay because we don't want to perform this
+  // transformation in other cases anyway.
+  SDValue Load;
+  SDValue Chain;
+  if (ResultVT.bitsGT(VecEltVT)) {
+    // If the result type of vextract is wider than the load, then issue an
+    // extending load instead.
+    ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, VecEltVT)
+                                   ? ISD::ZEXTLOAD
+                                   : ISD::EXTLOAD;
+    Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT, OriginalLoad->getChain(),
+                          NewPtr, MPI, VecEltVT, OriginalLoad->isVolatile(),
+                          OriginalLoad->isNonTemporal(), Align,
+                          OriginalLoad->getTBAAInfo());
+    Chain = Load.getValue(1);
+  } else {
+    Load = DAG.getLoad(
+        VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr, MPI,
+        OriginalLoad->isVolatile(), OriginalLoad->isNonTemporal(),
+        OriginalLoad->isInvariant(), Align, OriginalLoad->getTBAAInfo());
+    Chain = Load.getValue(1);
+    if (ResultVT.bitsLT(VecEltVT))
+      Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load);
+    else
+      Load = DAG.getNode(ISD::BITCAST, SDLoc(EVE), ResultVT, Load);
+  }
+  WorkListRemover DeadNodes(*this);
+  SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) };
+  SDValue To[] = { Load, Chain };
+  DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
+  // Since we're explicitly calling ReplaceAllUses, add the new node to the
+  // worklist explicitly as well.
+  AddToWorkList(Load.getNode());
+  AddUsersToWorkList(Load.getNode()); // Add users too
+  // Make sure to revisit this node to clean it up; it will usually be dead.
+  AddToWorkList(EVE);
+  ++OpsNarrowed;
+  return SDValue(EVE, 0);
+}
+
 SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
   // (vextract (scalar_to_vector val, 0) -> val
   SDValue InVec = N->getOperand(0);
@@ -9766,6 +9927,38 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     }
   }
 
+  bool BCNumEltsChanged = false;
+  EVT ExtVT = VT.getVectorElementType();
+  EVT LVT = ExtVT;
+
+  // If the result of load has to be truncated, then it's not necessarily
+  // profitable.
+  if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT))
+    return SDValue();
+
+  if (InVec.getOpcode() == ISD::BITCAST) {
+    // Don't duplicate a load with other uses.
+    if (!InVec.hasOneUse())
+      return SDValue();
+
+    EVT BCVT = InVec.getOperand(0).getValueType();
+    if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType()))
+      return SDValue();
+    if (VT.getVectorNumElements() != BCVT.getVectorNumElements())
+      BCNumEltsChanged = true;
+    InVec = InVec.getOperand(0);
+    ExtVT = BCVT.getVectorElementType();
+  }
+
+  // (vextract (vN[if]M load $addr), i) -> ([if]M load $addr + i * size)
+  if (!LegalOperations && !ConstEltNo && InVec.hasOneUse() &&
+      ISD::isNormalLoad(InVec.getNode())) {
+    SDValue Index = N->getOperand(1);
+    if (LoadSDNode *OrigLoad = dyn_cast<LoadSDNode>(InVec))
+      return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, Index,
+                                                           OrigLoad);
+  }
+
   // Perform only after legalization to ensure build_vector / vector_shuffle
   // optimizations have already been done.
   if (!LegalOperations) return SDValue();
@@ -9776,30 +9969,6 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
 
   if (ConstEltNo) {
     int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
-    bool NewLoad = false;
-    bool BCNumEltsChanged = false;
-    EVT ExtVT = VT.getVectorElementType();
-    EVT LVT = ExtVT;
-
-    // If the result of load has to be truncated, then it's not necessarily
-    // profitable.
-    if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT))
-      return SDValue();
-
-    if (InVec.getOpcode() == ISD::BITCAST) {
-      // Don't duplicate a load with other uses.
-      if (!InVec.hasOneUse())
-        return SDValue();
-
-      EVT BCVT = InVec.getOperand(0).getValueType();
-      if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType()))
-        return SDValue();
-      if (VT.getVectorNumElements() != BCVT.getVectorNumElements())
-        BCNumEltsChanged = true;
-      InVec = InVec.getOperand(0);
-      ExtVT = BCVT.getVectorElementType();
-      NewLoad = true;
-    }
 
     LoadSDNode *LN0 = nullptr;
     const ShuffleVectorSDNode *SVN = nullptr;
@@ -9842,6 +10011,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       if (ISD::isNormalLoad(InVec.getNode())) {
         LN0 = cast<LoadSDNode>(InVec);
         Elt = (Idx < (int)NumElems) ? Idx : Idx - (int)NumElems;
+        EltNo = DAG.getConstant(Elt, EltNo.getValueType());
       }
     }
 
@@ -9854,72 +10024,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     if (Elt == -1)
       return DAG.getUNDEF(LVT);
 
-    unsigned Align = LN0->getAlignment();
-    if (NewLoad) {
-      // Check the resultant load doesn't need a higher alignment than the
-      // original load.
-      unsigned NewAlign =
-        TLI.getDataLayout()
-            ->getABITypeAlignment(LVT.getTypeForEVT(*DAG.getContext()));
-
-      if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, LVT))
-        return SDValue();
-
-      Align = NewAlign;
-    }
-
-    SDValue NewPtr = LN0->getBasePtr();
-    unsigned PtrOff = 0;
-
-    if (Elt) {
-      PtrOff = LVT.getSizeInBits() * Elt / 8;
-      EVT PtrType = NewPtr.getValueType();
-      if (TLI.isBigEndian())
-        PtrOff = VT.getSizeInBits() / 8 - PtrOff;
-      NewPtr = DAG.getNode(ISD::ADD, SDLoc(N), PtrType, NewPtr,
-                           DAG.getConstant(PtrOff, PtrType));
-    }
-
-    // The replacement we need to do here is a little tricky: we need to
-    // replace an extractelement of a load with a load.
-    // Use ReplaceAllUsesOfValuesWith to do the replacement.
-    // Note that this replacement assumes that the extractvalue is the only
-    // use of the load; that's okay because we don't want to perform this
-    // transformation in other cases anyway.
-    SDValue Load;
-    SDValue Chain;
-    if (NVT.bitsGT(LVT)) {
-      // If the result type of vextract is wider than the load, then issue an
-      // extending load instead.
-      ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, LVT)
-        ? ISD::ZEXTLOAD : ISD::EXTLOAD;
-      Load = DAG.getExtLoad(ExtType, SDLoc(N), NVT, LN0->getChain(),
-                            NewPtr, LN0->getPointerInfo().getWithOffset(PtrOff),
-                            LVT, LN0->isVolatile(), LN0->isNonTemporal(),
-                            Align, LN0->getTBAAInfo());
-      Chain = Load.getValue(1);
-    } else {
-      Load = DAG.getLoad(LVT, SDLoc(N), LN0->getChain(), NewPtr,
-                         LN0->getPointerInfo().getWithOffset(PtrOff),
-                         LN0->isVolatile(), LN0->isNonTemporal(),
-                         LN0->isInvariant(), Align, LN0->getTBAAInfo());
-      Chain = Load.getValue(1);
-      if (NVT.bitsLT(LVT))
-        Load = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, Load);
-      else
-        Load = DAG.getNode(ISD::BITCAST, SDLoc(N), NVT, Load);
-    }
-    WorkListRemover DeadNodes(*this);
-    SDValue From[] = { SDValue(N, 0), SDValue(LN0,1) };
-    SDValue To[] = { Load, Chain };
-    DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
-    // Since we're explcitly calling ReplaceAllUses, add the new node to the
-    // worklist explicitly as well.
-    AddToWorkList(Load.getNode());
-    AddUsersToWorkList(Load.getNode()); // Add users too
-    // Make sure to revisit this node to clean it up; it will usually be dead.
-    AddToWorkList(N);
-    return SDValue(N, 0);
+    return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, EltNo, LN0);
   }
 
   return SDValue();
@@ -10280,10 +10385,24 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
     SmallVector<SDValue, 8> Opnds;
     unsigned BuildVecNumElts =  N0.getNumOperands();
 
-    for (unsigned i = 0; i != BuildVecNumElts; ++i)
-      Opnds.push_back(N0.getOperand(i));
-    for (unsigned i = 0; i != BuildVecNumElts; ++i)
-      Opnds.push_back(N1.getOperand(i));
+    EVT SclTy0 = N0.getOperand(0)->getValueType(0);
+    EVT SclTy1 = N1.getOperand(0)->getValueType(0);
+    if (SclTy0.isFloatingPoint()) {
+      for (unsigned i = 0; i != BuildVecNumElts; ++i)
+        Opnds.push_back(N0.getOperand(i));
+      for (unsigned i = 0; i != BuildVecNumElts; ++i)
+        Opnds.push_back(N1.getOperand(i));
+    } else {
+      // If BUILD_VECTOR are from built from integer, they may have different
+      // operand types. Get the smaller type and truncate all operands to it.
+      EVT MinTy = SclTy0.bitsLE(SclTy1) ? SclTy0 : SclTy1;
+      for (unsigned i = 0; i != BuildVecNumElts; ++i)
+        Opnds.push_back(DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinTy,
+                        N0.getOperand(i)));
+      for (unsigned i = 0; i != BuildVecNumElts; ++i)
+        Opnds.push_back(DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinTy,
+                        N1.getOperand(i)));
+    }
 
     return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Opnds);
   }
@@ -10558,22 +10677,19 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   }
 
   // If this shuffle node is simply a swizzle of another shuffle node,
-  // and it reverses the swizzle of the previous shuffle then we can
-  // optimize shuffle(shuffle(x, undef), undef) -> x.
+  // then try to simplify it.
   if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
       N1.getOpcode() == ISD::UNDEF) {
 
     ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);
 
-    // Shuffle nodes can only reverse shuffles with a single non-undef value.
-    if (N0.getOperand(1).getOpcode() != ISD::UNDEF)
-      return SDValue();
-
     // The incoming shuffle must be of the same type as the result of the
     // current shuffle.
     assert(OtherSV->getOperand(0).getValueType() == VT &&
            "Shuffle types don't match");
 
+    SmallVector<int, 4> Mask;
+    // Compute the combined shuffle mask.
     for (unsigned i = 0; i != NumElts; ++i) {
       int Idx = SVN->getMaskElt(i);
       assert(Idx < (int)NumElts && "Index references undef operand");
@@ -10581,13 +10697,71 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       // shuffle. Adopt the incoming index.
       if (Idx >= 0)
         Idx = OtherSV->getMaskElt(Idx);
+      Mask.push_back(Idx);
+    }
+    
+    bool CommuteOperands = false;
+    if (N0.getOperand(1).getOpcode() != ISD::UNDEF) {
+      // To be valid, the combine shuffle mask should only reference elements
+      // from one of the two vectors in input to the inner shufflevector.
+      bool IsValidMask = true;
+      for (unsigned i = 0; i != NumElts && IsValidMask; ++i)
+        // See if the combined mask only reference undefs or elements coming
+        // from the first shufflevector operand.
+        IsValidMask = Mask[i] < 0 || (unsigned)Mask[i] < NumElts;
+
+      if (!IsValidMask) {
+        IsValidMask = true;
+        for (unsigned i = 0; i != NumElts && IsValidMask; ++i)
+          // Check that all the elements come from the second shuffle operand.
+          IsValidMask = Mask[i] < 0 || (unsigned)Mask[i] >= NumElts;
+        CommuteOperands = IsValidMask;
+      }
 
-      // The combined shuffle must map each index to itself.
-      if (Idx >= 0 && (unsigned)Idx != i)
+      // Early exit if the combined shuffle mask is not valid.
+      if (!IsValidMask)
         return SDValue();
     }
 
-    return OtherSV->getOperand(0);
+    // See if this pair of shuffles can be safely folded according to either
+    // of the following rules:
+    //   shuffle(shuffle(x, y), undef) -> x
+    //   shuffle(shuffle(x, undef), undef) -> x
+    //   shuffle(shuffle(x, y), undef) -> y
+    bool IsIdentityMask = true;
+    unsigned BaseMaskIndex = CommuteOperands ? NumElts : 0;
+    for (unsigned i = 0; i != NumElts && IsIdentityMask; ++i) {
+      // Skip Undefs.
+      if (Mask[i] < 0)
+        continue;
+
+      // The combined shuffle must map each index to itself.
+      IsIdentityMask = (unsigned)Mask[i] == i + BaseMaskIndex;
+    }
+    
+    if (IsIdentityMask) {
+      if (CommuteOperands)
+        // optimize shuffle(shuffle(x, y), undef) -> y.
+        return OtherSV->getOperand(1);
+      
+      // optimize shuffle(shuffle(x, undef), undef) -> x
+      // optimize shuffle(shuffle(x, y), undef) -> x
+      return OtherSV->getOperand(0);
+    }
+
+    // It may still be beneficial to combine the two shuffles if the
+    // resulting shuffle is legal.
+    if (TLI.isShuffleMaskLegal(Mask, VT)) {
+      if (!CommuteOperands)
+        // shuffle(shuffle(x, undef, M1), undef, M2) -> shuffle(x, undef, M3).
+        // shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(x, undef, M3)
+        return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0), N1,
+                                    &Mask[0]);
+      
+      //   shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(undef, y, M3)
+      return DAG.getVectorShuffle(VT, SDLoc(N), N1, N0->getOperand(1),
+                                  &Mask[0]);
+    }
   }
 
   return SDValue();
@@ -10729,6 +10903,27 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
       return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), LHS.getValueType(), Ops);
   }
 
+  // Type legalization might introduce new shuffles in the DAG.
+  // Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask)))
+  //   -> (shuffle (VBinOp (A, B)), Undef, Mask).
+  if (LegalTypes && isa<ShuffleVectorSDNode>(LHS) &&
+      isa<ShuffleVectorSDNode>(RHS) && LHS.hasOneUse() && RHS.hasOneUse() &&
+      LHS.getOperand(1).getOpcode() == ISD::UNDEF &&
+      RHS.getOperand(1).getOpcode() == ISD::UNDEF) {
+    ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(LHS);
+    ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(RHS);
+
+    if (SVN0->getMask().equals(SVN1->getMask())) {
+      EVT VT = N->getValueType(0);
+      SDValue UndefVector = LHS.getOperand(1);
+      SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
+                                     LHS.getOperand(0), RHS.getOperand(0));
+      AddUsersToWorkList(N);
+      return DAG.getVectorShuffle(VT, SDLoc(N), NewBinOp, UndefVector,
+                                  &SVN0->getMask()[0]);
+    }
+  }
+
   return SDValue();
 }
 
@@ -11080,8 +11275,8 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
 
   // fold select C, 16, 0 -> shl C, 4
   if (N2C && N3C && N3C->isNullValue() && N2C->getAPIntValue().isPowerOf2() &&
-    TLI.getBooleanContents(N0.getValueType().isVector()) ==
-      TargetLowering::ZeroOrOneBooleanContent) {
+      TLI.getBooleanContents(N0.getValueType()) ==
+          TargetLowering::ZeroOrOneBooleanContent) {
 
     // If the caller doesn't want us to simplify this into a zext of a compare,
     // don't do it.
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 99931c1..445572a 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -42,12 +42,15 @@
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
@@ -558,6 +561,107 @@ bool FastISel::SelectGetElementPtr(const User *I) {
   return true;
 }
 
+/// \brief Add a stackmap or patchpoint intrinsic call's live variable operands
+/// to a stackmap or patchpoint machine instruction.
+bool FastISel::addStackMapLiveVars(SmallVectorImpl<MachineOperand> &Ops,
+                                   const CallInst *CI, unsigned StartIdx) {
+  for (unsigned i = StartIdx, e = CI->getNumArgOperands(); i != e; ++i) {
+    Value *Val = CI->getArgOperand(i);
+    // Check for constants and encode them with a StackMaps::ConstantOp prefix.
+    if (auto *C = dyn_cast<ConstantInt>(Val)) {
+      Ops.push_back(MachineOperand::CreateImm(StackMaps::ConstantOp));
+      Ops.push_back(MachineOperand::CreateImm(C->getSExtValue()));
+    } else if (isa<ConstantPointerNull>(Val)) {
+      Ops.push_back(MachineOperand::CreateImm(StackMaps::ConstantOp));
+      Ops.push_back(MachineOperand::CreateImm(0));
+    } else if (auto *AI = dyn_cast<AllocaInst>(Val)) {
+      // Values coming from a stack location also require a sepcial encoding,
+      // but that is added later on by the target specific frame index
+      // elimination implementation.
+      auto SI = FuncInfo.StaticAllocaMap.find(AI);
+      if (SI != FuncInfo.StaticAllocaMap.end())
+        Ops.push_back(MachineOperand::CreateFI(SI->second));
+      else
+        return false;
+    } else {
+      unsigned Reg = getRegForValue(Val);
+      if (Reg == 0)
+        return false;
+      Ops.push_back(MachineOperand::CreateReg(Reg, /*IsDef=*/false));
+    }
+  }
+
+  return true;
+}
+
+bool FastISel::SelectStackmap(const CallInst *I) {
+  // void @llvm.experimental.stackmap(i64 <id>, i32 <numShadowBytes>,
+  //                                  [live variables...])
+  assert(I->getCalledFunction()->getReturnType()->isVoidTy() &&
+         "Stackmap cannot return a value.");
+
+  // The stackmap intrinsic only records the live variables (the arguments
+  // passed to it) and emits NOPS (if requested). Unlike the patchpoint
+  // intrinsic, this won't be lowered to a function call. This means we don't
+  // have to worry about calling conventions and target-specific lowering code.
+  // Instead we perform the call lowering right here.
+  //
+  // CALLSEQ_START(0)
+  // STACKMAP(id, nbytes, ...)
+  // CALLSEQ_END(0, 0)
+  //
+  SmallVector<MachineOperand, 32> Ops;
+
+  // Add the <id> and <numBytes> constants.
+  assert(isa<ConstantInt>(I->getOperand(PatchPointOpers::IDPos)) &&
+         "Expected a constant integer.");
+  const auto *ID = cast<ConstantInt>(I->getOperand(PatchPointOpers::IDPos));
+  Ops.push_back(MachineOperand::CreateImm(ID->getZExtValue()));
+
+  assert(isa<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos)) &&
+         "Expected a constant integer.");
+  const auto *NumBytes =
+    cast<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos));
+  Ops.push_back(MachineOperand::CreateImm(NumBytes->getZExtValue()));
+
+  // Push live variables for the stack map (skipping the first two arguments
+  // <id> and <numBytes>).
+  if (!addStackMapLiveVars(Ops, I, 2))
+    return false;
+
+  // We are not adding any register mask info here, because the stackmap doesn't
+  // clobber anything.
+
+  // Add scratch registers as implicit def and early clobber.
+  CallingConv::ID CC = I->getCallingConv();
+  const MCPhysReg *ScratchRegs = TLI.getScratchRegisters(CC);
+  for (unsigned i = 0; ScratchRegs[i]; ++i)
+    Ops.push_back(MachineOperand::CreateReg(
+      ScratchRegs[i], /*IsDef=*/true, /*IsImp=*/true, /*IsKill=*/false,
+      /*IsDead=*/false, /*IsUndef=*/false, /*IsEarlyClobber=*/true));
+
+  // Issue CALLSEQ_START
+  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
+    .addImm(0);
+
+  // Issue STACKMAP.
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(TargetOpcode::STACKMAP));
+  for (auto const &MO : Ops)
+    MIB.addOperand(MO);
+
+  // Issue CALLSEQ_END
+  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
+    .addImm(0).addImm(0);
+
+  // Inform the Frame Information that we have a stackmap in this function.
+  FuncInfo.MF->getFrameInfo()->setHasStackMap();
+
+  return true;
+}
+
 bool FastISel::SelectCall(const User *I) {
   const CallInst *Call = cast<CallInst>(I);
 
@@ -713,6 +817,8 @@ bool FastISel::SelectCall(const User *I) {
     UpdateValueMap(Call, ResultReg);
     return true;
   }
+  case Intrinsic::experimental_stackmap:
+    return SelectStackmap(Call);
   }
 
   // Usually, it does not make sense to initialize a value,
@@ -879,7 +985,6 @@ FastISel::SelectInstruction(const Instruction *I) {
 /// the CFG.
 void
 FastISel::FastEmitBranch(MachineBasicBlock *MSucc, DebugLoc DbgLoc) {
-
   if (FuncInfo.MBB->getBasicBlock()->size() > 1 &&
       FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
     // For more accurate line information if this is the only instruction
@@ -890,7 +995,11 @@ FastISel::FastEmitBranch(MachineBasicBlock *MSucc, DebugLoc DbgLoc) {
     TII.InsertBranch(*FuncInfo.MBB, MSucc, nullptr,
                      SmallVector<MachineOperand, 0>(), DbgLoc);
   }
-  FuncInfo.MBB->addSuccessor(MSucc);
+  uint32_t BranchWeight = 0;
+  if (FuncInfo.BPI)
+    BranchWeight = FuncInfo.BPI->getEdgeWeight(FuncInfo.MBB->getBasicBlock(),
+                                               MSucc->getBasicBlock());
+  FuncInfo.MBB->addSuccessor(MSucc, BranchWeight);
 }
 
 /// SelectFNeg - Emit an FNeg operation.
@@ -1101,6 +1210,7 @@ FastISel::SelectOperator(const User *I, unsigned Opcode) {
 FastISel::FastISel(FunctionLoweringInfo &funcInfo,
                    const TargetLibraryInfo *libInfo)
   : FuncInfo(funcInfo),
+    MF(funcInfo.MF),
     MRI(FuncInfo.MF->getRegInfo()),
     MFI(*FuncInfo.MF->getFrameInfo()),
     MCP(*FuncInfo.MF->getConstantPool()),
@@ -1635,3 +1745,47 @@ bool FastISel::canFoldAddIntoGEP(const User *GEP, const Value *Add) {
   return isa<ConstantInt>(cast<AddOperator>(Add)->getOperand(1));
 }
 
+MachineMemOperand *
+FastISel::createMachineMemOperandFor(const Instruction *I) const {
+  const Value *Ptr;
+  Type *ValTy;
+  unsigned Alignment;
+  unsigned Flags;
+  bool IsVolatile;
+
+  if (const auto *LI = dyn_cast<LoadInst>(I)) {
+    Alignment = LI->getAlignment();
+    IsVolatile = LI->isVolatile();
+    Flags = MachineMemOperand::MOLoad;
+    Ptr = LI->getPointerOperand();
+    ValTy = LI->getType();
+  } else if (const auto *SI = dyn_cast<StoreInst>(I)) {
+    Alignment = SI->getAlignment();
+    IsVolatile = SI->isVolatile();
+    Flags = MachineMemOperand::MOStore;
+    Ptr = SI->getPointerOperand();
+    ValTy = SI->getValueOperand()->getType();
+  } else {
+    return nullptr;
+  }
+
+  bool IsNonTemporal = I->getMetadata("nontemporal") != nullptr;
+  bool IsInvariant = I->getMetadata("invariant.load") != nullptr;
+  const MDNode *TBAAInfo = I->getMetadata(LLVMContext::MD_tbaa);
+  const MDNode *Ranges = I->getMetadata(LLVMContext::MD_range);
+
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0.
+    Alignment = DL.getABITypeAlignment(ValTy);
+
+  unsigned Size = TM.getDataLayout()->getTypeStoreSize(ValTy);
+
+  if (IsVolatile)
+    Flags |= MachineMemOperand::MOVolatile;
+  if (IsNonTemporal)
+    Flags |= MachineMemOperand::MONonTemporal;
+  if (IsInvariant)
+    Flags |= MachineMemOperand::MOInvariant;
+
+  return FuncInfo.MF->getMachineMemOperand(MachinePointerInfo(Ptr), Flags, Size,
+                                           Alignment, TBAAInfo, Ranges);
+}
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index a59e895..c0e8c8c 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2060,7 +2060,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, &Args, 0)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0)
     .setTailCall(isTailCall).setSExtResult(isSigned).setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
@@ -2095,7 +2095,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, &Args, 0)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0)
     .setSExtResult(isSigned).setZExtResult(!isSigned);
 
   std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI);
@@ -2129,7 +2129,7 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, &Args, 0)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0)
     .setSExtResult(isSigned).setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
@@ -2266,7 +2266,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
   SDLoc dl(Node);
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, &Args, 0)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0)
     .setSExtResult(isSigned).setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
@@ -2381,7 +2381,7 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(InChain)
     .setCallee(TLI.getLibcallCallingConv(LC),
-               Type::getVoidTy(*DAG.getContext()), Callee, &Args, 0);
+               Type::getVoidTy(*DAG.getContext()), Callee, std::move(Args), 0);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
@@ -2650,12 +2650,15 @@ SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp,
     NewOutTy = (MVT::SimpleValueType)(NewOutTy.getSimpleVT().SimpleTy+1);
     assert(NewOutTy.isInteger() && "Ran out of possibilities!");
 
+    // A larger signed type can hold all unsigned values of the requested type,
+    // so using FP_TO_SINT is valid
     if (TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NewOutTy)) {
       OpToUse = ISD::FP_TO_SINT;
       break;
     }
 
-    if (TLI.isOperationLegalOrCustom(ISD::FP_TO_UINT, NewOutTy)) {
+    // However, if the value may be < 0.0, we *must* use some FP_TO_SINT.
+    if (!isSigned && TLI.isOperationLegalOrCustom(ISD::FP_TO_UINT, NewOutTy)) {
       OpToUse = ISD::FP_TO_UINT;
       break;
     }
@@ -2996,8 +2999,8 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(dl).setChain(Node->getOperand(0))
       .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                 DAG.getExternalSymbol("__sync_synchronize", TLI.getPointerTy()),
-                 &Args, 0);
+                 DAG.getExternalSymbol("__sync_synchronize",
+                 TLI.getPointerTy()), std::move(Args), 0);
 
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
@@ -3007,14 +3010,14 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::ATOMIC_LOAD: {
     // There is no libcall for atomic load; fake it with ATOMIC_CMP_SWAP.
     SDValue Zero = DAG.getConstant(0, Node->getValueType(0));
-    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl,
-                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
-                                 Node->getOperand(0),
-                                 Node->getOperand(1), Zero, Zero,
-                                 cast<AtomicSDNode>(Node)->getMemOperand(),
-                                 cast<AtomicSDNode>(Node)->getOrdering(),
-                                 cast<AtomicSDNode>(Node)->getOrdering(),
-                                 cast<AtomicSDNode>(Node)->getSynchScope());
+    SDVTList VTs = DAG.getVTList(Node->getValueType(0), MVT::Other);
+    SDValue Swap = DAG.getAtomicCmpSwap(
+        ISD::ATOMIC_CMP_SWAP, dl, cast<AtomicSDNode>(Node)->getMemoryVT(), VTs,
+        Node->getOperand(0), Node->getOperand(1), Zero, Zero,
+        cast<AtomicSDNode>(Node)->getMemOperand(),
+        cast<AtomicSDNode>(Node)->getOrdering(),
+        cast<AtomicSDNode>(Node)->getOrdering(),
+        cast<AtomicSDNode>(Node)->getSynchScope());
     Results.push_back(Swap.getValue(0));
     Results.push_back(Swap.getValue(1));
     break;
@@ -3051,6 +3054,27 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(Tmp.second);
     break;
   }
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
+    // Expanding an ATOMIC_CMP_SWAP_WITH_SUCCESS produces an ATOMIC_CMP_SWAP and
+    // splits out the success value as a comparison. Expanding the resulting
+    // ATOMIC_CMP_SWAP will produce a libcall.
+    SDVTList VTs = DAG.getVTList(Node->getValueType(0), MVT::Other);
+    SDValue Res = DAG.getAtomicCmpSwap(
+        ISD::ATOMIC_CMP_SWAP, dl, cast<AtomicSDNode>(Node)->getMemoryVT(), VTs,
+        Node->getOperand(0), Node->getOperand(1), Node->getOperand(2),
+        Node->getOperand(3), cast<MemSDNode>(Node)->getMemOperand(),
+        cast<AtomicSDNode>(Node)->getSuccessOrdering(),
+        cast<AtomicSDNode>(Node)->getFailureOrdering(),
+        cast<AtomicSDNode>(Node)->getSynchScope());
+
+    SDValue Success = DAG.getSetCC(SDLoc(Node), Node->getValueType(1),
+                                   Res, Node->getOperand(2), ISD::SETEQ);
+
+    Results.push_back(Res.getValue(0));
+    Results.push_back(Success);
+    Results.push_back(Res.getValue(1));
+    break;
+  }
   case ISD::DYNAMIC_STACKALLOC:
     ExpandDYNAMIC_STACKALLOC(Node, Results);
     break;
@@ -3074,7 +3098,8 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(dl).setChain(Node->getOperand(0))
       .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                 DAG.getExternalSymbol("abort", TLI.getPointerTy()), &Args, 0);
+                 DAG.getExternalSymbol("abort", TLI.getPointerTy()),
+                 std::move(Args), 0);
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
     Results.push_back(CallResult.second);
@@ -3128,6 +3153,65 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                                 Node->getOperand(0), Node->getValueType(0), dl);
     Results.push_back(Tmp1);
     break;
+  case ISD::FP_TO_SINT: {
+    EVT VT = Node->getOperand(0).getValueType();
+    EVT NVT = Node->getValueType(0);
+
+    // FIXME: Only f32 to i64 conversions are supported.
+    if (VT != MVT::f32 || NVT != MVT::i64)
+      break;
+
+    // Expand f32 -> i64 conversion
+    // This algorithm comes from compiler-rt's implementation of fixsfdi:
+    // https://github.com/llvm-mirror/compiler-rt/blob/master/lib/builtins/fixsfdi.c
+    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(),
+                                  VT.getSizeInBits());
+    SDValue ExponentMask = DAG.getConstant(0x7F800000, IntVT);
+    SDValue ExponentLoBit = DAG.getConstant(23, IntVT);
+    SDValue Bias = DAG.getConstant(127, IntVT);
+    SDValue SignMask = DAG.getConstant(APInt::getSignBit(VT.getSizeInBits()),
+                                       IntVT);
+    SDValue SignLowBit = DAG.getConstant(VT.getSizeInBits() - 1, IntVT);
+    SDValue MantissaMask = DAG.getConstant(0x007FFFFF, IntVT);
+
+    SDValue Bits = DAG.getNode(ISD::BITCAST, dl, IntVT, Node->getOperand(0));
+
+    SDValue ExponentBits = DAG.getNode(ISD::SRL, dl, IntVT,
+        DAG.getNode(ISD::AND, dl, IntVT, Bits, ExponentMask),
+        DAG.getZExtOrTrunc(ExponentLoBit, dl, TLI.getShiftAmountTy(IntVT)));
+    SDValue Exponent = DAG.getNode(ISD::SUB, dl, IntVT, ExponentBits, Bias);
+
+    SDValue Sign = DAG.getNode(ISD::SRA, dl, IntVT,
+        DAG.getNode(ISD::AND, dl, IntVT, Bits, SignMask),
+        DAG.getZExtOrTrunc(SignLowBit, dl, TLI.getShiftAmountTy(IntVT)));
+    Sign = DAG.getSExtOrTrunc(Sign, dl, NVT);
+
+    SDValue R = DAG.getNode(ISD::OR, dl, IntVT,
+        DAG.getNode(ISD::AND, dl, IntVT, Bits, MantissaMask),
+        DAG.getConstant(0x00800000, IntVT));
+
+    R = DAG.getZExtOrTrunc(R, dl, NVT);
+
+
+    R = DAG.getSelectCC(dl, Exponent, ExponentLoBit,
+       DAG.getNode(ISD::SHL, dl, NVT, R,
+                   DAG.getZExtOrTrunc(
+                      DAG.getNode(ISD::SUB, dl, IntVT, Exponent, ExponentLoBit),
+                      dl, TLI.getShiftAmountTy(IntVT))),
+       DAG.getNode(ISD::SRL, dl, NVT, R,
+                   DAG.getZExtOrTrunc(
+                      DAG.getNode(ISD::SUB, dl, IntVT, ExponentLoBit, Exponent),
+                      dl, TLI.getShiftAmountTy(IntVT))),
+       ISD::SETGT);
+
+    SDValue Ret = DAG.getNode(ISD::SUB, dl, NVT,
+        DAG.getNode(ISD::XOR, dl, NVT, R, Sign),
+        Sign);
+
+    Results.push_back(DAG.getSelectCC(dl, Exponent, DAG.getConstant(0, IntVT),
+        DAG.getConstant(0, NVT), Ret, ISD::SETLT));
+    break;
+  }
   case ISD::FP_TO_UINT: {
     SDValue True, False;
     EVT VT =  Node->getOperand(0).getValueType();
@@ -3653,7 +3737,8 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                               ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
                               LHS, RHS);
     Results.push_back(Sum);
-    EVT OType = Node->getValueType(1);
+    EVT ResultType = Node->getValueType(1);
+    EVT OType = getSetCCResultType(Node->getValueType(0));
 
     SDValue Zero = DAG.getConstant(0, LHS.getValueType());
 
@@ -3676,7 +3761,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
 
     SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
-    Results.push_back(Cmp);
+    Results.push_back(DAG.getBoolExtOrTrunc(Cmp, dl, ResultType, ResultType));
     break;
   }
   case ISD::UADDO:
@@ -3687,9 +3772,14 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                               ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
                               LHS, RHS);
     Results.push_back(Sum);
-    Results.push_back(DAG.getSetCC(dl, Node->getValueType(1), Sum, LHS,
-                                   Node->getOpcode () == ISD::UADDO ?
-                                   ISD::SETULT : ISD::SETUGT));
+
+    EVT ResultType = Node->getValueType(1);
+    EVT SetCCType = getSetCCResultType(Node->getValueType(0));
+    ISD::CondCode CC
+      = Node->getOpcode() == ISD::UADDO ? ISD::SETULT : ISD::SETUGT;
+    SDValue SetCC = DAG.getSetCC(dl, SetCCType, Sum, LHS, CC);
+
+    Results.push_back(DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType));
     break;
   }
   case ISD::UMULO:
@@ -3879,7 +3969,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     // illegal; expand it into a SELECT_CC.
     EVT VT = Node->getValueType(0);
     int TrueValue;
-    switch (TLI.getBooleanContents(VT.isVector())) {
+    switch (TLI.getBooleanContents(Tmp1->getValueType(0))) {
     case TargetLowering::ZeroOrOneBooleanContent:
     case TargetLowering::UndefinedBooleanContent:
       TrueValue = 1;
@@ -3899,13 +3989,29 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp2 = Node->getOperand(1);   // RHS
     Tmp3 = Node->getOperand(2);   // True
     Tmp4 = Node->getOperand(3);   // False
+    EVT VT = Node->getValueType(0);
     SDValue CC = Node->getOperand(4);
+    ISD::CondCode CCOp = cast<CondCodeSDNode>(CC)->get();
+
+    if (TLI.isCondCodeLegal(CCOp, Tmp1.getSimpleValueType())) {
+      // If the condition code is legal, then we need to expand this
+      // node using SETCC and SELECT.
+      EVT CmpVT = Tmp1.getValueType();
+      assert(!TLI.isOperationExpand(ISD::SELECT, VT) &&
+             "Cannot expand ISD::SELECT_CC when ISD::SELECT also needs to be "
+             "expanded.");
+      EVT CCVT = TLI.getSetCCResultType(*DAG.getContext(), CmpVT);
+      SDValue Cond = DAG.getNode(ISD::SETCC, dl, CCVT, Tmp1, Tmp2, CC);
+      Results.push_back(DAG.getSelect(dl, VT, Cond, Tmp3, Tmp4));
+      break;
+    }
 
+    // SELECT_CC is legal, so the condition code must not be.
     bool Legalized = false;
     // Try to legalize by inverting the condition.  This is for targets that
     // might support an ordered version of a condition, but not the unordered
     // version (or vice versa).
-    ISD::CondCode InvCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+    ISD::CondCode InvCC = ISD::getSetCCInverse(CCOp,
                                                Tmp1.getValueType().isInteger());
     if (TLI.isCondCodeLegal(InvCC, Tmp1.getSimpleValueType())) {
       // Use the new condition code and swap true and false
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 2483184..6feac0d 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -138,7 +138,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
     Res = PromoteIntRes_Atomic1(cast<AtomicSDNode>(N)); break;
 
   case ISD::ATOMIC_CMP_SWAP:
-    Res = PromoteIntRes_Atomic2(cast<AtomicSDNode>(N)); break;
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+    Res = PromoteIntRes_AtomicCmpSwap(cast<AtomicSDNode>(N), ResNo);
+    break;
   }
 
   // If the result is null then the sub-method took care of registering it.
@@ -192,16 +194,41 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Atomic1(AtomicSDNode *N) {
   return Res;
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_Atomic2(AtomicSDNode *N) {
+SDValue DAGTypeLegalizer::PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N,
+                                                      unsigned ResNo) {
+  if (ResNo == 1) {
+    assert(N->getOpcode() == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
+    EVT SVT = getSetCCResultType(N->getOperand(2).getValueType());
+    EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1));
+
+    // Only use the result of getSetCCResultType if it is legal,
+    // otherwise just use the promoted result type (NVT).
+    if (!TLI.isTypeLegal(SVT))
+      SVT = NVT;
+
+    SDVTList VTs = DAG.getVTList(N->getValueType(0), SVT, MVT::Other);
+    SDValue Res = DAG.getAtomicCmpSwap(
+        ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, SDLoc(N), N->getMemoryVT(), VTs,
+        N->getChain(), N->getBasePtr(), N->getOperand(2), N->getOperand(3),
+        N->getMemOperand(), N->getSuccessOrdering(), N->getFailureOrdering(),
+        N->getSynchScope());
+    ReplaceValueWith(SDValue(N, 0), Res.getValue(0));
+    ReplaceValueWith(SDValue(N, 2), Res.getValue(2));
+    return Res.getValue(1);
+  }
+
   SDValue Op2 = GetPromotedInteger(N->getOperand(2));
   SDValue Op3 = GetPromotedInteger(N->getOperand(3));
-  SDValue Res = DAG.getAtomic(N->getOpcode(), SDLoc(N), N->getMemoryVT(),
-                              N->getChain(), N->getBasePtr(), Op2, Op3,
-                              N->getMemOperand(), N->getSuccessOrdering(),
-                              N->getFailureOrdering(), N->getSynchScope());
+  SDVTList VTs =
+      DAG.getVTList(Op2.getValueType(), N->getValueType(1), MVT::Other);
+  SDValue Res = DAG.getAtomicCmpSwap(
+      N->getOpcode(), SDLoc(N), N->getMemoryVT(), VTs, N->getChain(),
+      N->getBasePtr(), Op2, Op3, N->getMemOperand(), N->getSuccessOrdering(),
+      N->getFailureOrdering(), N->getSynchScope());
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
-  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  unsigned ChainOp = N->getNumValues() - 1;
+  ReplaceValueWith(SDValue(N, ChainOp), Res.getValue(ChainOp));
   return Res;
 }
 
@@ -492,7 +519,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VSELECT(SDNode *N) {
   EVT OpTy = N->getOperand(1).getValueType();
 
   // Promote all the way up to the canonical SetCC type.
-  Mask = PromoteTargetBoolean(Mask, getSetCCResultType(OpTy));
+  Mask = PromoteTargetBoolean(Mask, OpTy);
   SDValue LHS = GetPromotedInteger(N->getOperand(1));
   SDValue RHS = GetPromotedInteger(N->getOperand(2));
   return DAG.getNode(ISD::VSELECT, SDLoc(N),
@@ -892,8 +919,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo) {
   assert(OpNo == 1 && "only know how to promote condition");
 
   // Promote all the way up to the canonical SetCC type.
-  EVT SVT = getSetCCResultType(MVT::Other);
-  SDValue Cond = PromoteTargetBoolean(N->getOperand(1), SVT);
+  SDValue Cond = PromoteTargetBoolean(N->getOperand(1), MVT::Other);
 
   // The chain (Op#0) and basic block destination (Op#2) are always legal types.
   return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Cond,
@@ -986,9 +1012,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
   EVT OpTy = N->getOperand(1).getValueType();
 
   // Promote all the way up to the canonical SetCC type.
-  EVT SVT = getSetCCResultType(N->getOpcode() == ISD::SELECT ?
-                                   OpTy.getScalarType() : OpTy);
-  Cond = PromoteTargetBoolean(Cond, SVT);
+  EVT OpVT = N->getOpcode() == ISD::SELECT ? OpTy.getScalarType() : OpTy;
+  Cond = PromoteTargetBoolean(Cond, OpVT);
 
   return SDValue(DAG.UpdateNodeOperands(N, Cond, N->getOperand(1),
                                         N->getOperand(2)), 0);
@@ -1143,6 +1168,26 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
     ReplaceValueWith(SDValue(N, 1), Tmp.second);
     break;
   }
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
+    AtomicSDNode *AN = cast<AtomicSDNode>(N);
+    SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::Other);
+    SDValue Tmp = DAG.getAtomicCmpSwap(
+        ISD::ATOMIC_CMP_SWAP, SDLoc(N), AN->getMemoryVT(), VTs,
+        N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3),
+        AN->getMemOperand(), AN->getSuccessOrdering(), AN->getFailureOrdering(),
+        AN->getSynchScope());
+
+    // Expanding to the strong ATOMIC_CMP_SWAP node means we can determine
+    // success simply by comparing the loaded value against the ingoing
+    // comparison.
+    SDValue Success = DAG.getSetCC(SDLoc(N), N->getValueType(1), Tmp,
+                                   N->getOperand(2), ISD::SETEQ);
+
+    SplitInteger(Tmp, Lo, Hi);
+    ReplaceValueWith(SDValue(N, 1), Success);
+    ReplaceValueWith(SDValue(N, 2), Tmp.getValue(1));
+    break;
+  }
 
   case ISD::AND:
   case ISD::OR:
@@ -2301,7 +2346,7 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(Chain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, &Args, 0)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args), 0)
     .setSExtResult();
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
@@ -2388,16 +2433,18 @@ void DAGTypeLegalizer::ExpandIntRes_ATOMIC_LOAD(SDNode *N,
                                                 SDValue &Lo, SDValue &Hi) {
   SDLoc dl(N);
   EVT VT = cast<AtomicSDNode>(N)->getMemoryVT();
+  SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
   SDValue Zero = DAG.getConstant(0, VT);
-  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
-                               N->getOperand(0),
-                               N->getOperand(1), Zero, Zero,
-                               cast<AtomicSDNode>(N)->getMemOperand(),
-                               cast<AtomicSDNode>(N)->getOrdering(),
-                               cast<AtomicSDNode>(N)->getOrdering(),
-                               cast<AtomicSDNode>(N)->getSynchScope());
+  SDValue Swap = DAG.getAtomicCmpSwap(
+      ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl,
+      cast<AtomicSDNode>(N)->getMemoryVT(), VTs, N->getOperand(0),
+      N->getOperand(1), Zero, Zero, cast<AtomicSDNode>(N)->getMemOperand(),
+      cast<AtomicSDNode>(N)->getOrdering(),
+      cast<AtomicSDNode>(N)->getOrdering(),
+      cast<AtomicSDNode>(N)->getSynchScope());
+
   ReplaceValueWith(SDValue(N, 0), Swap.getValue(0));
-  ReplaceValueWith(SDValue(N, 1), Swap.getValue(1));
+  ReplaceValueWith(SDValue(N, 1), Swap.getValue(2));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 3971fc3..bd7dacf 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -1054,7 +1054,7 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC,
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, &Args, 0)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0)
     .setSExtResult(isSigned).setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
@@ -1065,11 +1065,14 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC,
 /// PromoteTargetBoolean - Promote the given target boolean to a target boolean
 /// of the given type.  A target boolean is an integer value, not necessarily of
 /// type i1, the bits of which conform to getBooleanContents.
-SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT VT) {
+///
+/// ValVT is the type of values that produced the boolean.
+SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT ValVT) {
   SDLoc dl(Bool);
+  EVT BoolVT = getSetCCResultType(ValVT);
   ISD::NodeType ExtendCode =
-    TargetLowering::getExtendForContent(TLI.getBooleanContents(VT.isVector()));
-  return DAG.getNode(ExtendCode, dl, VT, Bool);
+      TargetLowering::getExtendForContent(TLI.getBooleanContents(ValVT));
+  return DAG.getNode(ExtendCode, dl, BoolVT, Bool);
 }
 
 /// SplitInteger - Return the lower LoVT bits of Op in Lo and the upper HiVT
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index e4bbc78..d0ca6f8 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -167,7 +167,7 @@ private:
                                                  SDNode *Node, bool isSigned);
   std::pair<SDValue, SDValue> ExpandAtomic(SDNode *Node);
 
-  SDValue PromoteTargetBoolean(SDValue Bool, EVT VT);
+  SDValue PromoteTargetBoolean(SDValue Bool, EVT ValVT);
   void ReplaceValueWith(SDValue From, SDValue To);
   void SplitInteger(SDValue Op, SDValue &Lo, SDValue &Hi);
   void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT,
@@ -220,7 +220,7 @@ private:
   SDValue PromoteIntRes_AssertZext(SDNode *N);
   SDValue PromoteIntRes_Atomic0(AtomicSDNode *N);
   SDValue PromoteIntRes_Atomic1(AtomicSDNode *N);
-  SDValue PromoteIntRes_Atomic2(AtomicSDNode *N);
+  SDValue PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N, unsigned ResNo);
   SDValue PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N);
   SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N);
@@ -570,6 +570,7 @@ private:
   void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi);
@@ -644,6 +645,7 @@ private:
   bool WidenVectorOperand(SDNode *N, unsigned OpNo);
   SDValue WidenVecOp_BITCAST(SDNode *N);
   SDValue WidenVecOp_CONCAT_VECTORS(SDNode *N);
+  SDValue WidenVecOp_EXTEND(SDNode *N);
   SDValue WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue WidenVecOp_STORE(SDNode* N);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index f40ed76..7e2f7b6 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -60,12 +60,15 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
     case TargetLowering::TypeExpandFloat:
       // Convert the expanded pieces of the input.
       GetExpandedOp(InOp, Lo, Hi);
+      if (TLI.hasBigEndianPartOrdering(InVT) !=
+          TLI.hasBigEndianPartOrdering(OutVT))
+        std::swap(Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
       return;
     case TargetLowering::TypeSplitVector:
       GetSplitVector(InOp, Lo, Hi);
-      if (TLI.isBigEndian())
+      if (TLI.hasBigEndianPartOrdering(OutVT))
         std::swap(Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
@@ -82,7 +85,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
       EVT LoVT, HiVT;
       std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(InVT);
       std::tie(Lo, Hi) = DAG.SplitVector(InOp, dl, LoVT, HiVT);
-      if (TLI.isBigEndian())
+      if (TLI.hasBigEndianPartOrdering(OutVT))
         std::swap(Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
@@ -176,7 +179,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
                    false, false, MinAlign(Alignment, IncrementSize));
 
   // Handle endianness of the load.
-  if (TLI.isBigEndian())
+  if (TLI.hasBigEndianPartOrdering(OutVT))
     std::swap(Lo, Hi);
 }
 
@@ -245,7 +248,8 @@ void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
   SDLoc dl(N);
 
   LoadSDNode *LD = cast<LoadSDNode>(N);
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0));
+  EVT ValueVT = LD->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), ValueVT);
   SDValue Chain = LD->getChain();
   SDValue Ptr = LD->getBasePtr();
   unsigned Alignment = LD->getAlignment();
@@ -275,7 +279,7 @@ void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
                       Hi.getValue(1));
 
   // Handle endianness of the load.
-  if (TLI.isBigEndian())
+  if (TLI.hasBigEndianPartOrdering(ValueVT))
     std::swap(Lo, Hi);
 
   // Modified the chain - switch anything that used the old chain to use
@@ -295,7 +299,7 @@ void DAGTypeLegalizer::ExpandRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
   Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, N->getOperand(2), 0);
 
   // Handle endianness of the load.
-  if (TLI.isBigEndian())
+  if (TLI.hasBigEndianPartOrdering(OVT))
     std::swap(Lo, Hi);
 
   // Modified the chain - switch anything that used the old chain to use
@@ -459,8 +463,8 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
   SDLoc dl(N);
 
   StoreSDNode *St = cast<StoreSDNode>(N);
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(),
-                                     St->getValue().getValueType());
+  EVT ValueVT = St->getValue().getValueType();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), ValueVT);
   SDValue Chain = St->getChain();
   SDValue Ptr = St->getBasePtr();
   unsigned Alignment = St->getAlignment();
@@ -474,7 +478,7 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
   SDValue Lo, Hi;
   GetExpandedOp(St->getValue(), Lo, Hi);
 
-  if (TLI.isBigEndian())
+  if (TLI.hasBigEndianPartOrdering(ValueVT))
     std::swap(Lo, Hi);
 
   Lo = DAG.getStore(Chain, dl, Lo, Ptr, St->getPointerInfo(),
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 898cd29..507e7ff 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -37,12 +37,12 @@ class VectorLegalizer {
   const TargetLowering &TLI;
   bool Changed; // Keep track of whether anything changed
 
-  /// LegalizedNodes - For nodes that are of legal width, and that have more
-  /// than one use, this map indicates what regularized operand to use.  This
-  /// allows us to avoid legalizing the same thing more than once.
+  /// For nodes that are of legal width, and that have more than one use, this
+  /// map indicates what regularized operand to use.  This allows us to avoid
+  /// legalizing the same thing more than once.
   SmallDenseMap<SDValue, SDValue, 64> LegalizedNodes;
 
-  // Adds a node to the translation cache
+  /// \brief Adds a node to the translation cache.
   void AddLegalizedOperand(SDValue From, SDValue To) {
     LegalizedNodes.insert(std::make_pair(From, To));
     // If someone requests legalization of the new node, return itself.
@@ -50,41 +50,81 @@ class VectorLegalizer {
       LegalizedNodes.insert(std::make_pair(To, To));
   }
 
-  // Legalizes the given node
+  /// \brief Legalizes the given node.
   SDValue LegalizeOp(SDValue Op);
-  // Assuming the node is legal, "legalize" the results
+
+  /// \brief Assuming the node is legal, "legalize" the results.
   SDValue TranslateLegalizeResults(SDValue Op, SDValue Result);
-  // Implements unrolling a VSETCC.
+
+  /// \brief Implements unrolling a VSETCC.
   SDValue UnrollVSETCC(SDValue Op);
-  // Implements expansion for FNEG; falls back to UnrollVectorOp if FSUB
-  // isn't legal.
-  // Implements expansion for UINT_TO_FLOAT; falls back to UnrollVectorOp if
-  // SINT_TO_FLOAT and SHR on vectors isn't legal.
+
+  /// \brief Implement expand-based legalization of vector operations.
+  ///
+  /// This is just a high-level routine to dispatch to specific code paths for
+  /// operations to legalize them.
+  SDValue Expand(SDValue Op);
+
+  /// \brief Implements expansion for FNEG; falls back to UnrollVectorOp if
+  /// FSUB isn't legal.
+  ///
+  /// Implements expansion for UINT_TO_FLOAT; falls back to UnrollVectorOp if
+  /// SINT_TO_FLOAT and SHR on vectors isn't legal.
   SDValue ExpandUINT_TO_FLOAT(SDValue Op);
-  // Implement expansion for SIGN_EXTEND_INREG using SRL and SRA.
+
+  /// \brief Implement expansion for SIGN_EXTEND_INREG using SRL and SRA.
   SDValue ExpandSEXTINREG(SDValue Op);
-  // Expand bswap of vectors into a shuffle if legal.
+
+  /// \brief Implement expansion for ANY_EXTEND_VECTOR_INREG.
+  ///
+  /// Shuffles the low lanes of the operand into place and bitcasts to the proper
+  /// type. The contents of the bits in the extended part of each element are
+  /// undef.
+  SDValue ExpandANY_EXTEND_VECTOR_INREG(SDValue Op);
+
+  /// \brief Implement expansion for SIGN_EXTEND_VECTOR_INREG.
+  ///
+  /// Shuffles the low lanes of the operand into place, bitcasts to the proper
+  /// type, then shifts left and arithmetic shifts right to introduce a sign
+  /// extension.
+  SDValue ExpandSIGN_EXTEND_VECTOR_INREG(SDValue Op);
+
+  /// \brief Implement expansion for ZERO_EXTEND_VECTOR_INREG.
+  ///
+  /// Shuffles the low lanes of the operand into place and blends zeros into
+  /// the remaining lanes, finally bitcasting to the proper type.
+  SDValue ExpandZERO_EXTEND_VECTOR_INREG(SDValue Op);
+
+  /// \brief Expand bswap of vectors into a shuffle if legal.
   SDValue ExpandBSWAP(SDValue Op);
-  // Implement vselect in terms of XOR, AND, OR when blend is not supported
-  // by the target.
+
+  /// \brief Implement vselect in terms of XOR, AND, OR when blend is not
+  /// supported by the target.
   SDValue ExpandVSELECT(SDValue Op);
   SDValue ExpandSELECT(SDValue Op);
   SDValue ExpandLoad(SDValue Op);
   SDValue ExpandStore(SDValue Op);
   SDValue ExpandFNEG(SDValue Op);
-  // Implements vector promotion; this is essentially just bitcasting the
-  // operands to a different type and bitcasting the result back to the
-  // original type.
-  SDValue PromoteVectorOp(SDValue Op);
-  // Implements [SU]INT_TO_FP vector promotion; this is a [zs]ext of the input
-  // operand to the next size up.
-  SDValue PromoteVectorOpINT_TO_FP(SDValue Op);
-  // Implements FP_TO_[SU]INT vector promotion of the result type; it is
-  // promoted to the next size up integer type.  The result is then truncated
-  // back to the original type.
-  SDValue PromoteVectorOpFP_TO_INT(SDValue Op, bool isSigned);
-
-  public:
+
+  /// \brief Implements vector promotion.
+  ///
+  /// This is essentially just bitcasting the operands to a different type and
+  /// bitcasting the result back to the original type.
+  SDValue Promote(SDValue Op);
+
+  /// \brief Implements [SU]INT_TO_FP vector promotion.
+  ///
+  /// This is a [zs]ext of the input operand to the next size up.
+  SDValue PromoteINT_TO_FP(SDValue Op);
+
+  /// \brief Implements FP_TO_[SU]INT vector promotion of the result type.
+  ///
+  /// It is promoted to the next size up integer type.  The result is then
+  /// truncated back to the original type.
+  SDValue PromoteFP_TO_INT(SDValue Op, bool isSigned);
+
+public:
+  /// \brief Begin legalizer the vector operations in the DAG.
   bool Run();
   VectorLegalizer(SelectionDAG& dag) :
       DAG(dag), TLI(dag.getTargetLoweringInfo()), Changed(false) {}
@@ -254,6 +294,9 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FP_EXTEND:
   case ISD::FMA:
   case ISD::SIGN_EXTEND_INREG:
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
     QueryType = Node->getValueType(0);
     break;
   case ISD::FP_ROUND_INREG:
@@ -267,27 +310,11 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
 
   switch (TLI.getOperationAction(Node->getOpcode(), QueryType)) {
   case TargetLowering::Promote:
-    switch (Op.getOpcode()) {
-    default:
-      // "Promote" the operation by bitcasting
-      Result = PromoteVectorOp(Op);
-      Changed = true;
-      break;
-    case ISD::SINT_TO_FP:
-    case ISD::UINT_TO_FP:
-      // "Promote" the operation by extending the operand.
-      Result = PromoteVectorOpINT_TO_FP(Op);
-      Changed = true;
-      break;
-    case ISD::FP_TO_UINT:
-    case ISD::FP_TO_SINT:
-      // Promote the operation by extending the operand.
-      Result = PromoteVectorOpFP_TO_INT(Op, Op->getOpcode() == ISD::FP_TO_SINT);
-      Changed = true;
-      break;
-    }
+    Result = Promote(Op);
+    Changed = true;
+    break;
+  case TargetLowering::Legal:
     break;
-  case TargetLowering::Legal: break;
   case TargetLowering::Custom: {
     SDValue Tmp1 = TLI.LowerOperation(Op, DAG);
     if (Tmp1.getNode()) {
@@ -297,23 +324,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
     // FALL THROUGH
   }
   case TargetLowering::Expand:
-    if (Node->getOpcode() == ISD::SIGN_EXTEND_INREG)
-      Result = ExpandSEXTINREG(Op);
-    else if (Node->getOpcode() == ISD::BSWAP)
-      Result = ExpandBSWAP(Op);
-    else if (Node->getOpcode() == ISD::VSELECT)
-      Result = ExpandVSELECT(Op);
-    else if (Node->getOpcode() == ISD::SELECT)
-      Result = ExpandSELECT(Op);
-    else if (Node->getOpcode() == ISD::UINT_TO_FP)
-      Result = ExpandUINT_TO_FLOAT(Op);
-    else if (Node->getOpcode() == ISD::FNEG)
-      Result = ExpandFNEG(Op);
-    else if (Node->getOpcode() == ISD::SETCC)
-      Result = UnrollVSETCC(Op);
-    else
-      Result = DAG.UnrollVectorOp(Op.getNode());
-    break;
+    Result = Expand(Op);
   }
 
   // Make sure that the generated code is itself legal.
@@ -328,10 +339,23 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   return Result;
 }
 
-SDValue VectorLegalizer::PromoteVectorOp(SDValue Op) {
-  // Vector "promotion" is basically just bitcasting and doing the operation
-  // in a different type.  For example, x86 promotes ISD::AND on v2i32 to
-  // v1i64.
+SDValue VectorLegalizer::Promote(SDValue Op) {
+  // For a few operations there is a specific concept for promotion based on
+  // the operand's type.
+  switch (Op.getOpcode()) {
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    // "Promote" the operation by extending the operand.
+    return PromoteINT_TO_FP(Op);
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+    // Promote the operation by extending the operand.
+    return PromoteFP_TO_INT(Op, Op->getOpcode() == ISD::FP_TO_SINT);
+  }
+
+  // The rest of the time, vector "promotion" is basically just bitcasting and
+  // doing the operation in a different type.  For example, x86 promotes
+  // ISD::AND on v2i32 to v1i64.
   MVT VT = Op.getSimpleValueType();
   assert(Op.getNode()->getNumValues() == 1 &&
          "Can't promote a vector with multiple results!");
@@ -351,7 +375,7 @@ SDValue VectorLegalizer::PromoteVectorOp(SDValue Op) {
   return DAG.getNode(ISD::BITCAST, dl, VT, Op);
 }
 
-SDValue VectorLegalizer::PromoteVectorOpINT_TO_FP(SDValue Op) {
+SDValue VectorLegalizer::PromoteINT_TO_FP(SDValue Op) {
   // INT_TO_FP operations may require the input operand be promoted even
   // when the type is otherwise legal.
   EVT VT = Op.getOperand(0).getValueType();
@@ -387,7 +411,7 @@ SDValue VectorLegalizer::PromoteVectorOpINT_TO_FP(SDValue Op) {
 // elements and then truncate the result.  This is different from the default
 // PromoteVector which uses bitcast to promote thus assumning that the
 // promoted vector type has the same overall size.
-SDValue VectorLegalizer::PromoteVectorOpFP_TO_INT(SDValue Op, bool isSigned) {
+SDValue VectorLegalizer::PromoteFP_TO_INT(SDValue Op, bool isSigned) {
   assert(Op.getNode()->getNumValues() == 1 &&
          "Can't promote a vector with multiple results!");
   EVT VT = Op.getValueType();
@@ -609,6 +633,33 @@ SDValue VectorLegalizer::ExpandStore(SDValue Op) {
   return TF;
 }
 
+SDValue VectorLegalizer::Expand(SDValue Op) {
+  switch (Op->getOpcode()) {
+  case ISD::SIGN_EXTEND_INREG:
+    return ExpandSEXTINREG(Op);
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+    return ExpandANY_EXTEND_VECTOR_INREG(Op);
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+    return ExpandSIGN_EXTEND_VECTOR_INREG(Op);
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    return ExpandZERO_EXTEND_VECTOR_INREG(Op);
+  case ISD::BSWAP:
+    return ExpandBSWAP(Op);
+  case ISD::VSELECT:
+    return ExpandVSELECT(Op);
+  case ISD::SELECT:
+    return ExpandSELECT(Op);
+  case ISD::UINT_TO_FP:
+    return ExpandUINT_TO_FLOAT(Op);
+  case ISD::FNEG:
+    return ExpandFNEG(Op);
+  case ISD::SETCC:
+    return UnrollVSETCC(Op);
+  default:
+    return DAG.UnrollVectorOp(Op.getNode());
+  }
+}
+
 SDValue VectorLegalizer::ExpandSELECT(SDValue Op) {
   // Lower a select instruction where the condition is a scalar and the
   // operands are vectors. Lower this select to VSELECT and implement it
@@ -686,6 +737,85 @@ SDValue VectorLegalizer::ExpandSEXTINREG(SDValue Op) {
   return DAG.getNode(ISD::SRA, DL, VT, Op, ShiftSz);
 }
 
+// Generically expand a vector anyext in register to a shuffle of the relevant
+// lanes into the appropriate locations, with other lanes left undef.
+SDValue VectorLegalizer::ExpandANY_EXTEND_VECTOR_INREG(SDValue Op) {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  int NumElements = VT.getVectorNumElements();
+  SDValue Src = Op.getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  int NumSrcElements = SrcVT.getVectorNumElements();
+
+  // Build a base mask of undef shuffles.
+  SmallVector<int, 16> ShuffleMask;
+  ShuffleMask.resize(NumSrcElements, -1);
+
+  // Place the extended lanes into the correct locations.
+  int ExtLaneScale = NumSrcElements / NumElements;
+  int EndianOffset = TLI.isBigEndian() ? ExtLaneScale - 1 : 0;
+  for (int i = 0; i < NumElements; ++i)
+    ShuffleMask[i * ExtLaneScale + EndianOffset] = i;
+
+  return DAG.getNode(
+      ISD::BITCAST, DL, VT,
+      DAG.getVectorShuffle(SrcVT, DL, Src, DAG.getUNDEF(SrcVT), ShuffleMask));
+}
+
+SDValue VectorLegalizer::ExpandSIGN_EXTEND_VECTOR_INREG(SDValue Op) {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue Src = Op.getOperand(0);
+  EVT SrcVT = Src.getValueType();
+
+  // First build an any-extend node which can be legalized above when we
+  // recurse through it.
+  Op = DAG.getAnyExtendVectorInReg(Src, DL, VT);
+
+  // Now we need sign extend. Do this by shifting the elements. Even if these
+  // aren't legal operations, they have a better chance of being legalized
+  // without full scalarization than the sign extension does.
+  unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
+  unsigned SrcEltWidth = SrcVT.getVectorElementType().getSizeInBits();
+  SDValue ShiftAmount = DAG.getConstant(EltWidth - SrcEltWidth, VT);
+  return DAG.getNode(ISD::SRA, DL, VT,
+                     DAG.getNode(ISD::SHL, DL, VT, Op, ShiftAmount),
+                     ShiftAmount);
+}
+
+// Generically expand a vector zext in register to a shuffle of the relevant
+// lanes into the appropriate locations, a blend of zero into the high bits,
+// and a bitcast to the wider element type.
+SDValue VectorLegalizer::ExpandZERO_EXTEND_VECTOR_INREG(SDValue Op) {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  int NumElements = VT.getVectorNumElements();
+  SDValue Src = Op.getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  int NumSrcElements = SrcVT.getVectorNumElements();
+
+  // Build up a zero vector to blend into this one.
+  EVT SrcScalarVT = SrcVT.getScalarType();
+  SDValue ScalarZero = DAG.getTargetConstant(0, SrcScalarVT);
+  SmallVector<SDValue, 4> BuildVectorOperands(NumSrcElements, ScalarZero);
+  SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, DL, SrcVT, BuildVectorOperands);
+
+  // Shuffle the incoming lanes into the correct position, and pull all other
+  // lanes from the zero vector.
+  SmallVector<int, 16> ShuffleMask;
+  ShuffleMask.reserve(NumSrcElements);
+  for (int i = 0; i < NumSrcElements; ++i)
+    ShuffleMask.push_back(i);
+
+  int ExtLaneScale = NumSrcElements / NumElements;
+  int EndianOffset = TLI.isBigEndian() ? ExtLaneScale - 1 : 0;
+  for (int i = 0; i < NumElements; ++i)
+    ShuffleMask[i * ExtLaneScale + EndianOffset] = NumSrcElements + i;
+
+  return DAG.getNode(ISD::BITCAST, DL, VT,
+                     DAG.getVectorShuffle(SrcVT, DL, Zero, Src, ShuffleMask));
+}
+
 SDValue VectorLegalizer::ExpandBSWAP(SDValue Op) {
   EVT VT = Op.getValueType();
 
@@ -729,9 +859,9 @@ SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
   // FIXME: Sign extend 1 to all ones if thats legal on the target.
   if (TLI.getOperationAction(ISD::AND, VT) == TargetLowering::Expand ||
       TLI.getOperationAction(ISD::XOR, VT) == TargetLowering::Expand ||
-      TLI.getOperationAction(ISD::OR,  VT) == TargetLowering::Expand ||
-      TLI.getBooleanContents(true) !=
-      TargetLowering::ZeroOrNegativeOneBooleanContent)
+      TLI.getOperationAction(ISD::OR, VT) == TargetLowering::Expand ||
+      TLI.getBooleanContents(Op1.getValueType()) !=
+          TargetLowering::ZeroOrNegativeOneBooleanContent)
     return DAG.UnrollVectorOp(Op.getNode());
 
   // If the mask and the type are different sizes, unroll the vector op. This
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 368eba3..f77c592 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -257,8 +257,26 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
 SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) {
   SDValue Cond = GetScalarizedVector(N->getOperand(0));
   SDValue LHS = GetScalarizedVector(N->getOperand(1));
-  TargetLowering::BooleanContent ScalarBool = TLI.getBooleanContents(false);
-  TargetLowering::BooleanContent VecBool = TLI.getBooleanContents(true);
+  TargetLowering::BooleanContent ScalarBool =
+      TLI.getBooleanContents(false, false);
+  TargetLowering::BooleanContent VecBool = TLI.getBooleanContents(true, false);
+
+  // If integer and float booleans have different contents then we can't
+  // reliably optimize in all cases. There is a full explanation for this in
+  // DAGCombiner::visitSELECT() where the same issue affects folding
+  // (select C, 0, 1) to (xor C, 1).
+  if (TLI.getBooleanContents(false, false) !=
+      TLI.getBooleanContents(false, true)) {
+    // At least try the common case where the boolean is generated by a
+    // comparison.
+    if (Cond->getOpcode() == ISD::SETCC) {
+      EVT OpVT = Cond->getOperand(0)->getValueType(0);
+      ScalarBool = TLI.getBooleanContents(OpVT.getScalarType());
+      VecBool = TLI.getBooleanContents(OpVT);
+    } else
+      ScalarBool = TargetLowering::UndefinedBooleanContent;
+  }
+
   if (ScalarBool != VecBool) {
     EVT CondVT = Cond.getValueType();
     switch (ScalarBool) {
@@ -357,7 +375,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_VSETCC(SDNode *N) {
   // Vectors may have a different boolean contents to scalars.  Promote the
   // value appropriately.
   ISD::NodeType ExtendCode =
-    TargetLowering::getExtendForContent(TLI.getBooleanContents(true));
+      TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));
   return DAG.getNode(ExtendCode, DL, NVT, Res);
 }
 
@@ -545,6 +563,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::BUILD_VECTOR:      SplitVecRes_BUILD_VECTOR(N, Lo, Hi); break;
   case ISD::CONCAT_VECTORS:    SplitVecRes_CONCAT_VECTORS(N, Lo, Hi); break;
   case ISD::EXTRACT_SUBVECTOR: SplitVecRes_EXTRACT_SUBVECTOR(N, Lo, Hi); break;
+  case ISD::INSERT_SUBVECTOR:  SplitVecRes_INSERT_SUBVECTOR(N, Lo, Hi); break;
   case ISD::FP_ROUND_INREG:    SplitVecRes_InregOp(N, Lo, Hi); break;
   case ISD::FPOWI:             SplitVecRes_FPOWI(N, Lo, Hi); break;
   case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break;
@@ -765,6 +784,43 @@ void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo,
                                    TLI.getVectorIdxTy()));
 }
 
+void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
+                                                    SDValue &Hi) {
+  SDValue Vec = N->getOperand(0);
+  SDValue SubVec = N->getOperand(1);
+  SDValue Idx = N->getOperand(2);
+  SDLoc dl(N);
+  GetSplitVector(Vec, Lo, Hi);
+
+  // Spill the vector to the stack.
+  EVT VecVT = Vec.getValueType();
+  EVT SubVecVT = VecVT.getVectorElementType();
+  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr,
+                               MachinePointerInfo(), false, false, 0);
+
+  // Store the new subvector into the specified index.
+  SDValue SubVecPtr = GetVectorElementPointer(StackPtr, SubVecVT, Idx);
+  Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
+  unsigned Alignment = TLI.getDataLayout()->getPrefTypeAlignment(VecType);
+  Store = DAG.getStore(Store, dl, SubVec, SubVecPtr, MachinePointerInfo(),
+                       false, false, 0);
+
+  // Load the Lo part from the stack slot.
+  Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, MachinePointerInfo(),
+                   false, false, false, 0);
+
+  // Increment the pointer to the other part.
+  unsigned IncrementSize = Lo.getValueType().getSizeInBits() / 8;
+  StackPtr =
+      DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                  DAG.getConstant(IncrementSize, StackPtr.getValueType()));
+
+  // Load the Hi part from the stack slot.
+  Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MachinePointerInfo(),
+                   false, false, false, MinAlign(Alignment, IncrementSize));
+}
+
 void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo,
                                          SDValue &Hi) {
   SDLoc dl(N);
@@ -1511,7 +1567,6 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
 
   case ISD::ADD:
   case ISD::AND:
-  case ISD::BSWAP:
   case ISD::MUL:
   case ISD::MULHS:
   case ISD::MULHU:
@@ -1558,6 +1613,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     Res = WidenVecRes_Convert(N);
     break;
 
+  case ISD::BSWAP:
   case ISD::CTLZ:
   case ISD::CTPOP:
   case ISD::CTTZ:
@@ -2343,15 +2399,18 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::STORE:              Res = WidenVecOp_STORE(N); break;
   case ISD::SETCC:              Res = WidenVecOp_SETCC(N); break;
 
+  case ISD::ANY_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    Res = WidenVecOp_EXTEND(N);
+    break;
+
   case ISD::FP_EXTEND:
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
   case ISD::TRUNCATE:
-  case ISD::SIGN_EXTEND:
-  case ISD::ZERO_EXTEND:
-  case ISD::ANY_EXTEND:
     Res = WidenVecOp_Convert(N);
     break;
   }
@@ -2372,6 +2431,68 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
   return false;
 }
 
+SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  SDValue InOp = N->getOperand(0);
+  // If some legalization strategy other than widening is used on the operand,
+  // we can't safely assume that just extending the low lanes is the correct
+  // transformation.
+  if (getTypeAction(InOp.getValueType()) != TargetLowering::TypeWidenVector)
+    return WidenVecOp_Convert(N);
+  InOp = GetWidenedVector(InOp);
+  assert(VT.getVectorNumElements() <
+             InOp.getValueType().getVectorNumElements() &&
+         "Input wasn't widened!");
+
+  // We may need to further widen the operand until it has the same total
+  // vector size as the result.
+  EVT InVT = InOp.getValueType();
+  if (InVT.getSizeInBits() != VT.getSizeInBits()) {
+    EVT InEltVT = InVT.getVectorElementType();
+    for (int i = MVT::FIRST_VECTOR_VALUETYPE, e = MVT::LAST_VECTOR_VALUETYPE; i < e; ++i) {
+      EVT FixedVT = (MVT::SimpleValueType)i;
+      EVT FixedEltVT = FixedVT.getVectorElementType();
+      if (TLI.isTypeLegal(FixedVT) &&
+          FixedVT.getSizeInBits() == VT.getSizeInBits() &&
+          FixedEltVT == InEltVT) {
+        assert(FixedVT.getVectorNumElements() >= VT.getVectorNumElements() &&
+               "Not enough elements in the fixed type for the operand!");
+        assert(FixedVT.getVectorNumElements() != InVT.getVectorNumElements() &&
+               "We can't have the same type as we started with!");
+        if (FixedVT.getVectorNumElements() > InVT.getVectorNumElements())
+          InOp = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, FixedVT,
+                             DAG.getUNDEF(FixedVT), InOp,
+                             DAG.getConstant(0, TLI.getVectorIdxTy()));
+        else
+          InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FixedVT, InOp,
+                             DAG.getConstant(0, TLI.getVectorIdxTy()));
+        break;
+      }
+    }
+    InVT = InOp.getValueType();
+    if (InVT.getSizeInBits() != VT.getSizeInBits())
+      // We couldn't find a legal vector type that was a widening of the input
+      // and could be extended in-register to the result type, so we have to
+      // scalarize.
+      return WidenVecOp_Convert(N);
+  }
+
+  // Use special DAG nodes to represent the operation of extending the
+  // low lanes.
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Extend legalization on on extend operation!");
+  case ISD::ANY_EXTEND:
+    return DAG.getAnyExtendVectorInReg(InOp, DL, VT);
+  case ISD::SIGN_EXTEND:
+    return DAG.getSignExtendVectorInReg(InOp, DL, VT);
+  case ISD::ZERO_EXTEND:
+    return DAG.getZeroExtendVectorInReg(InOp, DL, VT);
+  }
+}
+
 SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
   // Since the result is legal and the input is illegal, it is unlikely
   // that we can fix the input to a legal type so unroll the convert
diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index f92230c..624003f 100644
--- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -442,7 +442,7 @@ signed ResourcePriorityQueue::SUSchedulingCost(SUnit *SU) {
     ResCount -= (regPressureDelta(SU) * ScaleTwo);
   }
 
-  // These are platform specific things.
+  // These are platform-specific things.
   // Will need to go into the back end
   // and accessed from here via a hook.
   for (SDNode *N = SU->getNode(); N; N = N->getGluedNode()) {
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 78ec4df..13cfae7 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -170,7 +170,8 @@ public:
     if (DisableSchedCycles || !NeedLatency)
       HazardRec = new ScheduleHazardRecognizer();
     else
-      HazardRec = tm.getInstrInfo()->CreateTargetHazardRecognizer(&tm, this);
+      HazardRec = tm.getInstrInfo()->CreateTargetHazardRecognizer(
+          tm.getSubtargetImpl(), this);
   }
 
   ~ScheduleDAGRRList() {
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
index 51c51d6..4589b0c 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
@@ -73,7 +73,8 @@ public:
     : ScheduleDAGSDNodes(mf), AvailableQueue(availqueue), AA(aa) {
 
     const TargetMachine &tm = mf.getTarget();
-    HazardRec = tm.getInstrInfo()->CreateTargetHazardRecognizer(&tm, this);
+    HazardRec = tm.getInstrInfo()->CreateTargetHazardRecognizer(
+        tm.getSubtargetImpl(), this);
   }
 
   ~ScheduleDAGVLIW() {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b1b8035..daff1f2 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Target/TargetSelectionDAGInfo.h"
 #include <algorithm>
 #include <cmath>
+
 using namespace llvm;
 
 /// makeVTList - Return an instance of the SDVTList struct initialized with the
@@ -147,33 +148,34 @@ bool ISD::isBuildVectorAllZeros(const SDNode *N) {
 
   if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
 
-  unsigned i = 0, e = N->getNumOperands();
-
-  // Skip over all of the undef values.
-  while (i != e && N->getOperand(i).getOpcode() == ISD::UNDEF)
-    ++i;
+  bool IsAllUndef = true;
+  for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) {
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF)
+      continue;
+    IsAllUndef = false;
+    // Do not accept build_vectors that aren't all constants or which have non-0
+    // elements. We have to be a bit careful here, as the type of the constant
+    // may not be the same as the type of the vector elements due to type
+    // legalization (the elements are promoted to a legal type for the target
+    // and a vector of a type may be legal when the base element type is not).
+    // We only want to check enough bits to cover the vector elements, because
+    // we care if the resultant vector is all zeros, not whether the individual
+    // constants are.
+    SDValue Zero = N->getOperand(i);
+    unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits();
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Zero)) {
+      if (CN->getAPIntValue().countTrailingZeros() < EltSize)
+        return false;
+    } else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(Zero)) {
+      if (CFPN->getValueAPF().bitcastToAPInt().countTrailingZeros() < EltSize)
+        return false;
+    } else
+      return false;
+  }
 
   // Do not accept an all-undef vector.
-  if (i == e) return false;
-
-  // Do not accept build_vectors that aren't all constants or which have non-0
-  // elements.
-  SDValue Zero = N->getOperand(i);
-  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Zero)) {
-    if (!CN->isNullValue())
-      return false;
-  } else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(Zero)) {
-    if (!CFPN->getValueAPF().isPosZero())
-      return false;
-  } else
+  if (IsAllUndef)
     return false;
-
-  // Okay, we have at least one 0 value, check to see if the rest match or are
-  // undefs.
-  for (++i; i != e; ++i)
-    if (N->getOperand(i) != Zero &&
-        N->getOperand(i).getOpcode() != ISD::UNDEF)
-      return false;
   return true;
 }
 
@@ -381,6 +383,20 @@ static void AddNodeIDOperands(FoldingSetNodeID &ID,
   }
 }
 
+static void AddBinaryNodeIDCustom(FoldingSetNodeID &ID, bool nuw, bool nsw,
+                                  bool exact) {
+  ID.AddBoolean(nuw);
+  ID.AddBoolean(nsw);
+  ID.AddBoolean(exact);
+}
+
+/// AddBinaryNodeIDCustom - Add BinarySDNodes special infos
+static void AddBinaryNodeIDCustom(FoldingSetNodeID &ID, unsigned Opcode,
+                                  bool nuw, bool nsw, bool exact) {
+  if (isBinOpWithFlags(Opcode))
+    AddBinaryNodeIDCustom(ID, nuw, nsw, exact);
+}
+
 static void AddNodeIDNode(FoldingSetNodeID &ID, unsigned short OpC,
                           SDVTList VTList, ArrayRef<SDValue> OpList) {
   AddNodeIDOpcode(ID, OpC);
@@ -473,7 +489,21 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddInteger(ST->getPointerInfo().getAddrSpace());
     break;
   }
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::MUL:
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::SHL: {
+    const BinaryWithFlagsSDNode *BinNode = cast<BinaryWithFlagsSDNode>(N);
+    AddBinaryNodeIDCustom(ID, N->getOpcode(), BinNode->hasNoUnsignedWrap(),
+                          BinNode->hasNoSignedWrap(), BinNode->isExact());
+    break;
+  }
   case ISD::ATOMIC_CMP_SWAP:
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
   case ISD::ATOMIC_SWAP:
   case ISD::ATOMIC_LOAD_ADD:
   case ISD::ATOMIC_LOAD_SUB:
@@ -527,7 +557,7 @@ static void AddNodeIDNode(FoldingSetNodeID &ID, const SDNode *N) {
   // Add the return value info.
   AddNodeIDValueTypes(ID, N->getVTList());
   // Add the operand info.
-  AddNodeIDOperands(ID, makeArrayRef(N->op_begin(), N->op_end()));
+  AddNodeIDOperands(ID, N->ops());
 
   // Handle SDNode leafs with special info.
   AddNodeIDCustom(ID, N);
@@ -926,6 +956,25 @@ void SelectionDAG::allnodes_clear() {
     DeallocateNode(AllNodes.begin());
 }
 
+BinarySDNode *SelectionDAG::GetBinarySDNode(unsigned Opcode, SDLoc DL,
+                                            SDVTList VTs, SDValue N1,
+                                            SDValue N2, bool nuw, bool nsw,
+                                            bool exact) {
+  if (isBinOpWithFlags(Opcode)) {
+    BinaryWithFlagsSDNode *FN = new (NodeAllocator) BinaryWithFlagsSDNode(
+        Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2);
+    FN->setHasNoUnsignedWrap(nuw);
+    FN->setHasNoSignedWrap(nsw);
+    FN->setIsExact(exact);
+
+    return FN;
+  }
+
+  BinarySDNode *N = new (NodeAllocator)
+      BinarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2);
+  return N;
+}
+
 void SelectionDAG::clear() {
   allnodes_clear();
   OperandAllocator.Reset();
@@ -963,11 +1012,12 @@ SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, SDLoc DL, EVT VT) {
     getNode(ISD::TRUNCATE, DL, VT, Op);
 }
 
-SDValue SelectionDAG::getBoolExtOrTrunc(SDValue Op, SDLoc SL, EVT VT) {
+SDValue SelectionDAG::getBoolExtOrTrunc(SDValue Op, SDLoc SL, EVT VT,
+                                        EVT OpVT) {
   if (VT.bitsLE(Op.getValueType()))
     return getNode(ISD::TRUNCATE, SL, VT, Op);
 
-  TargetLowering::BooleanContent BType = TLI->getBooleanContents(VT.isVector());
+  TargetLowering::BooleanContent BType = TLI->getBooleanContents(OpVT);
   return getNode(TLI->getExtendForContent(BType), SL, VT, Op);
 }
 
@@ -983,6 +1033,36 @@ SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, SDLoc DL, EVT VT) {
                  getConstant(Imm, Op.getValueType()));
 }
 
+SDValue SelectionDAG::getAnyExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT) {
+  assert(VT.isVector() && "This DAG node is restricted to vector types.");
+  assert(VT.getSizeInBits() == Op.getValueType().getSizeInBits() &&
+         "The sizes of the input and result must match in order to perform the "
+         "extend in-register.");
+  assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
+         "The destination vector type must have fewer lanes than the input.");
+  return getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, VT, Op);
+}
+
+SDValue SelectionDAG::getSignExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT) {
+  assert(VT.isVector() && "This DAG node is restricted to vector types.");
+  assert(VT.getSizeInBits() == Op.getValueType().getSizeInBits() &&
+         "The sizes of the input and result must match in order to perform the "
+         "extend in-register.");
+  assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
+         "The destination vector type must have fewer lanes than the input.");
+  return getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, VT, Op);
+}
+
+SDValue SelectionDAG::getZeroExtendVectorInReg(SDValue Op, SDLoc DL, EVT VT) {
+  assert(VT.isVector() && "This DAG node is restricted to vector types.");
+  assert(VT.getSizeInBits() == Op.getValueType().getSizeInBits() &&
+         "The sizes of the input and result must match in order to perform the "
+         "extend in-register.");
+  assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
+         "The destination vector type must have fewer lanes than the input.");
+  return getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Op);
+}
+
 /// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
 ///
 SDValue SelectionDAG::getNOT(SDLoc DL, SDValue Val, EVT VT) {
@@ -995,7 +1075,7 @@ SDValue SelectionDAG::getNOT(SDLoc DL, SDValue Val, EVT VT) {
 SDValue SelectionDAG::getLogicalNOT(SDLoc DL, SDValue Val, EVT VT) {
   EVT EltVT = VT.getScalarType();
   SDValue TrueValue;
-  switch (TLI->getBooleanContents(VT.isVector())) {
+  switch (TLI->getBooleanContents(VT)) {
     case TargetLowering::ZeroOrOneBooleanContent:
     case TargetLowering::UndefinedBooleanContent:
       TrueValue = getConstant(1, VT);
@@ -1190,15 +1270,8 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, SDLoc DL,
   if (BitWidth < 64)
     Offset = SignExtend64(Offset, BitWidth);
 
-  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
-  if (!GVar) {
-    // If GV is an alias then use the aliasee for determining thread-localness.
-    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      GVar = dyn_cast_or_null<GlobalVariable>(GA->getAliasee());
-  }
-
   unsigned Opc;
-  if (GVar && GVar->isThreadLocal())
+  if (GV->isThreadLocal())
     Opc = isTargetGA ? ISD::TargetGlobalTLSAddress : ISD::GlobalTLSAddress;
   else
     Opc = isTargetGA ? ISD::TargetGlobalAddress : ISD::GlobalAddress;
@@ -1454,6 +1527,11 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
     N1 = getUNDEF(VT);
     commuteShuffle(N1, N2, MaskVec);
   }
+  // Reset our undef status after accounting for the mask.
+  N2Undef = N2.getOpcode() == ISD::UNDEF;
+  // Re-check whether both sides ended up undef.
+  if (N1.getOpcode() == ISD::UNDEF && N2Undef)
+    return getUNDEF(VT);
 
   // If Identity shuffle return that node.
   bool Identity = true;
@@ -1464,9 +1542,36 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
     return N1;
 
   // Shuffling a constant splat doesn't change the result.
-  if (N2Undef && N1.getOpcode() == ISD::BUILD_VECTOR)
-    if (cast<BuildVectorSDNode>(N1)->getConstantSplatValue())
-      return N1;
+  if (N2Undef) {
+    SDValue V = N1;
+
+    // Look through any bitcasts. We check that these don't change the number
+    // (and size) of elements and just changes their types.
+    while (V.getOpcode() == ISD::BITCAST)
+      V = V->getOperand(0);
+
+    // A splat should always show up as a build vector node.
+    if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
+      BitVector UndefElements;
+      SDValue Splat = BV->getSplatValue(&UndefElements);
+      // If this is a splat of an undef, shuffling it is also undef.
+      if (Splat && Splat.getOpcode() == ISD::UNDEF)
+        return getUNDEF(VT);
+
+      // We only have a splat which can skip shuffles if there is a splatted
+      // value and no undef lanes rearranged by the shuffle.
+      if (Splat && UndefElements.none()) {
+        // Splat of <x, x, ..., x>, return <x, x, ..., x>, provided that the
+        // number of elements match or the value splatted is a zero constant.
+        if (V.getValueType().getVectorNumElements() ==
+            VT.getVectorNumElements())
+          return N1;
+        if (auto *C = dyn_cast<ConstantSDNode>(Splat))
+          if (C->isNullValue())
+            return N1;
+      }
+    }
+  }
 
   FoldingSetNodeID ID;
   SDValue Ops[2] = { N1, N2 };
@@ -1692,7 +1797,8 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1,
   case ISD::SETTRUE:
   case ISD::SETTRUE2: {
     const TargetLowering *TLI = TM.getTargetLowering();
-    TargetLowering::BooleanContent Cnt = TLI->getBooleanContents(VT.isVector());
+    TargetLowering::BooleanContent Cnt =
+        TLI->getBooleanContents(N1->getValueType(0));
     return getConstant(
         Cnt == TargetLowering::ZeroOrNegativeOneBooleanContent ? -1ULL : 1, VT);
   }
@@ -1923,11 +2029,20 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
   case ISD::UMULO:
     if (Op.getResNo() != 1)
       break;
-    // The boolean result conforms to getBooleanContents.  Fall through.
+    // The boolean result conforms to getBooleanContents.
+    // If we know the result of a setcc has the top bits zero, use this info.
+    // We know that we have an integer-based boolean since these operations
+    // are only available for integer.
+    if (TLI->getBooleanContents(Op.getValueType().isVector(), false) ==
+            TargetLowering::ZeroOrOneBooleanContent &&
+        BitWidth > 1)
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    break;
   case ISD::SETCC:
     // If we know the result of a setcc has the top bits zero, use this info.
-    if (TLI->getBooleanContents(Op.getValueType().isVector()) ==
-        TargetLowering::ZeroOrOneBooleanContent && BitWidth > 1)
+    if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
+            TargetLowering::ZeroOrOneBooleanContent &&
+        BitWidth > 1)
       KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
     break;
   case ISD::SHL:
@@ -2043,7 +2158,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
       unsigned MemBits = VT.getScalarType().getSizeInBits();
       KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
     } else if (const MDNode *Ranges = LD->getRanges()) {
-      computeKnownBitsLoad(*Ranges, KnownZero);
+      computeKnownBitsFromRangeMetadata(*Ranges, KnownZero);
     }
     break;
   }
@@ -2192,8 +2307,11 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
       const APInt &RA = Rem->getAPIntValue();
       if (RA.isPowerOf2()) {
         APInt LowBits = (RA - 1);
-        KnownZero |= ~LowBits;
-        computeKnownBits(Op.getOperand(0), KnownZero, KnownOne,Depth+1);
+        computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth + 1);
+
+        // The upper bits are all zero, the lower ones are unchanged.
+        KnownZero = KnownZero2 | ~LowBits;
+        KnownOne = KnownOne2 & LowBits;
         break;
       }
     }
@@ -2323,9 +2441,16 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
     if (Op.getResNo() != 1)
       break;
     // The boolean result conforms to getBooleanContents.  Fall through.
+    // If setcc returns 0/-1, all bits are sign bits.
+    // We know that we have an integer-based boolean since these operations
+    // are only available for integer.
+    if (TLI->getBooleanContents(Op.getValueType().isVector(), false) ==
+        TargetLowering::ZeroOrNegativeOneBooleanContent)
+      return VTBits;
+    break;
   case ISD::SETCC:
     // If setcc returns 0/-1, all bits are sign bits.
-    if (TLI->getBooleanContents(Op.getValueType().isVector()) ==
+    if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
         TargetLowering::ZeroOrNegativeOneBooleanContent)
       return VTBits;
     break;
@@ -2940,7 +3065,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, EVT VT,
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
-                              SDValue N2) {
+                              SDValue N2, bool nuw, bool nsw, bool exact) {
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
   ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
   switch (Opcode) {
@@ -3380,22 +3505,25 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
   }
 
   // Memoize this node if possible.
-  SDNode *N;
+  BinarySDNode *N;
   SDVTList VTs = getVTList(VT);
+  const bool BinOpHasFlags = isBinOpWithFlags(Opcode);
   if (VT != MVT::Glue) {
-    SDValue Ops[] = { N1, N2 };
+    SDValue Ops[] = {N1, N2};
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTs, Ops);
+    if (BinOpHasFlags)
+      AddBinaryNodeIDCustom(ID, Opcode, nuw, nsw, exact);
     void *IP = nullptr;
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return SDValue(E, 0);
 
-    N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(),
-                                         DL.getDebugLoc(), VTs, N1, N2);
+    N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, nuw, nsw, exact);
+
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(),
-                                         DL.getDebugLoc(), VTs, N1, N2);
+
+    N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, nuw, nsw, exact);
   }
 
   AllNodes.push_back(N);
@@ -3583,7 +3711,7 @@ static SDValue getMemsetStringVal(EVT VT, SDLoc dl, SelectionDAG &DAG,
   if (Str.empty()) {
     if (VT.isInteger())
       return DAG.getConstant(0, VT);
-    else if (VT == MVT::f32 || VT == MVT::f64)
+    else if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128)
       return DAG.getConstantFP(0.0, VT);
     else if (VT.isVector()) {
       unsigned NumElts = VT.getVectorNumElements();
@@ -4110,7 +4238,7 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst,
     .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
                Type::getVoidTy(*getContext()),
                getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
-                                 TLI->getPointerTy()), &Args, 0)
+                                 TLI->getPointerTy()), std::move(Args), 0)
     .setDiscardResult();
   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
 
@@ -4166,7 +4294,7 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, SDLoc dl, SDValue Dst,
     .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
                Type::getVoidTy(*getContext()),
                getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
-                                 TLI->getPointerTy()), &Args, 0)
+                                 TLI->getPointerTy()), std::move(Args), 0)
     .setDiscardResult();
   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
 
@@ -4230,7 +4358,7 @@ SDValue SelectionDAG::getMemset(SDValue Chain, SDLoc dl, SDValue Dst,
     .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
                Type::getVoidTy(*getContext()),
                getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
-                                 TLI->getPointerTy()), &Args, 0)
+                                 TLI->getPointerTy()), std::move(Args), 0)
     .setDiscardResult();
 
   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
@@ -4281,51 +4409,47 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
                    Ordering, SynchScope);
 }
 
-SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
-                                SDValue Chain, SDValue Ptr, SDValue Cmp,
-                                SDValue Swp, MachinePointerInfo PtrInfo,
-                                unsigned Alignment,
-                                AtomicOrdering SuccessOrdering,
-                                AtomicOrdering FailureOrdering,
-                                SynchronizationScope SynchScope) {
+SDValue SelectionDAG::getAtomicCmpSwap(
+    unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTs, SDValue Chain,
+    SDValue Ptr, SDValue Cmp, SDValue Swp, MachinePointerInfo PtrInfo,
+    unsigned Alignment, AtomicOrdering SuccessOrdering,
+    AtomicOrdering FailureOrdering, SynchronizationScope SynchScope) {
+  assert(Opcode == ISD::ATOMIC_CMP_SWAP ||
+         Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
+  assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");
+
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(MemVT);
 
   MachineFunction &MF = getMachineFunction();
 
-  // All atomics are load and store, except for ATMOIC_LOAD and ATOMIC_STORE.
-  // For now, atomics are considered to be volatile always.
   // FIXME: Volatile isn't really correct; we should keep track of atomic
   // orderings in the memoperand.
   unsigned Flags = MachineMemOperand::MOVolatile;
-  if (Opcode != ISD::ATOMIC_STORE)
-    Flags |= MachineMemOperand::MOLoad;
-  if (Opcode != ISD::ATOMIC_LOAD)
-    Flags |= MachineMemOperand::MOStore;
+  Flags |= MachineMemOperand::MOLoad;
+  Flags |= MachineMemOperand::MOStore;
 
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment);
 
-  return getAtomic(Opcode, dl, MemVT, Chain, Ptr, Cmp, Swp, MMO,
-                   SuccessOrdering, FailureOrdering, SynchScope);
+  return getAtomicCmpSwap(Opcode, dl, MemVT, VTs, Chain, Ptr, Cmp, Swp, MMO,
+                          SuccessOrdering, FailureOrdering, SynchScope);
 }
 
-SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
-                                SDValue Chain,
-                                SDValue Ptr, SDValue Cmp,
-                                SDValue Swp, MachineMemOperand *MMO,
-                                AtomicOrdering SuccessOrdering,
-                                AtomicOrdering FailureOrdering,
-                                SynchronizationScope SynchScope) {
-  assert(Opcode == ISD::ATOMIC_CMP_SWAP && "Invalid Atomic Op");
+SDValue SelectionDAG::getAtomicCmpSwap(unsigned Opcode, SDLoc dl, EVT MemVT,
+                                       SDVTList VTs, SDValue Chain, SDValue Ptr,
+                                       SDValue Cmp, SDValue Swp,
+                                       MachineMemOperand *MMO,
+                                       AtomicOrdering SuccessOrdering,
+                                       AtomicOrdering FailureOrdering,
+                                       SynchronizationScope SynchScope) {
+  assert(Opcode == ISD::ATOMIC_CMP_SWAP ||
+         Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
   assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");
 
-  EVT VT = Cmp.getValueType();
-
-  SDVTList VTs = getVTList(VT, MVT::Other);
   SDValue Ops[] = {Chain, Ptr, Cmp, Swp};
-  return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO, SuccessOrdering,
-                   FailureOrdering, SynchScope);
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO,
+                   SuccessOrdering, FailureOrdering, SynchScope);
 }
 
 SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
@@ -5610,10 +5734,13 @@ SelectionDAG::getTargetInsertSubreg(int SRIdx, SDLoc DL, EVT VT,
 /// getNodeIfExists - Get the specified node if it's already available, or
 /// else return NULL.
 SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
-                                      ArrayRef<SDValue> Ops) {
-  if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
+                                      ArrayRef<SDValue> Ops, bool nuw, bool nsw,
+                                      bool exact) {
+  if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTList, Ops);
+    if (isBinOpWithFlags(Opcode))
+      AddBinaryNodeIDCustom(ID, nuw, nsw, exact);
     void *IP = nullptr;
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return E;
@@ -5960,7 +6087,7 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
   // count of outstanding operands.
   for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ) {
     SDNode *N = I++;
-    checkForCycles(N);
+    checkForCycles(N, this);
     unsigned Degree = N->getNumOperands();
     if (Degree == 0) {
       // A node with no uses, add it to the result array immediately.
@@ -5980,7 +6107,7 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
   // such that by the time the end is reached all nodes will be sorted.
   for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ++I) {
     SDNode *N = I;
-    checkForCycles(N);
+    checkForCycles(N, this);
     // N is in sorted position, so all its uses have one less operand
     // that needs to be sorted.
     for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
@@ -6005,7 +6132,9 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
 #ifndef NDEBUG
       SDNode *S = ++I;
       dbgs() << "Overran sorted position:\n";
-      S->dumprFull();
+      S->dumprFull(this); dbgs() << "\n";
+      dbgs() << "Checking if this is due to cycles\n";
+      checkForCycles(this, true);
 #endif
       llvm_unreachable(nullptr);
     }
@@ -6554,16 +6683,43 @@ bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue,
   return true;
 }
 
-ConstantSDNode *BuildVectorSDNode::getConstantSplatValue() const {
-  SDValue Op0 = getOperand(0);
-  if (Op0.getOpcode() != ISD::Constant)
-    return nullptr;
+SDValue BuildVectorSDNode::getSplatValue(BitVector *UndefElements) const {
+  if (UndefElements) {
+    UndefElements->clear();
+    UndefElements->resize(getNumOperands());
+  }
+  SDValue Splatted;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    SDValue Op = getOperand(i);
+    if (Op.getOpcode() == ISD::UNDEF) {
+      if (UndefElements)
+        (*UndefElements)[i] = true;
+    } else if (!Splatted) {
+      Splatted = Op;
+    } else if (Splatted != Op) {
+      return SDValue();
+    }
+  }
+
+  if (!Splatted) {
+    assert(getOperand(0).getOpcode() == ISD::UNDEF &&
+           "Can only have a splat without a constant for all undefs.");
+    return getOperand(0);
+  }
 
-  for (unsigned i = 1, e = getNumOperands(); i != e; ++i)
-    if (getOperand(i) != Op0)
-      return nullptr;
+  return Splatted;
+}
 
-  return cast<ConstantSDNode>(Op0);
+ConstantSDNode *
+BuildVectorSDNode::getConstantSplatNode(BitVector *UndefElements) const {
+  return dyn_cast_or_null<ConstantSDNode>(
+      getSplatValue(UndefElements).getNode());
+}
+
+ConstantFPSDNode *
+BuildVectorSDNode::getConstantFPSplatNode(BitVector *UndefElements) const {
+  return dyn_cast_or_null<ConstantFPSDNode>(
+      getSplatValue(UndefElements).getNode());
 }
 
 bool BuildVectorSDNode::isConstant() const {
@@ -6591,10 +6747,11 @@ bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
   return true;
 }
 
-#ifdef XDEBUG
+#ifndef NDEBUG
 static void checkForCyclesHelper(const SDNode *N,
                                  SmallPtrSet<const SDNode*, 32> &Visited,
-                                 SmallPtrSet<const SDNode*, 32> &Checked) {
+                                 SmallPtrSet<const SDNode*, 32> &Checked,
+                                 const llvm::SelectionDAG *DAG) {
   // If this node has already been checked, don't check it again.
   if (Checked.count(N))
     return;
@@ -6602,29 +6759,37 @@ static void checkForCyclesHelper(const SDNode *N,
   // If a node has already been visited on this depth-first walk, reject it as
   // a cycle.
   if (!Visited.insert(N)) {
-    dbgs() << "Offending node:\n";
-    N->dumprFull();
     errs() << "Detected cycle in SelectionDAG\n";
+    dbgs() << "Offending node:\n";
+    N->dumprFull(DAG); dbgs() << "\n";
     abort();
   }
 
   for(unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-    checkForCyclesHelper(N->getOperand(i).getNode(), Visited, Checked);
+    checkForCyclesHelper(N->getOperand(i).getNode(), Visited, Checked, DAG);
 
   Checked.insert(N);
   Visited.erase(N);
 }
 #endif
 
-void llvm::checkForCycles(const llvm::SDNode *N) {
+void llvm::checkForCycles(const llvm::SDNode *N,
+                          const llvm::SelectionDAG *DAG,
+                          bool force) {
+#ifndef NDEBUG
+  bool check = force;
 #ifdef XDEBUG
-  assert(N && "Checking nonexistent SDNode");
-  SmallPtrSet<const SDNode*, 32> visited;
-  SmallPtrSet<const SDNode*, 32> checked;
-  checkForCyclesHelper(N, visited, checked);
-#endif
+  check = true;
+#endif  // XDEBUG
+  if (check) {
+    assert(N && "Checking nonexistent SDNode");
+    SmallPtrSet<const SDNode*, 32> visited;
+    SmallPtrSet<const SDNode*, 32> checked;
+    checkForCyclesHelper(N, visited, checked, DAG);
+  }
+#endif  // !NDEBUG
 }
 
-void llvm::checkForCycles(const llvm::SelectionDAG *DAG) {
-  checkForCycles(DAG->getRoot().getNode());
+void llvm::checkForCycles(const llvm::SelectionDAG *DAG, bool force) {
+  checkForCycles(DAG->getRoot().getNode(), DAG, force);
 }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 070e929..28d8e98 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -169,7 +169,7 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, SDLoc DL,
       SDValue Lo, Hi;
       Lo = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[0]);
       Hi = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[1]);
-      if (TLI.isBigEndian())
+      if (TLI.hasBigEndianPartOrdering(ValueVT))
         std::swap(Lo, Hi);
       Val = DAG.getNode(ISD::BUILD_PAIR, DL, ValueVT, Lo, Hi);
     } else {
@@ -2784,8 +2784,22 @@ void SelectionDAGBuilder::visitFSub(const User &I) {
 void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
-  setValue(&I, DAG.getNode(OpCode, getCurSDLoc(),
-                           Op1.getValueType(), Op1, Op2));
+
+  bool nuw = false;
+  bool nsw = false;
+  bool exact = false;
+  if (const OverflowingBinaryOperator *OFBinOp =
+          dyn_cast<const OverflowingBinaryOperator>(&I)) {
+    nuw = OFBinOp->hasNoUnsignedWrap();
+    nsw = OFBinOp->hasNoSignedWrap();
+  }
+  if (const PossiblyExactOperator *ExactOp =
+          dyn_cast<const PossiblyExactOperator>(&I))
+    exact = ExactOp->isExact();
+
+  SDValue BinNodeValue = DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(),
+                                     Op1, Op2, nuw, nsw, exact);
+  setValue(&I, BinNodeValue);
 }
 
 void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
@@ -2816,8 +2830,25 @@ void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
       Op2 = DAG.getZExtOrTrunc(Op2, DL, MVT::i32);
   }
 
-  setValue(&I, DAG.getNode(Opcode, getCurSDLoc(),
-                           Op1.getValueType(), Op1, Op2));
+  bool nuw = false;
+  bool nsw = false;
+  bool exact = false;
+
+  if (Opcode == ISD::SRL || Opcode == ISD::SRA || Opcode == ISD::SHL) {
+
+    if (const OverflowingBinaryOperator *OFBinOp =
+            dyn_cast<const OverflowingBinaryOperator>(&I)) {
+      nuw = OFBinOp->hasNoUnsignedWrap();
+      nsw = OFBinOp->hasNoSignedWrap();
+    }
+    if (const PossiblyExactOperator *ExactOp =
+            dyn_cast<const PossiblyExactOperator>(&I))
+      exact = ExactOp->isExact();
+  }
+
+  SDValue Res = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(), Op1, Op2,
+                            nuw, nsw, exact);
+  setValue(&I, Res);
 }
 
 void SelectionDAGBuilder::visitSDiv(const User &I) {
@@ -3570,12 +3601,12 @@ static SDValue InsertFenceForAtomic(SDValue Chain, AtomicOrdering Order,
   if (Before) {
     if (Order == AcquireRelease || Order == SequentiallyConsistent)
       Order = Release;
-    else if (Order == Acquire || Order == Monotonic)
+    else if (Order == Acquire || Order == Monotonic || Order == Unordered)
       return Chain;
   } else {
     if (Order == AcquireRelease)
       Order = Acquire;
-    else if (Order == Release || Order == Monotonic)
+    else if (Order == Release || Order == Monotonic || Order == Unordered)
       return Chain;
   }
   SDValue Ops[3];
@@ -3598,19 +3629,17 @@ void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
     InChain = InsertFenceForAtomic(InChain, SuccessOrder, Scope, true, dl,
                                    DAG, *TLI);
 
-  SDValue L =
-    DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl,
-                  getValue(I.getCompareOperand()).getSimpleValueType(),
-                  InChain,
-                  getValue(I.getPointerOperand()),
-                  getValue(I.getCompareOperand()),
-                  getValue(I.getNewValOperand()),
-                  MachinePointerInfo(I.getPointerOperand()), 0 /* Alignment */,
-                  TLI->getInsertFencesForAtomic() ? Monotonic : SuccessOrder,
-                  TLI->getInsertFencesForAtomic() ? Monotonic : FailureOrder,
-                  Scope);
+  MVT MemVT = getValue(I.getCompareOperand()).getSimpleValueType();
+  SDVTList VTs = DAG.getVTList(MemVT, MVT::i1, MVT::Other);
+  SDValue L = DAG.getAtomicCmpSwap(
+      ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, MemVT, VTs, InChain,
+      getValue(I.getPointerOperand()), getValue(I.getCompareOperand()),
+      getValue(I.getNewValOperand()), MachinePointerInfo(I.getPointerOperand()),
+      0 /* Alignment */,
+      TLI->getInsertFencesForAtomic() ? Monotonic : SuccessOrder,
+      TLI->getInsertFencesForAtomic() ? Monotonic : FailureOrder, Scope);
 
-  SDValue OutChain = L.getValue(1);
+  SDValue OutChain = L.getValue(2);
 
   if (TLI->getInsertFencesForAtomic())
     OutChain = InsertFenceForAtomic(OutChain, SuccessOrder, Scope, false, dl,
@@ -5293,7 +5322,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     CLI.setDebugLoc(sdl).setChain(getRoot())
       .setCallee(CallingConv::C, I.getType(),
                  DAG.getExternalSymbol(TrapFuncName.data(), TLI->getPointerTy()),
-                 &Args, 0);
+                 std::move(Args), 0);
 
     std::pair<SDValue, SDValue> Result = TLI->LowerCallTo(CLI);
     DAG.setRoot(Result.second);
@@ -5410,6 +5439,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
 void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
                                       bool isTailCall,
                                       MachineBasicBlock *LandingPad) {
+  const TargetLowering *TLI = TM.getTargetLowering();
   PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
   FunctionType *FTy = cast<FunctionType>(PT->getElementType());
   Type *RetTy = FTy->getReturnType();
@@ -5420,45 +5450,6 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   TargetLowering::ArgListEntry Entry;
   Args.reserve(CS.arg_size());
 
-  // Check whether the function can return without sret-demotion.
-  SmallVector<ISD::OutputArg, 4> Outs;
-  const TargetLowering *TLI = TM.getTargetLowering();
-  GetReturnInfo(RetTy, CS.getAttributes(), Outs, *TLI);
-
-  bool CanLowerReturn = TLI->CanLowerReturn(CS.getCallingConv(),
-                                            DAG.getMachineFunction(),
-                                            FTy->isVarArg(), Outs,
-                                            FTy->getContext());
-
-  SDValue DemoteStackSlot;
-  int DemoteStackIdx = -100;
-
-  if (!CanLowerReturn) {
-    assert(!CS.hasInAllocaArgument() &&
-           "sret demotion is incompatible with inalloca");
-    uint64_t TySize = TLI->getDataLayout()->getTypeAllocSize(
-                      FTy->getReturnType());
-    unsigned Align  = TLI->getDataLayout()->getPrefTypeAlignment(
-                      FTy->getReturnType());
-    MachineFunction &MF = DAG.getMachineFunction();
-    DemoteStackIdx = MF.getFrameInfo()->CreateStackObject(TySize, Align, false);
-    Type *StackSlotPtrType = PointerType::getUnqual(FTy->getReturnType());
-
-    DemoteStackSlot = DAG.getFrameIndex(DemoteStackIdx, TLI->getPointerTy());
-    Entry.Node = DemoteStackSlot;
-    Entry.Ty = StackSlotPtrType;
-    Entry.isSExt = false;
-    Entry.isZExt = false;
-    Entry.isInReg = false;
-    Entry.isSRet = true;
-    Entry.isNest = false;
-    Entry.isByVal = false;
-    Entry.isReturned = false;
-    Entry.Alignment = Align;
-    Args.push_back(Entry);
-    RetTy = Type::getVoidTy(FTy->getContext());
-  }
-
   for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
        i != e; ++i) {
     const Value *V = *i;
@@ -5499,58 +5490,20 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
 
   // Check if target-independent constraints permit a tail call here.
   // Target-dependent constraints are checked within TLI->LowerCallTo.
-  if (isTailCall && !isInTailCallPosition(CS, *TLI))
+  if (isTailCall && !isInTailCallPosition(CS, DAG))
     isTailCall = false;
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot())
-    .setCallee(RetTy, FTy, Callee, &Args, CS).setTailCall(isTailCall);
+    .setCallee(RetTy, FTy, Callee, std::move(Args), CS).setTailCall(isTailCall);
 
   std::pair<SDValue,SDValue> Result = TLI->LowerCallTo(CLI);
   assert((isTailCall || Result.second.getNode()) &&
          "Non-null chain expected with non-tail call!");
   assert((Result.second.getNode() || !Result.first.getNode()) &&
          "Null value expected with tail call!");
-  if (Result.first.getNode()) {
+  if (Result.first.getNode())
     setValue(CS.getInstruction(), Result.first);
-  } else if (!CanLowerReturn && Result.second.getNode()) {
-    // The instruction result is the result of loading from the
-    // hidden sret parameter.
-    SmallVector<EVT, 1> PVTs;
-    Type *PtrRetTy = PointerType::getUnqual(FTy->getReturnType());
-
-    ComputeValueVTs(*TLI, PtrRetTy, PVTs);
-    assert(PVTs.size() == 1 && "Pointers should fit in one register");
-    EVT PtrVT = PVTs[0];
-
-    SmallVector<EVT, 4> RetTys;
-    SmallVector<uint64_t, 4> Offsets;
-    RetTy = FTy->getReturnType();
-    ComputeValueVTs(*TLI, RetTy, RetTys, &Offsets);
-
-    unsigned NumValues = RetTys.size();
-    SmallVector<SDValue, 4> Values(NumValues);
-    SmallVector<SDValue, 4> Chains(NumValues);
-
-    for (unsigned i = 0; i < NumValues; ++i) {
-      SDValue Add = DAG.getNode(ISD::ADD, getCurSDLoc(), PtrVT,
-                                DemoteStackSlot,
-                                DAG.getConstant(Offsets[i], PtrVT));
-      SDValue L = DAG.getLoad(RetTys[i], getCurSDLoc(), Result.second, Add,
-                  MachinePointerInfo::getFixedStack(DemoteStackIdx, Offsets[i]),
-                              false, false, false, 1);
-      Values[i] = L;
-      Chains[i] = L.getValue(1);
-    }
-
-    SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
-                                MVT::Other, Chains);
-    PendingLoads.push_back(Chain);
-
-    setValue(CS.getInstruction(),
-             DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
-                         DAG.getVTList(RetTys), Values));
-  }
 
   if (!Result.second.getNode()) {
     // As a special case, a null chain means that a tail call has been emitted
@@ -6845,7 +6798,7 @@ SelectionDAGBuilder::LowerCallOperands(const CallInst &CI, unsigned ArgIdx,
   Type *retTy = useVoidTy ? Type::getVoidTy(*DAG.getContext()) : CI.getType();
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot())
-    .setCallee(CI.getCallingConv(), retTy, Callee, &Args, NumArgs)
+    .setCallee(CI.getCallingConv(), retTy, Callee, std::move(Args), NumArgs)
     .setDiscardResult(!CI.use_empty());
 
   const TargetLowering *TLI = TM.getTargetLowering();
@@ -7092,6 +7045,21 @@ void SelectionDAGBuilder::visitPatchpoint(const CallInst &CI) {
   FuncInfo.MF->getFrameInfo()->setHasPatchPoint();
 }
 
+/// Returns an AttributeSet representing the attributes applied to the return
+/// value of the given call.
+static AttributeSet getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
+  SmallVector<Attribute::AttrKind, 2> Attrs;
+  if (CLI.RetSExt)
+    Attrs.push_back(Attribute::SExt);
+  if (CLI.RetZExt)
+    Attrs.push_back(Attribute::ZExt);
+  if (CLI.IsInReg)
+    Attrs.push_back(Attribute::InReg);
+
+  return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex,
+                           Attrs);
+}
+
 /// TargetLowering::LowerCallTo - This is the default LowerCallTo
 /// implementation, which just calls LowerCall.
 /// FIXME: When all targets are
@@ -7100,24 +7068,62 @@ std::pair<SDValue, SDValue>
 TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   // Handle the incoming return values from the call.
   CLI.Ins.clear();
+  Type *OrigRetTy = CLI.RetTy;
   SmallVector<EVT, 4> RetTys;
-  ComputeValueVTs(*this, CLI.RetTy, RetTys);
-  for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
-    EVT VT = RetTys[I];
-    MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
-    unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
-    for (unsigned i = 0; i != NumRegs; ++i) {
-      ISD::InputArg MyFlags;
-      MyFlags.VT = RegisterVT;
-      MyFlags.ArgVT = VT;
-      MyFlags.Used = CLI.IsReturnValueUsed;
-      if (CLI.RetSExt)
-        MyFlags.Flags.setSExt();
-      if (CLI.RetZExt)
-        MyFlags.Flags.setZExt();
-      if (CLI.IsInReg)
-        MyFlags.Flags.setInReg();
-      CLI.Ins.push_back(MyFlags);
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(*this, CLI.RetTy, RetTys, &Offsets);
+
+  SmallVector<ISD::OutputArg, 4> Outs;
+  GetReturnInfo(CLI.RetTy, getReturnAttrs(CLI), Outs, *this);
+
+  bool CanLowerReturn =
+      this->CanLowerReturn(CLI.CallConv, CLI.DAG.getMachineFunction(),
+                           CLI.IsVarArg, Outs, CLI.RetTy->getContext());
+
+  SDValue DemoteStackSlot;
+  int DemoteStackIdx = -100;
+  if (!CanLowerReturn) {
+    // FIXME: equivalent assert?
+    // assert(!CS.hasInAllocaArgument() &&
+    //        "sret demotion is incompatible with inalloca");
+    uint64_t TySize = getDataLayout()->getTypeAllocSize(CLI.RetTy);
+    unsigned Align  = getDataLayout()->getPrefTypeAlignment(CLI.RetTy);
+    MachineFunction &MF = CLI.DAG.getMachineFunction();
+    DemoteStackIdx = MF.getFrameInfo()->CreateStackObject(TySize, Align, false);
+    Type *StackSlotPtrType = PointerType::getUnqual(CLI.RetTy);
+
+    DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getPointerTy());
+    ArgListEntry Entry;
+    Entry.Node = DemoteStackSlot;
+    Entry.Ty = StackSlotPtrType;
+    Entry.isSExt = false;
+    Entry.isZExt = false;
+    Entry.isInReg = false;
+    Entry.isSRet = true;
+    Entry.isNest = false;
+    Entry.isByVal = false;
+    Entry.isReturned = false;
+    Entry.Alignment = Align;
+    CLI.getArgs().insert(CLI.getArgs().begin(), Entry);
+    CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext());
+  } else {
+    for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+      EVT VT = RetTys[I];
+      MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
+      unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
+      for (unsigned i = 0; i != NumRegs; ++i) {
+        ISD::InputArg MyFlags;
+        MyFlags.VT = RegisterVT;
+        MyFlags.ArgVT = VT;
+        MyFlags.Used = CLI.IsReturnValueUsed;
+        if (CLI.RetSExt)
+          MyFlags.Flags.setSExt();
+        if (CLI.RetZExt)
+          MyFlags.Flags.setZExt();
+        if (CLI.IsInReg)
+          MyFlags.Flags.setInReg();
+        CLI.Ins.push_back(MyFlags);
+      }
     }
   }
 
@@ -7260,31 +7266,59 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
                  "LowerCall emitted a value with the wrong type!");
         });
 
-  // Collect the legal value parts into potentially illegal values
-  // that correspond to the original function's return values.
-  ISD::NodeType AssertOp = ISD::DELETED_NODE;
-  if (CLI.RetSExt)
-    AssertOp = ISD::AssertSext;
-  else if (CLI.RetZExt)
-    AssertOp = ISD::AssertZext;
   SmallVector<SDValue, 4> ReturnValues;
-  unsigned CurReg = 0;
-  for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
-    EVT VT = RetTys[I];
-    MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
-    unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
-
-    ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg],
-                                            NumRegs, RegisterVT, VT, nullptr,
-                                            AssertOp));
-    CurReg += NumRegs;
-  }
-
-  // For a function returning void, there is no return value. We can't create
-  // such a node, so we just return a null return value in that case. In
-  // that case, nothing will actually look at the value.
-  if (ReturnValues.empty())
-    return std::make_pair(SDValue(), CLI.Chain);
+  if (!CanLowerReturn) {
+    // The instruction result is the result of loading from the
+    // hidden sret parameter.
+    SmallVector<EVT, 1> PVTs;
+    Type *PtrRetTy = PointerType::getUnqual(OrigRetTy);
+
+    ComputeValueVTs(*this, PtrRetTy, PVTs);
+    assert(PVTs.size() == 1 && "Pointers should fit in one register");
+    EVT PtrVT = PVTs[0];
+
+    unsigned NumValues = RetTys.size();
+    ReturnValues.resize(NumValues);
+    SmallVector<SDValue, 4> Chains(NumValues);
+
+    for (unsigned i = 0; i < NumValues; ++i) {
+      SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot,
+                                    CLI.DAG.getConstant(Offsets[i], PtrVT));
+      SDValue L = CLI.DAG.getLoad(
+          RetTys[i], CLI.DL, CLI.Chain, Add,
+          MachinePointerInfo::getFixedStack(DemoteStackIdx, Offsets[i]), false,
+          false, false, 1);
+      ReturnValues[i] = L;
+      Chains[i] = L.getValue(1);
+    }
+
+    CLI.Chain = CLI.DAG.getNode(ISD::TokenFactor, CLI.DL, MVT::Other, Chains);
+  } else {
+    // Collect the legal value parts into potentially illegal values
+    // that correspond to the original function's return values.
+    ISD::NodeType AssertOp = ISD::DELETED_NODE;
+    if (CLI.RetSExt)
+      AssertOp = ISD::AssertSext;
+    else if (CLI.RetZExt)
+      AssertOp = ISD::AssertZext;
+    unsigned CurReg = 0;
+    for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+      EVT VT = RetTys[I];
+      MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
+      unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
+
+      ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg],
+                                              NumRegs, RegisterVT, VT, nullptr,
+                                              AssertOp));
+      CurReg += NumRegs;
+    }
+
+    // For a function returning void, there is no return value. We can't create
+    // such a node, so we just return a null return value in that case. In
+    // that case, nothing will actually look at the value.
+    if (ReturnValues.empty())
+      return std::make_pair(SDValue(), CLI.Chain);
+  }
 
   SDValue Res = CLI.DAG.getNode(ISD::MERGE_VALUES, CLI.DL,
                                 CLI.DAG.getVTList(RetTys), ReturnValues);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index fb29691..84679f9 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -320,7 +320,7 @@ private:
   ///   1. Preserve the architecture independence of stack protector generation.
   ///
   ///   2. Preserve the normal IR level stack protector check for platforms like
-  ///      OpenBSD for which we support platform specific stack protector
+  ///      OpenBSD for which we support platform-specific stack protector
   ///      generation.
   ///
   /// The main problem that guided the present solution is that one can not
@@ -338,7 +338,7 @@ private:
   ///      basic block (where the return inst is placed) and then move it back
   ///      later at SelectionDAG/MI time before the stack protector check if the
   ///      tail call optimization failed. The MI level option was nixed
-  ///      immediately since it would require platform specific pattern
+  ///      immediately since it would require platform-specific pattern
   ///      matching. The SelectionDAG level option was nixed because
   ///      SelectionDAG only processes one IR level basic block at a time
   ///      implying one could not create a DAG Combine to move the callinst.
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index d6b5255..b3a452f 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -55,6 +55,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::PREFETCH:                   return "Prefetch";
   case ISD::ATOMIC_FENCE:               return "AtomicFence";
   case ISD::ATOMIC_CMP_SWAP:            return "AtomicCmpSwap";
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: return "AtomicCmpSwapWithSuccess";
   case ISD::ATOMIC_SWAP:                return "AtomicSwap";
   case ISD::ATOMIC_LOAD_ADD:            return "AtomicLoadAdd";
   case ISD::ATOMIC_LOAD_SUB:            return "AtomicLoadSub";
@@ -220,6 +221,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::ZERO_EXTEND:                return "zero_extend";
   case ISD::ANY_EXTEND:                 return "any_extend";
   case ISD::SIGN_EXTEND_INREG:          return "sign_extend_inreg";
+  case ISD::ANY_EXTEND_VECTOR_INREG:    return "any_extend_vector_inreg";
+  case ISD::SIGN_EXTEND_VECTOR_INREG:   return "sign_extend_vector_inreg";
+  case ISD::ZERO_EXTEND_VECTOR_INREG:   return "zero_extend_vector_inreg";
   case ISD::TRUNCATE:                   return "truncate";
   case ISD::FP_ROUND:                   return "fp_round";
   case ISD::FLT_ROUNDS_:                return "flt_rounds";
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 472fc9c..57e22e2 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -141,6 +141,25 @@ STATISTIC(NumFastIselFailShuffleVector,"Fast isel fails on ShuffleVector");
 STATISTIC(NumFastIselFailExtractValue,"Fast isel fails on ExtractValue");
 STATISTIC(NumFastIselFailInsertValue,"Fast isel fails on InsertValue");
 STATISTIC(NumFastIselFailLandingPad,"Fast isel fails on LandingPad");
+
+// Intrinsic instructions...
+STATISTIC(NumFastIselFailIntrinsicCall, "Fast isel fails on Intrinsic call");
+STATISTIC(NumFastIselFailSAddWithOverflow,
+          "Fast isel fails on sadd.with.overflow");
+STATISTIC(NumFastIselFailUAddWithOverflow,
+          "Fast isel fails on uadd.with.overflow");
+STATISTIC(NumFastIselFailSSubWithOverflow,
+          "Fast isel fails on ssub.with.overflow");
+STATISTIC(NumFastIselFailUSubWithOverflow,
+          "Fast isel fails on usub.with.overflow");
+STATISTIC(NumFastIselFailSMulWithOverflow,
+          "Fast isel fails on smul.with.overflow");
+STATISTIC(NumFastIselFailUMulWithOverflow,
+          "Fast isel fails on umul.with.overflow");
+STATISTIC(NumFastIselFailFrameaddress, "Fast isel fails on Frameaddress");
+STATISTIC(NumFastIselFailSqrt, "Fast isel fails on sqrt call");
+STATISTIC(NumFastIselFailStackMap, "Fast isel fails on StackMap call");
+STATISTIC(NumFastIselFailPatchPoint, "Fast isel fails on PatchPoint call");
 #endif
 
 static cl::opt<bool>
@@ -974,7 +993,37 @@ static void collectFailStats(const Instruction *I) {
   case Instruction::FCmp:           NumFastIselFailFCmp++; return;
   case Instruction::PHI:            NumFastIselFailPHI++; return;
   case Instruction::Select:         NumFastIselFailSelect++; return;
-  case Instruction::Call:           NumFastIselFailCall++; return;
+  case Instruction::Call: {
+    if (auto const *Intrinsic = dyn_cast<IntrinsicInst>(I)) {
+      switch (Intrinsic->getIntrinsicID()) {
+      default:
+        NumFastIselFailIntrinsicCall++; return;
+      case Intrinsic::sadd_with_overflow:
+        NumFastIselFailSAddWithOverflow++; return;
+      case Intrinsic::uadd_with_overflow:
+        NumFastIselFailUAddWithOverflow++; return;
+      case Intrinsic::ssub_with_overflow:
+        NumFastIselFailSSubWithOverflow++; return;
+      case Intrinsic::usub_with_overflow:
+        NumFastIselFailUSubWithOverflow++; return;
+      case Intrinsic::smul_with_overflow:
+        NumFastIselFailSMulWithOverflow++; return;
+      case Intrinsic::umul_with_overflow:
+        NumFastIselFailUMulWithOverflow++; return;
+      case Intrinsic::frameaddress:
+        NumFastIselFailFrameaddress++; return;
+      case Intrinsic::sqrt:
+          NumFastIselFailSqrt++; return;
+      case Intrinsic::experimental_stackmap:
+        NumFastIselFailStackMap++; return;
+      case Intrinsic::experimental_patchpoint_void: // fall-through
+      case Intrinsic::experimental_patchpoint_i64:
+        NumFastIselFailPatchPoint++; return;
+      }
+    }
+    NumFastIselFailCall++;
+    return;
+  }
   case Instruction::Shl:            NumFastIselFailShl++; return;
   case Instruction::LShr:           NumFastIselFailLShr++; return;
   case Instruction::AShr:           NumFastIselFailAShr++; return;
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b75d805..42372a2 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -105,7 +105,7 @@ TargetLowering::makeLibCall(SelectionDAG &DAG,
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, &Args, 0)
+    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0)
     .setNoReturn(doesNotReturn).setDiscardResult(!isReturnValueUsed)
     .setSExtResult(isSigned).setZExtResult(!isSigned);
   return LowerCallTo(CLI);
@@ -327,6 +327,10 @@ TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op,
   assert(Op.getNode()->getNumValues() == 1 &&
          "ShrinkDemandedOp only supports nodes with one result!");
 
+  // Early return, as this function cannot handle vector types.
+  if (Op.getValueType().isVector())
+    return false;
+
   // Don't do this if the node has another user, which may require the
   // full value.
   if (!Op.getNode()->hasOneUse())
@@ -1146,18 +1150,21 @@ bool TargetLowering::isConstTrueVal(const SDNode *N) const {
   if (!N)
     return false;
 
-  bool IsVec = false;
   const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
   if (!CN) {
     const BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N);
     if (!BV)
       return false;
 
-    IsVec = true;
-    CN = BV->getConstantSplatValue();
+    BitVector UndefElements;
+    CN = BV->getConstantSplatNode(&UndefElements);
+    // Only interested in constant splats, and we don't try to handle undef
+    // elements in identifying boolean constants.
+    if (!CN || UndefElements.none())
+      return false;
   }
 
-  switch (getBooleanContents(IsVec)) {
+  switch (getBooleanContents(N->getValueType(0))) {
   case UndefinedBooleanContent:
     return CN->getAPIntValue()[0];
   case ZeroOrOneBooleanContent:
@@ -1173,18 +1180,21 @@ bool TargetLowering::isConstFalseVal(const SDNode *N) const {
   if (!N)
     return false;
 
-  bool IsVec = false;
   const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
   if (!CN) {
     const BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N);
     if (!BV)
       return false;
 
-    IsVec = true;
-    CN = BV->getConstantSplatValue();
+    BitVector UndefElements;
+    CN = BV->getConstantSplatNode(&UndefElements);
+    // Only interested in constant splats, and we don't try to handle undef
+    // elements in identifying boolean constants.
+    if (!CN || UndefElements.none())
+      return false;
   }
 
-  if (getBooleanContents(IsVec) == UndefinedBooleanContent)
+  if (getBooleanContents(N->getValueType(0)) == UndefinedBooleanContent)
     return !CN->getAPIntValue()[0];
 
   return CN->isNullValue();
@@ -1205,7 +1215,8 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   case ISD::SETFALSE2: return DAG.getConstant(0, VT);
   case ISD::SETTRUE:
   case ISD::SETTRUE2: {
-    TargetLowering::BooleanContent Cnt = getBooleanContents(VT.isVector());
+    TargetLowering::BooleanContent Cnt =
+        getBooleanContents(N0->getValueType(0));
     return DAG.getConstant(
         Cnt == TargetLowering::ZeroOrNegativeOneBooleanContent ? -1ULL : 1, VT);
   }
@@ -1412,7 +1423,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
 
           SDValue NewSetCC = DAG.getSetCC(dl, NewSetCCVT, N0.getOperand(0),
                                           NewConst, Cond);
-          return DAG.getBoolExtOrTrunc(NewSetCC, dl, VT);
+          return DAG.getBoolExtOrTrunc(NewSetCC, dl, VT, N0.getValueType());
         }
         break;
       }
@@ -1496,7 +1507,8 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         }
       } else if (N1C->getAPIntValue() == 1 &&
                  (VT == MVT::i1 ||
-                  getBooleanContents(false) == ZeroOrOneBooleanContent)) {
+                  getBooleanContents(N0->getValueType(0)) ==
+                      ZeroOrOneBooleanContent)) {
         SDValue Op0 = N0;
         if (Op0.getOpcode() == ISD::TRUNCATE)
           Op0 = Op0.getOperand(0);
@@ -1767,7 +1779,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
     // The sext(setcc()) => setcc() optimization relies on the appropriate
     // constant being emitted.
     uint64_t EqVal = 0;
-    switch (getBooleanContents(N0.getValueType().isVector())) {
+    switch (getBooleanContents(N0.getValueType())) {
     case UndefinedBooleanContent:
     case ZeroOrOneBooleanContent:
       EqVal = ISD::isTrueWhenEqual(Cond);
@@ -2613,7 +2625,8 @@ SDValue TargetLowering::BuildExactSDIV(SDValue Op1, SDValue Op2, SDLoc dl,
   if (ShAmt) {
     // TODO: For UDIV use SRL instead of SRA.
     SDValue Amt = DAG.getConstant(ShAmt, getShiftAmountTy(Op1.getValueType()));
-    Op1 = DAG.getNode(ISD::SRA, dl, Op1.getValueType(), Op1, Amt);
+    Op1 = DAG.getNode(ISD::SRA, dl, Op1.getValueType(), Op1, Amt, false, false,
+                      true);
     d = d.ashr(ShAmt);
   }
 
diff --git a/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp b/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp
index 1120be8..0e89bad 100644
--- a/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp
@@ -15,8 +15,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
-TargetSelectionDAGInfo::TargetSelectionDAGInfo(const TargetMachine &TM)
-  : DL(TM.getDataLayout()) {
+TargetSelectionDAGInfo::TargetSelectionDAGInfo(const DataLayout *DL)
+  : DL(DL) {
 }
 
 TargetSelectionDAGInfo::~TargetSelectionDAGInfo() {
diff --git a/lib/CodeGen/StackMapLivenessAnalysis.cpp b/lib/CodeGen/StackMapLivenessAnalysis.cpp
index 4dd87dd..3ba502f 100644
--- a/lib/CodeGen/StackMapLivenessAnalysis.cpp
+++ b/lib/CodeGen/StackMapLivenessAnalysis.cpp
@@ -28,10 +28,9 @@ using namespace llvm;
 #define DEBUG_TYPE "stackmaps"
 
 namespace llvm {
-cl::opt<bool> EnableStackMapLiveness("enable-stackmap-liveness",
-  cl::Hidden, cl::desc("Enable StackMap Liveness Analysis Pass"));
 cl::opt<bool> EnablePatchPointLiveness("enable-patchpoint-liveness",
-  cl::Hidden, cl::desc("Enable PatchPoint Liveness Analysis Pass"));
+  cl::Hidden, cl::init(true),
+  cl::desc("Enable PatchPoint Liveness Analysis Pass"));
 }
 
 STATISTIC(NumStackMapFuncVisited, "Number of functions visited");
@@ -62,15 +61,17 @@ void StackMapLiveness::getAnalysisUsage(AnalysisUsage &AU) const {
 
 /// Calculate the liveness information for the given machine function.
 bool StackMapLiveness::runOnMachineFunction(MachineFunction &_MF) {
+  if (!EnablePatchPointLiveness)
+    return false;
+
   DEBUG(dbgs() << "********** COMPUTING STACKMAP LIVENESS: "
                << _MF.getName() << " **********\n");
   MF = &_MF;
   TRI = MF->getTarget().getRegisterInfo();
   ++NumStackMapFuncVisited;
 
-  // Skip this function if there are no stackmaps or patchpoints to process.
-  if (!((MF->getFrameInfo()->hasStackMap() && EnableStackMapLiveness) ||
-        (MF->getFrameInfo()->hasPatchPoint() && EnablePatchPointLiveness))) {
+  // Skip this function if there are no patchpoints to process.
+  if (!MF->getFrameInfo()->hasPatchPoint()) {
     ++NumStackMapFuncSkipped;
     return false;
   }
@@ -88,13 +89,10 @@ bool StackMapLiveness::calculateLiveness() {
     LiveRegs.addLiveOuts(MBBI);
     bool HasStackMap = false;
     // Reverse iterate over all instructions and add the current live register
-    // set to an instruction if we encounter a stackmap or patchpoint
-    // instruction.
+    // set to an instruction if we encounter a patchpoint instruction.
     for (MachineBasicBlock::reverse_iterator I = MBBI->rbegin(),
          E = MBBI->rend(); I != E; ++I) {
-      int Opc = I->getOpcode();
-      if ((EnableStackMapLiveness && (Opc == TargetOpcode::STACKMAP)) ||
-          (EnablePatchPointLiveness && (Opc == TargetOpcode::PATCHPOINT))) {
+      if (I->getOpcode() == TargetOpcode::PATCHPOINT) {
         addLiveOutSetToMI(*I);
         HasChanged = true;
         HasStackMap = true;
diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp
index c3f84c6..83966bd0 100644
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp
@@ -671,7 +671,7 @@ bool TargetInstrInfo::usePreRAHazardRecognizer() const {
 
 // Default implementation of CreateTargetRAHazardRecognizer.
 ScheduleHazardRecognizer *TargetInstrInfo::
-CreateTargetHazardRecognizer(const TargetMachine *TM,
+CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
                              const ScheduleDAG *DAG) const {
   // Dummy hazard recognizer allows all instructions to issue.
   return new ScheduleHazardRecognizer();
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 2634d71..c574fd4 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -39,7 +39,7 @@ using namespace llvm;
 
 /// InitLibcallNames - Set default libcall names.
 ///
-static void InitLibcallNames(const char **Names, const TargetMachine &TM) {
+static void InitLibcallNames(const char **Names, const Triple &TT) {
   Names[RTLIB::SHL_I16] = "__ashlhi3";
   Names[RTLIB::SHL_I32] = "__ashlsi3";
   Names[RTLIB::SHL_I64] = "__ashldi3";
@@ -384,7 +384,7 @@ static void InitLibcallNames(const char **Names, const TargetMachine &TM) {
   Names[RTLIB::SYNC_FETCH_AND_UMIN_8] = "__sync_fetch_and_umin_8";
   Names[RTLIB::SYNC_FETCH_AND_UMIN_16] = "__sync_fetch_and_umin_16";
   
-  if (Triple(TM.getTargetTriple()).getEnvironment() == Triple::GNU) {
+  if (TT.getEnvironment() == Triple::GNU) {
     Names[RTLIB::SINCOS_F32] = "sincosf";
     Names[RTLIB::SINCOS_F64] = "sincos";
     Names[RTLIB::SINCOS_F80] = "sincosl";
@@ -399,7 +399,7 @@ static void InitLibcallNames(const char **Names, const TargetMachine &TM) {
     Names[RTLIB::SINCOS_PPCF128] = nullptr;
   }
 
-  if (Triple(TM.getTargetTriple()).getOS() != Triple::OpenBSD) {
+  if (TT.getOS() != Triple::OpenBSD) {
     Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = "__stack_chk_fail";
   } else {
     // These are generally not available.
@@ -690,6 +690,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm,
   ExceptionPointerRegister = 0;
   ExceptionSelectorRegister = 0;
   BooleanContents = UndefinedBooleanContent;
+  BooleanFloatContents = UndefinedBooleanContent;
   BooleanVectorContents = UndefinedBooleanContent;
   SchedPreferenceInfo = Sched::ILP;
   JumpBufSize = 0;
@@ -702,7 +703,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm,
   SupportJumpTables = true;
   MinimumJumpTableEntries = 4;
 
-  InitLibcallNames(LibcallRoutineNames, TM);
+  InitLibcallNames(LibcallRoutineNames, Triple(TM.getTargetTriple()));
   InitCmpLibcallCCs(CmpLibcallCCs);
   InitLibcallCallingConvs(LibcallCallingConvs);
 }
@@ -730,6 +731,10 @@ void TargetLoweringBase::initActions() {
       setIndexedStoreAction(IM, (MVT::SimpleValueType)VT, Expand);
     }
 
+    // Most backends expect to see the node which just returns the value loaded.
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
+                       (MVT::SimpleValueType)VT, Expand);
+
     // These operations default to expand.
     setOperationAction(ISD::FGETSIGN, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::CONCAT_VECTORS, (MVT::SimpleValueType)VT, Expand);
@@ -739,8 +744,15 @@ void TargetLoweringBase::initActions() {
 
     // These operations default to expand for vector types.
     if (VT >= MVT::FIRST_VECTOR_VALUETYPE &&
-        VT <= MVT::LAST_VECTOR_VALUETYPE)
+        VT <= MVT::LAST_VECTOR_VALUETYPE) {
       setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
+      setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG,
+                         (MVT::SimpleValueType)VT, Expand);
+      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG,
+                         (MVT::SimpleValueType)VT, Expand);
+      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG,
+                         (MVT::SimpleValueType)VT, Expand);
+    }
   }
 
   // Most targets ignore the @llvm.prefetch intrinsic.
@@ -1080,24 +1092,25 @@ void TargetLoweringBase::computeRegisterProperties() {
   // Loop over all of the vector value types to see which need transformations.
   for (unsigned i = MVT::FIRST_VECTOR_VALUETYPE;
        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
-    MVT VT = (MVT::SimpleValueType)i;
-    if (isTypeLegal(VT)) continue;
+    MVT VT = (MVT::SimpleValueType) i;
+    if (isTypeLegal(VT))
+      continue;
 
-    // Determine if there is a legal wider type.  If so, we should promote to
-    // that wider vector type.
     MVT EltVT = VT.getVectorElementType();
     unsigned NElts = VT.getVectorNumElements();
-    if (NElts != 1 && !shouldSplitVectorType(VT)) {
-      bool IsLegalWiderType = false;
-      // First try to promote the elements of integer vectors. If no legal
-      // promotion was found, fallback to the widen-vector method.
-      for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
-        MVT SVT = (MVT::SimpleValueType)nVT;
+    bool IsLegalWiderType = false;
+    LegalizeTypeAction PreferredAction = getPreferredVectorAction(VT);
+    switch (PreferredAction) {
+    case TypePromoteInteger: {
+      // Try to promote the elements of integer vectors. If no legal
+      // promotion was found, fall through to the widen-vector method.
+      for (unsigned nVT = i + 1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+        MVT SVT = (MVT::SimpleValueType) nVT;
         // Promote vectors of integers to vectors with the same number
         // of elements, with a wider element type.
         if (SVT.getVectorElementType().getSizeInBits() > EltVT.getSizeInBits()
-            && SVT.getVectorNumElements() == NElts &&
-            isTypeLegal(SVT) && SVT.getScalarType().isInteger()) {
+            && SVT.getVectorNumElements() == NElts && isTypeLegal(SVT)
+            && SVT.getScalarType().isInteger()) {
           TransformToType[i] = SVT;
           RegisterTypeForVT[i] = SVT;
           NumRegistersForVT[i] = 1;
@@ -1106,15 +1119,15 @@ void TargetLoweringBase::computeRegisterProperties() {
           break;
         }
       }
-
-      if (IsLegalWiderType) continue;
-
+      if (IsLegalWiderType)
+        break;
+    }
+    case TypeWidenVector: {
       // Try to widen the vector.
-      for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
-        MVT SVT = (MVT::SimpleValueType)nVT;
-        if (SVT.getVectorElementType() == EltVT &&
-            SVT.getVectorNumElements() > NElts &&
-            isTypeLegal(SVT)) {
+      for (unsigned nVT = i + 1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+        MVT SVT = (MVT::SimpleValueType) nVT;
+        if (SVT.getVectorElementType() == EltVT
+            && SVT.getVectorNumElements() > NElts && isTypeLegal(SVT)) {
           TransformToType[i] = SVT;
           RegisterTypeForVT[i] = SVT;
           NumRegistersForVT[i] = 1;
@@ -1123,27 +1136,34 @@ void TargetLoweringBase::computeRegisterProperties() {
           break;
         }
       }
-      if (IsLegalWiderType) continue;
+      if (IsLegalWiderType)
+        break;
     }
-
-    MVT IntermediateVT;
-    MVT RegisterVT;
-    unsigned NumIntermediates;
-    NumRegistersForVT[i] =
-      getVectorTypeBreakdownMVT(VT, IntermediateVT, NumIntermediates,
-                                RegisterVT, this);
-    RegisterTypeForVT[i] = RegisterVT;
-
-    MVT NVT = VT.getPow2VectorType();
-    if (NVT == VT) {
-      // Type is already a power of 2.  The default action is to split.
-      TransformToType[i] = MVT::Other;
-      unsigned NumElts = VT.getVectorNumElements();
-      ValueTypeActions.setTypeAction(VT,
-            NumElts > 1 ? TypeSplitVector : TypeScalarizeVector);
-    } else {
-      TransformToType[i] = NVT;
-      ValueTypeActions.setTypeAction(VT, TypeWidenVector);
+    case TypeSplitVector:
+    case TypeScalarizeVector: {
+      MVT IntermediateVT;
+      MVT RegisterVT;
+      unsigned NumIntermediates;
+      NumRegistersForVT[i] = getVectorTypeBreakdownMVT(VT, IntermediateVT,
+          NumIntermediates, RegisterVT, this);
+      RegisterTypeForVT[i] = RegisterVT;
+
+      MVT NVT = VT.getPow2VectorType();
+      if (NVT == VT) {
+        // Type is already a power of 2.  The default action is to split.
+        TransformToType[i] = MVT::Other;
+        if (PreferredAction == TypeScalarizeVector)
+          ValueTypeActions.setTypeAction(VT, TypeScalarizeVector);
+        else
+          ValueTypeActions.setTypeAction(VT, TypeSplitVector);
+      } else {
+        TransformToType[i] = NVT;
+        ValueTypeActions.setTypeAction(VT, TypeWidenVector);
+      }
+      break;
+    }
+    default:
+      llvm_unreachable("Unknown vector legalization action!");
     }
   }
 
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index dda2259..03f4a51 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -48,16 +48,12 @@ MCSymbol *TargetLoweringObjectFileELF::getCFIPersonalitySymbol(
     const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
     MachineModuleInfo *MMI) const {
   unsigned Encoding = getPersonalityEncoding();
-  switch (Encoding & 0x70) {
-  default:
-    report_fatal_error("We do not support this DWARF encoding yet!");
-  case dwarf::DW_EH_PE_absptr:
-    return TM.getSymbol(GV, Mang);
-  case dwarf::DW_EH_PE_pcrel: {
+  if ((Encoding & 0x80) == dwarf::DW_EH_PE_indirect)
     return getContext().GetOrCreateSymbol(StringRef("DW.ref.") +
                                           TM.getSymbol(GV, Mang)->getName());
-  }
-  }
+  if ((Encoding & 0x70) == dwarf::DW_EH_PE_absptr)
+    return TM.getSymbol(GV, Mang);
+  report_fatal_error("We do not support this DWARF encoding yet!");
 }
 
 void TargetLoweringObjectFileELF::emitPersonalityValue(MCStreamer &Streamer,
@@ -196,6 +192,18 @@ getELFSectionFlags(SectionKind K) {
   return Flags;
 }
 
+static const Comdat *getELFComdat(const GlobalValue *GV) {
+  const Comdat *C = GV->getComdat();
+  if (!C)
+    return nullptr;
+
+  if (C->getSelectionKind() != Comdat::Any)
+    report_fatal_error("ELF COMDATs only support SelectionKind::Any, '" +
+                       C->getName() + "' cannot be lowered.");
+
+  return C;
+}
+
 const MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
     const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
     const TargetMachine &TM) const {
@@ -204,14 +212,20 @@ const MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
   // Infer section flags from the section name if we can.
   Kind = getELFKindForNamedSection(SectionName, Kind);
 
+  StringRef Group = "";
+  unsigned Flags = getELFSectionFlags(Kind);
+  if (const Comdat *C = getELFComdat(GV)) {
+    Group = C->getName();
+    Flags |= ELF::SHF_GROUP;
+  }
   return getContext().getELFSection(SectionName,
-                                    getELFSectionType(SectionName, Kind),
-                                    getELFSectionFlags(Kind), Kind);
+                                    getELFSectionType(SectionName, Kind), Flags,
+                                    Kind, /*EntrySize=*/0, Group);
 }
 
 /// getSectionPrefixForGlobal - Return the section prefix name used by options
 /// FunctionsSections and DataSections.
-static const char *getSectionPrefixForGlobal(SectionKind Kind) {
+static StringRef getSectionPrefixForGlobal(SectionKind Kind) {
   if (Kind.isText())                 return ".text.";
   if (Kind.isReadOnly())             return ".rodata.";
   if (Kind.isBSS())                  return ".bss.";
@@ -228,7 +242,6 @@ static const char *getSectionPrefixForGlobal(SectionKind Kind) {
   return ".data.rel.ro.";
 }
 
-
 const MCSection *TargetLoweringObjectFileELF::
 SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
                        Mangler &Mang, const TargetMachine &TM) const {
@@ -242,18 +255,20 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
 
   // If this global is linkonce/weak and the target handles this by emitting it
   // into a 'uniqued' section name, create and return the section now.
-  if ((GV->isWeakForLinker() || EmitUniquedSection) &&
+  if ((GV->isWeakForLinker() || EmitUniquedSection || GV->hasComdat()) &&
       !Kind.isCommon()) {
-    const char *Prefix;
-    Prefix = getSectionPrefixForGlobal(Kind);
+    StringRef Prefix = getSectionPrefixForGlobal(Kind);
 
-    SmallString<128> Name(Prefix, Prefix+strlen(Prefix));
+    SmallString<128> Name(Prefix);
     TM.getNameWithPrefix(Name, GV, Mang, true);
 
     StringRef Group = "";
     unsigned Flags = getELFSectionFlags(Kind);
-    if (GV->isWeakForLinker()) {
-      Group = Name.substr(strlen(Prefix));
+    if (GV->isWeakForLinker() || GV->hasComdat()) {
+      if (const Comdat *C = getELFComdat(GV))
+        Group = C->getName();
+      else
+        Group = Name.substr(Prefix.size());
       Flags |= ELF::SHF_GROUP;
     }
 
@@ -340,7 +355,7 @@ getSectionForConstant(SectionKind Kind) const {
 }
 
 const MCSection *TargetLoweringObjectFileELF::getStaticCtorSection(
-    unsigned Priority, const MCSymbol *KeySym, const MCSection *KeySec) const {
+    unsigned Priority, const MCSymbol *KeySym) const {
   // The default scheme is .ctor / .dtor, so we have to invert the priority
   // numbering.
   if (Priority == 65535)
@@ -360,7 +375,7 @@ const MCSection *TargetLoweringObjectFileELF::getStaticCtorSection(
 }
 
 const MCSection *TargetLoweringObjectFileELF::getStaticDtorSection(
-    unsigned Priority, const MCSymbol *KeySym, const MCSection *KeySec) const {
+    unsigned Priority, const MCSymbol *KeySym) const {
   // The default scheme is .ctor / .dtor, so we have to invert the priority
   // numbering.
   if (Priority == 65535)
@@ -487,6 +502,15 @@ emitModuleFlags(MCStreamer &Streamer,
   Streamer.AddBlankLine();
 }
 
+static void checkMachOComdat(const GlobalValue *GV) {
+  const Comdat *C = GV->getComdat();
+  if (!C)
+    return;
+
+  report_fatal_error("MachO doesn't support COMDATs, '" + C->getName() +
+                     "' cannot be lowered.");
+}
+
 const MCSection *TargetLoweringObjectFileMachO::getExplicitSectionGlobal(
     const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
     const TargetMachine &TM) const {
@@ -494,6 +518,9 @@ const MCSection *TargetLoweringObjectFileMachO::getExplicitSectionGlobal(
   StringRef Segment, Section;
   unsigned TAA = 0, StubSize = 0;
   bool TAAParsed;
+
+  checkMachOComdat(GV);
+
   std::string ErrorCode =
     MCSectionMachO::ParseSectionSpecifier(GV->getSection(), Segment, Section,
                                           TAA, TAAParsed, StubSize);
@@ -564,6 +591,7 @@ bool TargetLoweringObjectFileMachO::isSectionAtomizableBySymbols(
 const MCSection *TargetLoweringObjectFileMachO::
 SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
                        Mangler &Mang, const TargetMachine &TM) const {
+  checkMachOComdat(GV);
 
   // Handle thread local data.
   if (Kind.isThreadBSS()) return TLSBSSSection;
@@ -732,6 +760,50 @@ getCOFFSectionFlags(SectionKind K) {
   return Flags;
 }
 
+static const GlobalValue *getComdatGVForCOFF(const GlobalValue *GV) {
+  const Comdat *C = GV->getComdat();
+  assert(C && "expected GV to have a Comdat!");
+
+  StringRef ComdatGVName = C->getName();
+  const GlobalValue *ComdatGV = GV->getParent()->getNamedValue(ComdatGVName);
+  if (!ComdatGV)
+    report_fatal_error("Associative COMDAT symbol '" + ComdatGVName +
+                       "' does not exist.");
+
+  if (ComdatGV->getComdat() != C)
+    report_fatal_error("Associative COMDAT symbol '" + ComdatGVName +
+                       "' is not a key for it's COMDAT.");
+
+  return ComdatGV;
+}
+
+static int getSelectionForCOFF(const GlobalValue *GV) {
+  if (const Comdat *C = GV->getComdat()) {
+    const GlobalValue *ComdatKey = getComdatGVForCOFF(GV);
+    if (const auto *GA = dyn_cast<GlobalAlias>(ComdatKey))
+      ComdatKey = GA->getBaseObject();
+    if (ComdatKey == GV) {
+      switch (C->getSelectionKind()) {
+      case Comdat::Any:
+        return COFF::IMAGE_COMDAT_SELECT_ANY;
+      case Comdat::ExactMatch:
+        return COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH;
+      case Comdat::Largest:
+        return COFF::IMAGE_COMDAT_SELECT_LARGEST;
+      case Comdat::NoDuplicates:
+        return COFF::IMAGE_COMDAT_SELECT_NODUPLICATES;
+      case Comdat::SameSize:
+        return COFF::IMAGE_COMDAT_SELECT_SAME_SIZE;
+      }
+    } else {
+      return COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE;
+    }
+  } else if (GV->isWeakForLinker()) {
+    return COFF::IMAGE_COMDAT_SELECT_ANY;
+  }
+  return 0;
+}
+
 const MCSection *TargetLoweringObjectFileCOFF::getExplicitSectionGlobal(
     const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
     const TargetMachine &TM) const {
@@ -739,11 +811,21 @@ const MCSection *TargetLoweringObjectFileCOFF::getExplicitSectionGlobal(
   unsigned Characteristics = getCOFFSectionFlags(Kind);
   StringRef Name = GV->getSection();
   StringRef COMDATSymName = "";
-  if (GV->isWeakForLinker()) {
-    Selection = COFF::IMAGE_COMDAT_SELECT_ANY;
-    Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
-    MCSymbol *Sym = TM.getSymbol(GV, Mang);
-    COMDATSymName = Sym->getName();
+  if ((GV->isWeakForLinker() || GV->hasComdat()) && !Kind.isCommon()) {
+    Selection = getSelectionForCOFF(GV);
+    const GlobalValue *ComdatGV;
+    if (Selection == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
+      ComdatGV = getComdatGVForCOFF(GV);
+    else
+      ComdatGV = GV;
+
+    if (!ComdatGV->hasPrivateLinkage()) {
+      MCSymbol *Sym = TM.getSymbol(ComdatGV, Mang);
+      COMDATSymName = Sym->getName();
+      Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
+    } else {
+      Selection = 0;
+    }
   }
   return getContext().getCOFFSection(Name,
                                      Characteristics,
@@ -780,17 +862,27 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
   // into a 'uniqued' section name, create and return the section now.
   // Section names depend on the name of the symbol which is not feasible if the
   // symbol has private linkage.
-  if ((GV->isWeakForLinker() || EmitUniquedSection) &&
-      !GV->hasPrivateLinkage() && !Kind.isCommon()) {
+  if ((GV->isWeakForLinker() || EmitUniquedSection || GV->hasComdat()) &&
+      !Kind.isCommon()) {
     const char *Name = getCOFFSectionNameForUniqueGlobal(Kind);
     unsigned Characteristics = getCOFFSectionFlags(Kind);
 
     Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
-    MCSymbol *Sym = TM.getSymbol(GV, Mang);
-    return getContext().getCOFFSection(
-        Name, Characteristics, Kind, Sym->getName(),
-        GV->isWeakForLinker() ? COFF::IMAGE_COMDAT_SELECT_ANY
-                              : COFF::IMAGE_COMDAT_SELECT_NODUPLICATES);
+    int Selection = getSelectionForCOFF(GV);
+    if (!Selection)
+      Selection = COFF::IMAGE_COMDAT_SELECT_NODUPLICATES;
+    const GlobalValue *ComdatGV;
+    if (GV->hasComdat())
+      ComdatGV = getComdatGVForCOFF(GV);
+    else
+      ComdatGV = GV;
+
+    if (!ComdatGV->hasPrivateLinkage()) {
+      MCSymbol *Sym = TM.getSymbol(ComdatGV, Mang);
+      StringRef COMDATSymName = Sym->getName();
+      return getContext().getCOFFSection(Name, Characteristics, Kind,
+                                         COMDATSymName, Selection);
+    }
   }
 
   if (Kind.isText())
@@ -868,8 +960,7 @@ emitModuleFlags(MCStreamer &Streamer,
 
 static const MCSection *getAssociativeCOFFSection(MCContext &Ctx,
                                                   const MCSection *Sec,
-                                                  const MCSymbol *KeySym,
-                                                  const MCSection *KeySec) {
+                                                  const MCSymbol *KeySym) {
   // Return the normal section if we don't have to be associative.
   if (!KeySym)
     return Sec;
@@ -877,20 +968,19 @@ static const MCSection *getAssociativeCOFFSection(MCContext &Ctx,
   // Make an associative section with the same name and kind as the normal
   // section.
   const MCSectionCOFF *SecCOFF = cast<MCSectionCOFF>(Sec);
-  const MCSectionCOFF *KeySecCOFF = cast<MCSectionCOFF>(KeySec);
   unsigned Characteristics =
       SecCOFF->getCharacteristics() | COFF::IMAGE_SCN_LNK_COMDAT;
   return Ctx.getCOFFSection(SecCOFF->getSectionName(), Characteristics,
                             SecCOFF->getKind(), KeySym->getName(),
-                            COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE, KeySecCOFF);
+                            COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE);
 }
 
 const MCSection *TargetLoweringObjectFileCOFF::getStaticCtorSection(
-    unsigned Priority, const MCSymbol *KeySym, const MCSection *KeySec) const {
-  return getAssociativeCOFFSection(getContext(), StaticCtorSection, KeySym, KeySec);
+    unsigned Priority, const MCSymbol *KeySym) const {
+  return getAssociativeCOFFSection(getContext(), StaticCtorSection, KeySym);
 }
 
 const MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection(
-    unsigned Priority, const MCSymbol *KeySym, const MCSection *KeySec) const {
-  return getAssociativeCOFFSection(getContext(), StaticDtorSection, KeySym, KeySec);
+    unsigned Priority, const MCSymbol *KeySym) const {
+  return getAssociativeCOFFSection(getContext(), StaticDtorSection, KeySym);
 }
diff --git a/lib/DebugInfo/DWARFContext.cpp b/lib/DebugInfo/DWARFContext.cpp
index e52e8af..3961905 100644
--- a/lib/DebugInfo/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARFContext.cpp
@@ -734,7 +734,7 @@ DWARFContextInMemory::DWARFContextInMemory(object::ObjectFile *Obj)
         object::RelocToApply R(V.visit(Type, Reloc, 0, SymAddr));
         if (V.error()) {
           SmallString<32> Name;
-          error_code ec(Reloc.getTypeName(Name));
+          std::error_code ec(Reloc.getTypeName(Name));
           if (ec) {
             errs() << "Aaaaaa! Nameless relocation! Aaaaaa!\n";
           }
diff --git a/lib/DebugInfo/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARFDebugAranges.cpp
index 2524adc..fe7e46d 100644
--- a/lib/DebugInfo/DWARFDebugAranges.cpp
+++ b/lib/DebugInfo/DWARFDebugAranges.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
+#include <set>
 using namespace llvm;
 
 void DWARFDebugAranges::extract(DataExtractor DebugArangesData) {
@@ -30,6 +31,7 @@ void DWARFDebugAranges::extract(DataExtractor DebugArangesData) {
       uint64_t HighPC = Desc.getEndAddress();
       appendRange(CUOffset, LowPC, HighPC);
     }
+    ParsedCUOffsets.insert(CUOffset);
   }
 }
 
@@ -56,69 +58,55 @@ void DWARFDebugAranges::generate(DWARFContext *CTX) {
     }
   }
 
-  sortAndMinimize();
+  construct();
 }
 
 void DWARFDebugAranges::clear() {
+  Endpoints.clear();
   Aranges.clear();
   ParsedCUOffsets.clear();
 }
 
 void DWARFDebugAranges::appendRange(uint32_t CUOffset, uint64_t LowPC,
                                     uint64_t HighPC) {
-  if (!Aranges.empty()) {
-    if (Aranges.back().CUOffset == CUOffset &&
-        Aranges.back().HighPC() == LowPC) {
-      Aranges.back().setHighPC(HighPC);
-      return;
-    }
-  }
-  Aranges.push_back(Range(LowPC, HighPC, CUOffset));
-}
-
-void DWARFDebugAranges::sortAndMinimize() {
-  const size_t orig_arange_size = Aranges.size();
-  // Size of one? If so, no sorting is needed
-  if (orig_arange_size <= 1)
+  if (LowPC >= HighPC)
     return;
-  // Sort our address range entries
-  std::stable_sort(Aranges.begin(), Aranges.end());
-
-  // Most address ranges are contiguous from function to function
-  // so our new ranges will likely be smaller. We calculate the size
-  // of the new ranges since although std::vector objects can be resized,
-  // the will never reduce their allocated block size and free any excesss
-  // memory, so we might as well start a brand new collection so it is as
-  // small as possible.
-
-  // First calculate the size of the new minimal arange vector
-  // so we don't have to do a bunch of re-allocations as we
-  // copy the new minimal stuff over to the new collection.
-  size_t minimal_size = 1;
-  for (size_t i = 1; i < orig_arange_size; ++i) {
-    if (!Range::SortedOverlapCheck(Aranges[i-1], Aranges[i]))
-      ++minimal_size;
-  }
+  Endpoints.emplace_back(LowPC, CUOffset, true);
+  Endpoints.emplace_back(HighPC, CUOffset, false);
+}
 
-  // Else, make a new RangeColl that _only_ contains what we need.
-  RangeColl minimal_aranges;
-  minimal_aranges.resize(minimal_size);
-  uint32_t j = 0;
-  minimal_aranges[j] = Aranges[0];
-  for (size_t i = 1; i < orig_arange_size; ++i) {
-    if (Range::SortedOverlapCheck(minimal_aranges[j], Aranges[i])) {
-      minimal_aranges[j].setHighPC(Aranges[i].HighPC());
+void DWARFDebugAranges::construct() {
+  std::multiset<uint32_t> ValidCUs;  // Maintain the set of CUs describing
+                                     // a current address range.
+  std::sort(Endpoints.begin(), Endpoints.end());
+  uint64_t PrevAddress = -1ULL;
+  for (const auto &E : Endpoints) {
+    if (PrevAddress < E.Address && ValidCUs.size() > 0) {
+      // If the address range between two endpoints is described by some
+      // CU, first try to extend the last range in Aranges. If we can't
+      // do it, start a new range.
+      if (!Aranges.empty() && Aranges.back().HighPC() == PrevAddress &&
+          ValidCUs.find(Aranges.back().CUOffset) != ValidCUs.end()) {
+        Aranges.back().setHighPC(E.Address);
+      } else {
+        Aranges.emplace_back(PrevAddress, E.Address, *ValidCUs.begin());
+      }
+    }
+    // Update the set of valid CUs.
+    if (E.IsRangeStart) {
+      ValidCUs.insert(E.CUOffset);
     } else {
-      // Only increment j if we aren't merging
-      minimal_aranges[++j] = Aranges[i];
+      auto CUPos = ValidCUs.find(E.CUOffset);
+      assert(CUPos != ValidCUs.end());
+      ValidCUs.erase(CUPos);
     }
+    PrevAddress = E.Address;
   }
-  assert(j+1 == minimal_size);
+  assert(ValidCUs.empty());
 
-  // Now swap our new minimal aranges into place. The local
-  // minimal_aranges will then contian the old big collection
-  // which will get freed.
-  minimal_aranges.swap(Aranges);
+  // Endpoints are not needed now.
+  std::vector<RangeEndpoint> EmptyEndpoints;
+  EmptyEndpoints.swap(Endpoints);
 }
 
 uint32_t DWARFDebugAranges::findAddress(uint64_t Address) const {
diff --git a/lib/DebugInfo/DWARFDebugAranges.h b/lib/DebugInfo/DWARFDebugAranges.h
index de96d7f..a9f37fe 100644
--- a/lib/DebugInfo/DWARFDebugAranges.h
+++ b/lib/DebugInfo/DWARFDebugAranges.h
@@ -27,9 +27,9 @@ private:
   void clear();
   void extract(DataExtractor DebugArangesData);
 
-  // Use appendRange multiple times and then call sortAndMinimize.
+  // Call appendRange multiple times and then call construct.
   void appendRange(uint32_t CUOffset, uint64_t LowPC, uint64_t HighPC);
-  void sortAndMinimize();
+  void construct();
 
   struct Range {
     explicit Range(uint64_t LowPC = -1ULL, uint64_t HighPC = -1ULL,
@@ -47,31 +47,39 @@ private:
         return LowPC + Length;
       return -1ULL;
     }
+
     bool containsAddress(uint64_t Address) const {
       return LowPC <= Address && Address < HighPC();
     }
-
-    bool operator <(const Range &other) const {
+    bool operator<(const Range &other) const {
       return LowPC < other.LowPC;
     }
 
-    static bool SortedOverlapCheck(const Range &Left, const Range &Right) {
-      if (Left.CUOffset != Right.CUOffset)
-        return false;
-      return Left.HighPC() >= Right.LowPC;
-    }
-
     uint64_t LowPC; // Start of address range.
     uint32_t Length; // End of address range (not including this address).
     uint32_t CUOffset; // Offset of the compile unit or die.
   };
 
+  struct RangeEndpoint {
+    uint64_t Address;
+    uint32_t CUOffset;
+    bool IsRangeStart;
+
+    RangeEndpoint(uint64_t Address, uint32_t CUOffset, bool IsRangeStart)
+        : Address(Address), CUOffset(CUOffset), IsRangeStart(IsRangeStart) {}
+
+    bool operator<(const RangeEndpoint &Other) const {
+      return Address < Other.Address;
+    }
+  };
+
+
   typedef std::vector<Range>              RangeColl;
   typedef RangeColl::const_iterator       RangeCollIterator;
-  typedef DenseSet<uint32_t>              ParsedCUOffsetColl;
 
+  std::vector<RangeEndpoint> Endpoints;
   RangeColl Aranges;
-  ParsedCUOffsetColl ParsedCUOffsets;
+  DenseSet<uint32_t> ParsedCUOffsets;
 };
 
 }
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
index b811ed7..2e7a54a 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
@@ -210,6 +210,16 @@ uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsSectionOffset(
   return Result.hasValue() ? Result.getValue() : FailValue;
 }
 
+uint64_t
+DWARFDebugInfoEntryMinimal::getRangesBaseAttribute(const DWARFUnit *U,
+                                                   uint64_t FailValue) const {
+  uint64_t Result =
+      getAttributeValueAsSectionOffset(U, DW_AT_ranges_base, -1ULL);
+  if (Result != -1ULL)
+    return Result;
+  return getAttributeValueAsSectionOffset(U, DW_AT_GNU_ranges_base, FailValue);
+}
+
 bool DWARFDebugInfoEntryMinimal::getLowAndHighPC(const DWARFUnit *U,
                                                  uint64_t &LowPC,
                                                  uint64_t &HighPC) const {
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.h b/lib/DebugInfo/DWARFDebugInfoEntry.h
index 916e1ed..cc58eb6 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.h
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.h
@@ -106,6 +106,8 @@ public:
                                             const uint16_t Attr,
                                             uint64_t FailValue) const;
 
+  uint64_t getRangesBaseAttribute(const DWARFUnit *U, uint64_t FailValue) const;
+
   /// Retrieves DW_AT_low_pc and DW_AT_high_pc from CU.
   /// Returns true if both attributes are present.
   bool getLowAndHighPC(const DWARFUnit *U, uint64_t &LowPC,
diff --git a/lib/DebugInfo/DWARFUnit.cpp b/lib/DebugInfo/DWARFUnit.cpp
index f5f5072..39d0a0f 100644
--- a/lib/DebugInfo/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARFUnit.cpp
@@ -226,7 +226,9 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
     AddrOffsetSectionBase = DieArray[0].getAttributeValueAsSectionOffset(
         this, DW_AT_GNU_addr_base, 0);
     RangeSectionBase = DieArray[0].getAttributeValueAsSectionOffset(
-        this, DW_AT_GNU_ranges_base, 0);
+        this, DW_AT_ranges_base, 0);
+    // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for
+    // skeleton CU DIE, so that DWARF users not aware of it are not broken.
   }
 
   setDIERelations();
@@ -272,7 +274,8 @@ bool DWARFUnit::parseDWO() {
   }
   // Share .debug_addr and .debug_ranges section with compile unit in .dwo
   DWOCU->setAddrOffsetSection(AddrOffsetSection, AddrOffsetSectionBase);
-  DWOCU->setRangesSection(RangeSection, RangeSectionBase);
+  uint32_t DWORangesBase = DieArray[0].getRangesBaseAttribute(this, 0);
+  DWOCU->setRangesSection(RangeSection, DWORangesBase);
   return true;
 }
 
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index 6766ef1..b0e985d 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -148,8 +148,7 @@ Function *ExecutionEngine::FindFunctionNamed(const char *FnName) {
 }
 
 
-void *ExecutionEngineState::RemoveMapping(const MutexGuard &,
-                                          const GlobalValue *ToUnmap) {
+void *ExecutionEngineState::RemoveMapping(const GlobalValue *ToUnmap) {
   GlobalAddressMapTy::iterator I = GlobalAddressMap.find(ToUnmap);
   void *OldVal;
 
@@ -171,14 +170,14 @@ void ExecutionEngine::addGlobalMapping(const GlobalValue *GV, void *Addr) {
 
   DEBUG(dbgs() << "JIT: Map \'" << GV->getName()
         << "\' to [" << Addr << "]\n";);
-  void *&CurVal = EEState.getGlobalAddressMap(locked)[GV];
+  void *&CurVal = EEState.getGlobalAddressMap()[GV];
   assert((!CurVal || !Addr) && "GlobalMapping already established!");
   CurVal = Addr;
 
   // If we are using the reverse mapping, add it too.
-  if (!EEState.getGlobalAddressReverseMap(locked).empty()) {
+  if (!EEState.getGlobalAddressReverseMap().empty()) {
     AssertingVH<const GlobalValue> &V =
-      EEState.getGlobalAddressReverseMap(locked)[Addr];
+      EEState.getGlobalAddressReverseMap()[Addr];
     assert((!V || !GV) && "GlobalMapping already established!");
     V = GV;
   }
@@ -187,41 +186,41 @@ void ExecutionEngine::addGlobalMapping(const GlobalValue *GV, void *Addr) {
 void ExecutionEngine::clearAllGlobalMappings() {
   MutexGuard locked(lock);
 
-  EEState.getGlobalAddressMap(locked).clear();
-  EEState.getGlobalAddressReverseMap(locked).clear();
+  EEState.getGlobalAddressMap().clear();
+  EEState.getGlobalAddressReverseMap().clear();
 }
 
 void ExecutionEngine::clearGlobalMappingsFromModule(Module *M) {
   MutexGuard locked(lock);
 
   for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; ++FI)
-    EEState.RemoveMapping(locked, FI);
+    EEState.RemoveMapping(FI);
   for (Module::global_iterator GI = M->global_begin(), GE = M->global_end();
        GI != GE; ++GI)
-    EEState.RemoveMapping(locked, GI);
+    EEState.RemoveMapping(GI);
 }
 
 void *ExecutionEngine::updateGlobalMapping(const GlobalValue *GV, void *Addr) {
   MutexGuard locked(lock);
 
   ExecutionEngineState::GlobalAddressMapTy &Map =
-    EEState.getGlobalAddressMap(locked);
+    EEState.getGlobalAddressMap();
 
   // Deleting from the mapping?
   if (!Addr)
-    return EEState.RemoveMapping(locked, GV);
+    return EEState.RemoveMapping(GV);
 
   void *&CurVal = Map[GV];
   void *OldVal = CurVal;
 
-  if (CurVal && !EEState.getGlobalAddressReverseMap(locked).empty())
-    EEState.getGlobalAddressReverseMap(locked).erase(CurVal);
+  if (CurVal && !EEState.getGlobalAddressReverseMap().empty())
+    EEState.getGlobalAddressReverseMap().erase(CurVal);
   CurVal = Addr;
 
   // If we are using the reverse mapping, add it too.
-  if (!EEState.getGlobalAddressReverseMap(locked).empty()) {
+  if (!EEState.getGlobalAddressReverseMap().empty()) {
     AssertingVH<const GlobalValue> &V =
-      EEState.getGlobalAddressReverseMap(locked)[Addr];
+      EEState.getGlobalAddressReverseMap()[Addr];
     assert((!V || !GV) && "GlobalMapping already established!");
     V = GV;
   }
@@ -232,25 +231,25 @@ void *ExecutionEngine::getPointerToGlobalIfAvailable(const GlobalValue *GV) {
   MutexGuard locked(lock);
 
   ExecutionEngineState::GlobalAddressMapTy::iterator I =
-    EEState.getGlobalAddressMap(locked).find(GV);
-  return I != EEState.getGlobalAddressMap(locked).end() ? I->second : nullptr;
+    EEState.getGlobalAddressMap().find(GV);
+  return I != EEState.getGlobalAddressMap().end() ? I->second : nullptr;
 }
 
 const GlobalValue *ExecutionEngine::getGlobalValueAtAddress(void *Addr) {
   MutexGuard locked(lock);
 
   // If we haven't computed the reverse mapping yet, do so first.
-  if (EEState.getGlobalAddressReverseMap(locked).empty()) {
+  if (EEState.getGlobalAddressReverseMap().empty()) {
     for (ExecutionEngineState::GlobalAddressMapTy::iterator
-         I = EEState.getGlobalAddressMap(locked).begin(),
-         E = EEState.getGlobalAddressMap(locked).end(); I != E; ++I)
-      EEState.getGlobalAddressReverseMap(locked).insert(std::make_pair(
+         I = EEState.getGlobalAddressMap().begin(),
+         E = EEState.getGlobalAddressMap().end(); I != E; ++I)
+      EEState.getGlobalAddressReverseMap().insert(std::make_pair(
                                                           I->second, I->first));
   }
 
   std::map<void *, AssertingVH<const GlobalValue> >::iterator I =
-    EEState.getGlobalAddressReverseMap(locked).find(Addr);
-  return I != EEState.getGlobalAddressReverseMap(locked).end() ? I->second : nullptr;
+    EEState.getGlobalAddressReverseMap().find(Addr);
+  return I != EEState.getGlobalAddressReverseMap().end() ? I->second : nullptr;
 }
 
 namespace {
@@ -412,13 +411,14 @@ ExecutionEngine *ExecutionEngine::create(Module *M,
                                          std::string *ErrorStr,
                                          CodeGenOpt::Level OptLevel,
                                          bool GVsWithCode) {
-  EngineBuilder EB =  EngineBuilder(M)
-      .setEngineKind(ForceInterpreter
-                     ? EngineKind::Interpreter
-                     : EngineKind::JIT)
-      .setErrorStr(ErrorStr)
-      .setOptLevel(OptLevel)
-      .setAllocateGVsWithCode(GVsWithCode);
+
+  EngineBuilder EB =
+      EngineBuilder(M)
+          .setEngineKind(ForceInterpreter ? EngineKind::Interpreter
+                                          : EngineKind::Either)
+          .setErrorStr(ErrorStr)
+          .setOptLevel(OptLevel)
+          .setAllocateGVsWithCode(GVsWithCode);
 
   return EB.create();
 }
@@ -457,6 +457,27 @@ ExecutionEngine *ExecutionEngine::createJIT(Module *M,
   return ExecutionEngine::JITCtor(M, ErrorStr, JMM, GVsWithCode, TM);
 }
 
+void EngineBuilder::InitEngine() {
+  WhichEngine = EngineKind::Either;
+  ErrorStr = nullptr;
+  OptLevel = CodeGenOpt::Default;
+  MCJMM = nullptr;
+  JMM = nullptr;
+  Options = TargetOptions();
+  AllocateGVsWithCode = false;
+  RelocModel = Reloc::Default;
+  CMModel = CodeModel::JITDefault;
+  UseMCJIT = false;
+
+// IR module verification is enabled by default in debug builds, and disabled
+// by default in release builds.
+#ifndef NDEBUG
+  VerifyModules = true;
+#else
+  VerifyModules = false;
+#endif
+}
+
 ExecutionEngine *EngineBuilder::create(TargetMachine *TM) {
   std::unique_ptr<TargetMachine> TheTM(TM); // Take ownership.
 
@@ -536,7 +557,7 @@ void *ExecutionEngine::getPointerToGlobal(const GlobalValue *GV) {
     return getPointerToFunction(F);
 
   MutexGuard locked(lock);
-  if (void *P = EEState.getGlobalAddressMap(locked)[GV])
+  if (void *P = EEState.getGlobalAddressMap()[GV])
     return P;
 
   // Global variable might have been added since interpreter started.
@@ -546,7 +567,7 @@ void *ExecutionEngine::getPointerToGlobal(const GlobalValue *GV) {
   else
     llvm_unreachable("Global hasn't had an address allocated yet!");
 
-  return EEState.getGlobalAddressMap(locked)[GV];
+  return EEState.getGlobalAddressMap()[GV];
 }
 
 /// \brief Converts a Constant* into a GenericValue, including handling of
diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
index 9a65fa0..4e22a8b 100644
--- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
@@ -86,7 +86,7 @@ static LineNumberInfo DILineInfoToIntelJITFormat(uintptr_t StartAddress,
   LineNumberInfo Result;
 
   Result.Offset = Address - StartAddress;
-  Result.LineNumber = Line.getLine();
+  Result.LineNumber = Line.Line;
 
   return Result;
 }
@@ -233,7 +233,7 @@ void IntelJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
           FunctionMessage.line_number_size = 0;
           FunctionMessage.line_number_table = 0;
         } else {
-          SourceFileName = Lines.front().second.getFileName();
+          SourceFileName = Lines.front().second.FileName;
           FunctionMessage.source_file_name = const_cast<char *>(SourceFileName.c_str());
           FunctionMessage.line_number_size = LineInfo.size();
           FunctionMessage.line_number_table = &*LineInfo.begin();
diff --git a/lib/ExecutionEngine/Interpreter/Interpreter.cpp b/lib/ExecutionEngine/Interpreter/Interpreter.cpp
index c589457..814efcc 100644
--- a/lib/ExecutionEngine/Interpreter/Interpreter.cpp
+++ b/lib/ExecutionEngine/Interpreter/Interpreter.cpp
@@ -34,7 +34,7 @@ extern "C" void LLVMLinkInInterpreter() { }
 ///
 ExecutionEngine *Interpreter::create(Module *M, std::string* ErrStr) {
   // Tell this Module to materialize everything and release the GVMaterializer.
-  if (error_code EC = M->materializeAllPermanently()) {
+  if (std::error_code EC = M->materializeAllPermanently()) {
     if (ErrStr)
       *ErrStr = EC.message();
     // We got an error, just return 0
diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
index f8b2827..83ec978 100644
--- a/lib/ExecutionEngine/JIT/JIT.cpp
+++ b/lib/ExecutionEngine/JIT/JIT.cpp
@@ -151,7 +151,7 @@ JIT::JIT(Module *M, TargetMachine &tm, TargetJITInfo &tji,
 
   // Add target data
   MutexGuard locked(lock);
-  FunctionPassManager &PM = jitstate->getPM(locked);
+  FunctionPassManager &PM = jitstate->getPM();
   M->setDataLayout(TM.getDataLayout());
   PM.add(new DataLayoutPass(M));
 
@@ -184,7 +184,7 @@ void JIT::addModule(Module *M) {
 
     jitstate = new JITState(M);
 
-    FunctionPassManager &PM = jitstate->getPM(locked);
+    FunctionPassManager &PM = jitstate->getPM();
     M->setDataLayout(TM.getDataLayout());
     PM.add(new DataLayoutPass(M));
 
@@ -216,7 +216,7 @@ bool JIT::removeModule(Module *M) {
   if (!jitstate && !Modules.empty()) {
     jitstate = new JITState(Modules[0]);
 
-    FunctionPassManager &PM = jitstate->getPM(locked);
+    FunctionPassManager &PM = jitstate->getPM();
     M->setDataLayout(TM.getDataLayout());
     PM.add(new DataLayoutPass(M));
 
@@ -460,41 +460,41 @@ void JIT::runJITOnFunction(Function *F, MachineCodeInfo *MCI) {
   if (MCI)
     RegisterJITEventListener(&MCIL);
 
-  runJITOnFunctionUnlocked(F, locked);
+  runJITOnFunctionUnlocked(F);
 
   if (MCI)
     UnregisterJITEventListener(&MCIL);
 }
 
-void JIT::runJITOnFunctionUnlocked(Function *F, const MutexGuard &locked) {
+void JIT::runJITOnFunctionUnlocked(Function *F) {
   assert(!isAlreadyCodeGenerating && "Error: Recursive compilation detected!");
 
-  jitTheFunction(F, locked);
+  jitTheFunctionUnlocked(F);
 
   // If the function referred to another function that had not yet been
   // read from bitcode, and we are jitting non-lazily, emit it now.
-  while (!jitstate->getPendingFunctions(locked).empty()) {
-    Function *PF = jitstate->getPendingFunctions(locked).back();
-    jitstate->getPendingFunctions(locked).pop_back();
+  while (!jitstate->getPendingFunctions().empty()) {
+    Function *PF = jitstate->getPendingFunctions().back();
+    jitstate->getPendingFunctions().pop_back();
 
     assert(!PF->hasAvailableExternallyLinkage() &&
            "Externally-defined function should not be in pending list.");
 
-    jitTheFunction(PF, locked);
+    jitTheFunctionUnlocked(PF);
 
     // Now that the function has been jitted, ask the JITEmitter to rewrite
     // the stub with real address of the function.
-    updateFunctionStub(PF);
+    updateFunctionStubUnlocked(PF);
   }
 }
 
-void JIT::jitTheFunction(Function *F, const MutexGuard &locked) {
+void JIT::jitTheFunctionUnlocked(Function *F) {
   isAlreadyCodeGenerating = true;
-  jitstate->getPM(locked).run(*F);
+  jitstate->getPM().run(*F);
   isAlreadyCodeGenerating = false;
 
   // clear basic block addresses after this function is done
-  getBasicBlockAddressMap(locked).clear();
+  getBasicBlockAddressMap().clear();
 }
 
 /// getPointerToFunction - This method is used to get the address of the
@@ -526,7 +526,7 @@ void *JIT::getPointerToFunction(Function *F) {
     return Addr;
   }
 
-  runJITOnFunctionUnlocked(F, locked);
+  runJITOnFunctionUnlocked(F);
 
   void *Addr = getPointerToGlobalIfAvailable(F);
   assert(Addr && "Code generation didn't add function to GlobalAddress table!");
@@ -537,9 +537,9 @@ void JIT::addPointerToBasicBlock(const BasicBlock *BB, void *Addr) {
   MutexGuard locked(lock);
 
   BasicBlockAddressMapTy::iterator I =
-    getBasicBlockAddressMap(locked).find(BB);
-  if (I == getBasicBlockAddressMap(locked).end()) {
-    getBasicBlockAddressMap(locked)[BB] = Addr;
+    getBasicBlockAddressMap().find(BB);
+  if (I == getBasicBlockAddressMap().end()) {
+    getBasicBlockAddressMap()[BB] = Addr;
   } else {
     // ignore repeats: some BBs can be split into few MBBs?
   }
@@ -547,7 +547,7 @@ void JIT::addPointerToBasicBlock(const BasicBlock *BB, void *Addr) {
 
 void JIT::clearPointerToBasicBlock(const BasicBlock *BB) {
   MutexGuard locked(lock);
-  getBasicBlockAddressMap(locked).erase(BB);
+  getBasicBlockAddressMap().erase(BB);
 }
 
 void *JIT::getPointerToBasicBlock(BasicBlock *BB) {
@@ -558,8 +558,8 @@ void *JIT::getPointerToBasicBlock(BasicBlock *BB) {
   MutexGuard locked(lock);
 
   BasicBlockAddressMapTy::iterator I =
-    getBasicBlockAddressMap(locked).find(BB);
-  if (I != getBasicBlockAddressMap(locked).end()) {
+    getBasicBlockAddressMap().find(BB);
+  if (I != getBasicBlockAddressMap().end()) {
     return I->second;
   } else {
     llvm_unreachable("JIT does not have BB address for address-of-label, was"
@@ -688,7 +688,7 @@ char* JIT::getMemoryForGV(const GlobalVariable* GV) {
 
 void JIT::addPendingFunction(Function *F) {
   MutexGuard locked(lock);
-  jitstate->getPendingFunctions(locked).push_back(F);
+  jitstate->getPendingFunctions().push_back(F);
 }
 
 
diff --git a/lib/ExecutionEngine/JIT/JIT.h b/lib/ExecutionEngine/JIT/JIT.h
index d2bd508..69a7c36 100644
--- a/lib/ExecutionEngine/JIT/JIT.h
+++ b/lib/ExecutionEngine/JIT/JIT.h
@@ -39,12 +39,12 @@ private:
 public:
   explicit JITState(Module *M) : PM(M), M(M) {}
 
-  FunctionPassManager &getPM(const MutexGuard &L) {
+  FunctionPassManager &getPM() {
     return PM;
   }
 
   Module *getModule() const { return M; }
-  std::vector<AssertingVH<Function> > &getPendingFunctions(const MutexGuard &L){
+  std::vector<AssertingVH<Function> > &getPendingFunctions() {
     return PendingFunctions;
   }
 };
@@ -205,7 +205,7 @@ public:
   void NotifyFreeingMachineCode(void *OldPtr);
 
   BasicBlockAddressMapTy &
-  getBasicBlockAddressMap(const MutexGuard &) {
+  getBasicBlockAddressMap() {
     return BasicBlockAddressMap;
   }
 
@@ -213,9 +213,9 @@ public:
 private:
   static JITCodeEmitter *createEmitter(JIT &J, JITMemoryManager *JMM,
                                        TargetMachine &tm);
-  void runJITOnFunctionUnlocked(Function *F, const MutexGuard &locked);
-  void updateFunctionStub(Function *F);
-  void jitTheFunction(Function *F, const MutexGuard &locked);
+  void runJITOnFunctionUnlocked(Function *F);
+  void updateFunctionStubUnlocked(Function *F);
+  void jitTheFunctionUnlocked(Function *F);
 
 protected:
 
diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp
index cd7a500..50b8c10 100644
--- a/lib/ExecutionEngine/JIT/JITEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/ValueMap.h"
 #include "llvm/Support/Debug.h"
@@ -120,21 +121,16 @@ namespace {
 #endif
     }
 
-    FunctionToLazyStubMapTy& getFunctionToLazyStubMap(
-      const MutexGuard& locked) {
-      assert(locked.holds(TheJIT->lock));
+    FunctionToLazyStubMapTy& getFunctionToLazyStubMap() {
       return FunctionToLazyStubMap;
     }
 
-    GlobalToIndirectSymMapTy& getGlobalToIndirectSymMap(const MutexGuard& lck) {
-      assert(lck.holds(TheJIT->lock));
+    GlobalToIndirectSymMapTy& getGlobalToIndirectSymMap() {
       return GlobalToIndirectSymMap;
     }
 
     std::pair<void *, Function *> LookupFunctionFromCallSite(
-        const MutexGuard &locked, void *CallSite) const {
-      assert(locked.holds(TheJIT->lock));
-
+        void *CallSite) const {
       // The address given to us for the stub may not be exactly right, it
       // might be a little bit after the stub.  As such, use upper_bound to
       // find it.
@@ -146,9 +142,7 @@ namespace {
       return *I;
     }
 
-    void AddCallSite(const MutexGuard &locked, void *CallSite, Function *F) {
-      assert(locked.holds(TheJIT->lock));
-
+    void AddCallSite(void *CallSite, Function *F) {
       bool Inserted = CallSiteToFunctionMap.insert(
           std::make_pair(CallSite, F)).second;
       (void)Inserted;
@@ -503,7 +497,7 @@ void *JITResolver::getLazyFunctionStubIfAvailable(Function *F) {
   MutexGuard locked(TheJIT->lock);
 
   // If we already have a stub for this function, recycle it.
-  return state.getFunctionToLazyStubMap(locked).lookup(F);
+  return state.getFunctionToLazyStubMap().lookup(F);
 }
 
 /// getFunctionStub - This returns a pointer to a function stub, creating
@@ -512,7 +506,7 @@ void *JITResolver::getLazyFunctionStub(Function *F) {
   MutexGuard locked(TheJIT->lock);
 
   // If we already have a lazy stub for this function, recycle it.
-  void *&Stub = state.getFunctionToLazyStubMap(locked)[F];
+  void *&Stub = state.getFunctionToLazyStubMap()[F];
   if (Stub) return Stub;
 
   // Call the lazy resolver function if we are JIT'ing lazily.  Otherwise we
@@ -554,7 +548,7 @@ void *JITResolver::getLazyFunctionStub(Function *F) {
 
     // Finally, keep track of the stub-to-Function mapping so that the
     // JITCompilerFn knows which function to compile!
-    state.AddCallSite(locked, Stub, F);
+    state.AddCallSite(Stub, F);
   } else if (!Actual) {
     // If we are JIT'ing non-lazily but need to call a function that does not
     // exist yet, add it to the JIT's work list so that we can fill in the
@@ -573,7 +567,7 @@ void *JITResolver::getGlobalValueIndirectSym(GlobalValue *GV, void *GVAddress) {
   MutexGuard locked(TheJIT->lock);
 
   // If we already have a stub for this global variable, recycle it.
-  void *&IndirectSym = state.getGlobalToIndirectSymMap(locked)[GV];
+  void *&IndirectSym = state.getGlobalToIndirectSymMap()[GV];
   if (IndirectSym) return IndirectSym;
 
   // Otherwise, codegen a new indirect symbol.
@@ -633,7 +627,7 @@ void *JITResolver::JITCompilerFn(void *Stub) {
     // The address given to us for the stub may not be exactly right, it might
     // be a little bit after the stub.  As such, use upper_bound to find it.
     std::pair<void*, Function*> I =
-      JR->state.LookupFunctionFromCallSite(locked, Stub);
+      JR->state.LookupFunctionFromCallSite(Stub);
     F = I.second;
     ActualPtr = I.first;
   }
@@ -684,13 +678,23 @@ void *JITResolver::JITCompilerFn(void *Stub) {
 //===----------------------------------------------------------------------===//
 // JITEmitter code.
 //
+
+static GlobalObject *getSimpleAliasee(Constant *C) {
+  C = C->stripPointerCasts();
+  return dyn_cast<GlobalObject>(C);
+}
+
 void *JITEmitter::getPointerToGlobal(GlobalValue *V, void *Reference,
                                      bool MayNeedFarStub) {
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
     return TheJIT->getOrEmitGlobalVariable(GV);
 
-  if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V))
-    return TheJIT->getPointerToGlobal(GA->getAliasee());
+  if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+    // We can only handle simple cases.
+    if (GlobalValue *GV = getSimpleAliasee(GA->getAliasee()))
+      return TheJIT->getPointerToGlobal(GV);
+    return nullptr;
+  }
 
   // If we have already compiled the function, return a pointer to its body.
   Function *F = cast<Function>(V);
@@ -1225,7 +1229,7 @@ void *JIT::getPointerToFunctionOrStub(Function *F) {
   return JE->getJITResolver().getLazyFunctionStub(F);
 }
 
-void JIT::updateFunctionStub(Function *F) {
+void JIT::updateFunctionStubUnlocked(Function *F) {
   // Get the empty stub we generated earlier.
   JITEmitter *JE = static_cast<JITEmitter*>(getCodeEmitter());
   void *Stub = JE->getJITResolver().getLazyFunctionStub(F);
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 42cb4ea..e9ba96a 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -305,9 +305,13 @@ uint64_t MCJIT::getSymbolAddress(const std::string &Name,
     // Look for our symbols in each Archive
     object::Archive::child_iterator ChildIt = A->findSym(Name);
     if (ChildIt != A->child_end()) {
-      std::unique_ptr<object::Binary> ChildBin;
       // FIXME: Support nested archives?
-      if (!ChildIt->getAsBinary(ChildBin) && ChildBin->isObject()) {
+      ErrorOr<std::unique_ptr<object::Binary>> ChildBinOrErr =
+          ChildIt->getAsBinary();
+      if (ChildBinOrErr.getError())
+        continue;
+      std::unique_ptr<object::Binary> ChildBin = std::move(ChildBinOrErr.get());
+      if (ChildBin->isObject()) {
         std::unique_ptr<object::ObjectFile> OF(
             static_cast<object::ObjectFile *>(ChildBin.release()));
         // This causes the object file to be loaded.
diff --git a/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp b/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
index 9ceaa90..5986084 100644
--- a/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
+++ b/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
@@ -71,7 +71,7 @@ uint8_t *SectionMemoryManager::allocateSection(MemoryGroup &MemGroup,
   //
   // FIXME: Initialize the Near member for each memory group to avoid
   // interleaving.
-  error_code ec;
+  std::error_code ec;
   sys::MemoryBlock MB = sys::Memory::allocateMappedMemory(RequiredSize,
                                                           &MemGroup.Near,
                                                           sys::Memory::MF_READ |
@@ -105,7 +105,7 @@ uint8_t *SectionMemoryManager::allocateSection(MemoryGroup &MemGroup,
 bool SectionMemoryManager::finalizeMemory(std::string *ErrMsg)
 {
   // FIXME: Should in-progress permissions be reverted if an error occurs?
-  error_code ec;
+  std::error_code ec;
 
   // Don't allow free memory blocks to be used after setting protection flags.
   CodeMem.FreeMem.clear();
@@ -143,19 +143,20 @@ bool SectionMemoryManager::finalizeMemory(std::string *ErrMsg)
   return false;
 }
 
-error_code SectionMemoryManager::applyMemoryGroupPermissions(MemoryGroup &MemGroup,
-                                                             unsigned Permissions) {
+std::error_code
+SectionMemoryManager::applyMemoryGroupPermissions(MemoryGroup &MemGroup,
+                                                  unsigned Permissions) {
 
   for (int i = 0, e = MemGroup.AllocatedMem.size(); i != e; ++i) {
-      error_code ec;
-      ec = sys::Memory::protectMappedMemory(MemGroup.AllocatedMem[i],
-                                            Permissions);
-      if (ec) {
-        return ec;
-      }
+    std::error_code ec;
+    ec =
+        sys::Memory::protectMappedMemory(MemGroup.AllocatedMem[i], Permissions);
+    if (ec) {
+      return ec;
+    }
   }
 
-  return error_code::success();
+  return std::error_code();
 }
 
 void SectionMemoryManager::invalidateInstructionCache() {
diff --git a/lib/ExecutionEngine/RuntimeDyld/Android.mk b/lib/ExecutionEngine/RuntimeDyld/Android.mk
index e98e80a..eb2e438 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Android.mk
+++ b/lib/ExecutionEngine/RuntimeDyld/Android.mk
@@ -7,6 +7,7 @@ include $(CLEAR_VARS)
 LOCAL_SRC_FILES :=	\
 	GDBRegistrar.cpp \
 	RuntimeDyld.cpp \
+	RuntimeDyldChecker.cpp \
 	RuntimeDyldELF.cpp \
 	RuntimeDyldMachO.cpp
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt b/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt
index cbf7cf1..eb1a60b 100644
--- a/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt
+++ b/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_library(LLVMRuntimeDyld
   GDBRegistrar.cpp
   RuntimeDyld.cpp
+  RuntimeDyldChecker.cpp
   RuntimeDyldELF.cpp
   RuntimeDyldMachO.cpp
   )
diff --git a/lib/ExecutionEngine/RuntimeDyld/LLVMBuild.txt b/lib/ExecutionEngine/RuntimeDyld/LLVMBuild.txt
index 97dc861..8bd5621 100644
--- a/lib/ExecutionEngine/RuntimeDyld/LLVMBuild.txt
+++ b/lib/ExecutionEngine/RuntimeDyld/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = RuntimeDyld
 parent = ExecutionEngine
-required_libraries = Object Support
+required_libraries = MC Object Support
diff --git a/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h b/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
index 4917b93..c3a2182 100644
--- a/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
+++ b/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
@@ -48,7 +48,8 @@ public:
   {
     // FIXME: error checking? createObjectFile returns an ErrorOr<ObjectFile*>
     // and should probably be checked for failure.
-    ObjFile.reset(object::ObjectFile::createObjectFile(Buffer->getMemBuffer()).get());
+    std::unique_ptr<MemoryBuffer> Buf(Buffer->getMemBuffer());
+    ObjFile.reset(object::ObjectFile::createObjectFile(Buf).get());
   }
   ObjectImageCommon(std::unique_ptr<object::ObjectFile> Input)
   : ObjectImage(nullptr), ObjFile(std::move(Input))  {}
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index c1eb0fd..9dfd167 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -73,9 +73,9 @@ void RuntimeDyldImpl::mapSectionAddress(const void *LocalAddress,
   llvm_unreachable("Attempting to remap address of unknown section!");
 }
 
-static error_code getOffset(const SymbolRef &Sym, uint64_t &Result) {
+static std::error_code getOffset(const SymbolRef &Sym, uint64_t &Result) {
   uint64_t Address;
-  if (error_code EC = Sym.getAddress(Address))
+  if (std::error_code EC = Sym.getAddress(Address))
     return EC;
 
   if (Address == UnknownAddressOrSize) {
@@ -85,7 +85,7 @@ static error_code getOffset(const SymbolRef &Sym, uint64_t &Result) {
 
   const ObjectFile *Obj = Sym.getObject();
   section_iterator SecI(Obj->section_begin());
-  if (error_code EC = Sym.getSection(SecI))
+  if (std::error_code EC = Sym.getSection(SecI))
     return EC;
 
  if (SecI == Obj->section_end()) {
@@ -94,7 +94,7 @@ static error_code getOffset(const SymbolRef &Sym, uint64_t &Result) {
  }
 
   uint64_t SectionAddress;
-  if (error_code EC = SecI->getAddress(SectionAddress))
+  if (std::error_code EC = SecI->getAddress(SectionAddress))
     return EC;
 
   Result = Address - SectionAddress;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
new file mode 100644
index 0000000..190bbbf
--- /dev/null
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
@@ -0,0 +1,641 @@
+//===--- RuntimeDyldChecker.cpp - RuntimeDyld tester framework --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/RuntimeDyldChecker.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/StringRefMemoryObject.h"
+#include "RuntimeDyldImpl.h"
+#include <cctype>
+#include <memory>
+
+#define DEBUG_TYPE "rtdyld"
+
+using namespace llvm;
+
+namespace llvm {
+
+  // Helper class that implements the language evaluated by RuntimeDyldChecker.
+  class RuntimeDyldCheckerExprEval {
+  public:
+
+    RuntimeDyldCheckerExprEval(const RuntimeDyldChecker &Checker,
+                               llvm::raw_ostream &ErrStream)
+      : Checker(Checker), ErrStream(ErrStream) {}
+
+    bool evaluate(StringRef Expr) const {
+      // Expect equality expression of the form 'LHS = RHS'.
+      Expr = Expr.trim();
+      size_t EQIdx = Expr.find('=');
+
+      // Evaluate LHS.
+      StringRef LHSExpr = Expr.substr(0, EQIdx).rtrim();
+      StringRef RemainingExpr;
+      EvalResult LHSResult;
+      std::tie(LHSResult, RemainingExpr) =
+        evalComplexExpr(evalSimpleExpr(LHSExpr));
+      if (LHSResult.hasError())
+        return handleError(Expr, LHSResult);
+      if (RemainingExpr != "")
+        return handleError(Expr, unexpectedToken(RemainingExpr, LHSExpr, ""));
+
+      // Evaluate RHS.
+      StringRef RHSExpr = Expr.substr(EQIdx + 1).ltrim();
+      EvalResult RHSResult;
+      std::tie(RHSResult, RemainingExpr) =
+        evalComplexExpr(evalSimpleExpr(RHSExpr));
+      if (RHSResult.hasError())
+        return handleError(Expr, RHSResult);
+      if (RemainingExpr != "")
+        return handleError(Expr, unexpectedToken(RemainingExpr, RHSExpr, ""));
+
+      if (LHSResult.getValue() != RHSResult.getValue()) {
+        ErrStream << "Expression '" << Expr << "' is false: "
+                  << format("0x%lx", LHSResult.getValue()) << " != "
+                  << format("0x%lx", RHSResult.getValue()) << "\n";
+        return false;
+      }
+      return true;
+    }
+
+  private:
+    const RuntimeDyldChecker &Checker;
+    llvm::raw_ostream &ErrStream;
+
+    enum class BinOpToken : unsigned { Invalid, Add, Sub, BitwiseAnd,
+                                       BitwiseOr, ShiftLeft, ShiftRight };
+
+    class EvalResult {
+    public:
+      EvalResult()
+        : Value(0), ErrorMsg("") {}
+      EvalResult(uint64_t Value)
+        : Value(Value), ErrorMsg("") {}
+      EvalResult(std::string ErrorMsg)
+        : Value(0), ErrorMsg(ErrorMsg) {}
+      uint64_t getValue() const { return Value; }
+      bool hasError() const { return ErrorMsg != ""; }
+      const std::string& getErrorMsg() const { return ErrorMsg; }
+    private:
+      uint64_t Value;
+      std::string ErrorMsg;
+    };
+
+    StringRef getTokenForError(StringRef Expr) const {
+      if (Expr.empty())
+        return "";
+
+      StringRef Token, Remaining;
+      if (isalpha(Expr[0]))
+        std::tie(Token, Remaining) = parseSymbol(Expr);
+      else if (isdigit(Expr[0]))
+        std::tie(Token, Remaining) = parseNumberString(Expr);
+      else {
+        unsigned TokLen = 1;
+        if (Expr.startswith("<<") || Expr.startswith(">>"))
+          TokLen = 2;
+        Token = Expr.substr(0, TokLen);
+      }
+      return Token;
+    }
+
+    EvalResult unexpectedToken(StringRef TokenStart,
+                               StringRef SubExpr,
+                               StringRef ErrText) const {
+      std::string ErrorMsg("Encountered unexpected token '");
+      ErrorMsg += getTokenForError(TokenStart);
+      if (SubExpr != "") {
+        ErrorMsg += "' while parsing subexpression '";
+        ErrorMsg += SubExpr;
+      }
+      ErrorMsg += "'";
+      if (ErrText != "") {
+        ErrorMsg += " ";
+        ErrorMsg += ErrText;
+      }
+      return EvalResult(std::move(ErrorMsg));
+    }
+
+    bool handleError(StringRef Expr, const EvalResult &R) const {
+      assert(R.hasError() && "Not an error result.");
+      ErrStream << "Error evaluating expression '" << Expr << "': "
+                << R.getErrorMsg() << "\n";
+      return false;
+    }
+
+    std::pair<BinOpToken, StringRef> parseBinOpToken(StringRef Expr) const {
+      if (Expr.empty())
+        return std::make_pair(BinOpToken::Invalid, "");
+
+      // Handle the two 2-character tokens.
+      if (Expr.startswith("<<"))
+        return std::make_pair(BinOpToken::ShiftLeft,
+                              Expr.substr(2).ltrim());
+      if (Expr.startswith(">>"))
+        return std::make_pair(BinOpToken::ShiftRight,
+                              Expr.substr(2).ltrim());
+
+      // Handle one-character tokens.
+      BinOpToken Op;
+      switch (Expr[0]) {
+        default: return std::make_pair(BinOpToken::Invalid, Expr);
+        case '+': Op = BinOpToken::Add; break;
+        case '-': Op = BinOpToken::Sub; break;
+        case '&': Op = BinOpToken::BitwiseAnd; break;
+        case '|': Op = BinOpToken::BitwiseOr; break;
+      }
+
+      return std::make_pair(Op, Expr.substr(1).ltrim());
+    }
+
+    EvalResult computeBinOpResult(BinOpToken Op, const EvalResult &LHSResult,
+                                  const EvalResult &RHSResult) const {
+      switch (Op) {
+      default: llvm_unreachable("Tried to evaluate unrecognized operation.");
+      case BinOpToken::Add:
+        return EvalResult(LHSResult.getValue() + RHSResult.getValue());
+      case BinOpToken::Sub:
+        return EvalResult(LHSResult.getValue() - RHSResult.getValue());
+      case BinOpToken::BitwiseAnd:
+        return EvalResult(LHSResult.getValue() & RHSResult.getValue());
+      case BinOpToken::BitwiseOr:
+        return EvalResult(LHSResult.getValue() | RHSResult.getValue());
+      case BinOpToken::ShiftLeft:
+        return EvalResult(LHSResult.getValue() << RHSResult.getValue());
+      case BinOpToken::ShiftRight:
+        return EvalResult(LHSResult.getValue() >> RHSResult.getValue());
+      }
+    }
+
+    // Parse a symbol and return a (string, string) pair representing the symbol
+    // name and expression remaining to be parsed.
+    std::pair<StringRef, StringRef> parseSymbol(StringRef Expr) const {
+      size_t FirstNonSymbol =
+        Expr.find_first_not_of("0123456789"
+                               "abcdefghijklmnopqrstuvwxyz"
+                               "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                               ":_");
+      return std::make_pair(Expr.substr(0, FirstNonSymbol),
+                            Expr.substr(FirstNonSymbol).ltrim());
+    }
+
+    // Evaluate a call to decode_operand. Decode the instruction operand at the
+    // given symbol and get the value of the requested operand.
+    // Returns an error if the instruction cannot be decoded, or the requested
+    // operand is not an immediate.
+    // On success, retuns a pair containing the value of the operand, plus
+    // the expression remaining to be evaluated.
+    std::pair<EvalResult, StringRef> evalDecodeOperand(StringRef Expr) const {
+      if (!Expr.startswith("("))
+        return std::make_pair(unexpectedToken(Expr, Expr, "expected '('"), "");
+      StringRef RemainingExpr = Expr.substr(1).ltrim();
+      StringRef Symbol;
+      std::tie(Symbol, RemainingExpr) = parseSymbol(RemainingExpr);
+
+      if (!Checker.checkSymbolIsValidForLoad(Symbol))
+        return std::make_pair(EvalResult(("Cannot decode unknown symbol '" +
+                                          Symbol + "'").str()),
+                              "");
+
+      if (!RemainingExpr.startswith(","))
+        return std::make_pair(unexpectedToken(RemainingExpr, RemainingExpr,
+                                              "expected ','"),
+                              "");
+      RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+      EvalResult OpIdxExpr;
+      std::tie(OpIdxExpr, RemainingExpr) = evalNumberExpr(RemainingExpr);
+      if (OpIdxExpr.hasError())
+        return std::make_pair(OpIdxExpr, "");
+
+      if (!RemainingExpr.startswith(")"))
+        return std::make_pair(unexpectedToken(RemainingExpr, RemainingExpr,
+                                              "expected ')'"),
+                              "");
+      RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+      MCInst Inst;
+      uint64_t Size;
+      if (!decodeInst(Symbol, Inst, Size))
+        return std::make_pair(EvalResult(("Couldn't decode instruction at '" +
+                                          Symbol + "'").str()),
+                              "");
+
+      unsigned OpIdx = OpIdxExpr.getValue();
+      if (OpIdx >= Inst.getNumOperands()) {
+        std::string ErrMsg;
+        raw_string_ostream ErrMsgStream(ErrMsg);
+        ErrMsgStream << "Invalid operand index '" << format("%i", OpIdx)
+                     << " for instruction '" << Symbol
+                     << ". Instruction has only "
+                     << format("%i", Inst.getNumOperands()) << " operands.";
+        return std::make_pair(EvalResult(ErrMsgStream.str()), "");
+      }
+
+      const MCOperand &Op = Inst.getOperand(OpIdx);
+      if (!Op.isImm()) {
+        std::string ErrMsg;
+        raw_string_ostream ErrMsgStream(ErrMsg);
+        ErrMsgStream << "Operand '" << format("%i", OpIdx)
+                     << "' of instruction '" << Symbol
+                     << "' is not an immediate.\nInstruction is:\n  ";
+        Inst.dump_pretty(ErrMsgStream,
+                         Checker.Disassembler->getContext().getAsmInfo(),
+                         Checker.InstPrinter);
+
+        return std::make_pair(EvalResult(ErrMsgStream.str()), "");
+      }
+
+      return std::make_pair(EvalResult(Op.getImm()), RemainingExpr);
+    }
+
+    // Evaluate a call to next_pc. Decode the instruction at the given
+    // symbol and return the following program counter..
+    // Returns an error if the instruction cannot be decoded.
+    // On success, returns a pair containing the next PC, plus the length of the
+    // expression remaining to be evaluated.
+    std::pair<EvalResult, StringRef> evalNextPC(StringRef Expr) const {
+      if (!Expr.startswith("("))
+        return std::make_pair(unexpectedToken(Expr, Expr, "expected '('"), "");
+      StringRef RemainingExpr = Expr.substr(1).ltrim();
+      StringRef Symbol;
+      std::tie(Symbol, RemainingExpr) = parseSymbol(RemainingExpr);
+
+      if (!Checker.checkSymbolIsValidForLoad(Symbol))
+        return std::make_pair(EvalResult(("Cannot decode unknown symbol '"
+                                          + Symbol + "'").str()),
+                              "");
+
+      if (!RemainingExpr.startswith(")"))
+        return std::make_pair(unexpectedToken(RemainingExpr, RemainingExpr,
+                                              "expected ')'"),
+                              "");
+      RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+      MCInst Inst;
+      uint64_t Size;
+      if (!decodeInst(Symbol, Inst, Size))
+        return std::make_pair(EvalResult(("Couldn't decode instruction at '" +
+                                          Symbol + "'").str()),
+                              "");
+      uint64_t NextPC = Checker.getSymbolAddress(Symbol) + Size;
+
+      return std::make_pair(EvalResult(NextPC), RemainingExpr);
+    }
+
+    // Evaluate an identiefer expr, which may be a symbol, or a call to
+    // one of the builtin functions: get_insn_opcode or get_insn_length.
+    // Return the result, plus the expression remaining to be parsed.
+    std::pair<EvalResult, StringRef> evalIdentifierExpr(StringRef Expr) const {
+      StringRef Symbol;
+      StringRef RemainingExpr;
+      std::tie(Symbol, RemainingExpr) = parseSymbol(Expr);
+
+      // Check for builtin function calls.
+      if (Symbol == "decode_operand")
+        return evalDecodeOperand(RemainingExpr);
+      else if (Symbol == "next_pc")
+        return evalNextPC(RemainingExpr);
+
+      // Looks like a plain symbol reference.
+      return std::make_pair(EvalResult(Checker.getSymbolAddress(Symbol)),
+                            RemainingExpr);
+    }
+
+    // Parse a number (hexadecimal or decimal) and return a (string, string)
+    // pair representing the number and the expression remaining to be parsed.
+    std::pair<StringRef, StringRef> parseNumberString(StringRef Expr) const {
+      size_t FirstNonDigit = StringRef::npos;
+      if (Expr.startswith("0x")) {
+        FirstNonDigit = Expr.find_first_not_of("0123456789abcdefABCDEF", 2);
+        if (FirstNonDigit == StringRef::npos)
+          FirstNonDigit = Expr.size();
+      } else {
+        FirstNonDigit = Expr.find_first_not_of("0123456789");
+        if (FirstNonDigit == StringRef::npos)
+          FirstNonDigit = Expr.size();
+      }
+      return std::make_pair(Expr.substr(0, FirstNonDigit),
+                            Expr.substr(FirstNonDigit));
+    }
+
+    // Evaluate a constant numeric expression (hexidecimal or decimal) and
+    // return a pair containing the result, and the expression remaining to be
+    // evaluated.
+    std::pair<EvalResult, StringRef> evalNumberExpr(StringRef Expr) const {
+      StringRef ValueStr;
+      StringRef RemainingExpr;
+      std::tie(ValueStr, RemainingExpr) = parseNumberString(Expr);
+
+      if (ValueStr.empty() || !isdigit(ValueStr[0]))
+        return std::make_pair(unexpectedToken(RemainingExpr, RemainingExpr,
+                                              "expected number"),
+                              "");
+      uint64_t Value;
+      ValueStr.getAsInteger(0, Value);
+      return std::make_pair(EvalResult(Value), RemainingExpr);
+    }
+
+    // Evaluate an expression of the form "(<expr>)" and return a pair
+    // containing the result of evaluating <expr>, plus the expression
+    // remaining to be parsed.
+    std::pair<EvalResult, StringRef> evalParensExpr(StringRef Expr) const {
+      assert(Expr.startswith("(") && "Not a parenthesized expression");
+      EvalResult SubExprResult;
+      StringRef RemainingExpr;
+      std::tie(SubExprResult, RemainingExpr) =
+        evalComplexExpr(evalSimpleExpr(Expr.substr(1).ltrim()));
+      if (SubExprResult.hasError())
+        return std::make_pair(SubExprResult, "");
+      if (!RemainingExpr.startswith(")"))
+        return std::make_pair(unexpectedToken(RemainingExpr, Expr,
+                                              "expected ')'"),
+                              "");
+      RemainingExpr = RemainingExpr.substr(1).ltrim();
+      return std::make_pair(SubExprResult, RemainingExpr);
+    }
+
+    // Evaluate an expression in one of the following forms:
+    //   *{<number>}<symbol>
+    //   *{<number>}(<symbol> + <number>)
+    //   *{<number>}(<symbol> - <number>)
+    // Return a pair containing the result, plus the expression remaining to be
+    // parsed.
+    std::pair<EvalResult, StringRef> evalLoadExpr(StringRef Expr) const {
+      assert(Expr.startswith("*") && "Not a load expression");
+      StringRef RemainingExpr = Expr.substr(1).ltrim();
+      // Parse read size.
+      if (!RemainingExpr.startswith("{"))
+        return std::make_pair(EvalResult("Expected '{' following '*'."), "");
+      RemainingExpr = RemainingExpr.substr(1).ltrim();
+      EvalResult ReadSizeExpr;
+      std::tie(ReadSizeExpr, RemainingExpr) = evalNumberExpr(RemainingExpr);
+      if (ReadSizeExpr.hasError())
+        return std::make_pair(ReadSizeExpr, RemainingExpr);
+      uint64_t ReadSize = ReadSizeExpr.getValue();
+      if (ReadSize < 1 || ReadSize > 8)
+        return std::make_pair(EvalResult("Invalid size for dereference."), "");
+      if (!RemainingExpr.startswith("}"))
+        return std::make_pair(EvalResult("Missing '}' for dereference."), "");
+      RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+      // Check for '(symbol +/- constant)' form.
+      bool SymbolPlusConstant = false;
+      if (RemainingExpr.startswith("(")) {
+        SymbolPlusConstant = true;
+        RemainingExpr = RemainingExpr.substr(1).ltrim();
+      }
+
+      // Read symbol.
+      StringRef Symbol;
+      std::tie(Symbol, RemainingExpr) = parseSymbol(RemainingExpr);
+
+      if (!Checker.checkSymbolIsValidForLoad(Symbol))
+        return std::make_pair(EvalResult(("Cannot dereference unknown symbol '"
+                                          + Symbol + "'").str()),
+                              "");
+
+      // Set up defaut offset.
+      int64_t Offset = 0;
+
+      // Handle "+/- constant)" portion if necessary.
+      if (SymbolPlusConstant) {
+        char OpChar = RemainingExpr[0];
+        if (OpChar != '+' && OpChar != '-')
+          return std::make_pair(EvalResult("Invalid operator in load address."),
+                                "");
+        RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+        EvalResult OffsetExpr;
+        std::tie(OffsetExpr, RemainingExpr) = evalNumberExpr(RemainingExpr);
+
+        Offset = (OpChar == '+') ?
+                   OffsetExpr.getValue() : -1 * OffsetExpr.getValue();
+
+        if (!RemainingExpr.startswith(")"))
+          return std::make_pair(EvalResult("Missing ')' in load address."),
+                                "");
+
+        RemainingExpr = RemainingExpr.substr(1).ltrim();
+      }
+
+      return std::make_pair(
+               EvalResult(Checker.readMemoryAtSymbol(Symbol, Offset, ReadSize)),
+               RemainingExpr);
+    }
+
+    // Evaluate a "simple" expression. This is any expression that _isn't_ an
+    // un-parenthesized binary expression.
+    //
+    // "Simple" expressions can be optionally bit-sliced. See evalSlicedExpr.
+    //
+    // Returns a pair containing the result of the evaluation, plus the
+    // expression remaining to be parsed.
+    std::pair<EvalResult, StringRef> evalSimpleExpr(StringRef Expr) const {
+      EvalResult SubExprResult;
+      StringRef RemainingExpr;
+
+      if (Expr.empty())
+        return std::make_pair(EvalResult("Unexpected end of expression"), "");
+
+      if (Expr[0] == '(')
+        std::tie(SubExprResult, RemainingExpr) = evalParensExpr(Expr);
+      else if (Expr[0] == '*')
+        std::tie(SubExprResult, RemainingExpr) = evalLoadExpr(Expr);
+      else if (isalpha(Expr[0]))
+        std::tie(SubExprResult, RemainingExpr) = evalIdentifierExpr(Expr);
+      else if (isdigit(Expr[0]))
+        std::tie(SubExprResult, RemainingExpr) = evalNumberExpr(Expr);
+
+      if (SubExprResult.hasError())
+        return std::make_pair(SubExprResult, RemainingExpr);
+
+      // Evaluate bit-slice if present.
+      if (RemainingExpr.startswith("["))
+        std::tie(SubExprResult, RemainingExpr) =
+          evalSliceExpr(std::make_pair(SubExprResult, RemainingExpr));
+
+      return std::make_pair(SubExprResult, RemainingExpr);
+    }
+
+    // Evaluate a bit-slice of an expression.
+    // A bit-slice has the form "<expr>[high:low]". The result of evaluating a
+    // slice is the bits between high and low (inclusive) in the original
+    // expression, right shifted so that the "low" bit is in position 0 in the
+    // result.
+    // Returns a pair containing the result of the slice operation, plus the
+    // expression remaining to be parsed.
+    std::pair<EvalResult, StringRef> evalSliceExpr(
+                                    std::pair<EvalResult, StringRef> Ctx) const{
+      EvalResult SubExprResult;
+      StringRef RemainingExpr;
+      std::tie(SubExprResult, RemainingExpr) = Ctx;
+
+      assert(RemainingExpr.startswith("[") && "Not a slice expr.");
+      RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+      EvalResult HighBitExpr;
+      std::tie(HighBitExpr, RemainingExpr) = evalNumberExpr(RemainingExpr);
+
+      if (HighBitExpr.hasError())
+        return std::make_pair(HighBitExpr, RemainingExpr);
+
+      if (!RemainingExpr.startswith(":"))
+        return std::make_pair(unexpectedToken(RemainingExpr, RemainingExpr,
+                                              "expected ':'"),
+                              "");
+      RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+      EvalResult LowBitExpr;
+      std::tie(LowBitExpr, RemainingExpr) = evalNumberExpr(RemainingExpr);
+
+      if (LowBitExpr.hasError())
+        return std::make_pair(LowBitExpr, RemainingExpr);
+
+      if (!RemainingExpr.startswith("]"))
+        return std::make_pair(unexpectedToken(RemainingExpr, RemainingExpr,
+                                              "expected ']'"),
+                              "");
+      RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+      unsigned HighBit = HighBitExpr.getValue();
+      unsigned LowBit = LowBitExpr.getValue();
+      uint64_t Mask = ((uint64_t)1 << (HighBit - LowBit + 1)) - 1;
+      uint64_t SlicedValue = (SubExprResult.getValue() >> LowBit) & Mask;
+      return std::make_pair(EvalResult(SlicedValue), RemainingExpr);
+    }
+
+    // Evaluate a "complex" expression.
+    // Takes an already evaluated subexpression and checks for the presence of a
+    // binary operator, computing the result of the binary operation if one is
+    // found. Used to make arithmetic expressions left-associative.
+    // Returns a pair containing the ultimate result of evaluating the
+    // expression, plus the expression remaining to be evaluated.
+    std::pair<EvalResult, StringRef> evalComplexExpr(
+                                   std::pair<EvalResult, StringRef> Ctx) const {
+      EvalResult LHSResult;
+      StringRef RemainingExpr;
+      std::tie(LHSResult, RemainingExpr) = Ctx;
+
+      // If there was an error, or there's nothing left to evaluate, return the
+      // result.
+      if (LHSResult.hasError() || RemainingExpr == "")
+        return std::make_pair(LHSResult, RemainingExpr);
+
+      // Otherwise check if this is a binary expressioan.
+      BinOpToken BinOp;
+      std::tie(BinOp, RemainingExpr) = parseBinOpToken(RemainingExpr);
+
+      // If this isn't a recognized expression just return.
+      if (BinOp == BinOpToken::Invalid)
+        return std::make_pair(LHSResult, RemainingExpr);
+
+      // This is a recognized bin-op. Evaluate the RHS, then evaluate the binop.
+      EvalResult RHSResult;
+      std::tie(RHSResult, RemainingExpr) = evalSimpleExpr(RemainingExpr);
+
+      // If there was an error evaluating the RHS, return it.
+      if (RHSResult.hasError())
+        return std::make_pair(RHSResult, RemainingExpr);
+
+      // This is a binary expression - evaluate and try to continue as a
+      // complex expr.
+      EvalResult ThisResult(computeBinOpResult(BinOp, LHSResult, RHSResult));
+
+      return evalComplexExpr(std::make_pair(ThisResult, RemainingExpr));
+    }
+
+    bool decodeInst(StringRef Symbol, MCInst &Inst, uint64_t &Size) const {
+      MCDisassembler *Dis = Checker.Disassembler;
+      StringRef SectionMem = Checker.getSubsectionStartingAt(Symbol);
+      StringRefMemoryObject SectionBytes(SectionMem, 0);
+
+      MCDisassembler::DecodeStatus S =
+        Dis->getInstruction(Inst, Size, SectionBytes, 0, nulls(), nulls());
+
+      return (S == MCDisassembler::Success);
+    }
+
+  };
+
+}
+
+bool RuntimeDyldChecker::check(StringRef CheckExpr) const {
+  CheckExpr = CheckExpr.trim();
+  DEBUG(llvm::dbgs() << "RuntimeDyldChecker: Checking '" << CheckExpr
+                     << "'...\n");
+  RuntimeDyldCheckerExprEval P(*this, ErrStream);
+  bool Result = P.evaluate(CheckExpr);
+  (void)Result;
+  DEBUG(llvm::dbgs() << "RuntimeDyldChecker: '" << CheckExpr << "' "
+                     << (Result ? "passed" : "FAILED") << ".\n");
+  return Result;
+}
+
+bool RuntimeDyldChecker::checkAllRulesInBuffer(StringRef RulePrefix,
+                                               MemoryBuffer* MemBuf) const {
+  bool DidAllTestsPass = true;
+  unsigned NumRules = 0;
+
+  const char *LineStart = MemBuf->getBufferStart();
+
+  // Eat whitespace.
+  while (LineStart != MemBuf->getBufferEnd() &&
+         std::isspace(*LineStart))
+    ++LineStart;
+
+  while (LineStart != MemBuf->getBufferEnd() && *LineStart != '\0') {
+    const char *LineEnd = LineStart;
+    while (LineEnd != MemBuf->getBufferEnd() &&
+           *LineEnd != '\r' && *LineEnd != '\n')
+      ++LineEnd;
+
+    StringRef Line(LineStart, LineEnd - LineStart);
+    if (Line.startswith(RulePrefix)) {
+      DidAllTestsPass &= check(Line.substr(RulePrefix.size()));
+      ++NumRules;
+    }
+
+    // Eat whitespace.
+    LineStart = LineEnd;
+    while (LineStart != MemBuf->getBufferEnd() &&
+           std::isspace(*LineStart))
+      ++LineStart;
+  }
+  return DidAllTestsPass && (NumRules != 0);
+}
+
+bool RuntimeDyldChecker::checkSymbolIsValidForLoad(StringRef Symbol) const {
+  return RTDyld.getSymbolAddress(Symbol) != nullptr;
+}
+
+uint64_t RuntimeDyldChecker::getSymbolAddress(StringRef Symbol) const {
+  return RTDyld.getAnySymbolRemoteAddress(Symbol);
+}
+
+uint64_t RuntimeDyldChecker::readMemoryAtSymbol(StringRef Symbol,
+                                                int64_t Offset,
+                                                unsigned Size) const {
+  uint8_t *Src = RTDyld.getSymbolAddress(Symbol);
+  uint64_t Result = 0;
+  memcpy(&Result, Src + Offset, Size);
+  return Result;
+}
+
+StringRef RuntimeDyldChecker::getSubsectionStartingAt(StringRef Name) const {
+  RuntimeDyldImpl::SymbolTableMap::const_iterator pos =
+    RTDyld.GlobalSymbolTable.find(Name);
+  if (pos == RTDyld.GlobalSymbolTable.end())
+    return StringRef();
+  RuntimeDyldImpl::SymbolLoc Loc = pos->second;
+  uint8_t *SectionAddr = RTDyld.getSectionAddress(Loc.first);
+  return StringRef(reinterpret_cast<const char*>(SectionAddr) + Loc.second,
+                   RTDyld.Sections[Loc.first].Size - Loc.second);
+}
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 6ba24b9..80e489c 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -32,7 +32,7 @@ using namespace llvm::object;
 
 namespace {
 
-static inline error_code check(error_code Err) {
+static inline std::error_code check(std::error_code Err) {
   if (Err) {
     report_fatal_error(Err.message());
   }
@@ -55,9 +55,9 @@ template <class ELFT> class DyldELFObject : public ELFObjectFile<ELFT> {
 
 public:
   DyldELFObject(std::unique_ptr<ObjectFile> UnderlyingFile,
-                MemoryBuffer *Wrapper, error_code &ec);
+                std::unique_ptr<MemoryBuffer> Wrapper, std::error_code &ec);
 
-  DyldELFObject(MemoryBuffer *Wrapper, error_code &ec);
+  DyldELFObject(std::unique_ptr<MemoryBuffer> Wrapper, std::error_code &ec);
 
   void updateSectionAddress(const SectionRef &Sec, uint64_t Addr);
   void updateSymbolAddress(const SymbolRef &Sym, uint64_t Addr);
@@ -109,15 +109,17 @@ public:
 // actual memory.  Ultimately, the Binary parent class will take ownership of
 // this MemoryBuffer object but not the underlying memory.
 template <class ELFT>
-DyldELFObject<ELFT>::DyldELFObject(MemoryBuffer *Wrapper, error_code &ec)
-    : ELFObjectFile<ELFT>(Wrapper, ec) {
+DyldELFObject<ELFT>::DyldELFObject(std::unique_ptr<MemoryBuffer> Wrapper,
+                                   std::error_code &EC)
+    : ELFObjectFile<ELFT>(std::move(Wrapper), EC) {
   this->isDyldELFObject = true;
 }
 
 template <class ELFT>
 DyldELFObject<ELFT>::DyldELFObject(std::unique_ptr<ObjectFile> UnderlyingFile,
-                                   MemoryBuffer *Wrapper, error_code &ec)
-    : ELFObjectFile<ELFT>(Wrapper, ec),
+                                   std::unique_ptr<MemoryBuffer> Wrapper,
+                                   std::error_code &EC)
+    : ELFObjectFile<ELFT>(std::move(Wrapper), EC),
       UnderlyingFile(std::move(UnderlyingFile)) {
   this->isDyldELFObject = true;
 }
@@ -182,30 +184,30 @@ RuntimeDyldELF::createObjectImageFromFile(std::unique_ptr<object::ObjectFile> Ob
   if (!ObjFile)
     return nullptr;
 
-  error_code ec;
-  MemoryBuffer *Buffer =
-      MemoryBuffer::getMemBuffer(ObjFile->getData(), "", false);
+  std::error_code ec;
+  std::unique_ptr<MemoryBuffer> Buffer(
+      MemoryBuffer::getMemBuffer(ObjFile->getData(), "", false));
 
   if (ObjFile->getBytesInAddress() == 4 && ObjFile->isLittleEndian()) {
     auto Obj =
         llvm::make_unique<DyldELFObject<ELFType<support::little, 2, false>>>(
-            std::move(ObjFile), Buffer, ec);
+            std::move(ObjFile), std::move(Buffer), ec);
     return new ELFObjectImage<ELFType<support::little, 2, false>>(
         nullptr, std::move(Obj));
   } else if (ObjFile->getBytesInAddress() == 4 && !ObjFile->isLittleEndian()) {
     auto Obj =
         llvm::make_unique<DyldELFObject<ELFType<support::big, 2, false>>>(
-            std::move(ObjFile), Buffer, ec);
+            std::move(ObjFile), std::move(Buffer), ec);
     return new ELFObjectImage<ELFType<support::big, 2, false>>(nullptr, std::move(Obj));
   } else if (ObjFile->getBytesInAddress() == 8 && !ObjFile->isLittleEndian()) {
     auto Obj = llvm::make_unique<DyldELFObject<ELFType<support::big, 2, true>>>(
-        std::move(ObjFile), Buffer, ec);
+        std::move(ObjFile), std::move(Buffer), ec);
     return new ELFObjectImage<ELFType<support::big, 2, true>>(nullptr,
                                                               std::move(Obj));
   } else if (ObjFile->getBytesInAddress() == 8 && ObjFile->isLittleEndian()) {
     auto Obj =
         llvm::make_unique<DyldELFObject<ELFType<support::little, 2, true>>>(
-            std::move(ObjFile), Buffer, ec);
+            std::move(ObjFile), std::move(Buffer), ec);
     return new ELFObjectImage<ELFType<support::little, 2, true>>(
         nullptr, std::move(Obj));
   } else
@@ -218,31 +220,33 @@ ObjectImage *RuntimeDyldELF::createObjectImage(ObjectBuffer *Buffer) {
   std::pair<unsigned char, unsigned char> Ident =
       std::make_pair((uint8_t)Buffer->getBufferStart()[ELF::EI_CLASS],
                      (uint8_t)Buffer->getBufferStart()[ELF::EI_DATA]);
-  error_code ec;
+  std::error_code ec;
+
+  std::unique_ptr<MemoryBuffer> Buf(Buffer->getMemBuffer());
 
   if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2LSB) {
     auto Obj =
         llvm::make_unique<DyldELFObject<ELFType<support::little, 4, false>>>(
-            Buffer->getMemBuffer(), ec);
+            std::move(Buf), ec);
     return new ELFObjectImage<ELFType<support::little, 4, false>>(
         Buffer, std::move(Obj));
   } else if (Ident.first == ELF::ELFCLASS32 &&
              Ident.second == ELF::ELFDATA2MSB) {
     auto Obj =
         llvm::make_unique<DyldELFObject<ELFType<support::big, 4, false>>>(
-            Buffer->getMemBuffer(), ec);
+            std::move(Buf), ec);
     return new ELFObjectImage<ELFType<support::big, 4, false>>(Buffer,
                                                                std::move(Obj));
   } else if (Ident.first == ELF::ELFCLASS64 &&
              Ident.second == ELF::ELFDATA2MSB) {
     auto Obj = llvm::make_unique<DyldELFObject<ELFType<support::big, 8, true>>>(
-        Buffer->getMemBuffer(), ec);
+        std::move(Buf), ec);
     return new ELFObjectImage<ELFType<support::big, 8, true>>(Buffer, std::move(Obj));
   } else if (Ident.first == ELF::ELFCLASS64 &&
              Ident.second == ELF::ELFDATA2LSB) {
     auto Obj =
         llvm::make_unique<DyldELFObject<ELFType<support::little, 8, true>>>(
-            Buffer->getMemBuffer(), ec);
+            std::move(Buf), ec);
     return new ELFObjectImage<ELFType<support::little, 8, true>>(Buffer, std::move(Obj));
   } else
     llvm_unreachable("Unexpected ELF format");
@@ -612,30 +616,38 @@ void RuntimeDyldELF::resolveMIPSRelocation(const SectionEntry &Section,
   }
 }
 
-// Return the .TOC. section address to R_PPC64_TOC relocations.
-uint64_t RuntimeDyldELF::findPPC64TOC() const {
+// Return the .TOC. section and offset.
+void RuntimeDyldELF::findPPC64TOCSection(ObjectImage &Obj,
+                                         ObjSectionToIDMap &LocalSections,
+                                         RelocationValueRef &Rel) {
+  // Set a default SectionID in case we do not find a TOC section below.
+  // This may happen for references to TOC base base (sym@toc, .odp
+  // relocation) without a .toc directive.  In this case just use the
+  // first section (which is usually the .odp) since the code won't
+  // reference the .toc base directly.
+  Rel.SymbolName = NULL;
+  Rel.SectionID = 0;
+
   // The TOC consists of sections .got, .toc, .tocbss, .plt in that
   // order. The TOC starts where the first of these sections starts.
-  SectionList::const_iterator it = Sections.begin();
-  SectionList::const_iterator ite = Sections.end();
-  for (; it != ite; ++it) {
-    if (it->Name == ".got" || it->Name == ".toc" || it->Name == ".tocbss" ||
-        it->Name == ".plt")
+  for (section_iterator si = Obj.begin_sections(), se = Obj.end_sections();
+       si != se; ++si) {
+
+    StringRef SectionName;
+    check(si->getName(SectionName));
+
+    if (SectionName == ".got"
+        || SectionName == ".toc"
+        || SectionName == ".tocbss"
+        || SectionName == ".plt") {
+      Rel.SectionID = findOrEmitSection(Obj, *si, false, LocalSections);
       break;
+    }
   }
-  if (it == ite) {
-    // This may happen for
-    // * references to TOC base base (sym@toc, .odp relocation) without
-    // a .toc directive.
-    // In this case just use the first section (which is usually
-    // the .odp) since the code won't reference the .toc base
-    // directly.
-    it = Sections.begin();
-  }
-  assert(it != ite);
+
   // Per the ppc64-elf-linux ABI, The TOC base is TOC value plus 0x8000
   // thus permitting a full 64 Kbytes segment.
-  return it->LoadAddress + 0x8000;
+  Rel.Addend = 0x8000;
 }
 
 // Returns the sections and offset associated with the ODP entry referenced
@@ -702,24 +714,37 @@ void RuntimeDyldELF::findOPDEntrySection(ObjectImage &Obj,
   llvm_unreachable("Attempting to get address of ODP entry!");
 }
 
-// Relocation masks following the #lo(value), #hi(value), #higher(value),
-// and #highest(value) macros defined in section 4.5.1. Relocation Types
-// in PPC-elf64abi document.
-//
+// Relocation masks following the #lo(value), #hi(value), #ha(value),
+// #higher(value), #highera(value), #highest(value), and #highesta(value)
+// macros defined in section 4.5.1. Relocation Types of the PPC-elf64abi
+// document.
+
 static inline uint16_t applyPPClo(uint64_t value) { return value & 0xffff; }
 
 static inline uint16_t applyPPChi(uint64_t value) {
   return (value >> 16) & 0xffff;
 }
 
+static inline uint16_t applyPPCha (uint64_t value) {
+  return ((value + 0x8000) >> 16) & 0xffff;
+}
+
 static inline uint16_t applyPPChigher(uint64_t value) {
   return (value >> 32) & 0xffff;
 }
 
+static inline uint16_t applyPPChighera (uint64_t value) {
+  return ((value + 0x8000) >> 32) & 0xffff;
+}
+
 static inline uint16_t applyPPChighest(uint64_t value) {
   return (value >> 48) & 0xffff;
 }
 
+static inline uint16_t applyPPChighesta (uint64_t value) {
+  return ((value + 0x8000) >> 48) & 0xffff;
+}
+
 void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section,
                                             uint64_t Offset, uint64_t Value,
                                             uint32_t Type, int64_t Addend) {
@@ -728,24 +753,57 @@ void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section,
   default:
     llvm_unreachable("Relocation type not implemented yet!");
     break;
+  case ELF::R_PPC64_ADDR16:
+    writeInt16BE(LocalAddress, applyPPClo(Value + Addend));
+    break;
+  case ELF::R_PPC64_ADDR16_DS:
+    writeInt16BE(LocalAddress, applyPPClo(Value + Addend) & ~3);
+    break;
   case ELF::R_PPC64_ADDR16_LO:
     writeInt16BE(LocalAddress, applyPPClo(Value + Addend));
     break;
+  case ELF::R_PPC64_ADDR16_LO_DS:
+    writeInt16BE(LocalAddress, applyPPClo(Value + Addend) & ~3);
+    break;
   case ELF::R_PPC64_ADDR16_HI:
     writeInt16BE(LocalAddress, applyPPChi(Value + Addend));
     break;
+  case ELF::R_PPC64_ADDR16_HA:
+    writeInt16BE(LocalAddress, applyPPCha(Value + Addend));
+    break;
   case ELF::R_PPC64_ADDR16_HIGHER:
     writeInt16BE(LocalAddress, applyPPChigher(Value + Addend));
     break;
+  case ELF::R_PPC64_ADDR16_HIGHERA:
+    writeInt16BE(LocalAddress, applyPPChighera(Value + Addend));
+    break;
   case ELF::R_PPC64_ADDR16_HIGHEST:
     writeInt16BE(LocalAddress, applyPPChighest(Value + Addend));
     break;
+  case ELF::R_PPC64_ADDR16_HIGHESTA:
+    writeInt16BE(LocalAddress, applyPPChighesta(Value + Addend));
+    break;
   case ELF::R_PPC64_ADDR14: {
     assert(((Value + Addend) & 3) == 0);
     // Preserve the AA/LK bits in the branch instruction
     uint8_t aalk = *(LocalAddress + 3);
     writeInt16BE(LocalAddress + 2, (aalk & 3) | ((Value + Addend) & 0xfffc));
   } break;
+  case ELF::R_PPC64_REL16_LO: {
+    uint64_t FinalAddress = (Section.LoadAddress + Offset);
+    uint64_t Delta = Value - FinalAddress + Addend;
+    writeInt16BE(LocalAddress, applyPPClo(Delta));
+  } break;
+  case ELF::R_PPC64_REL16_HI: {
+    uint64_t FinalAddress = (Section.LoadAddress + Offset);
+    uint64_t Delta = Value - FinalAddress + Addend;
+    writeInt16BE(LocalAddress, applyPPChi(Delta));
+  } break;
+  case ELF::R_PPC64_REL16_HA: {
+    uint64_t FinalAddress = (Section.LoadAddress + Offset);
+    uint64_t Delta = Value - FinalAddress + Addend;
+    writeInt16BE(LocalAddress, applyPPCha(Delta));
+  } break;
   case ELF::R_PPC64_ADDR32: {
     int32_t Result = static_cast<int32_t>(Value + Addend);
     if (SignExtend32<32>(Result) != Result)
@@ -775,19 +833,6 @@ void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section,
   case ELF::R_PPC64_ADDR64:
     writeInt64BE(LocalAddress, Value + Addend);
     break;
-  case ELF::R_PPC64_TOC:
-    writeInt64BE(LocalAddress, findPPC64TOC());
-    break;
-  case ELF::R_PPC64_TOC16: {
-    uint64_t TOCStart = findPPC64TOC();
-    Value = applyPPClo((Value + Addend) - TOCStart);
-    writeInt16BE(LocalAddress, applyPPClo(Value));
-  } break;
-  case ELF::R_PPC64_TOC16_DS: {
-    uint64_t TOCStart = findPPC64TOC();
-    Value = ((Value + Addend) - TOCStart);
-    writeInt16BE(LocalAddress, applyPPClo(Value));
-  } break;
   }
 }
 
@@ -1139,14 +1184,20 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
                              ELF::R_PPC64_ADDR64, Value.Addend);
 
           // Generates the 64-bits address loads as exemplified in section
-          // 4.5.1 in PPC64 ELF ABI.
-          RelocationEntry REhst(SectionID, StubTargetAddr - Section.Address + 2,
+          // 4.5.1 in PPC64 ELF ABI.  Note that the relocations need to
+          // apply to the low part of the instructions, so we have to update
+          // the offset according to the target endianness.
+          uint64_t StubRelocOffset = StubTargetAddr - Section.Address;
+          if (!IsTargetLittleEndian)
+            StubRelocOffset += 2;
+
+          RelocationEntry REhst(SectionID, StubRelocOffset + 0,
                                 ELF::R_PPC64_ADDR16_HIGHEST, Value.Addend);
-          RelocationEntry REhr(SectionID, StubTargetAddr - Section.Address + 6,
+          RelocationEntry REhr(SectionID, StubRelocOffset + 4,
                                ELF::R_PPC64_ADDR16_HIGHER, Value.Addend);
-          RelocationEntry REh(SectionID, StubTargetAddr - Section.Address + 14,
+          RelocationEntry REh(SectionID, StubRelocOffset + 12,
                               ELF::R_PPC64_ADDR16_HI, Value.Addend);
-          RelocationEntry REl(SectionID, StubTargetAddr - Section.Address + 18,
+          RelocationEntry REl(SectionID, StubRelocOffset + 16,
                               ELF::R_PPC64_ADDR16_LO, Value.Addend);
 
           if (Value.SymbolName) {
@@ -1170,12 +1221,52 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
           // Restore the TOC for external calls
           writeInt32BE(Target + 4, 0xE8410028); // ld r2,40(r1)
       }
+    } else if (RelType == ELF::R_PPC64_TOC16 ||
+               RelType == ELF::R_PPC64_TOC16_DS ||
+               RelType == ELF::R_PPC64_TOC16_LO ||
+               RelType == ELF::R_PPC64_TOC16_LO_DS ||
+               RelType == ELF::R_PPC64_TOC16_HI ||
+               RelType == ELF::R_PPC64_TOC16_HA) {
+      // These relocations are supposed to subtract the TOC address from
+      // the final value.  This does not fit cleanly into the RuntimeDyld
+      // scheme, since there may be *two* sections involved in determining
+      // the relocation value (the section of the symbol refered to by the
+      // relocation, and the TOC section associated with the current module).
+      //
+      // Fortunately, these relocations are currently only ever generated
+      // refering to symbols that themselves reside in the TOC, which means
+      // that the two sections are actually the same.  Thus they cancel out
+      // and we can immediately resolve the relocation right now.
+      switch (RelType) {
+      case ELF::R_PPC64_TOC16: RelType = ELF::R_PPC64_ADDR16; break;
+      case ELF::R_PPC64_TOC16_DS: RelType = ELF::R_PPC64_ADDR16_DS; break;
+      case ELF::R_PPC64_TOC16_LO: RelType = ELF::R_PPC64_ADDR16_LO; break;
+      case ELF::R_PPC64_TOC16_LO_DS: RelType = ELF::R_PPC64_ADDR16_LO_DS; break;
+      case ELF::R_PPC64_TOC16_HI: RelType = ELF::R_PPC64_ADDR16_HI; break;
+      case ELF::R_PPC64_TOC16_HA: RelType = ELF::R_PPC64_ADDR16_HA; break;
+      default: llvm_unreachable("Wrong relocation type.");
+      }
+
+      RelocationValueRef TOCValue;
+      findPPC64TOCSection(Obj, ObjSectionToID, TOCValue);
+      if (Value.SymbolName || Value.SectionID != TOCValue.SectionID)
+        llvm_unreachable("Unsupported TOC relocation.");
+      Value.Addend -= TOCValue.Addend;
+      resolveRelocation(Sections[SectionID], Offset, Value.Addend, RelType, 0);
     } else {
+      // There are two ways to refer to the TOC address directly: either
+      // via a ELF::R_PPC64_TOC relocation (where both symbol and addend are
+      // ignored), or via any relocation that refers to the magic ".TOC."
+      // symbols (in which case the addend is respected).
+      if (RelType == ELF::R_PPC64_TOC) {
+        RelType = ELF::R_PPC64_ADDR64;
+        findPPC64TOCSection(Obj, ObjSectionToID, Value);
+      } else if (TargetName == ".TOC.") {
+        findPPC64TOCSection(Obj, ObjSectionToID, Value);
+        Value.Addend += Addend;
+      }
+
       RelocationEntry RE(SectionID, Offset, RelType, Value.Addend);
-      // Extra check to avoid relocation againt empty symbols (usually
-      // the R_PPC64_TOC).
-      if (SymType != SymbolRef::ST_Unknown && TargetName.empty())
-        Value.SymbolName = nullptr;
 
       if (Value.SymbolName)
         addRelocationForSymbol(RE, Value.SymbolName);
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index a526073..59fdfbe 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -20,10 +20,9 @@
 using namespace llvm;
 
 namespace llvm {
-
 namespace {
 // Helper for extensive error checking in debug builds.
-error_code Check(error_code Err) {
+std::error_code Check(std::error_code Err) {
   if (Err) {
     report_fatal_error(Err.message());
   }
@@ -83,7 +82,8 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
       return 1;
   }
 
-  uint64_t findPPC64TOC() const;
+  void findPPC64TOCSection(ObjectImage &Obj, ObjSectionToIDMap &LocalSections,
+                           RelocationValueRef &Rel);
   void findOPDEntrySection(ObjectImage &Obj, ObjSectionToIDMap &LocalSections,
                            RelocationValueRef &Rel);
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 412cf20..0336cba 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/ExecutionEngine/ObjectImage.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/ExecutionEngine/RuntimeDyldChecker.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -28,8 +29,8 @@
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <map>
+#include <system_error>
 
 using namespace llvm;
 using namespace llvm::object;
@@ -158,6 +159,15 @@ public:
 };
 
 class RuntimeDyldImpl {
+  friend class RuntimeDyldChecker;
+private:
+
+  uint64_t getAnySymbolRemoteAddress(StringRef Symbol) {
+    if (uint64_t InternalSymbolAddr = getSymbolLoadAddress(Symbol))
+      return InternalSymbolAddr;
+    return MemMgr->getSymbolAddress(Symbol);
+  }
+
 protected:
   // The MemoryManager to load objects into.
   RTDyldMemoryManager *MemMgr;
@@ -245,14 +255,14 @@ protected:
 
   void writeInt16BE(uint8_t *Addr, uint16_t Value) {
     if (IsTargetLittleEndian)
-      Value = sys::SwapByteOrder(Value);
+      sys::swapByteOrder(Value);
     *Addr       = (Value >> 8) & 0xFF;
     *(Addr + 1) = Value & 0xFF;
   }
 
   void writeInt32BE(uint8_t *Addr, uint32_t Value) {
     if (IsTargetLittleEndian)
-      Value = sys::SwapByteOrder(Value);
+      sys::swapByteOrder(Value);
     *Addr       = (Value >> 24) & 0xFF;
     *(Addr + 1) = (Value >> 16) & 0xFF;
     *(Addr + 2) = (Value >> 8) & 0xFF;
@@ -261,7 +271,7 @@ protected:
 
   void writeInt64BE(uint8_t *Addr, uint64_t Value) {
     if (IsTargetLittleEndian)
-      Value = sys::SwapByteOrder(Value);
+      sys::swapByteOrder(Value);
     *Addr       = (Value >> 56) & 0xFF;
     *(Addr + 1) = (Value >> 48) & 0xFF;
     *(Addr + 2) = (Value >> 40) & 0xFF;
@@ -339,7 +349,8 @@ protected:
 
 public:
   RuntimeDyldImpl(RTDyldMemoryManager *mm)
-      : MemMgr(mm), ProcessAllSections(false), HasError(false) {}
+      : MemMgr(mm), ProcessAllSections(false), HasError(false) {
+  }
 
   virtual ~RuntimeDyldImpl();
 
@@ -349,7 +360,7 @@ public:
 
   ObjectImage *loadObject(ObjectImage *InputObject);
 
-  void *getSymbolAddress(StringRef Name) {
+  uint8_t* getSymbolAddress(StringRef Name) {
     // FIXME: Just look up as a function for now. Overly simple of course.
     // Work in progress.
     SymbolTableMap::const_iterator pos = GlobalSymbolTable.find(Name);
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index 2b425fb..4eb516c 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -14,6 +14,8 @@
 #include "RuntimeDyldMachO.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "ObjectImageCommon.h"
+#include "JITRegistrar.h"
 using namespace llvm;
 using namespace llvm::object;
 
@@ -21,6 +23,126 @@ using namespace llvm::object;
 
 namespace llvm {
 
+class MachOObjectImage : public ObjectImageCommon {
+private:
+  typedef SmallVector<uint64_t, 1> SectionAddrList;
+  SectionAddrList OldSectionAddrList;
+
+protected:
+  bool is64;
+  bool Registered;
+
+private:
+  void initOldAddress() {
+    MachOObjectFile *objf = static_cast<MachOObjectFile *>(ObjFile.get());
+    // Unfortunately we need to do this, since there's information encoded
+    // in the original addr of the section that we could not otherwise
+    // recover. The reason for this is that symbols do not actually store
+    // their file offset, but only their vmaddr. This means that in order
+    // to locate the symbol correctly in the object file, we need to know
+    // where the original start of the section was (including any padding,
+    // etc).
+    for (section_iterator i = objf->section_begin(), e = objf->section_end();
+         i != e; ++i) {
+      uint64_t Addr;
+      i->getAddress(Addr);
+      OldSectionAddrList[i->getRawDataRefImpl().d.a] = Addr;
+    }
+  }
+
+public:
+  MachOObjectImage(ObjectBuffer *Input, bool is64)
+      : ObjectImageCommon(Input),
+        OldSectionAddrList(ObjFile->section_end()->getRawDataRefImpl().d.a, 0),
+        is64(is64), Registered(false) {
+    initOldAddress();
+  }
+
+  MachOObjectImage(std::unique_ptr<object::ObjectFile> Input, bool is64)
+      : ObjectImageCommon(std::move(Input)),
+        OldSectionAddrList(ObjFile->section_end()->getRawDataRefImpl().d.a, 0),
+        is64(is64), Registered(false) {
+    initOldAddress();
+  }
+
+  virtual ~MachOObjectImage() {
+    if (Registered)
+      deregisterWithDebugger();
+  }
+
+  // Subclasses can override these methods to update the image with loaded
+  // addresses for sections and common symbols
+  virtual void updateSectionAddress(const SectionRef &Sec, uint64_t Addr) {
+    MachOObjectFile *objf = static_cast<MachOObjectFile *>(ObjFile.get());
+    char *data =
+        const_cast<char *>(objf->getSectionPointer(Sec.getRawDataRefImpl()));
+
+    uint64_t oldAddr = OldSectionAddrList[Sec.getRawDataRefImpl().d.a];
+
+    if (is64) {
+      ((MachO::section_64 *)data)->addr = Addr;
+    } else {
+      ((MachO::section *)data)->addr = Addr;
+    }
+
+    for (symbol_iterator i = objf->symbol_begin(), e = objf->symbol_end();
+         i != e; ++i) {
+      section_iterator symSec(objf->section_end());
+      (*i).getSection(symSec);
+      if (*symSec == Sec) {
+        uint64_t symAddr;
+        (*i).getAddress(symAddr);
+        updateSymbolAddress(*i, symAddr + Addr - oldAddr);
+      }
+    }
+  }
+
+  uint64_t getOldSectionAddr(const SectionRef &Sec) const {
+    return OldSectionAddrList[Sec.getRawDataRefImpl().d.a];
+  }
+
+  virtual void updateSymbolAddress(const SymbolRef &Sym, uint64_t Addr) {
+    char *data = const_cast<char *>(
+        reinterpret_cast<const char *>(Sym.getRawDataRefImpl().p));
+    if (is64)
+      ((MachO::nlist_64 *)data)->n_value = Addr;
+    else
+      ((MachO::nlist *)data)->n_value = Addr;
+  }
+
+  virtual void registerWithDebugger() {
+    JITRegistrar::getGDBRegistrar().registerObject(*Buffer);
+    Registered = true;
+  }
+
+  virtual void deregisterWithDebugger() {
+    JITRegistrar::getGDBRegistrar().deregisterObject(*Buffer);
+  }
+};
+
+ObjectImage *RuntimeDyldMachO::createObjectImage(ObjectBuffer *Buffer) {
+  uint32_t magic = *((const uint32_t *)Buffer->getBufferStart());
+  bool is64 = (magic == MachO::MH_MAGIC_64);
+  assert((magic == MachO::MH_MAGIC_64 || magic == MachO::MH_MAGIC) &&
+         "Unrecognized Macho Magic");
+  return new MachOObjectImage(Buffer, is64);
+}
+
+ObjectImage *RuntimeDyldMachO::createObjectImageFromFile(
+    std::unique_ptr<object::ObjectFile> ObjFile) {
+  if (!ObjFile)
+    return nullptr;
+
+  MemoryBuffer *Buffer =
+      MemoryBuffer::getMemBuffer(ObjFile->getData(), "", false);
+
+  uint32_t magic = *((const uint32_t *)Buffer->getBufferStart());
+  bool is64 = (magic == MachO::MH_MAGIC_64);
+  assert((magic == MachO::MH_MAGIC_64 || magic == MachO::MH_MAGIC) &&
+         "Unrecognized Macho Magic");
+  return new MachOObjectImage(std::move(ObjFile), is64);
+}
+
 static unsigned char *processFDE(unsigned char *P, intptr_t DeltaForText,
                                  intptr_t DeltaForEH) {
   DEBUG(dbgs() << "Processing FDE: Delta for text: " << DeltaForText
@@ -533,6 +655,7 @@ relocation_iterator RuntimeDyldMachO::processRelocationRef(
     ObjSectionToIDMap &ObjSectionToID, const SymbolTableMap &Symbols,
     StubMap &Stubs) {
   const ObjectFile *OF = Obj.getObjectFile();
+  const MachOObjectImage &MachOObj = *static_cast<MachOObjectImage *>(&Obj);
   const MachOObjectFile *MachO = static_cast<const MachOObjectFile *>(OF);
   MachO::any_relocation_info RE =
       MachO->getRelocation(RelI->getRawDataRefImpl());
@@ -609,8 +732,8 @@ relocation_iterator RuntimeDyldMachO::processRelocationRef(
     bool IsCode = false;
     Sec.isText(IsCode);
     Value.SectionID = findOrEmitSection(Obj, Sec, IsCode, ObjSectionToID);
-    uint64_t Addr;
-    Sec.getAddress(Addr);
+    uint64_t Addr = MachOObj.getOldSectionAddr(Sec);
+    DEBUG(dbgs() << "\nAddr: " << Addr << "\nAddend: " << Addend);
     Value.Addend = Addend - Addr;
     if (IsPCRel)
       Value.Addend += Offset + NumBytes;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
index 060eb8c..35f0720 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
@@ -105,14 +105,9 @@ public:
   void finalizeLoad(ObjectImage &ObjImg,
                     ObjSectionToIDMap &SectionMap) override;
 
-  static ObjectImage *createObjectImage(ObjectBuffer *InputBuffer) {
-    return new ObjectImageCommon(InputBuffer);
-  }
-
+  static ObjectImage *createObjectImage(ObjectBuffer *Buffer);
   static ObjectImage *
-  createObjectImageFromFile(std::unique_ptr<object::ObjectFile> InputObject) {
-    return new ObjectImageCommon(std::move(InputObject));
-  }
+  createObjectImageFromFile(std::unique_ptr<object::ObjectFile> InputObject);
 };
 
 } // end namespace llvm
diff --git a/lib/IR/Android.mk b/lib/IR/Android.mk
index 2ffc86c..c51b241 100644
--- a/lib/IR/Android.mk
+++ b/lib/IR/Android.mk
@@ -5,6 +5,7 @@ vmcore_SRC_FILES := \
   Attributes.cpp \
   AutoUpgrade.cpp \
   BasicBlock.cpp \
+  Comdat.cpp \
   ConstantFold.cpp \
   ConstantRange.cpp \
   Constants.cpp \
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 0fef0d0..a7499bc 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -106,6 +106,7 @@ static void PrintEscapedString(StringRef Name, raw_ostream &Out) {
 
 enum PrefixType {
   GlobalPrefix,
+  ComdatPrefix,
   LabelPrefix,
   LocalPrefix,
   NoPrefix
@@ -119,6 +120,7 @@ static void PrintLLVMName(raw_ostream &OS, StringRef Name, PrefixType Prefix) {
   switch (Prefix) {
   case NoPrefix: break;
   case GlobalPrefix: OS << '@'; break;
+  case ComdatPrefix: OS << '$'; break;
   case LabelPrefix:  break;
   case LocalPrefix:  OS << '%'; break;
   }
@@ -1165,8 +1167,15 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
 }
 
 void AssemblyWriter::init() {
-  if (TheModule)
-    TypePrinter.incorporateTypes(*TheModule);
+  if (!TheModule)
+    return;
+  TypePrinter.incorporateTypes(*TheModule);
+  for (const Function &F : *TheModule)
+    if (const Comdat *C = F.getComdat())
+      Comdats.insert(C);
+  for (const GlobalVariable &GV : TheModule->globals())
+    if (const Comdat *C = GV.getComdat())
+      Comdats.insert(C);
 }
 
 
@@ -1308,6 +1317,15 @@ void AssemblyWriter::printModule(const Module *M) {
 
   printTypeIdentities();
 
+  // Output all comdats.
+  if (!Comdats.empty())
+    Out << '\n';
+  for (const Comdat *C : Comdats) {
+    printComdat(C);
+    if (C != Comdats.back())
+      Out << '\n';
+  }
+
   // Output all globals.
   if (!M->global_empty()) Out << '\n';
   for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
@@ -1451,10 +1469,11 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
   PrintVisibility(GV->getVisibility(), Out);
   PrintDLLStorageClass(GV->getDLLStorageClass(), Out);
   PrintThreadLocalModel(GV->getThreadLocalMode(), Out);
+  if (GV->hasUnnamedAddr())
+    Out << "unnamed_addr ";
 
   if (unsigned AddressSpace = GV->getType()->getAddressSpace())
     Out << "addrspace(" << AddressSpace << ") ";
-  if (GV->hasUnnamedAddr()) Out << "unnamed_addr ";
   if (GV->isExternallyInitialized()) Out << "externally_initialized ";
   Out << (GV->isConstant() ? "constant " : "global ");
   TypePrinter.print(GV->getType()->getElementType(), Out);
@@ -1469,6 +1488,10 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
     PrintEscapedString(GV->getSection(), Out);
     Out << '"';
   }
+  if (GV->hasComdat()) {
+    Out << ", comdat ";
+    PrintLLVMName(Out, GV->getComdat()->getName(), ComdatPrefix);
+  }
   if (GV->getAlignment())
     Out << ", align " << GV->getAlignment();
 
@@ -1488,21 +1511,18 @@ void AssemblyWriter::printAlias(const GlobalAlias *GA) {
   }
   PrintVisibility(GA->getVisibility(), Out);
   PrintDLLStorageClass(GA->getDLLStorageClass(), Out);
+  PrintThreadLocalModel(GA->getThreadLocalMode(), Out);
+  if (GA->hasUnnamedAddr())
+    Out << "unnamed_addr ";
 
   Out << "alias ";
 
   PrintLinkage(GA->getLinkage(), Out);
 
-  PointerType *Ty = GA->getType();
   const Constant *Aliasee = GA->getAliasee();
-  if (!Aliasee || Ty != Aliasee->getType()) {
-    if (unsigned AddressSpace = Ty->getAddressSpace())
-      Out << "addrspace(" << AddressSpace << ") ";
-    TypePrinter.print(Ty->getElementType(), Out);
-    Out << ", ";
-  }
 
   if (!Aliasee) {
+    TypePrinter.print(GA->getType(), Out);
     Out << " <<NULL ALIASEE>>";
   } else {
     writeOperand(Aliasee, !isa<ConstantExpr>(Aliasee));
@@ -1512,6 +1532,10 @@ void AssemblyWriter::printAlias(const GlobalAlias *GA) {
   Out << '\n';
 }
 
+void AssemblyWriter::printComdat(const Comdat *C) {
+  C->print(Out);
+}
+
 void AssemblyWriter::printTypeIdentities() {
   if (TypePrinter.NumberedTypes.empty() &&
       TypePrinter.NamedTypes.empty())
@@ -1649,6 +1673,10 @@ void AssemblyWriter::printFunction(const Function *F) {
     PrintEscapedString(F->getSection(), Out);
     Out << '"';
   }
+  if (F->hasComdat()) {
+    Out << " comdat ";
+    PrintLLVMName(Out, F->getComdat()->getName(), ComdatPrefix);
+  }
   if (F->getAlignment())
     Out << " align " << F->getAlignment();
   if (F->hasGC())
@@ -1788,6 +1816,9 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       (isa<StoreInst>(I) && cast<StoreInst>(I).isAtomic()))
     Out << " atomic";
 
+  if (isa<AtomicCmpXchgInst>(I) && cast<AtomicCmpXchgInst>(I).isWeak())
+    Out << " weak";
+
   // If this is a volatile operation, print out the volatile marker.
   if ((isa<LoadInst>(I)  && cast<LoadInst>(I).isVolatile()) ||
       (isa<StoreInst>(I) && cast<StoreInst>(I).isVolatile()) ||
@@ -2157,11 +2188,32 @@ void NamedMDNode::print(raw_ostream &ROS) const {
   W.printNamedMDNode(this);
 }
 
-void Type::print(raw_ostream &OS) const {
-  if (!this) {
-    OS << "<null Type>";
-    return;
+void Comdat::print(raw_ostream &ROS) const {
+  PrintLLVMName(ROS, getName(), ComdatPrefix);
+  ROS << " = comdat ";
+
+  switch (getSelectionKind()) {
+  case Comdat::Any:
+    ROS << "any";
+    break;
+  case Comdat::ExactMatch:
+    ROS << "exactmatch";
+    break;
+  case Comdat::Largest:
+    ROS << "largest";
+    break;
+  case Comdat::NoDuplicates:
+    ROS << "noduplicates";
+    break;
+  case Comdat::SameSize:
+    ROS << "samesize";
+    break;
   }
+
+  ROS << '\n';
+}
+
+void Type::print(raw_ostream &OS) const {
   TypePrinting TP;
   TP.print(const_cast<Type*>(this), OS);
 
@@ -2174,10 +2226,6 @@ void Type::print(raw_ostream &OS) const {
 }
 
 void Value::print(raw_ostream &ROS) const {
-  if (!this) {
-    ROS << "printing a <null> value\n";
-    return;
-  }
   formatted_raw_ostream OS(ROS);
   if (const Instruction *I = dyn_cast<Instruction>(this)) {
     const Function *F = I->getParent() ? I->getParent()->getParent() : nullptr;
@@ -2248,5 +2296,8 @@ void Type::dump() const { print(dbgs()); }
 // Module::dump() - Allow printing of Modules from the debugger.
 void Module::dump() const { print(dbgs(), nullptr); }
 
+// \brief Allow printing of Comdats from the debugger.
+void Comdat::dump() const { print(dbgs()); }
+
 // NamedMDNode::dump() - Allow printing of NamedMDNodes from the debugger.
 void NamedMDNode::dump() const { print(dbgs()); }
diff --git a/lib/IR/AsmWriter.h b/lib/IR/AsmWriter.h
index b4ce6de..aef9c8a 100644
--- a/lib/IR/AsmWriter.h
+++ b/lib/IR/AsmWriter.h
@@ -16,6 +16,7 @@
 #define LLVM_IR_ASSEMBLYWRITER_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/TypeFinder.h"
@@ -26,6 +27,7 @@ namespace llvm {
 class BasicBlock;
 class Function;
 class GlobalValue;
+class Comdat;
 class Module;
 class NamedMDNode;
 class Value;
@@ -70,6 +72,7 @@ private:
   SlotTracker &Machine;
   TypePrinting TypePrinter;
   AssemblyAnnotationWriter *AnnotationWriter;
+  SetVector<const Comdat *> Comdats;
 
 public:
   /// Construct an AssemblyWriter with an external SlotTracker
@@ -101,6 +104,7 @@ public:
   void printTypeIdentities();
   void printGlobal(const GlobalVariable *GV);
   void printAlias(const GlobalAlias *GV);
+  void printComdat(const Comdat *C);
   void printFunction(const Function *F);
   void printArgument(const Argument *FA, AttributeSet Attrs, unsigned Idx);
   void printBasicBlock(const BasicBlock *BB);
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index a9074bb..48a2ce8 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -173,6 +173,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "inlinehint";
   if (hasAttribute(Attribute::InReg))
     return "inreg";
+  if (hasAttribute(Attribute::JumpTable))
+    return "jumptable";
   if (hasAttribute(Attribute::MinSize))
     return "minsize";
   if (hasAttribute(Attribute::Naked))
@@ -291,7 +293,7 @@ bool Attribute::operator<(Attribute A) const {
 // AttributeImpl Definition
 //===----------------------------------------------------------------------===//
 
-// Pin the vtabels to this file.
+// Pin the vtables to this file.
 AttributeImpl::~AttributeImpl() {}
 void EnumAttributeImpl::anchor() {}
 void AlignAttributeImpl::anchor() {}
@@ -395,6 +397,7 @@ uint64_t AttributeImpl::getAttrMask(Attribute::AttrKind Val) {
   case Attribute::OptimizeNone:    return 1ULL << 42;
   case Attribute::InAlloca:        return 1ULL << 43;
   case Attribute::NonNull:         return 1ULL << 44;
+  case Attribute::JumpTable:       return 1ULL << 45;
   }
   llvm_unreachable("Unsupported attribute type");
 }
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index e255113..6554b3c 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -114,6 +114,9 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         Name == "x86.avx.movnt.pd.256" ||
         Name == "x86.avx.movnt.ps.256" ||
         Name == "x86.sse42.crc32.64.8" ||
+        Name == "x86.avx.vbroadcast.ss" ||
+        Name == "x86.avx.vbroadcast.ss.256" ||
+        Name == "x86.avx.vbroadcast.sd.256" ||
         (Name.startswith("x86.xop.vpcom") && F->arg_size() == 2)) {
       NewFn = nullptr;
       return true;
@@ -335,6 +338,19 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Value *Trunc0 = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C));
       Rep = Builder.CreateCall2(CRC32, Trunc0, CI->getArgOperand(1));
       Rep = Builder.CreateZExt(Rep, CI->getType(), "");
+    } else if (Name.startswith("llvm.x86.avx.vbroadcast")) {
+      // Replace broadcasts with a series of insertelements.
+      Type *VecTy = CI->getType();
+      Type *EltTy = VecTy->getVectorElementType();
+      unsigned EltNum = VecTy->getVectorNumElements();
+      Value *Cast = Builder.CreateBitCast(CI->getArgOperand(0),
+                                          EltTy->getPointerTo());
+      Value *Load = Builder.CreateLoad(Cast);
+      Type *I32Ty = Type::getInt32Ty(C);
+      Rep = UndefValue::get(VecTy);
+      for (unsigned I = 0; I < EltNum; ++I)
+        Rep = Builder.CreateInsertElement(Rep, Load,
+                                          ConstantInt::get(I32Ty, I));
     } else {
       bool PD128 = false, PD256 = false, PS128 = false, PS256 = false;
       if (Name == "llvm.x86.avx.vpermil.pd.256")
@@ -561,3 +577,10 @@ bool llvm::UpgradeDebugInfo(Module &M) {
   }
   return RetCode;
 }
+
+void llvm::UpgradeMDStringConstant(std::string &String) {
+  const std::string OldPrefix = "llvm.vectorizer.";
+  if (String.find(OldPrefix) == 0) {
+        String.replace(0, OldPrefix.size(), "llvm.loop.vectorize.");
+  }
+}
diff --git a/lib/IR/CMakeLists.txt b/lib/IR/CMakeLists.txt
index b027ae5..38a80b1 100644
--- a/lib/IR/CMakeLists.txt
+++ b/lib/IR/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(LLVMCore
   Attributes.cpp
   AutoUpgrade.cpp
   BasicBlock.cpp
+  Comdat.cpp
   ConstantFold.cpp
   ConstantRange.cpp
   Constants.cpp
diff --git a/lib/IR/Comdat.cpp b/lib/IR/Comdat.cpp
new file mode 100644
index 0000000..80715ff
--- /dev/null
+++ b/lib/IR/Comdat.cpp
@@ -0,0 +1,25 @@
+//===-- Comdat.cpp - Implement Metadata classes --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Comdat class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Comdat.h"
+#include "llvm/ADT/StringMap.h"
+using namespace llvm;
+
+Comdat::Comdat(SelectionKind SK, StringMapEntry<Comdat> *Name)
+    : Name(Name), SK(SK) {}
+
+Comdat::Comdat(Comdat &&C) : Name(C.Name), SK(C.SK) {}
+
+Comdat::Comdat() : Name(nullptr), SK(Comdat::Any) {}
+
+StringRef Comdat::getName() const { return Name->first(); }
diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp
index 706e66f..395ac39 100644
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp
@@ -529,7 +529,10 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
       // Try hard to fold cast of cast because they are often eliminable.
       if (unsigned newOpc = foldConstantCastPair(opc, CE, DestTy))
         return ConstantExpr::getCast(newOpc, CE->getOperand(0), DestTy);
-    } else if (CE->getOpcode() == Instruction::GetElementPtr) {
+    } else if (CE->getOpcode() == Instruction::GetElementPtr &&
+               // Do not fold addrspacecast (gep 0, .., 0). It might make the
+               // addrspacecast uncanonicalized.
+               opc != Instruction::AddrSpaceCast) {
       // If all of the indexes in the GEP are null values, there is no pointer
       // adjustment going on.  We might as well cast the source pointer.
       bool isAllNull = true;
@@ -1331,6 +1334,15 @@ static FCmpInst::Predicate evaluateFCmpRelation(Constant *V1, Constant *V2) {
   return FCmpInst::BAD_FCMP_PREDICATE;
 }
 
+static ICmpInst::Predicate areGlobalsPotentiallyEqual(const GlobalValue *GV1,
+                                                      const GlobalValue *GV2) {
+  // Don't try to decide equality of aliases.
+  if (!isa<GlobalAlias>(GV1) && !isa<GlobalAlias>(GV2))
+    if (!GV1->hasExternalWeakLinkage() || !GV2->hasExternalWeakLinkage())
+      return ICmpInst::ICMP_NE;
+  return ICmpInst::BAD_ICMP_PREDICATE;
+}
+
 /// evaluateICmpRelation - This function determines if there is anything we can
 /// decide about the two constants provided.  This doesn't need to handle simple
 /// things like integer comparisons, but should instead handle ConstantExprs
@@ -1392,10 +1404,7 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
     // constant (which, since the types must match, means that it's a
     // ConstantPointerNull).
     if (const GlobalValue *GV2 = dyn_cast<GlobalValue>(V2)) {
-      // Don't try to decide equality of aliases.
-      if (!isa<GlobalAlias>(GV) && !isa<GlobalAlias>(GV2))
-        if (!GV->hasExternalWeakLinkage() || !GV2->hasExternalWeakLinkage())
-          return ICmpInst::ICMP_NE;
+      return areGlobalsPotentiallyEqual(GV, GV2);
     } else if (isa<BlockAddress>(V2)) {
       return ICmpInst::ICMP_NE; // Globals never equal labels.
     } else {
@@ -1460,7 +1469,8 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
       }
       break;
 
-    case Instruction::GetElementPtr:
+    case Instruction::GetElementPtr: {
+      GEPOperator *CE1GEP = cast<GEPOperator>(CE1);
       // Ok, since this is a getelementptr, we know that the constant has a
       // pointer type.  Check the various cases.
       if (isa<ConstantPointerNull>(V2)) {
@@ -1507,7 +1517,8 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
                    "Surprising getelementptr!");
             return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
           } else {
-            // If they are different globals, we don't know what the value is.
+            if (CE1GEP->hasAllZeroIndices())
+              return areGlobalsPotentiallyEqual(GV, GV2);
             return ICmpInst::BAD_ICMP_PREDICATE;
           }
         }
@@ -1523,8 +1534,14 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
           // By far the most common case to handle is when the base pointers are
           // obviously to the same global.
           if (isa<GlobalValue>(CE1Op0) && isa<GlobalValue>(CE2Op0)) {
-            if (CE1Op0 != CE2Op0) // Don't know relative ordering.
+            // Don't know relative ordering, but check for inequality.
+            if (CE1Op0 != CE2Op0) {
+              GEPOperator *CE2GEP = cast<GEPOperator>(CE2);
+              if (CE1GEP->hasAllZeroIndices() && CE2GEP->hasAllZeroIndices())
+                return areGlobalsPotentiallyEqual(cast<GlobalValue>(CE1Op0),
+                                                  cast<GlobalValue>(CE2Op0));
               return ICmpInst::BAD_ICMP_PREDICATE;
+            }
             // Ok, we know that both getelementptr instructions are based on the
             // same global.  From this, we can precisely determine the relative
             // ordering of the resultant pointers.
@@ -1570,6 +1587,7 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
           }
         }
       }
+    }
     default:
       break;
     }
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index bb8d60b..b815936 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -107,6 +107,28 @@ bool Constant::isAllOnesValue() const {
   return false;
 }
 
+bool Constant::isMinSignedValue() const {
+  // Check for INT_MIN integers
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
+    return CI->isMinValue(/*isSigned=*/true);
+
+  // Check for FP which are bitcasted from INT_MIN integers
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
+    return CFP->getValueAPF().bitcastToAPInt().isMinSignedValue();
+
+  // Check for constant vectors which are splats of INT_MIN values.
+  if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
+    if (Constant *Splat = CV->getSplatValue())
+      return Splat->isMinSignedValue();
+
+  // Check for constant vectors which are splats of INT_MIN values.
+  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
+    if (Constant *Splat = CV->getSplatValue())
+      return Splat->isMinSignedValue();
+
+  return false;
+}
+
 // Constructor to create a '0' constant of arbitrary type...
 Constant *Constant::getNullValue(Type *Ty) {
   switch (Ty->getTypeID()) {
@@ -278,35 +300,48 @@ bool Constant::canTrap() const {
   return canTrapImpl(this, NonTrappingOps);
 }
 
-/// isThreadDependent - Return true if the value can vary between threads.
-bool Constant::isThreadDependent() const {
-  SmallPtrSet<const Constant*, 64> Visited;
-  SmallVector<const Constant*, 64> WorkList;
-  WorkList.push_back(this);
-  Visited.insert(this);
+/// Check if C contains a GlobalValue for which Predicate is true.
+static bool
+ConstHasGlobalValuePredicate(const Constant *C,
+                             bool (*Predicate)(const GlobalValue *)) {
+  SmallPtrSet<const Constant *, 8> Visited;
+  SmallVector<const Constant *, 8> WorkList;
+  WorkList.push_back(C);
+  Visited.insert(C);
 
   while (!WorkList.empty()) {
-    const Constant *C = WorkList.pop_back_val();
-
-    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
-      if (GV->isThreadLocal())
+    const Constant *WorkItem = WorkList.pop_back_val();
+    if (const auto *GV = dyn_cast<GlobalValue>(WorkItem))
+      if (Predicate(GV))
         return true;
-    }
-
-    for (unsigned I = 0, E = C->getNumOperands(); I != E; ++I) {
-      const Constant *D = dyn_cast<Constant>(C->getOperand(I));
-      if (!D)
+    for (const Value *Op : WorkItem->operands()) {
+      const Constant *ConstOp = dyn_cast<Constant>(Op);
+      if (!ConstOp)
         continue;
-      if (Visited.insert(D))
-        WorkList.push_back(D);
+      if (Visited.insert(ConstOp))
+        WorkList.push_back(ConstOp);
     }
   }
-
   return false;
 }
 
-/// isConstantUsed - Return true if the constant has users other than constant
-/// exprs and other dangling things.
+/// Return true if the value can vary between threads.
+bool Constant::isThreadDependent() const {
+  auto DLLImportPredicate = [](const GlobalValue *GV) {
+    return GV->isThreadLocal();
+  };
+  return ConstHasGlobalValuePredicate(this, DLLImportPredicate);
+}
+
+bool Constant::isDLLImportDependent() const {
+  auto DLLImportPredicate = [](const GlobalValue *GV) {
+    return GV->hasDLLImportStorageClass();
+  };
+  return ConstHasGlobalValuePredicate(this, DLLImportPredicate);
+}
+
+/// Return true if the constant has users other than constant exprs and other
+/// dangling things.
 bool Constant::isConstantUsed() const {
   for (const User *U : users()) {
     const Constant *UC = dyn_cast<Constant>(U);
@@ -1698,6 +1733,19 @@ Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy) {
   assert(CastInst::castIsValid(Instruction::AddrSpaceCast, C, DstTy) &&
          "Invalid constantexpr addrspacecast!");
 
+  // Canonicalize addrspacecasts between different pointer types by first
+  // bitcasting the pointer type and then converting the address space.
+  PointerType *SrcScalarTy = cast<PointerType>(C->getType()->getScalarType());
+  PointerType *DstScalarTy = cast<PointerType>(DstTy->getScalarType());
+  Type *DstElemTy = DstScalarTy->getElementType();
+  if (SrcScalarTy->getElementType() != DstElemTy) {
+    Type *MidTy = PointerType::get(DstElemTy, SrcScalarTy->getAddressSpace());
+    if (VectorType *VT = dyn_cast<VectorType>(DstTy)) {
+      // Handle vectors of pointers.
+      MidTy = VectorType::get(MidTy, VT->getNumElements());
+    }
+    C = getBitCast(C, MidTy);
+  }
   return getFoldedCast(Instruction::AddrSpaceCast, C, DstTy);
 }
 
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 27ce503..87099a6 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -17,9 +17,9 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
@@ -35,10 +35,10 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
+#include <system_error>
 
 using namespace llvm;
 
@@ -281,7 +281,11 @@ char *LLVMPrintTypeToString(LLVMTypeRef Ty) {
   std::string buf;
   raw_string_ostream os(buf);
 
-  unwrap(Ty)->print(os);
+  if (unwrap(Ty))
+    unwrap(Ty)->print(os);
+  else
+    os << "Printing <null> Type";
+
   os.flush();
 
   return strdup(buf.c_str());
@@ -531,7 +535,11 @@ char* LLVMPrintValueToString(LLVMValueRef Val) {
   std::string buf;
   raw_string_ostream os(buf);
 
-  unwrap(Val)->print(os);
+  if (unwrap(Val))
+    unwrap(Val)->print(os);
+  else
+    os << "Printing <null> Value";
+
   os.flush();
 
   return strdup(buf.c_str());
@@ -1286,7 +1294,7 @@ void LLVMSetLinkage(LLVMValueRef Global, LLVMLinkage Linkage) {
 }
 
 const char *LLVMGetSection(LLVMValueRef Global) {
-  return unwrap<GlobalValue>(Global)->getSection().c_str();
+  return unwrap<GlobalValue>(Global)->getSection();
 }
 
 void LLVMSetSection(LLVMValueRef Global, const char *Section) {
@@ -2598,28 +2606,24 @@ LLVMBool LLVMCreateMemoryBufferWithContentsOfFile(
     LLVMMemoryBufferRef *OutMemBuf,
     char **OutMessage) {
 
-  std::unique_ptr<MemoryBuffer> MB;
-  error_code ec;
-  if (!(ec = MemoryBuffer::getFile(Path, MB))) {
-    *OutMemBuf = wrap(MB.release());
-    return 0;
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr = MemoryBuffer::getFile(Path);
+  if (std::error_code EC = MBOrErr.getError()) {
+    *OutMessage = strdup(EC.message().c_str());
+    return 1;
   }
-
-  *OutMessage = strdup(ec.message().c_str());
-  return 1;
+  *OutMemBuf = wrap(MBOrErr.get().release());
+  return 0;
 }
 
 LLVMBool LLVMCreateMemoryBufferWithSTDIN(LLVMMemoryBufferRef *OutMemBuf,
                                          char **OutMessage) {
-  std::unique_ptr<MemoryBuffer> MB;
-  error_code ec;
-  if (!(ec = MemoryBuffer::getSTDIN(MB))) {
-    *OutMemBuf = wrap(MB.release());
-    return 0;
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr = MemoryBuffer::getSTDIN();
+  if (std::error_code EC = MBOrErr.getError()) {
+    *OutMessage = strdup(EC.message().c_str());
+    return 1;
   }
-
-  *OutMessage = strdup(ec.message().c_str());
-  return 1;
+  *OutMemBuf = wrap(MBOrErr.get().release());
+  return 0;
 }
 
 LLVMMemoryBufferRef LLVMCreateMemoryBufferWithMemoryRange(
@@ -2700,11 +2704,10 @@ void LLVMDisposePassManager(LLVMPassManagerRef PM) {
 /*===-- Threading ------------------------------------------------------===*/
 
 LLVMBool LLVMStartMultithreaded() {
-  return llvm_start_multithreaded();
+  return LLVMIsMultithreaded();
 }
 
 void LLVMStopMultithreaded() {
-  llvm_stop_multithreaded();
 }
 
 LLVMBool LLVMIsMultithreaded() {
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index 92edacc..218787c 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -102,7 +102,8 @@ DICompileUnit DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
                                            StringRef Producer, bool isOptimized,
                                            StringRef Flags, unsigned RunTimeVer,
                                            StringRef SplitName,
-                                           DebugEmissionKind Kind) {
+                                           DebugEmissionKind Kind,
+                                           bool EmitDebugInfo) {
 
   assert(((Lang <= dwarf::DW_LANG_OCaml && Lang >= dwarf::DW_LANG_C89) ||
           (Lang <= dwarf::DW_LANG_hi_user && Lang >= dwarf::DW_LANG_lo_user)) &&
@@ -140,8 +141,14 @@ DICompileUnit DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
   MDNode *CUNode = MDNode::get(VMContext, Elts);
 
   // Create a named metadata so that it is easier to find cu in a module.
-  NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
-  NMD->addOperand(CUNode);
+  // Note that we only generate this when the caller wants to actually
+  // emit debug information. When we are only interested in tracking
+  // source line locations throughout the backend, we prevent codegen from
+  // emitting debug info in the final output by not generating llvm.dbg.cu.
+  if (EmitDebugInfo) {
+    NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
+    NMD->addOperand(CUNode);
+  }
 
   return DICompileUnit(CUNode);
 }
@@ -1068,18 +1075,19 @@ DIVariable DIBuilder::createComplexVariable(unsigned Tag, DIDescriptor Scope,
                                             DITypeRef Ty,
                                             ArrayRef<Value *> Addr,
                                             unsigned ArgNo) {
-  SmallVector<Value *, 15> Elts;
-  Elts.push_back(GetTagConstant(VMContext, Tag));
-  Elts.push_back(getNonCompileUnitScope(Scope)),
-  Elts.push_back(MDString::get(VMContext, Name));
-  Elts.push_back(F);
-  Elts.push_back(ConstantInt::get(Type::getInt32Ty(VMContext),
-                                  (LineNo | (ArgNo << 24))));
-  Elts.push_back(Ty);
-  Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)));
-  Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)));
-  Elts.append(Addr.begin(), Addr.end());
-
+  assert(Addr.size() > 0 && "complex address is empty");
+  Value *Elts[] = {
+    GetTagConstant(VMContext, Tag),
+    getNonCompileUnitScope(Scope),
+    MDString::get(VMContext, Name),
+    F,
+    ConstantInt::get(Type::getInt32Ty(VMContext),
+                     (LineNo | (ArgNo << 24))),
+    Ty,
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    MDNode::get(VMContext, Addr)
+  };
   return DIVariable(MDNode::get(VMContext, Elts));
 }
 
diff --git a/lib/IR/DebugInfo.cpp b/lib/IR/DebugInfo.cpp
index db9e56d..5e39b24 100644
--- a/lib/IR/DebugInfo.cpp
+++ b/lib/IR/DebugInfo.cpp
@@ -138,8 +138,14 @@ void DIDescriptor::replaceFunctionField(unsigned Elt, Function *F) {
   }
 }
 
-unsigned DIVariable::getNumAddrElements() const {
-  return DbgNode->getNumOperands() - 8;
+uint64_t DIVariable::getAddrElement(unsigned Idx) const {
+  DIDescriptor ComplexExpr = getDescriptorField(8);
+  if (Idx < ComplexExpr->getNumOperands())
+    if (auto *CI = dyn_cast_or_null<ConstantInt>(ComplexExpr->getOperand(Idx)))
+      return CI->getZExtValue();
+
+  assert(false && "non-existing complex address element requested");
+  return 0;
 }
 
 /// getInlinedAt - If this variable is inlined then return inline location.
@@ -566,7 +572,13 @@ bool DIVariable::Verify() const {
   // Make sure that type @ field 5 is a DITypeRef.
   if (!fieldIsTypeRef(DbgNode, 5))
     return false;
-  return DbgNode->getNumOperands() >= 8;
+
+  // Variable without a complex expression.
+  if (DbgNode->getNumOperands() == 8)
+    return true;
+
+  // Make sure the complex expression is an MDNode.
+  return (DbgNode->getNumOperands() == 9 && fieldIsMDNode(DbgNode, 8));
 }
 
 /// Verify - Verify that a location descriptor is well formed.
@@ -1514,3 +1526,23 @@ unsigned llvm::getDebugMetadataVersionFromModule(const Module &M) {
     return 0;
   return cast<ConstantInt>(Val)->getZExtValue();
 }
+
+llvm::DenseMap<const llvm::Function *, llvm::DISubprogram>
+llvm::makeSubprogramMap(const Module &M) {
+  DenseMap<const Function *, DISubprogram> R;
+
+  NamedMDNode *CU_Nodes = M.getNamedMetadata("llvm.dbg.cu");
+  if (!CU_Nodes)
+    return R;
+
+  for (MDNode *N : CU_Nodes->operands()) {
+    DICompileUnit CUNode(N);
+    DIArray SPs = CUNode.getSubprograms();
+    for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
+      DISubprogram SP(SPs.getElement(i));
+      if (Function *F = SP.getFunction())
+        R.insert(std::make_pair(F, SP));
+    }
+  }
+  return R;
+}
diff --git a/lib/IR/DebugLoc.cpp b/lib/IR/DebugLoc.cpp
index 43360d3..e8bdcce 100644
--- a/lib/IR/DebugLoc.cpp
+++ b/lib/IR/DebugLoc.cpp
@@ -76,7 +76,7 @@ MDNode *DebugLoc::getScopeNode(const LLVMContext &Ctx) const {
   return getScope(Ctx);
 }
 
-DebugLoc DebugLoc::getFnDebugLoc(const LLVMContext &Ctx) {
+DebugLoc DebugLoc::getFnDebugLoc(const LLVMContext &Ctx) const {
   const MDNode *Scope = getScopeNode(Ctx);
   DISubprogram SP = getDISubprogram(Scope);
   if (SP.isSubprogram()) {
diff --git a/lib/IR/DiagnosticInfo.cpp b/lib/IR/DiagnosticInfo.cpp
index 6eeb162..2727063 100644
--- a/lib/IR/DiagnosticInfo.cpp
+++ b/lib/IR/DiagnosticInfo.cpp
@@ -128,7 +128,7 @@ void DiagnosticInfoSampleProfile::print(DiagnosticPrinter &DP) const {
 }
 
 bool DiagnosticInfoOptimizationRemarkBase::isLocationAvailable() const {
-  return getFunction().getParent()->getNamedMetadata("llvm.dbg.cu") != nullptr;
+  return getDebugLoc().isUnknown() == false;
 }
 
 void DiagnosticInfoOptimizationRemarkBase::getLocation(StringRef *Filename,
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index fe32c46..1443571 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -735,6 +735,11 @@ Function *Intrinsic::getDeclaration(Module *M, ID id, ArrayRef<Type*> Tys) {
 #include "llvm/IR/Intrinsics.gen"
 #undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
 
+// This defines the "Intrinsic::getIntrinsicForMSBuiltin()" method.
+#define GET_LLVM_INTRINSIC_FOR_MS_BUILTIN
+#include "llvm/IR/Intrinsics.gen"
+#undef GET_LLVM_INTRINSIC_FOR_MS_BUILTIN
+
 /// hasAddressTaken - returns true if there are any uses of this function
 /// other than direct calls or invokes to it.
 bool Function::hasAddressTaken(const User* *PutOffender) const {
diff --git a/lib/IR/GCOV.cpp b/lib/IR/GCOV.cpp
index f2099d6..1667401 100644
--- a/lib/IR/GCOV.cpp
+++ b/lib/IR/GCOV.cpp
@@ -19,8 +19,8 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/system_error.h"
 #include <algorithm>
+#include <system_error>
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -438,11 +438,15 @@ class LineConsumer {
   StringRef Remaining;
 public:
   LineConsumer(StringRef Filename) {
-    if (error_code EC = MemoryBuffer::getFileOrSTDIN(Filename, Buffer)) {
+    ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
+        MemoryBuffer::getFileOrSTDIN(Filename);
+    if (std::error_code EC = BufferOrErr.getError()) {
       errs() << Filename << ": " << EC.message() << "\n";
       Remaining = "";
-    } else
+    } else {
+      Buffer = std::move(BufferOrErr.get());
       Remaining = Buffer->getBuffer();
+    }
   }
   bool empty() { return Remaining.empty(); }
   void printNext(raw_ostream &OS, uint32_t LineNum) {
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index c905cfe..244e3e4 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/LeakDetector.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
@@ -59,9 +60,16 @@ void GlobalValue::copyAttributesFrom(const GlobalValue *Src) {
 }
 
 unsigned GlobalValue::getAlignment() const {
-  if (auto *GA = dyn_cast<GlobalAlias>(this))
-    return GA->getAliasee()->getAlignment();
-
+  if (auto *GA = dyn_cast<GlobalAlias>(this)) {
+    // In general we cannot compute this at the IR level, but we try.
+    if (const GlobalObject *GO = GA->getBaseObject())
+      return GO->getAlignment();
+
+    // FIXME: we should also be able to handle:
+    // Alias = Global + Offset
+    // Alias = Absolute
+    return 0;
+  }
   return cast<GlobalObject>(this)->getAlignment();
 }
 
@@ -80,12 +88,26 @@ void GlobalObject::copyAttributesFrom(const GlobalValue *Src) {
   setSection(GV->getSection());
 }
 
-const std::string &GlobalValue::getSection() const {
-  if (auto *GA = dyn_cast<GlobalAlias>(this))
-    return GA->getAliasee()->getSection();
+const char *GlobalValue::getSection() const {
+  if (auto *GA = dyn_cast<GlobalAlias>(this)) {
+    // In general we cannot compute this at the IR level, but we try.
+    if (const GlobalObject *GO = GA->getBaseObject())
+      return GO->getSection();
+    return "";
+  }
   return cast<GlobalObject>(this)->getSection();
 }
 
+Comdat *GlobalValue::getComdat() {
+  if (auto *GA = dyn_cast<GlobalAlias>(this)) {
+    // In general we cannot compute this at the IR level, but we try.
+    if (const GlobalObject *GO = GA->getBaseObject())
+      return const_cast<GlobalObject *>(GO)->getComdat();
+    return nullptr;
+  }
+  return cast<GlobalObject>(this)->getComdat();
+}
+
 void GlobalObject::setSection(StringRef S) { Section = S; }
 
 bool GlobalValue::isDeclaration() const {
@@ -113,8 +135,9 @@ GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link,
     : GlobalObject(PointerType::get(Ty, AddressSpace), Value::GlobalVariableVal,
                    OperandTraits<GlobalVariable>::op_begin(this),
                    InitVal != nullptr, Link, Name),
-      isConstantGlobal(constant), threadLocalMode(TLMode),
+      isConstantGlobal(constant),
       isExternallyInitializedConstant(isExternallyInitialized) {
+  setThreadLocalMode(TLMode);
   if (InitVal) {
     assert(InitVal->getType() == Ty &&
            "Initializer should be the same type as the GlobalVariable!");
@@ -132,8 +155,9 @@ GlobalVariable::GlobalVariable(Module &M, Type *Ty, bool constant,
     : GlobalObject(PointerType::get(Ty, AddressSpace), Value::GlobalVariableVal,
                    OperandTraits<GlobalVariable>::op_begin(this),
                    InitVal != nullptr, Link, Name),
-      isConstantGlobal(constant), threadLocalMode(TLMode),
+      isConstantGlobal(constant),
       isExternallyInitializedConstant(isExternallyInitialized) {
+  setThreadLocalMode(TLMode);
   if (InitVal) {
     assert(InitVal->getType() == Ty &&
            "Initializer should be the same type as the GlobalVariable!");
@@ -214,7 +238,7 @@ void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) {
 //===----------------------------------------------------------------------===//
 
 GlobalAlias::GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Link,
-                         const Twine &Name, GlobalObject *Aliasee,
+                         const Twine &Name, Constant *Aliasee,
                          Module *ParentModule)
     : GlobalValue(PointerType::get(Ty, AddressSpace), Value::GlobalAliasVal,
                   &Op<0>(), 1, Link, Name) {
@@ -227,7 +251,7 @@ GlobalAlias::GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Link,
 
 GlobalAlias *GlobalAlias::create(Type *Ty, unsigned AddressSpace,
                                  LinkageTypes Link, const Twine &Name,
-                                 GlobalObject *Aliasee, Module *ParentModule) {
+                                 Constant *Aliasee, Module *ParentModule) {
   return new GlobalAlias(Ty, AddressSpace, Link, Name, Aliasee, ParentModule);
 }
 
@@ -239,18 +263,18 @@ GlobalAlias *GlobalAlias::create(Type *Ty, unsigned AddressSpace,
 
 GlobalAlias *GlobalAlias::create(Type *Ty, unsigned AddressSpace,
                                  LinkageTypes Linkage, const Twine &Name,
-                                 GlobalObject *Aliasee) {
+                                 GlobalValue *Aliasee) {
   return create(Ty, AddressSpace, Linkage, Name, Aliasee, Aliasee->getParent());
 }
 
 GlobalAlias *GlobalAlias::create(LinkageTypes Link, const Twine &Name,
-                                 GlobalObject *Aliasee) {
+                                 GlobalValue *Aliasee) {
   PointerType *PTy = Aliasee->getType();
   return create(PTy->getElementType(), PTy->getAddressSpace(), Link, Name,
                 Aliasee);
 }
 
-GlobalAlias *GlobalAlias::create(const Twine &Name, GlobalObject *Aliasee) {
+GlobalAlias *GlobalAlias::create(const Twine &Name, GlobalValue *Aliasee) {
   return create(Aliasee->getLinkage(), Name, Aliasee);
 }
 
@@ -270,4 +294,8 @@ void GlobalAlias::eraseFromParent() {
   getParent()->getAliasList().erase(this);
 }
 
-void GlobalAlias::setAliasee(GlobalObject *Aliasee) { setOperand(0, Aliasee); }
+void GlobalAlias::setAliasee(Constant *Aliasee) {
+  assert((!Aliasee || Aliasee->getType() == getType()) &&
+         "Alias and aliasee types should match!");
+  setOperand(0, Aliasee);
+}
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index 28cc4cb..86421c4 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -145,31 +145,31 @@ void Instruction::setFastMathFlags(FastMathFlags FMF) {
 
 /// Determine whether the unsafe-algebra flag is set.
 bool Instruction::hasUnsafeAlgebra() const {
-  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
   return cast<FPMathOperator>(this)->hasUnsafeAlgebra();
 }
 
 /// Determine whether the no-NaNs flag is set.
 bool Instruction::hasNoNaNs() const {
-  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
   return cast<FPMathOperator>(this)->hasNoNaNs();
 }
 
 /// Determine whether the no-infs flag is set.
 bool Instruction::hasNoInfs() const {
-  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
   return cast<FPMathOperator>(this)->hasNoInfs();
 }
 
 /// Determine whether the no-signed-zeros flag is set.
 bool Instruction::hasNoSignedZeros() const {
-  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
   return cast<FPMathOperator>(this)->hasNoSignedZeros();
 }
 
 /// Determine whether the allow-reciprocal flag is set.
 bool Instruction::hasAllowReciprocal() const {
-  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
   return cast<FPMathOperator>(this)->hasAllowReciprocal();
 }
 
@@ -177,7 +177,7 @@ bool Instruction::hasAllowReciprocal() const {
 /// operator which supports these flags. See LangRef.html for the meaning of
 /// these flats.
 FastMathFlags Instruction::getFastMathFlags() const {
-  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
   return cast<FPMathOperator>(this)->getFastMathFlags();
 }
 
@@ -300,6 +300,7 @@ static bool haveSameSpecialState(const Instruction *I1, const Instruction *I2,
            FI->getSynchScope() == cast<FenceInst>(I2)->getSynchScope();
   if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I1))
     return CXI->isVolatile() == cast<AtomicCmpXchgInst>(I2)->isVolatile() &&
+           CXI->isWeak() == cast<AtomicCmpXchgInst>(I2)->isWeak() &&
            CXI->getSuccessOrdering() ==
                cast<AtomicCmpXchgInst>(I2)->getSuccessOrdering() &&
            CXI->getFailureOrdering() ==
@@ -331,6 +332,10 @@ bool Instruction::isIdenticalToWhenDefined(const Instruction *I) const {
       getType() != I->getType())
     return false;
 
+  // If both instructions have no operands, they are identical.
+  if (getNumOperands() == 0 && I->getNumOperands() == 0)
+    return haveSameSpecialState(this, I);
+
   // We have two instructions of identical opcode and #operands.  Check to see
   // if all operands are the same.
   if (!std::equal(op_begin(), op_end(), I->op_begin()))
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index 13c51b8..a5ceacb 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -248,7 +248,7 @@ void LandingPadInst::growOperands(unsigned Size) {
   Use::zap(OldOps, OldOps + e, true);
 }
 
-void LandingPadInst::addClause(Value *Val) {
+void LandingPadInst::addClause(Constant *Val) {
   unsigned OpNo = getNumOperands();
   growOperands(1);
   assert(OpNo < ReservedSpace && "Growing didn't work!");
@@ -1251,10 +1251,11 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
                                      AtomicOrdering FailureOrdering,
                                      SynchronizationScope SynchScope,
                                      Instruction *InsertBefore)
-  : Instruction(Cmp->getType(), AtomicCmpXchg,
-                OperandTraits<AtomicCmpXchgInst>::op_begin(this),
-                OperandTraits<AtomicCmpXchgInst>::operands(this),
-                InsertBefore) {
+    : Instruction(
+          StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext()),
+                          nullptr),
+          AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this),
+          OperandTraits<AtomicCmpXchgInst>::operands(this), InsertBefore) {
   Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SynchScope);
 }
 
@@ -1263,13 +1264,14 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
                                      AtomicOrdering FailureOrdering,
                                      SynchronizationScope SynchScope,
                                      BasicBlock *InsertAtEnd)
-  : Instruction(Cmp->getType(), AtomicCmpXchg,
-                OperandTraits<AtomicCmpXchgInst>::op_begin(this),
-                OperandTraits<AtomicCmpXchgInst>::operands(this),
-                InsertAtEnd) {
+    : Instruction(
+          StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext()),
+                          nullptr),
+          AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this),
+          OperandTraits<AtomicCmpXchgInst>::operands(this), InsertAtEnd) {
   Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SynchScope);
 }
- 
+
 //===----------------------------------------------------------------------===//
 //                       AtomicRMWInst Implementation
 //===----------------------------------------------------------------------===//
@@ -2331,18 +2333,12 @@ unsigned CastInst::isEliminableCastPair(
       // Allowed, use first cast's opcode
       return firstOp;
     case 14:
-      // FIXME: this state can be merged with (2), but the following assert
-      // is useful to check the correcteness of the sequence due to semantic
-      // change of bitcast.
-      assert(
-        SrcTy->isPtrOrPtrVectorTy() &&
-        MidTy->isPtrOrPtrVectorTy() &&
-        DstTy->isPtrOrPtrVectorTy() &&
-        SrcTy->getPointerAddressSpace() == MidTy->getPointerAddressSpace() &&
-        MidTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace() &&
-        "Illegal bitcast, addrspacecast sequence!");
-      // Allowed, use second cast's opcode
-      return secondOp;
+      // bitcast, addrspacecast -> addrspacecast if the element type of
+      // bitcast's source is the same as that of addrspacecast's destination.
+      if (SrcTy->getPointerElementType() == DstTy->getPointerElementType())
+        return Instruction::AddrSpaceCast;
+      return 0;
+
     case 15:
       // FIXME: this state can be merged with (1), but the following assert
       // is useful to check the correcteness of the sequence due to semantic
@@ -3610,6 +3606,7 @@ AtomicCmpXchgInst *AtomicCmpXchgInst::clone_impl() const {
                           getSuccessOrdering(), getFailureOrdering(),
                           getSynchScope());
   Result->setVolatile(isVolatile());
+  Result->setWeak(isWeak());
   return Result;
 }
 
diff --git a/lib/IR/Metadata.cpp b/lib/IR/Metadata.cpp
index 4d932d0..59137e4 100644
--- a/lib/IR/Metadata.cpp
+++ b/lib/IR/Metadata.cpp
@@ -663,7 +663,7 @@ void Instruction::setMetadata(unsigned KindID, MDNode *Node) {
 
   // Otherwise, we're removing metadata from an instruction.
   assert((hasMetadataHashEntry() ==
-          getContext().pImpl->MetadataStore.count(this)) &&
+          (getContext().pImpl->MetadataStore.count(this) > 0)) &&
          "HasMetadata bit out of date!");
   if (!hasMetadataHashEntry())
     return;  // Nothing to remove!
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index 5dbed69..f1b1f9a 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -24,6 +24,8 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LeakDetector.h"
 #include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/RandomNumberGenerator.h"
 #include <algorithm>
 #include <cstdarg>
 #include <cstdlib>
@@ -44,7 +46,7 @@ template class llvm::SymbolTableListTraits<GlobalAlias, Module>;
 //
 
 Module::Module(StringRef MID, LLVMContext &C)
-    : Context(C), Materializer(), ModuleID(MID), DL("") {
+    : Context(C), Materializer(), ModuleID(MID), RNG(nullptr), DL("") {
   ValSymTab = new ValueSymbolTable();
   NamedMDSymTab = new StringMap<NamedMDNode *>();
   Context.addModule(this);
@@ -59,6 +61,7 @@ Module::~Module() {
   NamedMDList.clear();
   delete ValSymTab;
   delete static_cast<StringMap<NamedMDNode *> *>(NamedMDSymTab);
+  delete RNG;
 }
 
 /// getNamedValue - Return the first global value in the module with
@@ -355,6 +358,16 @@ const DataLayout *Module::getDataLayout() const {
   return &DL;
 }
 
+// We want reproducible builds, but ModuleID may be a full path so we just use
+// the filename to salt the RNG (although it is not guaranteed to be unique).
+RandomNumberGenerator &Module::getRNG() const {
+  if (RNG == nullptr) {
+    StringRef Salt = sys::path::filename(ModuleID);
+    RNG = new RandomNumberGenerator(Salt);
+  }
+  return *RNG;
+}
+
 //===----------------------------------------------------------------------===//
 // Methods to control the materialization of GlobalValues in the Module.
 //
@@ -381,7 +394,7 @@ bool Module::Materialize(GlobalValue *GV, std::string *ErrInfo) {
   if (!Materializer)
     return false;
 
-  error_code EC = Materializer->Materialize(GV);
+  std::error_code EC = Materializer->Materialize(GV);
   if (!EC)
     return false;
   if (ErrInfo)
@@ -394,18 +407,21 @@ void Module::Dematerialize(GlobalValue *GV) {
     return Materializer->Dematerialize(GV);
 }
 
-error_code Module::materializeAll() {
+std::error_code Module::materializeAll() {
   if (!Materializer)
-    return error_code::success();
+    return std::error_code();
   return Materializer->MaterializeModule(this);
 }
 
-error_code Module::materializeAllPermanently() {
-  if (error_code EC = materializeAll())
+std::error_code Module::materializeAllPermanently(bool ReleaseBuffer) {
+  if (std::error_code EC = materializeAll())
     return EC;
 
+  if (ReleaseBuffer)
+    Materializer->releaseBuffer();
+
   Materializer.reset();
-  return error_code::success();
+  return std::error_code();
 }
 
 //===----------------------------------------------------------------------===//
@@ -421,14 +437,14 @@ error_code Module::materializeAllPermanently() {
 // has "dropped all references", except operator delete.
 //
 void Module::dropAllReferences() {
-  for(Module::iterator I = begin(), E = end(); I != E; ++I)
-    I->dropAllReferences();
+  for (Function &F : *this)
+    F.dropAllReferences();
 
-  for(Module::global_iterator I = global_begin(), E = global_end(); I != E; ++I)
-    I->dropAllReferences();
+  for (GlobalVariable &GV : globals())
+    GV.dropAllReferences();
 
-  for(Module::alias_iterator I = alias_begin(), E = alias_end(); I != E; ++I)
-    I->dropAllReferences();
+  for (GlobalAlias &GA : aliases())
+    GA.dropAllReferences();
 }
 
 unsigned Module::getDwarfVersion() const {
@@ -437,3 +453,11 @@ unsigned Module::getDwarfVersion() const {
     return dwarf::DWARF_VERSION;
   return cast<ConstantInt>(Val)->getZExtValue();
 }
+
+Comdat *Module::getOrInsertComdat(StringRef Name) {
+  Comdat C;
+  StringMapEntry<Comdat> &Entry =
+      ComdatSymTab.GetOrCreateValue(Name, std::move(C));
+  Entry.second.Name = &Entry;
+  return &Entry.second;
+}
diff --git a/lib/IR/Pass.cpp b/lib/IR/Pass.cpp
index bb55d2a..91d86ae 100644
--- a/lib/IR/Pass.cpp
+++ b/lib/IR/Pass.cpp
@@ -199,14 +199,6 @@ Pass *Pass::createPass(AnalysisID ID) {
   return PI->createPass();
 }
 
-Pass *PassInfo::createPass() const {
-  assert((!isAnalysisGroup() || NormalCtor) &&
-         "No default implementation found for analysis group!");
-  assert(NormalCtor &&
-         "Cannot call createPass on PassInfo without default ctor!");
-  return NormalCtor();
-}
-
 //===----------------------------------------------------------------------===//
 //                  Analysis Group Implementation Code
 //===----------------------------------------------------------------------===//
@@ -224,17 +216,6 @@ RegisterAGBase::RegisterAGBase(const char *Name, const void *InterfaceID,
 // PassRegistrationListener implementation
 //
 
-// PassRegistrationListener ctor - Add the current object to the list of
-// PassRegistrationListeners...
-PassRegistrationListener::PassRegistrationListener() {
-  PassRegistry::getPassRegistry()->addRegistrationListener(this);
-}
-
-// dtor - Remove object from list of listeners...
-PassRegistrationListener::~PassRegistrationListener() {
-  PassRegistry::getPassRegistry()->removeRegistrationListener(this);
-}
-
 // enumeratePasses - Iterate over the registered passes, calling the
 // passEnumerate callback on each PassInfo object.
 //
@@ -242,7 +223,16 @@ void PassRegistrationListener::enumeratePasses() {
   PassRegistry::getPassRegistry()->enumerateWith(this);
 }
 
-PassNameParser::~PassNameParser() {}
+PassNameParser::PassNameParser()
+    : Opt(nullptr) {
+  PassRegistry::getPassRegistry()->addRegistrationListener(this);
+}
+
+PassNameParser::~PassNameParser() {
+  // This only gets called during static destruction, in which case the
+  // PassRegistry will have already been destroyed by llvm_shutdown().  So
+  // attempting to remove the registration listener is an error.
+}
 
 //===----------------------------------------------------------------------===//
 //   AnalysisUsage Class Implementation
diff --git a/lib/IR/PassRegistry.cpp b/lib/IR/PassRegistry.cpp
index 6a5bee2..91940a9 100644
--- a/lib/IR/PassRegistry.cpp
+++ b/lib/IR/PassRegistry.cpp
@@ -13,14 +13,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/PassRegistry.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/IR/Function.h"
 #include "llvm/PassSupport.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/Mutex.h"
 #include "llvm/Support/RWMutex.h"
 #include <vector>
 
@@ -36,62 +32,23 @@ PassRegistry *PassRegistry::getPassRegistry() {
   return &*PassRegistryObj;
 }
 
-static ManagedStatic<sys::SmartRWMutex<true> > Lock;
-
-//===----------------------------------------------------------------------===//
-// PassRegistryImpl
-//
-
-namespace {
-struct PassRegistryImpl {
-  /// PassInfoMap - Keep track of the PassInfo object for each registered pass.
-  typedef DenseMap<const void*, const PassInfo*> MapType;
-  MapType PassInfoMap;
-  
-  typedef StringMap<const PassInfo*> StringMapType;
-  StringMapType PassInfoStringMap;
-  
-  /// AnalysisGroupInfo - Keep track of information for each analysis group.
-  struct AnalysisGroupInfo {
-    SmallPtrSet<const PassInfo *, 8> Implementations;
-  };
-  DenseMap<const PassInfo*, AnalysisGroupInfo> AnalysisGroupInfoMap;
-  
-  std::vector<std::unique_ptr<const PassInfo>> ToFree;
-  std::vector<PassRegistrationListener*> Listeners;
-};
-} // end anonymous namespace
-
-void *PassRegistry::getImpl() const {
-  if (!pImpl)
-    pImpl = new PassRegistryImpl();
-  return pImpl;
-}
-
 //===----------------------------------------------------------------------===//
 // Accessors
 //
 
 PassRegistry::~PassRegistry() {
-  sys::SmartScopedWriter<true> Guard(*Lock);
-  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(pImpl);
-  delete Impl;
-  pImpl = nullptr;
 }
 
 const PassInfo *PassRegistry::getPassInfo(const void *TI) const {
-  sys::SmartScopedReader<true> Guard(*Lock);
-  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
-  PassRegistryImpl::MapType::const_iterator I = Impl->PassInfoMap.find(TI);
-  return I != Impl->PassInfoMap.end() ? I->second : nullptr;
+  sys::SmartScopedReader<true> Guard(Lock);
+  MapType::const_iterator I = PassInfoMap.find(TI);
+  return I != PassInfoMap.end() ? I->second : nullptr;
 }
 
 const PassInfo *PassRegistry::getPassInfo(StringRef Arg) const {
-  sys::SmartScopedReader<true> Guard(*Lock);
-  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
-  PassRegistryImpl::StringMapType::const_iterator
-    I = Impl->PassInfoStringMap.find(Arg);
-  return I != Impl->PassInfoStringMap.end() ? I->second : nullptr;
+  sys::SmartScopedReader<true> Guard(Lock);
+  StringMapType::const_iterator I = PassInfoStringMap.find(Arg);
+  return I != PassInfoStringMap.end() ? I->second : nullptr;
 }
 
 //===----------------------------------------------------------------------===//
@@ -99,39 +56,34 @@ const PassInfo *PassRegistry::getPassInfo(StringRef Arg) const {
 //
 
 void PassRegistry::registerPass(const PassInfo &PI, bool ShouldFree) {
-  sys::SmartScopedWriter<true> Guard(*Lock);
-  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
+  sys::SmartScopedWriter<true> Guard(Lock);
   bool Inserted =
-    Impl->PassInfoMap.insert(std::make_pair(PI.getTypeInfo(),&PI)).second;
+    PassInfoMap.insert(std::make_pair(PI.getTypeInfo(),&PI)).second;
   assert(Inserted && "Pass registered multiple times!");
   (void)Inserted;
-  Impl->PassInfoStringMap[PI.getPassArgument()] = &PI;
+  PassInfoStringMap[PI.getPassArgument()] = &PI;
   
   // Notify any listeners.
   for (std::vector<PassRegistrationListener*>::iterator
-       I = Impl->Listeners.begin(), E = Impl->Listeners.end(); I != E; ++I)
+       I = Listeners.begin(), E = Listeners.end(); I != E; ++I)
     (*I)->passRegistered(&PI);
   
-  if (ShouldFree) Impl->ToFree.push_back(std::unique_ptr<const PassInfo>(&PI));
+  if (ShouldFree) ToFree.push_back(std::unique_ptr<const PassInfo>(&PI));
 }
 
 void PassRegistry::unregisterPass(const PassInfo &PI) {
-  sys::SmartScopedWriter<true> Guard(*Lock);
-  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
-  PassRegistryImpl::MapType::iterator I = 
-    Impl->PassInfoMap.find(PI.getTypeInfo());
-  assert(I != Impl->PassInfoMap.end() && "Pass registered but not in map!");
+  sys::SmartScopedWriter<true> Guard(Lock);
+  MapType::iterator I = PassInfoMap.find(PI.getTypeInfo());
+  assert(I != PassInfoMap.end() && "Pass registered but not in map!");
   
   // Remove pass from the map.
-  Impl->PassInfoMap.erase(I);
-  Impl->PassInfoStringMap.erase(PI.getPassArgument());
+  PassInfoMap.erase(I);
+  PassInfoStringMap.erase(PI.getPassArgument());
 }
 
 void PassRegistry::enumerateWith(PassRegistrationListener *L) {
-  sys::SmartScopedReader<true> Guard(*Lock);
-  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
-  for (PassRegistryImpl::MapType::const_iterator I = Impl->PassInfoMap.begin(),
-       E = Impl->PassInfoMap.end(); I != E; ++I)
+  sys::SmartScopedReader<true> Guard(Lock);
+  for (auto I = PassInfoMap.begin(), E = PassInfoMap.end(); I != E; ++I)
     L->passEnumerate(I->second);
 }
 
@@ -156,15 +108,13 @@ void PassRegistry::registerAnalysisGroup(const void *InterfaceID,
     assert(ImplementationInfo &&
            "Must register pass before adding to AnalysisGroup!");
 
-    sys::SmartScopedWriter<true> Guard(*Lock);
+    sys::SmartScopedWriter<true> Guard(Lock);
     
     // Make sure we keep track of the fact that the implementation implements
     // the interface.
     ImplementationInfo->addInterfaceImplemented(InterfaceInfo);
 
-    PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
-    PassRegistryImpl::AnalysisGroupInfo &AGI =
-      Impl->AnalysisGroupInfoMap[InterfaceInfo];
+    AnalysisGroupInfo &AGI = AnalysisGroupInfoMap[InterfaceInfo];
     assert(AGI.Implementations.count(ImplementationInfo) == 0 &&
            "Cannot add a pass to the same analysis group more than once!");
     AGI.Implementations.insert(ImplementationInfo);
@@ -179,30 +129,18 @@ void PassRegistry::registerAnalysisGroup(const void *InterfaceID,
     }
   }
   
-  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
   if (ShouldFree)
-    Impl->ToFree.push_back(std::unique_ptr<const PassInfo>(&Registeree));
+    ToFree.push_back(std::unique_ptr<const PassInfo>(&Registeree));
 }
 
 void PassRegistry::addRegistrationListener(PassRegistrationListener *L) {
-  sys::SmartScopedWriter<true> Guard(*Lock);
-  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
-  Impl->Listeners.push_back(L);
+  sys::SmartScopedWriter<true> Guard(Lock);
+  Listeners.push_back(L);
 }
 
 void PassRegistry::removeRegistrationListener(PassRegistrationListener *L) {
-  sys::SmartScopedWriter<true> Guard(*Lock);
-  
-  // NOTE: This is necessary, because removeRegistrationListener() can be called
-  // as part of the llvm_shutdown sequence.  Since we have no control over the
-  // order of that sequence, we need to gracefully handle the case where the
-  // PassRegistry is destructed before the object that triggers this call.
-  if (!pImpl) return;
+  sys::SmartScopedWriter<true> Guard(Lock);
   
-  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
-  std::vector<PassRegistrationListener*>::iterator I =
-    std::find(Impl->Listeners.begin(), Impl->Listeners.end(), L);
-  assert(I != Impl->Listeners.end() &&
-         "PassRegistrationListener not registered!");
-  Impl->Listeners.erase(I);
+  auto I = std::find(Listeners.begin(), Listeners.end(), L);
+  Listeners.erase(I);
 }
diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index d734e4e..35c241a 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/InstrTypes.h"
@@ -38,13 +39,12 @@ using namespace llvm;
 
 static inline Type *checkType(Type *Ty) {
   assert(Ty && "Value defined with a null type: Error!");
-  return const_cast<Type*>(Ty);
+  return Ty;
 }
 
 Value::Value(Type *ty, unsigned scid)
-  : SubclassID(scid), HasValueHandle(0),
-    SubclassOptionalData(0), SubclassData(0), VTy((Type*)checkType(ty)),
-    UseList(nullptr), Name(nullptr) {
+    : VTy(checkType(ty)), UseList(nullptr), Name(nullptr), SubclassID(scid),
+      HasValueHandle(0), SubclassOptionalData(0), SubclassData(0) {
   // FIXME: Why isn't this in the subclass gunk??
   // Note, we cannot call isa<CallInst> before the CallInst has been
   // constructed.
@@ -214,7 +214,7 @@ void Value::setName(const Twine &NewName) {
     // then reallocated.
 
     // Create the new name.
-    Name = ValueName::Create(NameRef.begin(), NameRef.end());
+    Name = ValueName::Create(NameRef);
     Name->setValue(this);
     return;
   }
@@ -301,27 +301,6 @@ void Value::takeName(Value *V) {
     ST->reinsertValue(this);
 }
 
-static GlobalObject &findReplacementForAliasUse(Value &C) {
-  if (auto *GO = dyn_cast<GlobalObject>(&C))
-    return *GO;
-  if (auto *GA = dyn_cast<GlobalAlias>(&C))
-    return *GA->getAliasee();
-  auto *CE = cast<ConstantExpr>(&C);
-  assert(CE->getOpcode() == Instruction::BitCast ||
-         CE->getOpcode() == Instruction::GetElementPtr ||
-         CE->getOpcode() == Instruction::AddrSpaceCast);
-  if (CE->getOpcode() == Instruction::GetElementPtr)
-    assert(cast<GEPOperator>(CE)->hasAllZeroIndices());
-  return findReplacementForAliasUse(*CE->getOperand(0));
-}
-
-static void replaceAliasUseWith(Use &U, Value *New) {
-  GlobalObject &Replacement = findReplacementForAliasUse(*New);
-  assert(&cast<GlobalObject>(*U) != &Replacement &&
-         "replaceAliasUseWith cannot form an alias cycle");
-  U.set(&Replacement);
-}
-
 #ifndef NDEBUG
 static bool contains(SmallPtrSet<ConstantExpr *, 4> &Cache, ConstantExpr *Expr,
                      Constant *C) {
@@ -373,10 +352,6 @@ void Value::replaceAllUsesWith(Value *New) {
     // Must handle Constants specially, we cannot call replaceUsesOfWith on a
     // constant because they are uniqued.
     if (auto *C = dyn_cast<Constant>(U.getUser())) {
-      if (isa<GlobalAlias>(C)) {
-        replaceAliasUseWith(U, New);
-        continue;
-      }
       if (!isa<GlobalValue>(C)) {
         C->replaceUsesOfWithOnConstant(this, New, &U);
         continue;
@@ -498,18 +473,33 @@ Value *Value::stripInBoundsOffsets() {
 
 /// isDereferenceablePointer - Test if this value is always a pointer to
 /// allocated and suitably aligned memory for a simple load or store.
-static bool isDereferenceablePointer(const Value *V,
+static bool isDereferenceablePointer(const Value *V, const DataLayout *DL,
                                      SmallPtrSet<const Value *, 32> &Visited) {
   // Note that it is not safe to speculate into a malloc'd region because
   // malloc may return null.
-  // It's also not always safe to follow a bitcast, for example:
-  //   bitcast i8* (alloca i8) to i32*
-  // would result in a 4-byte load from a 1-byte alloca. Some cases could
-  // be handled using DataLayout to check sizes and alignments though.
 
   // These are obviously ok.
   if (isa<AllocaInst>(V)) return true;
 
+  // It's not always safe to follow a bitcast, for example:
+  //   bitcast i8* (alloca i8) to i32*
+  // would result in a 4-byte load from a 1-byte alloca. However,
+  // if we're casting from a pointer from a type of larger size
+  // to a type of smaller size (or the same size), and the alignment
+  // is at least as large as for the resulting pointer type, then
+  // we can look through the bitcast.
+  if (DL)
+    if (const BitCastInst* BC = dyn_cast<BitCastInst>(V)) {
+      Type *STy = BC->getSrcTy()->getPointerElementType(),
+           *DTy = BC->getDestTy()->getPointerElementType();
+      if (STy->isSized() && DTy->isSized() &&
+          (DL->getTypeStoreSize(STy) >=
+           DL->getTypeStoreSize(DTy)) &&
+          (DL->getABITypeAlignment(STy) >=
+           DL->getABITypeAlignment(DTy)))
+        return isDereferenceablePointer(BC->getOperand(0), DL, Visited);
+    }
+
   // Global variables which can't collapse to null are ok.
   if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
     return !GV->hasExternalWeakLinkage();
@@ -523,7 +513,7 @@ static bool isDereferenceablePointer(const Value *V,
     // Conservatively require that the base pointer be fully dereferenceable.
     if (!Visited.insert(GEP->getOperand(0)))
       return false;
-    if (!isDereferenceablePointer(GEP->getOperand(0), Visited))
+    if (!isDereferenceablePointer(GEP->getOperand(0), DL, Visited))
       return false;
     // Check the indices.
     gep_type_iterator GTI = gep_type_begin(GEP);
@@ -559,9 +549,9 @@ static bool isDereferenceablePointer(const Value *V,
 
 /// isDereferenceablePointer - Test if this value is always a pointer to
 /// allocated and suitably aligned memory for a simple load or store.
-bool Value::isDereferenceablePointer() const {
+bool Value::isDereferenceablePointer(const DataLayout *DL) const {
   SmallPtrSet<const Value *, 32> Visited;
-  return ::isDereferenceablePointer(this, Visited);
+  return ::isDereferenceablePointer(this, DL, Visited);
 }
 
 /// DoPHITranslation - If this value is a PHI node with CurBB as its parent,
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index bcc38c1..314bad3 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -107,6 +107,12 @@ struct VerifierSupport {
     OS << ' ' << *T;
   }
 
+  void WriteComdat(const Comdat *C) {
+    if (!C)
+      return;
+    OS << *C;
+  }
+
   // CheckFailed - A check failed, so print out the condition and the message
   // that failed.  This provides a nice place to put a breakpoint if you want
   // to see why something is not correct.
@@ -138,6 +144,12 @@ struct VerifierSupport {
     WriteType(T3);
     Broken = true;
   }
+
+  void CheckFailed(const Twine &Message, const Comdat *C) {
+    OS << Message.str() << "\n";
+    WriteComdat(C);
+    Broken = true;
+  }
 };
 class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   friend class InstVisitor<Verifier>;
@@ -230,6 +242,9 @@ public:
          I != E; ++I)
       visitNamedMDNode(*I);
 
+    for (const StringMapEntry<Comdat> &SMEC : M.getComdatSymbolTable())
+      visitComdat(SMEC.getValue());
+
     visitModuleFlags(M);
     visitModuleIdents(M);
 
@@ -241,8 +256,12 @@ private:
   void visitGlobalValue(const GlobalValue &GV);
   void visitGlobalVariable(const GlobalVariable &GV);
   void visitGlobalAlias(const GlobalAlias &GA);
+  void visitAliaseeSubExpr(const GlobalAlias &A, const Constant &C);
+  void visitAliaseeSubExpr(SmallPtrSet<const GlobalAlias *, 4> &Visited,
+                           const GlobalAlias &A, const Constant &C);
   void visitNamedMDNode(const NamedMDNode &NMD);
   void visitMDNode(MDNode &MD, Function *F);
+  void visitComdat(const Comdat &C);
   void visitModuleIdents(const Module &M);
   void visitModuleFlags(const Module &M);
   void visitModuleFlag(const MDNode *Op,
@@ -384,6 +403,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
               "'common' global must have a zero initializer!", &GV);
       Assert1(!GV.isConstant(), "'common' global may not be marked constant!",
               &GV);
+      Assert1(!GV.hasComdat(), "'common' global may not be in a Comdat!", &GV);
     }
   } else {
     Assert1(GV.hasExternalLinkage() || GV.hasExternalWeakLinkage(),
@@ -474,36 +494,57 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
   visitGlobalValue(GV);
 }
 
+void Verifier::visitAliaseeSubExpr(const GlobalAlias &GA, const Constant &C) {
+  SmallPtrSet<const GlobalAlias*, 4> Visited;
+  Visited.insert(&GA);
+  visitAliaseeSubExpr(Visited, GA, C);
+}
+
+void Verifier::visitAliaseeSubExpr(SmallPtrSet<const GlobalAlias *, 4> &Visited,
+                                   const GlobalAlias &GA, const Constant &C) {
+  if (const auto *GV = dyn_cast<GlobalValue>(&C)) {
+    Assert1(!GV->isDeclaration(), "Alias must point to a definition", &GA);
+
+    if (const auto *GA2 = dyn_cast<GlobalAlias>(GV)) {
+      Assert1(Visited.insert(GA2), "Aliases cannot form a cycle", &GA);
+
+      Assert1(!GA2->mayBeOverridden(), "Alias cannot point to a weak alias",
+              &GA);
+    } else {
+      // Only continue verifying subexpressions of GlobalAliases.
+      // Do not recurse into global initializers.
+      return;
+    }
+  }
+
+  if (const auto *CE = dyn_cast<ConstantExpr>(&C))
+    VerifyConstantExprBitcastType(CE);
+
+  for (const Use &U : C.operands()) {
+    Value *V = &*U;
+    if (const auto *GA2 = dyn_cast<GlobalAlias>(V))
+      visitAliaseeSubExpr(Visited, GA, *GA2->getAliasee());
+    else if (const auto *C2 = dyn_cast<Constant>(V))
+      visitAliaseeSubExpr(Visited, GA, *C2);
+  }
+}
+
 void Verifier::visitGlobalAlias(const GlobalAlias &GA) {
   Assert1(!GA.getName().empty(),
           "Alias name cannot be empty!", &GA);
   Assert1(GlobalAlias::isValidLinkage(GA.getLinkage()),
-          "Alias should have external or external weak linkage!", &GA);
-  Assert1(GA.getAliasee(),
-          "Aliasee cannot be NULL!", &GA);
-  Assert1(!GA.hasUnnamedAddr(), "Alias cannot have unnamed_addr!", &GA);
-
+          "Alias should have private, internal, linkonce, weak, linkonce_odr, "
+          "weak_odr, or external linkage!",
+          &GA);
   const Constant *Aliasee = GA.getAliasee();
-  const GlobalValue *GV = dyn_cast<GlobalValue>(Aliasee);
-
-  if (!GV) {
-    const ConstantExpr *CE = dyn_cast<ConstantExpr>(Aliasee);
-    if (CE && (CE->getOpcode() == Instruction::BitCast ||
-               CE->getOpcode() == Instruction::AddrSpaceCast ||
-               CE->getOpcode() == Instruction::GetElementPtr))
-      GV = dyn_cast<GlobalValue>(CE->getOperand(0));
+  Assert1(Aliasee, "Aliasee cannot be NULL!", &GA);
+  Assert1(GA.getType() == Aliasee->getType(),
+          "Alias and aliasee types should match!", &GA);
 
-    Assert1(GV, "Aliasee should be either GlobalValue, bitcast or "
-                "addrspacecast of GlobalValue",
-            &GA);
+  Assert1(isa<GlobalValue>(Aliasee) || isa<ConstantExpr>(Aliasee),
+          "Aliasee should be either GlobalValue or ConstantExpr", &GA);
 
-    VerifyConstantExprBitcastType(CE);
-  }
-  Assert1(!GV->isDeclaration(), "Alias must point to a definition", &GA);
-  if (const GlobalAlias *GAAliasee = dyn_cast<GlobalAlias>(GV)) {
-    Assert1(!GAAliasee->mayBeOverridden(), "Alias cannot point to a weak alias",
-            &GA);
-  }
+  visitAliaseeSubExpr(GA, *Aliasee);
 
   visitGlobalValue(GA);
 }
@@ -556,6 +597,22 @@ void Verifier::visitMDNode(MDNode &MD, Function *F) {
   }
 }
 
+void Verifier::visitComdat(const Comdat &C) {
+  // All Comdat::SelectionKind values other than Comdat::Any require a
+  // GlobalValue with the same name as the Comdat.
+  const GlobalValue *GV = M->getNamedValue(C.getName());
+  if (C.getSelectionKind() != Comdat::Any)
+    Assert1(GV,
+            "comdat selection kind requires a global value with the same name",
+            &C);
+  // The Module is invalid if the GlobalValue has local linkage.  Allowing
+  // otherwise opens us up to seeing the underling global value get renamed if
+  // collisions occur.
+  if (GV)
+    Assert1(!GV->hasLocalLinkage(), "comdat global value has local linkage",
+            GV);
+}
+
 void Verifier::visitModuleIdents(const Module &M) {
   const NamedMDNode *Idents = M.getNamedMetadata("llvm.ident");
   if (!Idents) 
@@ -716,7 +773,8 @@ void Verifier::VerifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
         I->getKindAsEnum() == Attribute::Builtin ||
         I->getKindAsEnum() == Attribute::NoBuiltin ||
         I->getKindAsEnum() == Attribute::Cold ||
-        I->getKindAsEnum() == Attribute::OptimizeNone) {
+        I->getKindAsEnum() == Attribute::OptimizeNone ||
+        I->getKindAsEnum() == Attribute::JumpTable) {
       if (!isFunction) {
         CheckFailed("Attribute '" + I->getAsString() +
                     "' only applies to functions!", V);
@@ -890,6 +948,14 @@ void Verifier::VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
                                 Attribute::MinSize),
             "Attributes 'minsize and optnone' are incompatible!", V);
   }
+
+  if (Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                         Attribute::JumpTable)) {
+    const GlobalValue *GV = cast<GlobalValue>(V);
+    Assert1(GV->hasUnnamedAddr(),
+            "Attribute 'jumptable' requires 'unnamed_addr'", V);
+
+  }
 }
 
 void Verifier::VerifyBitcastType(const Value *V, Type *DestTy, Type *SrcTy) {
@@ -2058,8 +2124,7 @@ void Verifier::visitLandingPadInst(LandingPadInst &LPI) {
   Assert1(isa<Constant>(PersonalityFn), "Personality function is not constant!",
           &LPI);
   for (unsigned i = 0, e = LPI.getNumClauses(); i < e; ++i) {
-    Value *Clause = LPI.getClause(i);
-    Assert1(isa<Constant>(Clause), "Clause is not constant!", &LPI);
+    Constant *Clause = LPI.getClause(i);
     if (LPI.isCatch(i)) {
       Assert1(isa<PointerType>(Clause->getType()),
               "Catch operand does not have pointer type!", &LPI);
@@ -2203,7 +2268,8 @@ void Verifier::visitInstruction(Instruction &I) {
   }
 
   MDNode *MD = I.getMetadata(LLVMContext::MD_range);
-  Assert1(!MD || isa<LoadInst>(I), "Ranges are only for loads!", &I);
+  Assert1(!MD || isa<LoadInst>(I) || isa<CallInst>(I) || isa<InvokeInst>(I),
+          "Ranges are only for loads, calls and invokes!", &I);
 
   InstsInThisBlock.insert(&I);
 }
diff --git a/lib/IRReader/IRReader.cpp b/lib/IRReader/IRReader.cpp
index f4ed437..f8d2f5a 100644
--- a/lib/IRReader/IRReader.cpp
+++ b/lib/IRReader/IRReader.cpp
@@ -18,7 +18,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 using namespace llvm;
 
@@ -29,17 +29,16 @@ namespace llvm {
 static const char *const TimeIRParsingGroupName = "LLVM IR Parsing";
 static const char *const TimeIRParsingName = "Parse IR";
 
-
-Module *llvm::getLazyIRModule(MemoryBuffer *Buffer, SMDiagnostic &Err,
-                              LLVMContext &Context) {
+static Module *getLazyIRModule(MemoryBuffer *Buffer, SMDiagnostic &Err,
+                               LLVMContext &Context) {
   if (isBitcode((const unsigned char *)Buffer->getBufferStart(),
                 (const unsigned char *)Buffer->getBufferEnd())) {
     std::string ErrMsg;
     ErrorOr<Module *> ModuleOrErr = getLazyBitcodeModule(Buffer, Context);
-    if (error_code EC = ModuleOrErr.getError()) {
+    if (std::error_code EC = ModuleOrErr.getError()) {
       Err = SMDiagnostic(Buffer->getBufferIdentifier(), SourceMgr::DK_Error,
                          EC.message());
-      // ParseBitcodeFile does not take ownership of the Buffer in the
+      // getLazyBitcodeModule does not take ownership of the Buffer in the
       // case of an error.
       delete Buffer;
       return nullptr;
@@ -52,14 +51,15 @@ Module *llvm::getLazyIRModule(MemoryBuffer *Buffer, SMDiagnostic &Err,
 
 Module *llvm::getLazyIRFileModule(const std::string &Filename, SMDiagnostic &Err,
                                   LLVMContext &Context) {
-  std::unique_ptr<MemoryBuffer> File;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, File)) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
+      MemoryBuffer::getFileOrSTDIN(Filename);
+  if (std::error_code EC = FileOrErr.getError()) {
     Err = SMDiagnostic(Filename, SourceMgr::DK_Error,
-                       "Could not open input file: " + ec.message());
+                       "Could not open input file: " + EC.message());
     return nullptr;
   }
 
-  return getLazyIRModule(File.release(), Err, Context);
+  return getLazyIRModule(FileOrErr.get().release(), Err, Context);
 }
 
 Module *llvm::ParseIR(MemoryBuffer *Buffer, SMDiagnostic &Err,
@@ -70,29 +70,31 @@ Module *llvm::ParseIR(MemoryBuffer *Buffer, SMDiagnostic &Err,
                 (const unsigned char *)Buffer->getBufferEnd())) {
     ErrorOr<Module *> ModuleOrErr = parseBitcodeFile(Buffer, Context);
     Module *M = nullptr;
-    if (error_code EC = ModuleOrErr.getError())
+    if (std::error_code EC = ModuleOrErr.getError())
       Err = SMDiagnostic(Buffer->getBufferIdentifier(), SourceMgr::DK_Error,
                          EC.message());
     else
       M = ModuleOrErr.get();
     // parseBitcodeFile does not take ownership of the Buffer.
-    delete Buffer;
     return M;
   }
 
-  return ParseAssembly(Buffer, nullptr, Err, Context);
+  return ParseAssembly(MemoryBuffer::getMemBuffer(
+                           Buffer->getBuffer(), Buffer->getBufferIdentifier()),
+                       nullptr, Err, Context);
 }
 
 Module *llvm::ParseIRFile(const std::string &Filename, SMDiagnostic &Err,
                           LLVMContext &Context) {
-  std::unique_ptr<MemoryBuffer> File;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, File)) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
+      MemoryBuffer::getFileOrSTDIN(Filename);
+  if (std::error_code EC = FileOrErr.getError()) {
     Err = SMDiagnostic(Filename, SourceMgr::DK_Error,
-                       "Could not open input file: " + ec.message());
+                       "Could not open input file: " + EC.message());
     return nullptr;
   }
 
-  return ParseIR(File.release(), Err, Context);
+  return ParseIR(FileOrErr.get().get(), Err, Context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -104,7 +106,8 @@ LLVMBool LLVMParseIRInContext(LLVMContextRef ContextRef,
                               char **OutMessage) {
   SMDiagnostic Diag;
 
-  *OutM = wrap(ParseIR(unwrap(MemBuf), Diag, *unwrap(ContextRef)));
+  std::unique_ptr<MemoryBuffer> MB(unwrap(MemBuf));
+  *OutM = wrap(ParseIR(MB.get(), Diag, *unwrap(ContextRef)));
 
   if(!*OutM) {
     if (OutMessage) {
diff --git a/lib/LTO/LLVMBuild.txt b/lib/LTO/LLVMBuild.txt
index c9b5212..29ed92c 100644
--- a/lib/LTO/LLVMBuild.txt
+++ b/lib/LTO/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = LTO
 parent = Libraries
-required_libraries = BitReader BitWriter Core IPA IPO InstCombine Linker MC MCParser ObjCARC Scalar Support Target TransformUtils
+required_libraries = BitReader BitWriter Core IPA IPO InstCombine Linker MC MCParser ObjCARC Object Scalar Support Target TransformUtils
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 99236bd..335197a 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -44,7 +44,6 @@
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
@@ -52,6 +51,7 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/ObjCARC.h"
+#include <system_error>
 using namespace llvm;
 
 const char* LTOCodeGenerator::getVersionString() {
@@ -114,7 +114,7 @@ void LTOCodeGenerator::initializeLTOPasses() {
 }
 
 bool LTOCodeGenerator::addModule(LTOModule* mod, std::string& errMsg) {
-  bool ret = IRLinker.linkInModule(mod->getLLVVMModule(), &errMsg);
+  bool ret = IRLinker.linkInModule(&mod->getModule(), &errMsg);
 
   const std::vector<const char*> &undefs = mod->getAsmUndefinedRefs();
   for (int i = 0, e = undefs.size(); i != e; ++i)
@@ -124,23 +124,7 @@ bool LTOCodeGenerator::addModule(LTOModule* mod, std::string& errMsg) {
 }
 
 void LTOCodeGenerator::setTargetOptions(TargetOptions options) {
-  Options.LessPreciseFPMADOption = options.LessPreciseFPMADOption;
-  Options.NoFramePointerElim = options.NoFramePointerElim;
-  Options.AllowFPOpFusion = options.AllowFPOpFusion;
-  Options.UnsafeFPMath = options.UnsafeFPMath;
-  Options.NoInfsFPMath = options.NoInfsFPMath;
-  Options.NoNaNsFPMath = options.NoNaNsFPMath;
-  Options.HonorSignDependentRoundingFPMathOption =
-    options.HonorSignDependentRoundingFPMathOption;
-  Options.UseSoftFloat = options.UseSoftFloat;
-  Options.FloatABIType = options.FloatABIType;
-  Options.NoZerosInBSS = options.NoZerosInBSS;
-  Options.GuaranteedTailCallOpt = options.GuaranteedTailCallOpt;
-  Options.DisableTailCalls = options.DisableTailCalls;
-  Options.StackAlignmentOverride = options.StackAlignmentOverride;
-  Options.TrapFuncName = options.TrapFuncName;
-  Options.PositionIndependentExecutable = options.PositionIndependentExecutable;
-  Options.UseInitArray = options.UseInitArray;
+  Options = options;
 }
 
 void LTOCodeGenerator::setDebugInfo(lto_debug_model debug) {
@@ -208,7 +192,8 @@ bool LTOCodeGenerator::compile_to_file(const char** name,
   // make unique temp .o file to put generated object file
   SmallString<128> Filename;
   int FD;
-  error_code EC = sys::fs::createTemporaryFile("lto-llvm", "o", FD, Filename);
+  std::error_code EC =
+      sys::fs::createTemporaryFile("lto-llvm", "o", FD, Filename);
   if (EC) {
     errMsg = EC.message();
     return false;
@@ -251,13 +236,14 @@ const void* LTOCodeGenerator::compile(size_t* length,
   delete NativeObjectFile;
 
   // read .o file into memory buffer
-  std::unique_ptr<MemoryBuffer> BuffPtr;
-  if (error_code ec = MemoryBuffer::getFile(name, BuffPtr, -1, false)) {
-    errMsg = ec.message();
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
+      MemoryBuffer::getFile(name, -1, false);
+  if (std::error_code EC = BufferOrErr.getError()) {
+    errMsg = EC.message();
     sys::fs::remove(NativeObjectPath);
     return nullptr;
   }
-  NativeObjectFile = BuffPtr.release();
+  NativeObjectFile = BufferOrErr.get().release();
 
   // remove temp files
   sys::fs::remove(NativeObjectPath);
diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp
index d117514..844c0f2 100644
--- a/lib/LTO/LTOModule.cpp
+++ b/lib/LTO/LTOModule.cpp
@@ -24,7 +24,6 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCTargetAsmParser.h"
@@ -37,21 +36,16 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/system_error.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
+#include <system_error>
 using namespace llvm;
 
-LTOModule::LTOModule(llvm::Module *m, llvm::TargetMachine *t)
-  : _module(m), _target(t),
-    _context(_target->getMCAsmInfo(), _target->getRegisterInfo(), &ObjFileInfo),
-    _mangler(t->getDataLayout()) {
-  ObjFileInfo.InitMCObjectFileInfo(t->getTargetTriple(),
-                                   t->getRelocationModel(), t->getCodeModel(),
-                                   _context);
-}
+LTOModule::LTOModule(std::unique_ptr<object::IRObjectFile> Obj,
+                     llvm::TargetMachine *TM)
+    : IRFile(std::move(Obj)), _target(TM) {}
 
 /// isBitcodeFile - Returns 'true' if the file (or memory contents) is LLVM
 /// bitcode.
@@ -67,87 +61,63 @@ bool LTOModule::isBitcodeFile(const char *path) {
   return type == sys::fs::file_magic::bitcode;
 }
 
-/// isBitcodeFileForTarget - Returns 'true' if the file (or memory contents) is
-/// LLVM bitcode for the specified triple.
-bool LTOModule::isBitcodeFileForTarget(const void *mem, size_t length,
-                                       const char *triplePrefix) {
-  MemoryBuffer *buffer = makeBuffer(mem, length);
-  if (!buffer)
-    return false;
-  return isTargetMatch(buffer, triplePrefix);
-}
-
-bool LTOModule::isBitcodeFileForTarget(const char *path,
-                                       const char *triplePrefix) {
-  std::unique_ptr<MemoryBuffer> buffer;
-  if (MemoryBuffer::getFile(path, buffer))
-    return false;
-  return isTargetMatch(buffer.release(), triplePrefix);
-}
-
-/// isTargetMatch - Returns 'true' if the memory buffer is for the specified
-/// target triple.
-bool LTOModule::isTargetMatch(MemoryBuffer *buffer, const char *triplePrefix) {
+bool LTOModule::isBitcodeForTarget(MemoryBuffer *buffer,
+                                   StringRef triplePrefix) {
   std::string Triple = getBitcodeTargetTriple(buffer, getGlobalContext());
-  delete buffer;
-  return strncmp(Triple.c_str(), triplePrefix, strlen(triplePrefix)) == 0;
+  return StringRef(Triple).startswith(triplePrefix);
 }
 
-/// makeLTOModule - Create an LTOModule. N.B. These methods take ownership of
-/// the buffer.
-LTOModule *LTOModule::makeLTOModule(const char *path, TargetOptions options,
-                                    std::string &errMsg) {
-  std::unique_ptr<MemoryBuffer> buffer;
-  if (error_code ec = MemoryBuffer::getFile(path, buffer)) {
-    errMsg = ec.message();
+LTOModule *LTOModule::createFromFile(const char *path, TargetOptions options,
+                                     std::string &errMsg) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
+      MemoryBuffer::getFile(path);
+  if (std::error_code EC = BufferOrErr.getError()) {
+    errMsg = EC.message();
     return nullptr;
   }
-  return makeLTOModule(buffer.release(), options, errMsg);
+  return makeLTOModule(std::move(BufferOrErr.get()), options, errMsg);
 }
 
-LTOModule *LTOModule::makeLTOModule(int fd, const char *path,
-                                    size_t size, TargetOptions options,
-                                    std::string &errMsg) {
-  return makeLTOModule(fd, path, size, 0, options, errMsg);
+LTOModule *LTOModule::createFromOpenFile(int fd, const char *path, size_t size,
+                                         TargetOptions options,
+                                         std::string &errMsg) {
+  return createFromOpenFileSlice(fd, path, size, 0, options, errMsg);
 }
 
-LTOModule *LTOModule::makeLTOModule(int fd, const char *path,
-                                    size_t map_size,
-                                    off_t offset,
-                                    TargetOptions options,
-                                    std::string &errMsg) {
-  std::unique_ptr<MemoryBuffer> buffer;
-  if (error_code ec =
-          MemoryBuffer::getOpenFileSlice(fd, path, buffer, map_size, offset)) {
-    errMsg = ec.message();
+LTOModule *LTOModule::createFromOpenFileSlice(int fd, const char *path,
+                                              size_t map_size, off_t offset,
+                                              TargetOptions options,
+                                              std::string &errMsg) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
+      MemoryBuffer::getOpenFileSlice(fd, path, map_size, offset);
+  if (std::error_code EC = BufferOrErr.getError()) {
+    errMsg = EC.message();
     return nullptr;
   }
-  return makeLTOModule(buffer.release(), options, errMsg);
+  return makeLTOModule(std::move(BufferOrErr.get()), options, errMsg);
 }
 
-LTOModule *LTOModule::makeLTOModule(const void *mem, size_t length,
-                                    TargetOptions options,
-                                    std::string &errMsg, StringRef path) {
+LTOModule *LTOModule::createFromBuffer(const void *mem, size_t length,
+                                       TargetOptions options,
+                                       std::string &errMsg, StringRef path) {
   std::unique_ptr<MemoryBuffer> buffer(makeBuffer(mem, length, path));
   if (!buffer)
     return nullptr;
-  return makeLTOModule(buffer.release(), options, errMsg);
+  return makeLTOModule(std::move(buffer), options, errMsg);
 }
 
-LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
+LTOModule *LTOModule::makeLTOModule(std::unique_ptr<MemoryBuffer> Buffer,
                                     TargetOptions options,
                                     std::string &errMsg) {
-  // parse bitcode buffer
-  ErrorOr<Module *> ModuleOrErr =
-      getLazyBitcodeModule(buffer, getGlobalContext());
-  if (error_code EC = ModuleOrErr.getError()) {
+  ErrorOr<Module *> MOrErr =
+      getLazyBitcodeModule(Buffer.get(), getGlobalContext());
+  if (std::error_code EC = MOrErr.getError()) {
     errMsg = EC.message();
-    delete buffer;
     return nullptr;
   }
-  std::unique_ptr<Module> m(ModuleOrErr.get());
+  std::unique_ptr<Module> M(MOrErr.get());
 
-  std::string TripleStr = m->getTargetTriple();
+  std::string TripleStr = M->getTargetTriple();
   if (TripleStr.empty())
     TripleStr = sys::getDefaultTargetTriple();
   llvm::Triple Triple(TripleStr);
@@ -175,18 +145,13 @@ LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
 
   TargetMachine *target = march->createTargetMachine(TripleStr, CPU, FeatureStr,
                                                      options);
-  m->materializeAllPermanently();
+  M->materializeAllPermanently(true);
+  M->setDataLayout(target->getDataLayout());
 
-  LTOModule *Ret = new LTOModule(m.release(), target);
+  std::unique_ptr<object::IRObjectFile> IRObj(
+      new object::IRObjectFile(std::move(Buffer), std::move(M)));
 
-  // We need a MCContext set up in order to get mangled names of private
-  // symbols. It is a bit odd that we need to report uses and definitions
-  // of private symbols, but it does look like ld64 expects to be informed
-  // of at least the ones with an 'l' prefix.
-  MCContext &Context = Ret->_context;
-  const TargetLoweringObjectFile &TLOF =
-      target->getTargetLowering()->getObjFileLowering();
-  const_cast<TargetLoweringObjectFile &>(TLOF).Initialize(Context, *target);
+  LTOModule *Ret = new LTOModule(std::move(IRObj), target);
 
   if (Ret->parseSymbols(errMsg)) {
     delete Ret;
@@ -305,10 +270,20 @@ void LTOModule::addObjCClassRef(const GlobalVariable *clgv) {
   entry.setValue(info);
 }
 
-/// addDefinedDataSymbol - Add a data symbol as defined to the list.
-void LTOModule::addDefinedDataSymbol(const GlobalValue *v) {
+void LTOModule::addDefinedDataSymbol(const object::BasicSymbolRef &Sym) {
+  SmallString<64> Buffer;
+  {
+    raw_svector_ostream OS(Buffer);
+    Sym.printName(OS);
+  }
+
+  const GlobalValue *V = IRFile->getSymbolGV(Sym.getRawDataRefImpl());
+  addDefinedDataSymbol(Buffer.c_str(), V);
+}
+
+void LTOModule::addDefinedDataSymbol(const char *Name, const GlobalValue *v) {
   // Add to list of defined symbols.
-  addDefinedSymbol(v, false);
+  addDefinedSymbol(Name, v, false);
 
   if (!v->hasSection() /* || !isTargetDarwin */)
     return;
@@ -334,31 +309,43 @@ void LTOModule::addDefinedDataSymbol(const GlobalValue *v) {
   // from the ObjC data structures generated by the front end.
 
   // special case if this data blob is an ObjC class definition
-  if (v->getSection().compare(0, 15, "__OBJC,__class,") == 0) {
+  std::string Section = v->getSection();
+  if (Section.compare(0, 15, "__OBJC,__class,") == 0) {
     if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
       addObjCClass(gv);
     }
   }
 
   // special case if this data blob is an ObjC category definition
-  else if (v->getSection().compare(0, 18, "__OBJC,__category,") == 0) {
+  else if (Section.compare(0, 18, "__OBJC,__category,") == 0) {
     if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
       addObjCCategory(gv);
     }
   }
 
   // special case if this data blob is the list of referenced classes
-  else if (v->getSection().compare(0, 18, "__OBJC,__cls_refs,") == 0) {
+  else if (Section.compare(0, 18, "__OBJC,__cls_refs,") == 0) {
     if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
       addObjCClassRef(gv);
     }
   }
 }
 
-/// addDefinedFunctionSymbol - Add a function symbol as defined to the list.
-void LTOModule::addDefinedFunctionSymbol(const Function *f) {
+void LTOModule::addDefinedFunctionSymbol(const object::BasicSymbolRef &Sym) {
+  SmallString<64> Buffer;
+  {
+    raw_svector_ostream OS(Buffer);
+    Sym.printName(OS);
+  }
+
+  const Function *F =
+      cast<Function>(IRFile->getSymbolGV(Sym.getRawDataRefImpl()));
+  addDefinedFunctionSymbol(Buffer.c_str(), F);
+}
+
+void LTOModule::addDefinedFunctionSymbol(const char *Name, const Function *F) {
   // add to list of defined symbols
-  addDefinedSymbol(f, true);
+  addDefinedSymbol(Name, F, true);
 }
 
 static bool canBeHidden(const GlobalValue *GV) {
@@ -385,16 +372,8 @@ static bool canBeHidden(const GlobalValue *GV) {
   return !GS.IsCompared;
 }
 
-/// addDefinedSymbol - Add a defined symbol to the list.
-void LTOModule::addDefinedSymbol(const GlobalValue *def, bool isFunction) {
-  // ignore all llvm.* symbols
-  if (def->getName().startswith("llvm."))
-    return;
-
-  // string is owned by _defines
-  SmallString<64> Buffer;
-  _target->getNameWithPrefix(Buffer, def, _mangler);
-
+void LTOModule::addDefinedSymbol(const char *Name, const GlobalValue *def,
+                                 bool isFunction) {
   // set alignment part log2() can have rounding errors
   uint32_t align = def->getAlignment();
   uint32_t attr = align ? countTrailingZeros(align) : 0;
@@ -431,14 +410,14 @@ void LTOModule::addDefinedSymbol(const GlobalValue *def, bool isFunction) {
   else
     attr |= LTO_SYMBOL_SCOPE_DEFAULT;
 
-  StringSet::value_type &entry = _defines.GetOrCreateValue(Buffer);
+  StringSet::value_type &entry = _defines.GetOrCreateValue(Name);
   entry.setValue(1);
 
   // fill information structure
   NameAndAttributes info;
-  StringRef Name = entry.getKey();
-  info.name = Name.data();
-  assert(info.name[Name.size()] == '\0');
+  StringRef NameRef = entry.getKey();
+  info.name = NameRef.data();
+  assert(info.name[NameRef.size()] == '\0');
   info.attributes = attr;
   info.isFunction = isFunction;
   info.symbol = def;
@@ -483,9 +462,9 @@ void LTOModule::addAsmGlobalSymbol(const char *name,
   }
 
   if (info.isFunction)
-    addDefinedFunctionSymbol(cast<Function>(info.symbol));
+    addDefinedFunctionSymbol(info.name, cast<Function>(info.symbol));
   else
-    addDefinedDataSymbol(info.symbol);
+    addDefinedDataSymbol(info.name, info.symbol);
 
   _symbols.back().attributes &= ~LTO_SYMBOL_SCOPE_MASK;
   _symbols.back().attributes |= scope;
@@ -514,20 +493,14 @@ void LTOModule::addAsmGlobalSymbolUndef(const char *name) {
   entry.setValue(info);
 }
 
-/// addPotentialUndefinedSymbol - Add a symbol which isn't defined just yet to a
-/// list to be resolved later.
-void
-LTOModule::addPotentialUndefinedSymbol(const GlobalValue *decl, bool isFunc) {
-  // ignore all llvm.* symbols
-  if (decl->getName().startswith("llvm."))
-    return;
-
-  // ignore all aliases
-  if (isa<GlobalAlias>(decl))
-    return;
-
+/// Add a symbol which isn't defined just yet to a list to be resolved later.
+void LTOModule::addPotentialUndefinedSymbol(const object::BasicSymbolRef &Sym,
+                                            bool isFunc) {
   SmallString<64> name;
-  _target->getNameWithPrefix(name, decl, _mangler);
+  {
+    raw_svector_ostream OS(name);
+    Sym.printName(OS);
+  }
 
   StringMap<NameAndAttributes>::value_type &entry =
     _undefines.GetOrCreateValue(name);
@@ -540,6 +513,8 @@ LTOModule::addPotentialUndefinedSymbol(const GlobalValue *decl, bool isFunc) {
 
   info.name = entry.getKey().data();
 
+  const GlobalValue *decl = IRFile->getSymbolGV(Sym.getRawDataRefImpl());
+
   if (decl->hasExternalWeakLinkage())
     info.attributes = LTO_SYMBOL_DEFINITION_WEAKUNDEF;
   else
@@ -551,259 +526,54 @@ LTOModule::addPotentialUndefinedSymbol(const GlobalValue *decl, bool isFunc) {
   entry.setValue(info);
 }
 
-namespace {
-
-  class RecordStreamer : public MCStreamer {
-  public:
-    enum State { NeverSeen, Global, Defined, DefinedGlobal, Used };
-
-  private:
-    StringMap<State> Symbols;
-
-    void markDefined(const MCSymbol &Symbol) {
-      State &S = Symbols[Symbol.getName()];
-      switch (S) {
-      case DefinedGlobal:
-      case Global:
-        S = DefinedGlobal;
-        break;
-      case NeverSeen:
-      case Defined:
-      case Used:
-        S = Defined;
-        break;
-      }
-    }
-    void markGlobal(const MCSymbol &Symbol) {
-      State &S = Symbols[Symbol.getName()];
-      switch (S) {
-      case DefinedGlobal:
-      case Defined:
-        S = DefinedGlobal;
-        break;
-
-      case NeverSeen:
-      case Global:
-      case Used:
-        S = Global;
-        break;
-      }
-    }
-    void markUsed(const MCSymbol &Symbol) {
-      State &S = Symbols[Symbol.getName()];
-      switch (S) {
-      case DefinedGlobal:
-      case Defined:
-      case Global:
-        break;
-
-      case NeverSeen:
-      case Used:
-        S = Used;
-        break;
-      }
-    }
-
-    // FIXME: mostly copied for the obj streamer.
-    void AddValueSymbols(const MCExpr *Value) {
-      switch (Value->getKind()) {
-      case MCExpr::Target:
-        // FIXME: What should we do in here?
-        break;
-
-      case MCExpr::Constant:
-        break;
-
-      case MCExpr::Binary: {
-        const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
-        AddValueSymbols(BE->getLHS());
-        AddValueSymbols(BE->getRHS());
-        break;
-      }
-
-      case MCExpr::SymbolRef:
-        markUsed(cast<MCSymbolRefExpr>(Value)->getSymbol());
-        break;
-
-      case MCExpr::Unary:
-        AddValueSymbols(cast<MCUnaryExpr>(Value)->getSubExpr());
-        break;
+/// parseSymbols - Parse the symbols from the module and model-level ASM and add
+/// them to either the defined or undefined lists.
+bool LTOModule::parseSymbols(std::string &errMsg) {
+  for (auto &Sym : IRFile->symbols()) {
+    const GlobalValue *GV = IRFile->getSymbolGV(Sym.getRawDataRefImpl());
+    uint32_t Flags = Sym.getFlags();
+    if (Flags & object::BasicSymbolRef::SF_FormatSpecific)
+      continue;
+
+    bool IsUndefined = Flags & object::BasicSymbolRef::SF_Undefined;
+
+    if (!GV) {
+      SmallString<64> Buffer;
+      {
+        raw_svector_ostream OS(Buffer);
+        Sym.printName(OS);
       }
+      const char *Name = Buffer.c_str();
+
+      if (IsUndefined)
+        addAsmGlobalSymbolUndef(Name);
+      else if (Flags & object::BasicSymbolRef::SF_Global)
+        addAsmGlobalSymbol(Name, LTO_SYMBOL_SCOPE_DEFAULT);
+      else
+        addAsmGlobalSymbol(Name, LTO_SYMBOL_SCOPE_INTERNAL);
+      continue;
     }
 
-  public:
-    typedef StringMap<State>::const_iterator const_iterator;
-
-    const_iterator begin() {
-      return Symbols.begin();
+    auto *F = dyn_cast<Function>(GV);
+    if (IsUndefined) {
+      addPotentialUndefinedSymbol(Sym, F != nullptr);
+      continue;
     }
 
-    const_iterator end() {
-      return Symbols.end();
+    if (F) {
+      addDefinedFunctionSymbol(Sym);
+      continue;
     }
 
-    RecordStreamer(MCContext &Context) : MCStreamer(Context) {}
-
-    void EmitInstruction(const MCInst &Inst,
-                         const MCSubtargetInfo &STI) override {
-      // Scan for values.
-      for (unsigned i = Inst.getNumOperands(); i--; )
-        if (Inst.getOperand(i).isExpr())
-          AddValueSymbols(Inst.getOperand(i).getExpr());
-    }
-    void EmitLabel(MCSymbol *Symbol) override {
-      Symbol->setSection(*getCurrentSection().first);
-      markDefined(*Symbol);
-    }
-    void EmitDebugLabel(MCSymbol *Symbol) override {
-      EmitLabel(Symbol);
-    }
-    void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override {
-      // FIXME: should we handle aliases?
-      markDefined(*Symbol);
-      AddValueSymbols(Value);
-    }
-    bool EmitSymbolAttribute(MCSymbol *Symbol,
-                             MCSymbolAttr Attribute) override {
-      if (Attribute == MCSA_Global)
-        markGlobal(*Symbol);
-      return true;
+    if (isa<GlobalVariable>(GV)) {
+      addDefinedDataSymbol(Sym);
+      continue;
     }
-    void EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
-                      uint64_t Size , unsigned ByteAlignment) override {
-      markDefined(*Symbol);
-    }
-    void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                          unsigned ByteAlignment) override {
-      markDefined(*Symbol);
-    }
-
-    void EmitBundleAlignMode(unsigned AlignPow2) override {}
-    void EmitBundleLock(bool AlignToEnd) override {}
-    void EmitBundleUnlock() override {}
-
-    // Noop calls.
-    void ChangeSection(const MCSection *Section,
-                       const MCExpr *Subsection) override {}
-    void EmitAssemblerFlag(MCAssemblerFlag Flag) override {}
-    void EmitThumbFunc(MCSymbol *Func) override {}
-    void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override {}
-    void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override {}
-    void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {}
-    void EmitCOFFSymbolStorageClass(int StorageClass) override {}
-    void EmitCOFFSymbolType(int Type) override {}
-    void EndCOFFSymbolDef() override {}
-    void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) override {}
-    void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                               unsigned ByteAlignment) override {}
-    void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
-                        uint64_t Size, unsigned ByteAlignment) override {}
-    void EmitBytes(StringRef Data) override {}
-    void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                       const SMLoc &Loc) override {}
-    void EmitULEB128Value(const MCExpr *Value) override {}
-    void EmitSLEB128Value(const MCExpr *Value) override {}
-    void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
-                              unsigned ValueSize,
-                              unsigned MaxBytesToEmit) override {}
-    void EmitCodeAlignment(unsigned ByteAlignment,
-                           unsigned MaxBytesToEmit) override {}
-    bool EmitValueToOffset(const MCExpr *Offset,
-                           unsigned char Value) override { return false; }
-    void EmitFileDirective(StringRef Filename) override {}
-    void FinishImpl() override {}
-    void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override {
-      RecordProcEnd(Frame);
-    }
-  };
-} // end anonymous namespace
-
-/// addAsmGlobalSymbols - Add global symbols from module-level ASM to the
-/// defined or undefined lists.
-bool LTOModule::addAsmGlobalSymbols(std::string &errMsg) {
-  const std::string &inlineAsm = _module->getModuleInlineAsm();
-  if (inlineAsm.empty())
-    return false;
-
-  std::unique_ptr<RecordStreamer> Streamer(new RecordStreamer(_context));
-  MemoryBuffer *Buffer = MemoryBuffer::getMemBuffer(inlineAsm);
-  SourceMgr SrcMgr;
-  SrcMgr.AddNewSourceBuffer(Buffer, SMLoc());
-  std::unique_ptr<MCAsmParser> Parser(
-      createMCAsmParser(SrcMgr, _context, *Streamer, *_target->getMCAsmInfo()));
-  const Target &T = _target->getTarget();
-  std::unique_ptr<MCInstrInfo> MCII(T.createMCInstrInfo());
-  std::unique_ptr<MCSubtargetInfo> STI(T.createMCSubtargetInfo(
-      _target->getTargetTriple(), _target->getTargetCPU(),
-      _target->getTargetFeatureString()));
-  std::unique_ptr<MCTargetAsmParser> TAP(
-      T.createMCAsmParser(*STI, *Parser.get(), *MCII,
-                          _target->Options.MCOptions));
-  if (!TAP) {
-    errMsg = "target " + std::string(T.getName()) +
-      " does not define AsmParser.";
-    return true;
-  }
-
-  Parser->setTargetParser(*TAP);
-  if (Parser->Run(false))
-    return true;
 
-  for (RecordStreamer::const_iterator i = Streamer->begin(),
-         e = Streamer->end(); i != e; ++i) {
-    StringRef Key = i->first();
-    RecordStreamer::State Value = i->second;
-    if (Value == RecordStreamer::DefinedGlobal)
-      addAsmGlobalSymbol(Key.data(), LTO_SYMBOL_SCOPE_DEFAULT);
-    else if (Value == RecordStreamer::Defined)
-      addAsmGlobalSymbol(Key.data(), LTO_SYMBOL_SCOPE_INTERNAL);
-    else if (Value == RecordStreamer::Global ||
-             Value == RecordStreamer::Used)
-      addAsmGlobalSymbolUndef(Key.data());
+    assert(isa<GlobalAlias>(GV));
+    addDefinedDataSymbol(Sym);
   }
 
-  return false;
-}
-
-/// isDeclaration - Return 'true' if the global value is a declaration.
-static bool isDeclaration(const GlobalValue &V) {
-  if (V.hasAvailableExternallyLinkage())
-    return true;
-
-  if (V.isMaterializable())
-    return false;
-
-  return V.isDeclaration();
-}
-
-/// parseSymbols - Parse the symbols from the module and model-level ASM and add
-/// them to either the defined or undefined lists.
-bool LTOModule::parseSymbols(std::string &errMsg) {
-  // add functions
-  for (Module::iterator f = _module->begin(), e = _module->end(); f != e; ++f) {
-    if (isDeclaration(*f))
-      addPotentialUndefinedSymbol(f, true);
-    else
-      addDefinedFunctionSymbol(f);
-  }
-
-  // add data
-  for (Module::global_iterator v = _module->global_begin(),
-         e = _module->global_end(); v !=  e; ++v) {
-    if (isDeclaration(*v))
-      addPotentialUndefinedSymbol(v, false);
-    else
-      addDefinedDataSymbol(v);
-  }
-
-  // add asm globals
-  if (addAsmGlobalSymbols(errMsg))
-    return true;
-
-  // add aliases
-  for (const auto &Alias : _module->aliases())
-    addDefinedDataSymbol(&Alias);
-
   // make symbols for all undefines
   for (StringMap<NameAndAttributes>::iterator u =_undefines.begin(),
          e = _undefines.end(); u != e; ++u) {
@@ -820,7 +590,7 @@ bool LTOModule::parseSymbols(std::string &errMsg) {
 /// parseMetadata - Parse metadata from the module
 void LTOModule::parseMetadata() {
   // Linker Options
-  if (Value *Val = _module->getModuleFlag("Linker Options")) {
+  if (Value *Val = getModule().getModuleFlag("Linker Options")) {
     MDNode *LinkerOptions = cast<MDNode>(Val);
     for (unsigned i = 0, e = LinkerOptions->getNumOperands(); i != e; ++i) {
       MDNode *MDOptions = cast<MDNode>(LinkerOptions->getOperand(i));
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 45f2d4e..5bb2862 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <cctype>
+#include <tuple>
 using namespace llvm;
 
 
@@ -389,8 +390,6 @@ namespace {
     /// actually need, but this allows us to reuse the ValueMapper code.
     ValueToValueMapTy ValueMap;
 
-    std::vector<std::pair<GlobalValue *, GlobalAlias *>> ReplaceWithAlias;
-
     struct AppendingVarInfo {
       GlobalVariable *NewGV;  // New aggregate global in dest module.
       Constant *DstInit;      // Old initializer from dest module.
@@ -428,6 +427,18 @@ namespace {
       return true;
     }
 
+    bool getComdatLeader(Module *M, StringRef ComdatName,
+                         const GlobalVariable *&GVar);
+    bool computeResultingSelectionKind(StringRef ComdatName,
+                                       Comdat::SelectionKind Src,
+                                       Comdat::SelectionKind Dst,
+                                       Comdat::SelectionKind &Result,
+                                       bool &LinkFromSrc);
+    std::map<const Comdat *, std::pair<Comdat::SelectionKind, bool>>
+        ComdatsChosen;
+    bool getComdatResult(const Comdat *SrcC, Comdat::SelectionKind &SK,
+                         bool &LinkFromSrc);
+
     /// getLinkageResult - This analyzes the two global values and determines
     /// what the result will look like in the destination module.
     bool getLinkageResult(GlobalValue *Dest, const GlobalValue *Src,
@@ -536,6 +547,115 @@ Value *ValueMaterializerTy::materializeValueFor(Value *V) {
   return DF;
 }
 
+bool ModuleLinker::getComdatLeader(Module *M, StringRef ComdatName,
+                                   const GlobalVariable *&GVar) {
+  const GlobalValue *GVal = M->getNamedValue(ComdatName);
+  if (const auto *GA = dyn_cast_or_null<GlobalAlias>(GVal)) {
+    GVal = GA->getBaseObject();
+    if (!GVal)
+      // We cannot resolve the size of the aliasee yet.
+      return emitError("Linking COMDATs named '" + ComdatName +
+                       "': COMDAT key involves incomputable alias size.");
+  }
+
+  GVar = dyn_cast_or_null<GlobalVariable>(GVal);
+  if (!GVar)
+    return emitError(
+        "Linking COMDATs named '" + ComdatName +
+        "': GlobalVariable required for data dependent selection!");
+
+  return false;
+}
+
+bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName,
+                                                 Comdat::SelectionKind Src,
+                                                 Comdat::SelectionKind Dst,
+                                                 Comdat::SelectionKind &Result,
+                                                 bool &LinkFromSrc) {
+  // The ability to mix Comdat::SelectionKind::Any with
+  // Comdat::SelectionKind::Largest is a behavior that comes from COFF.
+  bool DstAnyOrLargest = Dst == Comdat::SelectionKind::Any ||
+                         Dst == Comdat::SelectionKind::Largest;
+  bool SrcAnyOrLargest = Src == Comdat::SelectionKind::Any ||
+                         Src == Comdat::SelectionKind::Largest;
+  if (DstAnyOrLargest && SrcAnyOrLargest) {
+    if (Dst == Comdat::SelectionKind::Largest ||
+        Src == Comdat::SelectionKind::Largest)
+      Result = Comdat::SelectionKind::Largest;
+    else
+      Result = Comdat::SelectionKind::Any;
+  } else if (Src == Dst) {
+    Result = Dst;
+  } else {
+    return emitError("Linking COMDATs named '" + ComdatName +
+                     "': invalid selection kinds!");
+  }
+
+  switch (Result) {
+  case Comdat::SelectionKind::Any:
+    // Go with Dst.
+    LinkFromSrc = false;
+    break;
+  case Comdat::SelectionKind::NoDuplicates:
+    return emitError("Linking COMDATs named '" + ComdatName +
+                     "': noduplicates has been violated!");
+  case Comdat::SelectionKind::ExactMatch:
+  case Comdat::SelectionKind::Largest:
+  case Comdat::SelectionKind::SameSize: {
+    const GlobalVariable *DstGV;
+    const GlobalVariable *SrcGV;
+    if (getComdatLeader(DstM, ComdatName, DstGV) ||
+        getComdatLeader(SrcM, ComdatName, SrcGV))
+      return true;
+
+    const DataLayout *DstDL = DstM->getDataLayout();
+    const DataLayout *SrcDL = SrcM->getDataLayout();
+    if (!DstDL || !SrcDL) {
+      return emitError(
+          "Linking COMDATs named '" + ComdatName +
+          "': can't do size dependent selection without DataLayout!");
+    }
+    uint64_t DstSize =
+        DstDL->getTypeAllocSize(DstGV->getType()->getPointerElementType());
+    uint64_t SrcSize =
+        SrcDL->getTypeAllocSize(SrcGV->getType()->getPointerElementType());
+    if (Result == Comdat::SelectionKind::ExactMatch) {
+      if (SrcGV->getInitializer() != DstGV->getInitializer())
+        return emitError("Linking COMDATs named '" + ComdatName +
+                         "': ExactMatch violated!");
+      LinkFromSrc = false;
+    } else if (Result == Comdat::SelectionKind::Largest) {
+      LinkFromSrc = SrcSize > DstSize;
+    } else if (Result == Comdat::SelectionKind::SameSize) {
+      if (SrcSize != DstSize)
+        return emitError("Linking COMDATs named '" + ComdatName +
+                         "': SameSize violated!");
+      LinkFromSrc = false;
+    } else {
+      llvm_unreachable("unknown selection kind");
+    }
+    break;
+  }
+  }
+
+  return false;
+}
+
+bool ModuleLinker::getComdatResult(const Comdat *SrcC,
+                                   Comdat::SelectionKind &Result,
+                                   bool &LinkFromSrc) {
+  StringRef ComdatName = SrcC->getName();
+  Module::ComdatSymTabType &ComdatSymTab = DstM->getComdatSymbolTable();
+  Module::ComdatSymTabType::iterator DstCI = ComdatSymTab.find(ComdatName);
+  if (DstCI != ComdatSymTab.end()) {
+    const Comdat *DstC = &DstCI->second;
+    Comdat::SelectionKind SSK = SrcC->getSelectionKind();
+    Comdat::SelectionKind DSK = DstC->getSelectionKind();
+    if (computeResultingSelectionKind(ComdatName, SSK, DSK, Result, LinkFromSrc))
+      return true;
+  }
+  return false;
+}
 
 /// getLinkageResult - This analyzes the two global values and determines what
 /// the result will look like in the destination module.  In particular, it
@@ -723,7 +843,7 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
     return emitError(
         "Appending variables with different unnamed_addr need to be linked!");
 
-  if (DstGV->getSection() != SrcGV->getSection())
+  if (StringRef(DstGV->getSection()) != SrcGV->getSection())
     return emitError(
           "Appending variables with different section name need to be linked!");
 
@@ -766,34 +886,47 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
   llvm::Optional<GlobalValue::VisibilityTypes> NewVisibility;
   bool HasUnnamedAddr = SGV->hasUnnamedAddr();
 
+  bool LinkFromSrc = false;
+  Comdat *DC = nullptr;
+  if (const Comdat *SC = SGV->getComdat()) {
+    Comdat::SelectionKind SK;
+    std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
+    DC = DstM->getOrInsertComdat(SC->getName());
+    DC->setSelectionKind(SK);
+  }
+
   if (DGV) {
-    // Concatenation of appending linkage variables is magic and handled later.
-    if (DGV->hasAppendingLinkage() || SGV->hasAppendingLinkage())
-      return linkAppendingVarProto(cast<GlobalVariable>(DGV), SGV);
-
-    // Determine whether linkage of these two globals follows the source
-    // module's definition or the destination module's definition.
-    GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage;
-    GlobalValue::VisibilityTypes NV;
-    bool LinkFromSrc = false;
-    if (getLinkageResult(DGV, SGV, NewLinkage, NV, LinkFromSrc))
-      return true;
-    NewVisibility = NV;
-    HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
+    if (!DC) {
+      // Concatenation of appending linkage variables is magic and handled later.
+      if (DGV->hasAppendingLinkage() || SGV->hasAppendingLinkage())
+        return linkAppendingVarProto(cast<GlobalVariable>(DGV), SGV);
+
+      // Determine whether linkage of these two globals follows the source
+      // module's definition or the destination module's definition.
+      GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage;
+      GlobalValue::VisibilityTypes NV;
+      if (getLinkageResult(DGV, SGV, NewLinkage, NV, LinkFromSrc))
+        return true;
+      NewVisibility = NV;
+      HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
+
+      // If we're not linking from the source, then keep the definition that we
+      // have.
+      if (!LinkFromSrc) {
+        // Special case for const propagation.
+        if (GlobalVariable *DGVar = dyn_cast<GlobalVariable>(DGV))
+          if (DGVar->isDeclaration() && SGV->isConstant() &&
+              !DGVar->isConstant())
+            DGVar->setConstant(true);
+
+        // Set calculated linkage, visibility and unnamed_addr.
+        DGV->setLinkage(NewLinkage);
+        DGV->setVisibility(*NewVisibility);
+        DGV->setUnnamedAddr(HasUnnamedAddr);
+      }
+    }
 
-    // If we're not linking from the source, then keep the definition that we
-    // have.
     if (!LinkFromSrc) {
-      // Special case for const propagation.
-      if (GlobalVariable *DGVar = dyn_cast<GlobalVariable>(DGV))
-        if (DGVar->isDeclaration() && SGV->isConstant() && !DGVar->isConstant())
-          DGVar->setConstant(true);
-
-      // Set calculated linkage, visibility and unnamed_addr.
-      DGV->setLinkage(NewLinkage);
-      DGV->setVisibility(*NewVisibility);
-      DGV->setUnnamedAddr(HasUnnamedAddr);
-
       // Make sure to remember this mapping.
       ValueMap[SGV] = ConstantExpr::getBitCast(DGV,TypeMap.get(SGV->getType()));
 
@@ -805,6 +938,12 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
     }
   }
 
+  // If the Comdat this variable was inside of wasn't selected, skip it.
+  if (DC && !DGV && !LinkFromSrc) {
+    DoNotLinkFromSource.insert(SGV);
+    return false;
+  }
+
   // No linking to be performed or linking from the source: simply create an
   // identical version of the symbol over in the dest module... the
   // initializer will be filled in later by LinkGlobalInits.
@@ -820,6 +959,9 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
     NewDGV->setVisibility(*NewVisibility);
   NewDGV->setUnnamedAddr(HasUnnamedAddr);
 
+  if (DC)
+    NewDGV->setComdat(DC);
+
   if (DGV) {
     DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDGV, DGV->getType()));
     DGV->eraseFromParent();
@@ -837,21 +979,33 @@ bool ModuleLinker::linkFunctionProto(Function *SF) {
   llvm::Optional<GlobalValue::VisibilityTypes> NewVisibility;
   bool HasUnnamedAddr = SF->hasUnnamedAddr();
 
+  bool LinkFromSrc = false;
+  Comdat *DC = nullptr;
+  if (const Comdat *SC = SF->getComdat()) {
+    Comdat::SelectionKind SK;
+    std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
+    DC = DstM->getOrInsertComdat(SC->getName());
+    DC->setSelectionKind(SK);
+  }
+
   if (DGV) {
-    GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage;
-    bool LinkFromSrc = false;
-    GlobalValue::VisibilityTypes NV;
-    if (getLinkageResult(DGV, SF, NewLinkage, NV, LinkFromSrc))
-      return true;
-    NewVisibility = NV;
-    HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
+    if (!DC) {
+      GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage;
+      GlobalValue::VisibilityTypes NV;
+      if (getLinkageResult(DGV, SF, NewLinkage, NV, LinkFromSrc))
+        return true;
+      NewVisibility = NV;
+      HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
+
+      if (!LinkFromSrc) {
+        // Set calculated linkage
+        DGV->setLinkage(NewLinkage);
+        DGV->setVisibility(*NewVisibility);
+        DGV->setUnnamedAddr(HasUnnamedAddr);
+      }
+    }
 
     if (!LinkFromSrc) {
-      // Set calculated linkage
-      DGV->setLinkage(NewLinkage);
-      DGV->setVisibility(*NewVisibility);
-      DGV->setUnnamedAddr(HasUnnamedAddr);
-
       // Make sure to remember this mapping.
       ValueMap[SF] = ConstantExpr::getBitCast(DGV, TypeMap.get(SF->getType()));
 
@@ -871,6 +1025,12 @@ bool ModuleLinker::linkFunctionProto(Function *SF) {
     return false;
   }
 
+  // If the Comdat this function was inside of wasn't selected, skip it.
+  if (DC && !DGV && !LinkFromSrc) {
+    DoNotLinkFromSource.insert(SF);
+    return false;
+  }
+
   // If there is no linkage to be performed or we are linking from the source,
   // bring SF over.
   Function *NewDF = Function::Create(TypeMap.get(SF->getFunctionType()),
@@ -880,6 +1040,9 @@ bool ModuleLinker::linkFunctionProto(Function *SF) {
     NewDF->setVisibility(*NewVisibility);
   NewDF->setUnnamedAddr(HasUnnamedAddr);
 
+  if (DC)
+    NewDF->setComdat(DC);
+
   if (DGV) {
     // Any uses of DF need to change to NewDF, with cast.
     DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDF, DGV->getType()));
@@ -895,20 +1058,35 @@ bool ModuleLinker::linkFunctionProto(Function *SF) {
 bool ModuleLinker::linkAliasProto(GlobalAlias *SGA) {
   GlobalValue *DGV = getLinkedToGlobal(SGA);
   llvm::Optional<GlobalValue::VisibilityTypes> NewVisibility;
+  bool HasUnnamedAddr = SGA->hasUnnamedAddr();
+
+  bool LinkFromSrc = false;
+  Comdat *DC = nullptr;
+  if (const Comdat *SC = SGA->getComdat()) {
+    Comdat::SelectionKind SK;
+    std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
+    DC = DstM->getOrInsertComdat(SC->getName());
+    DC->setSelectionKind(SK);
+  }
 
   if (DGV) {
-    GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage;
-    GlobalValue::VisibilityTypes NV;
-    bool LinkFromSrc = false;
-    if (getLinkageResult(DGV, SGA, NewLinkage, NV, LinkFromSrc))
-      return true;
-    NewVisibility = NV;
+    if (!DC) {
+      GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage;
+      GlobalValue::VisibilityTypes NV;
+      if (getLinkageResult(DGV, SGA, NewLinkage, NV, LinkFromSrc))
+        return true;
+      NewVisibility = NV;
+      HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
+
+      if (!LinkFromSrc) {
+        // Set calculated linkage.
+        DGV->setLinkage(NewLinkage);
+        DGV->setVisibility(*NewVisibility);
+        DGV->setUnnamedAddr(HasUnnamedAddr);
+      }
+    }
 
     if (!LinkFromSrc) {
-      // Set calculated linkage.
-      DGV->setLinkage(NewLinkage);
-      DGV->setVisibility(*NewVisibility);
-
       // Make sure to remember this mapping.
       ValueMap[SGA] = ConstantExpr::getBitCast(DGV,TypeMap.get(SGA->getType()));
 
@@ -919,6 +1097,12 @@ bool ModuleLinker::linkAliasProto(GlobalAlias *SGA) {
     }
   }
 
+  // If the Comdat this alias was inside of wasn't selected, skip it.
+  if (DC && !DGV && !LinkFromSrc) {
+    DoNotLinkFromSource.insert(SGA);
+    return false;
+  }
+
   // If there is no linkage to be performed or we're linking from the source,
   // bring over SGA.
   auto *PTy = cast<PointerType>(TypeMap.get(SGA->getType()));
@@ -928,9 +1112,13 @@ bool ModuleLinker::linkAliasProto(GlobalAlias *SGA) {
   copyGVAttributes(NewDA, SGA);
   if (NewVisibility)
     NewDA->setVisibility(*NewVisibility);
+  NewDA->setUnnamedAddr(HasUnnamedAddr);
 
-  if (DGV)
-    ReplaceWithAlias.push_back(std::make_pair(DGV, NewDA));
+  if (DGV) {
+    // Any uses of DGV need to change to NewDA, with cast.
+    DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDA, DGV->getType()));
+    DGV->eraseFromParent();
+  }
 
   ValueMap[SGA] = NewDA;
   return false;
@@ -1016,19 +1204,6 @@ void ModuleLinker::linkFunctionBody(Function *Dst, Function *Src) {
 
 }
 
-static GlobalObject &getGlobalObjectInExpr(Constant &C) {
-  auto *GO = dyn_cast<GlobalObject>(&C);
-  if (GO)
-    return *GO;
-  auto *GA = dyn_cast<GlobalAlias>(&C);
-  if (GA)
-    return *GA->getAliasee();
-  auto &CE = cast<ConstantExpr>(C);
-  assert(CE.getOpcode() == Instruction::BitCast ||
-         CE.getOpcode() == Instruction::AddrSpaceCast);
-  return getGlobalObjectInExpr(*CE.getOperand(0));
-}
-
 /// linkAliasBodies - Insert all of the aliases in Src into the Dest module.
 void ModuleLinker::linkAliasBodies() {
   for (Module::alias_iterator I = SrcM->alias_begin(), E = SrcM->alias_end();
@@ -1039,24 +1214,8 @@ void ModuleLinker::linkAliasBodies() {
       GlobalAlias *DA = cast<GlobalAlias>(ValueMap[I]);
       Constant *Val =
           MapValue(Aliasee, ValueMap, RF_None, &TypeMap, &ValMaterializer);
-      DA->setAliasee(&getGlobalObjectInExpr(*Val));
-    }
-  }
-
-  // Any uses of DGV need to change to NewDA, with cast.
-  for (auto &Pair : ReplaceWithAlias) {
-    GlobalValue *DGV = Pair.first;
-    GlobalAlias *NewDA = Pair.second;
-
-    for (auto *User : DGV->users()) {
-      if (auto *GA = dyn_cast<GlobalAlias>(User)) {
-        if (GA == NewDA)
-          report_fatal_error("Linking these modules creates an alias cycle.");
-      }
+      DA->setAliasee(Val);
     }
-
-    DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDA, DGV->getType()));
-    DGV->eraseFromParent();
   }
 }
 
@@ -1165,7 +1324,7 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
     // Perform the merge for standard behavior types.
     switch (SrcBehaviorValue) {
     case Module::Require:
-    case Module::Override: assert(0 && "not possible"); break;
+    case Module::Override: llvm_unreachable("not possible");
     case Module::Error: {
       // Emit an error if the values differ.
       if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
@@ -1278,6 +1437,18 @@ bool ModuleLinker::run() {
   // Loop over all of the linked values to compute type mappings.
   computeTypeMapping();
 
+  ComdatsChosen.clear();
+  for (const StringMapEntry<llvm::Comdat> &SMEC : SrcM->getComdatSymbolTable()) {
+    const Comdat &C = SMEC.getValue();
+    if (ComdatsChosen.count(&C))
+      continue;
+    Comdat::SelectionKind SK;
+    bool LinkFromSrc;
+    if (getComdatResult(&C, SK, LinkFromSrc))
+      return true;
+    ComdatsChosen[&C] = std::make_pair(SK, LinkFromSrc);
+  }
+
   // Insert all of the globals in src into the DstM module... without linking
   // initializers (which could refer to functions not yet mapped over).
   for (Module::global_iterator I = SrcM->global_begin(),
diff --git a/lib/MC/Android.mk b/lib/MC/Android.mk
index 23ad1d3..fd587c4 100644
--- a/lib/MC/Android.mk
+++ b/lib/MC/Android.mk
@@ -1,6 +1,7 @@
 LOCAL_PATH:= $(call my-dir)
 
 mc_SRC_FILES := \
+  ConstantPools.cpp \
   ELFObjectWriter.cpp \
   MCAsmBackend.cpp \
   MCAsmInfo.cpp \
@@ -9,7 +10,6 @@ mc_SRC_FILES := \
   MCAsmInfoELF.cpp \
   MCAsmStreamer.cpp \
   MCAssembler.cpp \
-  MCAtom.cpp \
   MCCodeEmitter.cpp \
   MCCodeGenInfo.cpp \
   MCContext.cpp \
@@ -18,7 +18,6 @@ mc_SRC_FILES := \
   MCELF.cpp \
   MCELFObjectTargetWriter.cpp \
   MCELFStreamer.cpp \
-  MCFunction.cpp \
   MCExpr.cpp \
   MCExternalSymbolizer.cpp \
   MCInst.cpp \
@@ -28,13 +27,9 @@ mc_SRC_FILES := \
   MCLinkerOptimizationHint.cpp \
   MCMachOStreamer.cpp \
   MCMachObjectTargetWriter.cpp \
-  MCModule.cpp \
-  MCModuleYAML.cpp \
   MCNullStreamer.cpp \
   MCObjectFileInfo.cpp \
-  MCObjectDisassembler.cpp \
   MCObjectStreamer.cpp \
-  MCObjectSymbolizer.cpp \
   MCObjectWriter.cpp \
   MCRegisterInfo.cpp \
   MCRelocationInfo.cpp \
@@ -50,9 +45,11 @@ mc_SRC_FILES := \
   MCValue.cpp \
   MCWin64EH.cpp \
   MachObjectWriter.cpp \
+  StringTableBuilder.cpp \
   SubtargetFeature.cpp \
   WinCOFFObjectWriter.cpp \
   WinCOFFStreamer.cpp \
+  YAML.cpp
 
 # For the host
 # =====================================================
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index 6a384c1..330519e 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_library(LLVMMC
+  ConstantPools.cpp
   ELFObjectWriter.cpp
   MCAsmBackend.cpp
   MCAsmInfo.cpp
@@ -7,7 +8,6 @@ add_llvm_library(LLVMMC
   MCAsmInfoELF.cpp
   MCAsmStreamer.cpp
   MCAssembler.cpp
-  MCAtom.cpp
   MCCodeEmitter.cpp
   MCCodeGenInfo.cpp
   MCContext.cpp
@@ -16,7 +16,6 @@ add_llvm_library(LLVMMC
   MCELF.cpp
   MCELFObjectTargetWriter.cpp
   MCELFStreamer.cpp
-  MCFunction.cpp
   MCExpr.cpp
   MCExternalSymbolizer.cpp
   MCInst.cpp
@@ -26,13 +25,9 @@ add_llvm_library(LLVMMC
   MCLinkerOptimizationHint.cpp
   MCMachOStreamer.cpp
   MCMachObjectTargetWriter.cpp
-  MCModule.cpp
-  MCModuleYAML.cpp
   MCNullStreamer.cpp
   MCObjectFileInfo.cpp
-  MCObjectDisassembler.cpp
   MCObjectStreamer.cpp
-  MCObjectSymbolizer.cpp
   MCObjectWriter.cpp
   MCRegisterInfo.cpp
   MCRelocationInfo.cpp
@@ -48,10 +43,13 @@ add_llvm_library(LLVMMC
   MCValue.cpp
   MCWin64EH.cpp
   MachObjectWriter.cpp
+  StringTableBuilder.cpp
   SubtargetFeature.cpp
   WinCOFFObjectWriter.cpp
   WinCOFFStreamer.cpp
+  YAML.cpp
   )
 
+add_subdirectory(MCAnalysis)
 add_subdirectory(MCParser)
 add_subdirectory(MCDisassembler)
diff --git a/lib/MC/ConstantPools.cpp b/lib/MC/ConstantPools.cpp
new file mode 100644
index 0000000..f979dad
--- /dev/null
+++ b/lib/MC/ConstantPools.cpp
@@ -0,0 +1,95 @@
+//===- ConstantPools.cpp - ConstantPool class --*- C++ -*---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ConstantPool and  AssemblerConstantPools classes.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/MapVector.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/ConstantPools.h"
+
+using namespace llvm;
+//
+// ConstantPool implementation
+//
+// Emit the contents of the constant pool using the provided streamer.
+void ConstantPool::emitEntries(MCStreamer &Streamer) {
+  if (Entries.empty())
+    return;
+  Streamer.EmitCodeAlignment(4); // align to 4-byte address
+  Streamer.EmitDataRegion(MCDR_DataRegion);
+  for (EntryVecTy::const_iterator I = Entries.begin(), E = Entries.end();
+       I != E; ++I) {
+    Streamer.EmitLabel(I->first);
+    Streamer.EmitValue(I->second, 4);
+  }
+  Streamer.EmitDataRegion(MCDR_DataRegionEnd);
+  Entries.clear();
+}
+
+const MCExpr *ConstantPool::addEntry(const MCExpr *Value, MCContext &Context) {
+  MCSymbol *CPEntryLabel = Context.CreateTempSymbol();
+
+  Entries.push_back(std::make_pair(CPEntryLabel, Value));
+  return MCSymbolRefExpr::Create(CPEntryLabel, Context);
+}
+
+bool ConstantPool::empty() { return Entries.empty(); }
+
+//
+// AssemblerConstantPools implementation
+//
+ConstantPool *
+AssemblerConstantPools::getConstantPool(const MCSection *Section) {
+  ConstantPoolMapTy::iterator CP = ConstantPools.find(Section);
+  if (CP == ConstantPools.end())
+    return nullptr;
+
+  return &CP->second;
+}
+
+ConstantPool &
+AssemblerConstantPools::getOrCreateConstantPool(const MCSection *Section) {
+  return ConstantPools[Section];
+}
+
+static void emitConstantPool(MCStreamer &Streamer, const MCSection *Section,
+                             ConstantPool &CP) {
+  if (!CP.empty()) {
+    Streamer.SwitchSection(Section);
+    CP.emitEntries(Streamer);
+  }
+}
+
+void AssemblerConstantPools::emitAll(MCStreamer &Streamer) {
+  // Dump contents of assembler constant pools.
+  for (ConstantPoolMapTy::iterator CPI = ConstantPools.begin(),
+                                   CPE = ConstantPools.end();
+       CPI != CPE; ++CPI) {
+    const MCSection *Section = CPI->first;
+    ConstantPool &CP = CPI->second;
+
+    emitConstantPool(Streamer, Section, CP);
+  }
+}
+
+void AssemblerConstantPools::emitForCurrentSection(MCStreamer &Streamer) {
+  const MCSection *Section = Streamer.getCurrentSection().first;
+  if (ConstantPool *CP = getConstantPool(Section)) {
+    emitConstantPool(Streamer, Section, *CP);
+  }
+}
+
+const MCExpr *AssemblerConstantPools::addEntry(MCStreamer &Streamer,
+                                               const MCExpr *Expr) {
+  const MCSection *Section = Streamer.getCurrentSection().first;
+  return getOrCreateConstantPool(Section).addEntry(Expr, Streamer.getContext());
+}
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 0a54627..7fb9fae 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -28,7 +28,7 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Object/StringTableBuilder.h"
+#include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
@@ -1179,7 +1179,7 @@ prependCompressionHeader(uint64_t Size,
   if (Size <= Magic.size() + sizeof(Size) + CompressedContents.size())
     return false;
   if (sys::IsLittleEndianHost)
-    Size = sys::SwapByteOrder(Size);
+    sys::swapByteOrder(Size);
   CompressedContents.insert(CompressedContents.begin(),
                             Magic.size() + sizeof(Size), 0);
   std::copy(Magic.begin(), Magic.end(), CompressedContents.begin());
@@ -1565,6 +1565,7 @@ void ELFObjectWriter::WriteSection(MCAssembler &Asm,
   case ELF::SHT_X86_64_UNWIND:
   case ELF::SHT_MIPS_REGINFO:
   case ELF::SHT_MIPS_OPTIONS:
+  case ELF::SHT_MIPS_ABIFLAGS:
     // Nothing to do.
     break;
 
@@ -1574,8 +1575,7 @@ void ELFObjectWriter::WriteSection(MCAssembler &Asm,
     break;
 
   default:
-    assert(0 && "FIXME: sh_type value not supported!");
-    break;
+    llvm_unreachable("FIXME: sh_type value not supported!");
   }
 
   if (TargetObjectWriter->getEMachine() == ELF::EM_ARM &&
diff --git a/lib/MC/LLVMBuild.txt b/lib/MC/LLVMBuild.txt
index f35dbe4..3fcb50b 100644
--- a/lib/MC/LLVMBuild.txt
+++ b/lib/MC/LLVMBuild.txt
@@ -16,10 +16,10 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = MCDisassembler MCParser
+subdirectories = MCAnalysis MCDisassembler MCParser
 
 [component_0]
 type = Library
 name = MC
 parent = Libraries
-required_libraries = Object Support
+required_libraries = Support
diff --git a/lib/MC/MCAnalysis/Android.mk b/lib/MC/MCAnalysis/Android.mk
new file mode 100644
index 0000000..27f848a
--- /dev/null
+++ b/lib/MC/MCAnalysis/Android.mk
@@ -0,0 +1,37 @@
+LOCAL_PATH:= $(call my-dir)
+
+mc_analysis_SRC_FILES := \
+  MCAtom.cpp \
+  MCFunction.cpp \
+  MCModule.cpp \
+  MCModuleYAML.cpp \
+  MCObjectDisassembler.cpp \
+  MCObjectSymbolizer.cpp
+
+# For the host
+# =====================================================
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := $(mc_analysis_SRC_FILES)
+
+LOCAL_MODULE:= libLLVMMCAnalysis
+
+LOCAL_MODULE_TAGS := optional
+
+include $(LLVM_HOST_BUILD_MK)
+include $(BUILD_HOST_STATIC_LIBRARY)
+
+# For the device
+# =====================================================
+include $(CLEAR_VARS)
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
+
+LOCAL_SRC_FILES := $(mc_analysis_SRC_FILES)
+
+LOCAL_MODULE:= libLLVMMCAnalysis
+
+LOCAL_MODULE_TAGS := optional
+
+include $(LLVM_DEVICE_BUILD_MK)
+include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/MC/MCAnalysis/CMakeLists.txt b/lib/MC/MCAnalysis/CMakeLists.txt
new file mode 100644
index 0000000..81eae2d
--- /dev/null
+++ b/lib/MC/MCAnalysis/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_llvm_library(LLVMMCAnalysis
+ MCAtom.cpp
+ MCFunction.cpp
+ MCModule.cpp
+ MCModuleYAML.cpp
+ MCObjectDisassembler.cpp
+ MCObjectSymbolizer.cpp
+)
diff --git a/lib/MC/MCAnalysis/LLVMBuild.txt b/lib/MC/MCAnalysis/LLVMBuild.txt
new file mode 100644
index 0000000..1b58fec
--- /dev/null
+++ b/lib/MC/MCAnalysis/LLVMBuild.txt
@@ -0,0 +1,5 @@
+[component_0]
+type = Library
+name = MCAnalysis
+parent = Libraries
+required_libraries = MC Object Support
diff --git a/lib/MC/MCAtom.cpp b/lib/MC/MCAnalysis/MCAtom.cpp
index bc353cd..82056ee 100644
--- a/lib/MC/MCAtom.cpp
+++ b/lib/MC/MCAnalysis/MCAtom.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCAtom.h"
-#include "llvm/MC/MCModule.h"
+#include "llvm/MC/MCAnalysis/MCAtom.h"
+#include "llvm/MC/MCAnalysis/MCModule.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <iterator>
 
diff --git a/lib/MC/MCFunction.cpp b/lib/MC/MCAnalysis/MCFunction.cpp
index 1ddc250..4e09d1a 100644
--- a/lib/MC/MCFunction.cpp
+++ b/lib/MC/MCAnalysis/MCFunction.cpp
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCFunction.h"
-#include "llvm/MC/MCAtom.h"
-#include "llvm/MC/MCModule.h"
+#include "llvm/MC/MCAnalysis/MCFunction.h"
+#include "llvm/MC/MCAnalysis/MCAtom.h"
+#include "llvm/MC/MCAnalysis/MCModule.h"
 #include <algorithm>
 
 using namespace llvm;
diff --git a/lib/MC/MCModule.cpp b/lib/MC/MCAnalysis/MCModule.cpp
index 3ed7356..7512299 100644
--- a/lib/MC/MCModule.cpp
+++ b/lib/MC/MCAnalysis/MCModule.cpp
@@ -7,10 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/MC/MCAnalysis/MCModule.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/MC/MCModule.h"
-#include "llvm/MC/MCAtom.h"
-#include "llvm/MC/MCFunction.h"
+#include "llvm/MC/MCAnalysis/MCAtom.h"
+#include "llvm/MC/MCAnalysis/MCFunction.h"
 #include <algorithm>
 
 using namespace llvm;
diff --git a/lib/MC/MCModuleYAML.cpp b/lib/MC/MCAnalysis/MCModuleYAML.cpp
index f81cb14..876b06d 100644
--- a/lib/MC/MCModuleYAML.cpp
+++ b/lib/MC/MCAnalysis/MCModuleYAML.cpp
@@ -11,13 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCModuleYAML.h"
+#include "llvm/MC/MCAnalysis/MCModuleYAML.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/MC/MCAtom.h"
-#include "llvm/MC/MCFunction.h"
+#include "llvm/MC/MCAnalysis/MCAtom.h"
+#include "llvm/MC/MCAnalysis/MCFunction.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Object/YAML.h"
+#include "llvm/MC/YAML.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/MathExtras.h"
@@ -102,7 +102,7 @@ struct Atom {
   uint64_t Size;
 
   std::vector<Inst> Insts;
-  object::yaml::BinaryRef Data;
+  yaml::BinaryRef Data;
 };
 
 struct BasicBlock {
@@ -453,7 +453,7 @@ StringRef yaml2mcmodule(std::unique_ptr<MCModule> &MCM, StringRef YamlContent,
   InstrRegInfoHolder IRI(MII, MRI);
   yaml::Input YIn(YamlContent, (void *)&IRI);
   YIn >> YAMLModule;
-  if (error_code ec = YIn.error())
+  if (std::error_code ec = YIn.error())
     return ec.message();
   StringRef err = Parser.parse(YAMLModule);
   if (!err.empty())
diff --git a/lib/MC/MCObjectDisassembler.cpp b/lib/MC/MCAnalysis/MCObjectDisassembler.cpp
index 8a258cb..0f789ff 100644
--- a/lib/MC/MCObjectDisassembler.cpp
+++ b/lib/MC/MCAnalysis/MCObjectDisassembler.cpp
@@ -13,11 +13,11 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAtom.h"
+#include "llvm/MC/MCAnalysis/MCAtom.h"
+#include "llvm/MC/MCAnalysis/MCFunction.h"
+#include "llvm/MC/MCAnalysis/MCModule.h"
 #include "llvm/MC/MCDisassembler.h"
-#include "llvm/MC/MCFunction.h"
 #include "llvm/MC/MCInstrAnalysis.h"
-#include "llvm/MC/MCModule.h"
 #include "llvm/MC/MCObjectSymbolizer.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/ObjectFile.h"
diff --git a/lib/MC/MCObjectSymbolizer.cpp b/lib/MC/MCAnalysis/MCObjectSymbolizer.cpp
index b149596..b149596 100644
--- a/lib/MC/MCObjectSymbolizer.cpp
+++ b/lib/MC/MCAnalysis/MCObjectSymbolizer.cpp
diff --git a/unittests/Object/Makefile b/lib/MC/MCAnalysis/Makefile
index 9062149..add2dbd 100644
--- a/unittests/Object/Makefile
+++ b/lib/MC/MCAnalysis/Makefile
@@ -1,4 +1,4 @@
-##===- unittests/Object/Makefile ---------------------------*- Makefile -*-===##
+##===- lib/MC/MCAnalysys/Makefile --------------------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
@@ -7,9 +7,8 @@
 #
 ##===----------------------------------------------------------------------===##
 
-LEVEL = ../..
-TESTNAME = Object
-LINK_COMPONENTS := object
+LEVEL = ../../..
+LIBRARYNAME = LLVMMCAnalysis
+BUILD_ARCHIVE := 1
 
-include $(LEVEL)/Makefile.config
-include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
+include $(LEVEL)/Makefile.common
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index c0777a6..f8081ef 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -39,7 +39,7 @@ MCAsmInfo::MCAsmInfo() {
   SeparatorString = ";";
   CommentString = "#";
   LabelSuffix = ":";
-  DebugLabelSuffix = ":";
+  UseAssignmentForEHBegin = false;
   PrivateGlobalPrefix = "L";
   LinkerPrivateGlobalPrefix = "";
   InlineAsmStart = "APP";
@@ -82,6 +82,7 @@ MCAsmInfo::MCAsmInfo() {
   HasLEB128 = false;
   SupportsDebugInformation = false;
   ExceptionsType = ExceptionHandling::None;
+  WinEHEncodingType = WinEH::EncodingType::ET_Invalid;
   DwarfUsesRelocationsAcrossSections = true;
   DwarfFDESymbolsUseAbsDiff = false;
   DwarfRegNumForCFI = false;
@@ -99,7 +100,7 @@ MCAsmInfo::MCAsmInfo() {
   //   - MCAsmInfoDarwin is handling this case
   // - Generic_GCC toolchains enable the integrated assembler on a per
   //   architecture basis.
-  //   - The target subclasses for AArch64, ARM, and X86  handle these cases
+  //   - The target subclasses for AArch64, ARM, and X86 handle these cases
   UseIntegratedAssembler = false;
 
   CompressDebugSections = false;
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 7f8ae54..6973bbb 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -120,7 +120,6 @@ public:
 
   void EmitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) override;
   void EmitLabel(MCSymbol *Symbol) override;
-  void EmitDebugLabel(MCSymbol *Symbol) override;
 
   void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
   void EmitLinkerOptions(ArrayRef<std::string> Options) override;
@@ -213,20 +212,20 @@ public:
   void EmitCFIRegister(int64_t Register1, int64_t Register2) override;
   void EmitCFIWindowSave() override;
 
-  void EmitWin64EHStartProc(const MCSymbol *Symbol) override;
-  void EmitWin64EHEndProc() override;
-  void EmitWin64EHStartChained() override;
-  void EmitWin64EHEndChained() override;
-  void EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind,
-                          bool Except) override;
-  void EmitWin64EHHandlerData() override;
-  void EmitWin64EHPushReg(unsigned Register) override;
-  void EmitWin64EHSetFrame(unsigned Register, unsigned Offset) override;
-  void EmitWin64EHAllocStack(unsigned Size) override;
-  void EmitWin64EHSaveReg(unsigned Register, unsigned Offset) override;
-  void EmitWin64EHSaveXMM(unsigned Register, unsigned Offset) override;
-  void EmitWin64EHPushFrame(bool Code) override;
-  void EmitWin64EHEndProlog() override;
+  void EmitWinCFIStartProc(const MCSymbol *Symbol) override;
+  void EmitWinCFIEndProc() override;
+  void EmitWinCFIStartChained() override;
+  void EmitWinCFIEndChained() override;
+  void EmitWinCFIPushReg(unsigned Register) override;
+  void EmitWinCFISetFrame(unsigned Register, unsigned Offset) override;
+  void EmitWinCFIAllocStack(unsigned Size) override;
+  void EmitWinCFISaveReg(unsigned Register, unsigned Offset) override;
+  void EmitWinCFISaveXMM(unsigned Register, unsigned Offset) override;
+  void EmitWinCFIPushFrame(bool Code) override;
+  void EmitWinCFIEndProlog() override;
+
+  void EmitWinEHHandler(const MCSymbol *Sym, bool Unwind, bool Except) override;
+  void EmitWinEHHandlerData() override;
 
   void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
 
@@ -334,14 +333,6 @@ void MCAsmStreamer::EmitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitDebugLabel(MCSymbol *Symbol) {
-  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
-  MCStreamer::EmitDebugLabel(Symbol);
-
-  OS << *Symbol << MAI->getDebugLabelSuffix();
-  EmitEOL();
-}
-
 void MCAsmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   switch (Flag) {
   case MCAF_SyntaxUnified:         OS << "\t.syntax unified"; break;
@@ -944,10 +935,7 @@ void MCAsmStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
 }
 
 void MCAsmStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
-  // Put a dummy non-null value in Frame.End to mark that this frame has been
-  // closed.
-  Frame.End = (MCSymbol *) 1;
-
+  MCStreamer::EmitCFIEndProcImpl(Frame);
   OS << "\t.cfi_endproc";
   EmitEOL();
 }
@@ -1061,37 +1049,37 @@ void MCAsmStreamer::EmitCFIWindowSave() {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWin64EHStartProc(const MCSymbol *Symbol) {
-  MCStreamer::EmitWin64EHStartProc(Symbol);
+void MCAsmStreamer::EmitWinCFIStartProc(const MCSymbol *Symbol) {
+  MCStreamer::EmitWinCFIStartProc(Symbol);
 
   OS << ".seh_proc " << *Symbol;
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWin64EHEndProc() {
-  MCStreamer::EmitWin64EHEndProc();
+void MCAsmStreamer::EmitWinCFIEndProc() {
+  MCStreamer::EmitWinCFIEndProc();
 
   OS << "\t.seh_endproc";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWin64EHStartChained() {
-  MCStreamer::EmitWin64EHStartChained();
+void MCAsmStreamer::EmitWinCFIStartChained() {
+  MCStreamer::EmitWinCFIStartChained();
 
   OS << "\t.seh_startchained";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWin64EHEndChained() {
-  MCStreamer::EmitWin64EHEndChained();
+void MCAsmStreamer::EmitWinCFIEndChained() {
+  MCStreamer::EmitWinCFIEndChained();
 
   OS << "\t.seh_endchained";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind,
-                                       bool Except) {
-  MCStreamer::EmitWin64EHHandler(Sym, Unwind, Except);
+void MCAsmStreamer::EmitWinEHHandler(const MCSymbol *Sym, bool Unwind,
+                                      bool Except) {
+  MCStreamer::EmitWinEHHandler(Sym, Unwind, Except);
 
   OS << "\t.seh_handler " << *Sym;
   if (Unwind)
@@ -1114,8 +1102,8 @@ static const MCSection *getWin64EHTableSection(StringRef suffix,
                                 SectionKind::getDataRel());
 }
 
-void MCAsmStreamer::EmitWin64EHHandlerData() {
-  MCStreamer::EmitWin64EHHandlerData();
+void MCAsmStreamer::EmitWinEHHandlerData() {
+  MCStreamer::EmitWinEHHandlerData();
 
   // Switch sections. Don't call SwitchSection directly, because that will
   // cause the section switch to be visible in the emitted assembly.
@@ -1131,50 +1119,43 @@ void MCAsmStreamer::EmitWin64EHHandlerData() {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWin64EHPushReg(unsigned Register) {
-  MCStreamer::EmitWin64EHPushReg(Register);
+void MCAsmStreamer::EmitWinCFIPushReg(unsigned Register) {
+  MCStreamer::EmitWinCFIPushReg(Register);
 
-  OS << "\t.seh_pushreg ";
-  EmitRegisterName(Register);
+  OS << "\t.seh_pushreg " << Register;
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWin64EHSetFrame(unsigned Register, unsigned Offset) {
-  MCStreamer::EmitWin64EHSetFrame(Register, Offset);
+void MCAsmStreamer::EmitWinCFISetFrame(unsigned Register, unsigned Offset) {
+  MCStreamer::EmitWinCFISetFrame(Register, Offset);
 
-  OS << "\t.seh_setframe ";
-  EmitRegisterName(Register);
-  OS << ", " << Offset;
+  OS << "\t.seh_setframe " << Register << ", " << Offset;
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWin64EHAllocStack(unsigned Size) {
-  MCStreamer::EmitWin64EHAllocStack(Size);
+void MCAsmStreamer::EmitWinCFIAllocStack(unsigned Size) {
+  MCStreamer::EmitWinCFIAllocStack(Size);
 
   OS << "\t.seh_stackalloc " << Size;
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWin64EHSaveReg(unsigned Register, unsigned Offset) {
-  MCStreamer::EmitWin64EHSaveReg(Register, Offset);
+void MCAsmStreamer::EmitWinCFISaveReg(unsigned Register, unsigned Offset) {
+  MCStreamer::EmitWinCFISaveReg(Register, Offset);
 
-  OS << "\t.seh_savereg ";
-  EmitRegisterName(Register);
-  OS << ", " << Offset;
+  OS << "\t.seh_savereg " << Register << ", " << Offset;
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWin64EHSaveXMM(unsigned Register, unsigned Offset) {
-  MCStreamer::EmitWin64EHSaveXMM(Register, Offset);
+void MCAsmStreamer::EmitWinCFISaveXMM(unsigned Register, unsigned Offset) {
+  MCStreamer::EmitWinCFISaveXMM(Register, Offset);
 
-  OS << "\t.seh_savexmm ";
-  EmitRegisterName(Register);
-  OS << ", " << Offset;
+  OS << "\t.seh_savexmm " << Register << ", " << Offset;
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWin64EHPushFrame(bool Code) {
-  MCStreamer::EmitWin64EHPushFrame(Code);
+void MCAsmStreamer::EmitWinCFIPushFrame(bool Code) {
+  MCStreamer::EmitWinCFIPushFrame(Code);
 
   OS << "\t.seh_pushframe";
   if (Code)
@@ -1182,8 +1163,8 @@ void MCAsmStreamer::EmitWin64EHPushFrame(bool Code) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitWin64EHEndProlog(void) {
-  MCStreamer::EmitWin64EHEndProlog();
+void MCAsmStreamer::EmitWinCFIEndProlog(void) {
+  MCStreamer::EmitWinCFIEndProlog();
 
   OS << "\t.seh_endprologue";
   EmitEOL();
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 886a5f5..a8aad71 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/MC/MCSectionELF.h"
 #include <tuple>
 using namespace llvm;
 
@@ -433,12 +434,27 @@ const MCSymbolData *MCAssembler::getAtom(const MCSymbolData *SD) const {
   return SD->getFragment()->getAtom();
 }
 
+// Try to fully compute Expr to an absolute value and if that fails produce
+// a relocatable expr.
+// FIXME: Should this be the behavior of EvaluateAsRelocatable itself?
+static bool evaluate(const MCExpr &Expr, const MCAsmLayout &Layout,
+                     MCValue &Target) {
+  if (Expr.EvaluateAsValue(Target, &Layout))
+    if (Target.isAbsolute())
+      return true;
+  return Expr.EvaluateAsRelocatable(Target, &Layout);
+}
+
 bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
                                 const MCFixup &Fixup, const MCFragment *DF,
                                 MCValue &Target, uint64_t &Value) const {
   ++stats::evaluateFixup;
 
-  if (!Fixup.getValue()->EvaluateAsRelocatable(Target, &Layout))
+  // FIXME: This code has some duplication with RecordRelocation. We should
+  // probably merge the two into a single callback that tries to evaluate a
+  // fixup and records a relocation if one is needed.
+  const MCExpr *Expr = Fixup.getValue();
+  if (!evaluate(*Expr, Layout, Target))
     getContext().FatalError(Fixup.getLoc(), "expected relocatable expression");
 
   bool IsPCRel = Backend.getFixupKindInfo(
@@ -782,8 +798,13 @@ void MCAssembler::writeSectionData(const MCSectionData *SD,
         assert(DF.fixup_begin() == DF.fixup_end() &&
                "Cannot have fixups in virtual section!");
         for (unsigned i = 0, e = DF.getContents().size(); i != e; ++i)
-          assert(DF.getContents()[i] == 0 &&
-                 "Invalid data value for virtual section!");
+          if (DF.getContents()[i]) {
+            if (auto *ELFSec = dyn_cast<const MCSectionELF>(&SD->getSection()))
+              report_fatal_error("non-zero initializer found in section '" +
+                  ELFSec->getSectionName() + "'");
+            else
+              report_fatal_error("non-zero initializer found in virtual section");
+          }
         break;
       }
       case MCFragment::FT_Align:
@@ -1222,7 +1243,7 @@ void MCSectionData::dump() {
   OS << "]>";
 }
 
-void MCSymbolData::dump() {
+void MCSymbolData::dump() const {
   raw_ostream &OS = llvm::errs();
 
   OS << "<MCSymbolData Symbol:" << getSymbol()
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index c163268..960a071 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -39,7 +39,7 @@ MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
       AllowTemporaryLabels(true), DwarfCompileUnitID(0),
       AutoReset(DoAutoReset) {
 
-  error_code EC = llvm::sys::fs::current_path(CompilationDir);
+  std::error_code EC = llvm::sys::fs::current_path(CompilationDir);
   if (EC)
     CompilationDir.clear();
 
@@ -47,8 +47,9 @@ MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
   SecureLog = nullptr;
   SecureLogUsed = false;
 
-  if (SrcMgr && SrcMgr->getNumBuffers() > 0)
-    MainFileName = SrcMgr->getMemoryBuffer(0)->getBufferIdentifier();
+  if (SrcMgr && SrcMgr->getNumBuffers())
+    MainFileName =
+        SrcMgr->getMemoryBuffer(SrcMgr->getMainFileID())->getBufferIdentifier();
 }
 
 MCContext::~MCContext() {
@@ -277,14 +278,15 @@ const MCSectionELF *MCContext::CreateELFGroupSection() {
   return Result;
 }
 
-const MCSectionCOFF *
-MCContext::getCOFFSection(StringRef Section, unsigned Characteristics,
-                          SectionKind Kind, StringRef COMDATSymName,
-                          int Selection, const MCSectionCOFF *Assoc) {
+const MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
+                                               unsigned Characteristics,
+                                               SectionKind Kind,
+                                               StringRef COMDATSymName,
+                                               int Selection) {
   // Do the lookup, if we have a hit, return it.
 
-  SectionGroupPair P(Section, COMDATSymName);
-  auto IterBool = COFFUniquingMap.insert(std::make_pair(P, nullptr));
+  SectionGroupTriple T(Section, COMDATSymName, Selection);
+  auto IterBool = COFFUniquingMap.insert(std::make_pair(T, nullptr));
   auto Iter = IterBool.first;
   if (!IterBool.second)
     return Iter->second;
@@ -293,9 +295,9 @@ MCContext::getCOFFSection(StringRef Section, unsigned Characteristics,
   if (!COMDATSymName.empty())
     COMDATSymbol = GetOrCreateSymbol(COMDATSymName);
 
-  StringRef CachedName = Iter->first.first;
-  MCSectionCOFF *Result = new (*this) MCSectionCOFF(
-      CachedName, Characteristics, COMDATSymbol, Selection, Assoc, Kind);
+  StringRef CachedName = std::get<0>(Iter->first);
+  MCSectionCOFF *Result = new (*this)
+      MCSectionCOFF(CachedName, Characteristics, COMDATSymbol, Selection, Kind);
 
   Iter->second = Result;
   return Result;
@@ -308,8 +310,8 @@ MCContext::getCOFFSection(StringRef Section, unsigned Characteristics,
 }
 
 const MCSectionCOFF *MCContext::getCOFFSection(StringRef Section) {
-  SectionGroupPair P(Section, "");
-  auto Iter = COFFUniquingMap.find(P);
+  SectionGroupTriple T(Section, "", 0);
+  auto Iter = COFFUniquingMap.find(T);
   if (Iter == COFFUniquingMap.end())
     return nullptr;
   return Iter->second;
@@ -339,6 +341,29 @@ bool MCContext::isValidDwarfFileNumber(unsigned FileNumber, unsigned CUID) {
   return !MCDwarfFiles[FileNumber].Name.empty();
 }
 
+/// finalizeDwarfSections - Emit end symbols for each non-empty code section.
+/// Also remove empty sections from SectionStartEndSyms, to avoid generating
+/// useless debug info for them.
+void MCContext::finalizeDwarfSections(MCStreamer &MCOS) {
+  MCContext &context = MCOS.getContext();
+
+  auto sec = SectionStartEndSyms.begin();
+  while (sec != SectionStartEndSyms.end()) {
+    assert(sec->second.first && "Start symbol must be set by now");
+    MCOS.SwitchSection(sec->first);
+    if (MCOS.mayHaveInstructions()) {
+      MCSymbol *SectionEndSym = context.CreateTempSymbol();
+      MCOS.EmitLabel(SectionEndSym);
+      sec->second.second = SectionEndSym;
+      ++sec;
+    } else {
+      MapVector<const MCSection *, std::pair<MCSymbol *, MCSymbol *> >::iterator
+        to_erase = sec;
+      sec = SectionStartEndSyms.erase(to_erase);
+    }
+  }
+}
+
 void MCContext::FatalError(SMLoc Loc, const Twine &Msg) const {
   // If we have a source manager and a location, use it. Otherwise just
   // use the generic report_fatal_error().
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index be6731a..0a3fab8 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -518,8 +519,12 @@ static void EmitGenDwarfAbbrev(MCStreamer *MCOS) {
   MCOS->EmitULEB128IntValue(dwarf::DW_TAG_compile_unit);
   MCOS->EmitIntValue(dwarf::DW_CHILDREN_yes, 1);
   EmitAbbrev(MCOS, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4);
-  EmitAbbrev(MCOS, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr);
-  EmitAbbrev(MCOS, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr);
+  if (MCOS->getContext().getGenDwarfSectionSyms().size() > 1) {
+    EmitAbbrev(MCOS, dwarf::DW_AT_ranges, dwarf::DW_FORM_data4);
+  } else {
+    EmitAbbrev(MCOS, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr);
+    EmitAbbrev(MCOS, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr);
+  }
   EmitAbbrev(MCOS, dwarf::DW_AT_name, dwarf::DW_FORM_string);
   if (!context.getCompilationDir().empty())
     EmitAbbrev(MCOS, dwarf::DW_AT_comp_dir, dwarf::DW_FORM_string);
@@ -552,20 +557,14 @@ static void EmitGenDwarfAbbrev(MCStreamer *MCOS) {
 }
 
 // When generating dwarf for assembly source files this emits the data for
-// .debug_aranges section.  Which contains a header and a table of pairs of
-// PointerSize'ed values for the address and size of section(s) with line table
-// entries (just the default .text in our case) and a terminating pair of zeros.
+// .debug_aranges section. This section contains a header and a table of pairs
+// of PointerSize'ed values for the address and size of section(s) with line
+// table entries.
 static void EmitGenDwarfAranges(MCStreamer *MCOS,
                                 const MCSymbol *InfoSectionSymbol) {
   MCContext &context = MCOS->getContext();
 
-  // Create a symbol at the end of the section that we are creating the dwarf
-  // debugging info to use later in here as part of the expression to calculate
-  // the size of the section for the table.
-  MCOS->SwitchSection(context.getGenDwarfSection());
-  MCSymbol *SectionEndSym = context.CreateTempSymbol();
-  MCOS->EmitLabel(SectionEndSym);
-  context.setGenDwarfSectionEndSym(SectionEndSym);
+  auto &Sections = context.getGenDwarfSectionSyms();
 
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfARangesSection());
 
@@ -583,8 +582,8 @@ static void EmitGenDwarfAranges(MCStreamer *MCOS,
   Length += Pad;
 
   // Add the size of the pair of PointerSize'ed values for the address and size
-  // of the one default .text section we have in the table.
-  Length += 2 * AddrSize;
+  // of each section we have in the table.
+  Length += 2 * AddrSize * Sections.size();
   // And the pair of terminating zeros.
   Length += 2 * AddrSize;
 
@@ -608,14 +607,21 @@ static void EmitGenDwarfAranges(MCStreamer *MCOS,
   for(int i = 0; i < Pad; i++)
     MCOS->EmitIntValue(0, 1);
 
-  // Now emit the table of pairs of PointerSize'ed values for the section(s)
-  // address and size, in our case just the one default .text section.
-  const MCExpr *Addr = MCSymbolRefExpr::Create(
-    context.getGenDwarfSectionStartSym(), MCSymbolRefExpr::VK_None, context);
-  const MCExpr *Size = MakeStartMinusEndExpr(*MCOS,
-    *context.getGenDwarfSectionStartSym(), *SectionEndSym, 0);
-  MCOS->EmitValue(Addr, AddrSize);
-  MCOS->EmitAbsValue(Size, AddrSize);
+  // Now emit the table of pairs of PointerSize'ed values for the section
+  // addresses and sizes.
+  for (const auto &sec : Sections) {
+    MCSymbol *StartSymbol = sec.second.first;
+    MCSymbol *EndSymbol = sec.second.second;
+    assert(StartSymbol && "StartSymbol must not be NULL");
+    assert(EndSymbol && "EndSymbol must not be NULL");
+
+    const MCExpr *Addr = MCSymbolRefExpr::Create(
+      StartSymbol, MCSymbolRefExpr::VK_None, context);
+    const MCExpr *Size = MakeStartMinusEndExpr(*MCOS,
+      *StartSymbol, *EndSymbol, 0);
+    MCOS->EmitValue(Addr, AddrSize);
+    MCOS->EmitAbsValue(Size, AddrSize);
+  }
 
   // And finally the pair of terminating zeros.
   MCOS->EmitIntValue(0, AddrSize);
@@ -627,7 +633,8 @@ static void EmitGenDwarfAranges(MCStreamer *MCOS,
 // DIE and a list of label DIEs.
 static void EmitGenDwarfInfo(MCStreamer *MCOS,
                              const MCSymbol *AbbrevSectionSymbol,
-                             const MCSymbol *LineSectionSymbol) {
+                             const MCSymbol *LineSectionSymbol,
+                             const MCSymbol *RangesSectionSymbol) {
   MCContext &context = MCOS->getContext();
 
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection());
@@ -674,15 +681,37 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
     MCOS->EmitIntValue(0, 4);
   }
 
-  // AT_low_pc, the first address of the default .text section.
-  const MCExpr *Start = MCSymbolRefExpr::Create(
-    context.getGenDwarfSectionStartSym(), MCSymbolRefExpr::VK_None, context);
-  MCOS->EmitValue(Start, AddrSize);
+  if (RangesSectionSymbol) {
+    // There are multiple sections containing code, so we must use the
+    // .debug_ranges sections.
 
-  // AT_high_pc, the last address of the default .text section.
-  const MCExpr *End = MCSymbolRefExpr::Create(
-    context.getGenDwarfSectionEndSym(), MCSymbolRefExpr::VK_None, context);
-  MCOS->EmitValue(End, AddrSize);
+    // AT_ranges, the 4 byte offset from the start of the .debug_ranges section
+    // to the address range list for this compilation unit.
+    MCOS->EmitSymbolValue(RangesSectionSymbol, 4);
+  } else {
+    // If we only have one non-empty code section, we can use the simpler
+    // AT_low_pc and AT_high_pc attributes.
+
+    // Find the first (and only) non-empty text section
+    auto &Sections = context.getGenDwarfSectionSyms();
+    const auto TextSection = Sections.begin();
+    assert(TextSection != Sections.end() && "No text section found");
+
+    MCSymbol *StartSymbol = TextSection->second.first;
+    MCSymbol *EndSymbol = TextSection->second.second;
+    assert(StartSymbol && "StartSymbol must not be NULL");
+    assert(EndSymbol && "EndSymbol must not be NULL");
+
+    // AT_low_pc, the first address of the default .text section.
+    const MCExpr *Start = MCSymbolRefExpr::Create(
+        StartSymbol, MCSymbolRefExpr::VK_None, context);
+    MCOS->EmitValue(Start, AddrSize);
+
+    // AT_high_pc, the last address of the default .text section.
+    const MCExpr *End = MCSymbolRefExpr::Create(
+      EndSymbol, MCSymbolRefExpr::VK_None, context);
+    MCOS->EmitValue(End, AddrSize);
+  }
 
   // AT_name, the name of the source file.  Reconstruct from the first directory
   // and file table entries.
@@ -766,13 +795,51 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
   MCOS->EmitLabel(InfoEnd);
 }
 
+// When generating dwarf for assembly source files this emits the data for
+// .debug_ranges section. We only emit one range list, which spans all of the
+// executable sections of this file.
+static void EmitGenDwarfRanges(MCStreamer *MCOS) {
+  MCContext &context = MCOS->getContext();
+  auto &Sections = context.getGenDwarfSectionSyms();
+
+  const MCAsmInfo *AsmInfo = context.getAsmInfo();
+  int AddrSize = AsmInfo->getPointerSize();
+
+  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfRangesSection());
+
+  for (const auto sec : Sections) {
+
+    MCSymbol *StartSymbol = sec.second.first;
+    MCSymbol *EndSymbol = sec.second.second;
+    assert(StartSymbol && "StartSymbol must not be NULL");
+    assert(EndSymbol && "EndSymbol must not be NULL");
+
+    // Emit a base address selection entry for the start of this section
+    const MCExpr *SectionStartAddr = MCSymbolRefExpr::Create(
+      StartSymbol, MCSymbolRefExpr::VK_None, context);
+    MCOS->EmitFill(AddrSize, 0xFF);
+    MCOS->EmitValue(SectionStartAddr, AddrSize);
+
+    // Emit a range list entry spanning this section
+    const MCExpr *SectionSize = MakeStartMinusEndExpr(*MCOS,
+      *StartSymbol, *EndSymbol, 0);
+    MCOS->EmitIntValue(0, AddrSize);
+    MCOS->EmitAbsValue(SectionSize, AddrSize);
+  }
+
+  // Emit end of list entry
+  MCOS->EmitIntValue(0, AddrSize);
+  MCOS->EmitIntValue(0, AddrSize);
+}
+
 //
 // When generating dwarf for assembly source files this emits the Dwarf
 // sections.
 //
 void MCGenDwarfInfo::Emit(MCStreamer *MCOS) {
-  // Create the dwarf sections in this order (.debug_line already created).
   MCContext &context = MCOS->getContext();
+
+  // Create the dwarf sections in this order (.debug_line already created).
   const MCAsmInfo *AsmInfo = context.getAsmInfo();
   bool CreateDwarfSectionSymbols =
       AsmInfo->doesDwarfUseRelocationsAcrossSections();
@@ -781,6 +848,22 @@ void MCGenDwarfInfo::Emit(MCStreamer *MCOS) {
     LineSectionSymbol = MCOS->getDwarfLineTableSymbol(0);
   MCSymbol *AbbrevSectionSymbol = nullptr;
   MCSymbol *InfoSectionSymbol = nullptr;
+  MCSymbol *RangesSectionSymbol = NULL;
+
+  // Create end symbols for each section, and remove empty sections
+  MCOS->getContext().finalizeDwarfSections(*MCOS);
+
+  // If there are no sections to generate debug info for, we don't need
+  // to do anything
+  if (MCOS->getContext().getGenDwarfSectionSyms().empty())
+    return;
+
+  // We only need to use the .debug_ranges section if we have multiple
+  // code sections.
+  const bool UseRangesSection =
+      MCOS->getContext().getGenDwarfSectionSyms().size() > 1;
+  CreateDwarfSectionSymbols |= UseRangesSection;
+
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection());
   if (CreateDwarfSectionSymbols) {
     InfoSectionSymbol = context.CreateTempSymbol();
@@ -791,20 +874,30 @@ void MCGenDwarfInfo::Emit(MCStreamer *MCOS) {
     AbbrevSectionSymbol = context.CreateTempSymbol();
     MCOS->EmitLabel(AbbrevSectionSymbol);
   }
-  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfARangesSection());
+  if (UseRangesSection) {
+    MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfRangesSection());
+    if (CreateDwarfSectionSymbols) {
+      RangesSectionSymbol = context.CreateTempSymbol();
+      MCOS->EmitLabel(RangesSectionSymbol);
+    }
+  }
 
-  // If there are no line table entries then do not emit any section contents.
-  if (!context.hasMCLineSections())
-    return;
+  assert((RangesSectionSymbol != NULL) || !UseRangesSection);
+
+  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfARangesSection());
 
   // Output the data for .debug_aranges section.
   EmitGenDwarfAranges(MCOS, InfoSectionSymbol);
 
+  if (UseRangesSection)
+    EmitGenDwarfRanges(MCOS);
+
   // Output the data for .debug_abbrev section.
   EmitGenDwarfAbbrev(MCOS);
 
   // Output the data for .debug_info section.
-  EmitGenDwarfInfo(MCOS, AbbrevSectionSymbol, LineSectionSymbol);
+  EmitGenDwarfInfo(MCOS, AbbrevSectionSymbol, LineSectionSymbol,
+                   RangesSectionSymbol);
 }
 
 //
@@ -815,12 +908,13 @@ void MCGenDwarfInfo::Emit(MCStreamer *MCOS) {
 //
 void MCGenDwarfLabelEntry::Make(MCSymbol *Symbol, MCStreamer *MCOS,
                                      SourceMgr &SrcMgr, SMLoc &Loc) {
-  // We won't create dwarf labels for temporary symbols or symbols not in
-  // the default text.
+  // We won't create dwarf labels for temporary symbols.
   if (Symbol->isTemporary())
     return;
   MCContext &context = MCOS->getContext();
-  if (context.getGenDwarfSection() != MCOS->getCurrentSection().first)
+  // We won't create dwarf labels for symbols in sections that we are not
+  // generating debug info for.
+  if (!context.getGenDwarfSectionSyms().count(MCOS->getCurrentSection().first))
     return;
 
   // The dwarf label's name does not have the symbol name's leading
@@ -834,7 +928,7 @@ void MCGenDwarfLabelEntry::Make(MCSymbol *Symbol, MCStreamer *MCOS,
 
   // Finding the line number is the expensive part which is why we just don't
   // pass it in as for some symbols we won't create a dwarf label.
-  int CurBuffer = SrcMgr.FindBufferContainingLoc(Loc);
+  unsigned CurBuffer = SrcMgr.FindBufferContainingLoc(Loc);
   unsigned LineNumber = SrcMgr.FindLineNumber(Loc, CurBuffer);
 
   // We create a temporary symbol for use for the AT_high_pc and AT_low_pc
@@ -1203,7 +1297,7 @@ void FrameEmitterImpl::EmitCompactUnwind(MCStreamer &Streamer,
   unsigned FDEEncoding = MOFI->getFDEEncoding();
   unsigned Size = getSizeForEncoding(Streamer, FDEEncoding);
   if (VerboseAsm) Streamer.AddComment("Range Start");
-  Streamer.EmitSymbolValue(Frame.Function, Size);
+  Streamer.EmitSymbolValue(Frame.Begin, Size);
 
   // Range Length
   const MCExpr *Range = MakeStartMinusEndExpr(Streamer, *Frame.Begin,
@@ -1246,12 +1340,7 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCObjectStreamer &streamer,
   const MCObjectFileInfo *MOFI = context.getObjectFileInfo();
   bool verboseAsm = streamer.isVerboseAsm();
 
-  MCSymbol *sectionStart;
-  if (MOFI->isFunctionEHFrameSymbolPrivate() || !IsEH)
-    sectionStart = context.CreateTempSymbol();
-  else
-    sectionStart = context.GetOrCreateSymbol(Twine("EH_frame") + Twine(CIENum));
-
+  MCSymbol *sectionStart = context.CreateTempSymbol();
   streamer.EmitLabel(sectionStart);
   CIENum++;
 
@@ -1270,7 +1359,10 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCObjectStreamer &streamer,
 
   // Version
   if (verboseAsm) streamer.AddComment("DW_CIE_VERSION");
-  streamer.EmitIntValue(dwarf::DW_CIE_VERSION, 1);
+  // For DWARF2, we use CIE version 1
+  // For DWARF3+, we use CIE version 3
+  uint8_t CIEVersion = context.getDwarfVersion() <= 2 ? 1 : 3;
+  streamer.EmitIntValue(CIEVersion, 1);
 
   // Augmentation String
   SmallString<8> Augmentation;
@@ -1298,7 +1390,14 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCObjectStreamer &streamer,
 
   // Return Address Register
   if (verboseAsm) streamer.AddComment("CIE Return Address Column");
-  streamer.EmitULEB128IntValue(MRI->getDwarfRegNum(MRI->getRARegister(), true));
+  if (CIEVersion == 1) {
+    assert(MRI->getRARegister() <= 255 &&
+           "DWARF 2 encodes return_address_register in one byte");
+    streamer.EmitIntValue(MRI->getDwarfRegNum(MRI->getRARegister(), true), 1);
+  } else {
+    streamer.EmitULEB128IntValue(
+        MRI->getDwarfRegNum(MRI->getRARegister(), true));
+  }
 
   // Augmentation Data Length (optional)
 
@@ -1360,13 +1459,6 @@ MCSymbol *FrameEmitterImpl::EmitFDE(MCObjectStreamer &streamer,
   const MCObjectFileInfo *MOFI = context.getObjectFileInfo();
   bool verboseAsm = streamer.isVerboseAsm();
 
-  if (IsEH && frame.Function && !MOFI->isFunctionEHFrameSymbolPrivate()) {
-    MCSymbol *EHSym =
-      context.GetOrCreateSymbol(frame.Function->getName() + Twine(".eh"));
-    streamer.EmitEHSymAttributes(frame.Function, EHSym);
-    streamer.EmitLabel(EHSym);
-  }
-
   // Length
   const MCExpr *Length = MakeStartMinusEndExpr(streamer, *fdeStart, *fdeEnd, 0);
   if (verboseAsm) streamer.AddComment("FDE Length");
@@ -1435,13 +1527,12 @@ namespace {
       return CIEKey(nullptr, -1, 0, false, false);
     }
 
-    CIEKey(const MCSymbol* Personality_, unsigned PersonalityEncoding_,
-           unsigned LsdaEncoding_, bool IsSignalFrame_, bool IsSimple_) :
-      Personality(Personality_), PersonalityEncoding(PersonalityEncoding_),
-      LsdaEncoding(LsdaEncoding_), IsSignalFrame(IsSignalFrame_),
-      IsSimple(IsSimple_) {
-    }
-    const MCSymbol* Personality;
+    CIEKey(const MCSymbol *Personality_, unsigned PersonalityEncoding_,
+           unsigned LsdaEncoding_, bool IsSignalFrame_, bool IsSimple_)
+        : Personality(Personality_), PersonalityEncoding(PersonalityEncoding_),
+          LsdaEncoding(LsdaEncoding_), IsSignalFrame(IsSignalFrame_),
+          IsSimple(IsSimple_) {}
+    const MCSymbol *Personality;
     unsigned PersonalityEncoding;
     unsigned LsdaEncoding;
     bool IsSignalFrame;
@@ -1516,7 +1607,7 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
   Emitter.setSectionStart(SectionStart);
 
   MCSymbol *FDEEnd = nullptr;
-  DenseMap<CIEKey, const MCSymbol*> CIEStarts;
+  DenseMap<CIEKey, const MCSymbol *> CIEStarts;
 
   const MCSymbol *DummyDebugKey = nullptr;
   NeedsEHFrameSection = !MOFI->getSupportsCompactUnwindWithoutEHFrame();
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index 767348c..7c70540 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -65,10 +65,6 @@ void MCELFStreamer::EmitLabel(MCSymbol *Symbol) {
     MCELF::SetType(SD, ELF::STT_TLS);
 }
 
-void MCELFStreamer::EmitDebugLabel(MCSymbol *Symbol) {
-  EmitLabel(Symbol);
-}
-
 void MCELFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   // Let the target do whatever target specific stuff it needs to do.
   getAssembler().getBackend().handleAssemblerFlag(Flag);
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index 37d05e9..9e8bc94 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -60,7 +60,6 @@ public:
 
   void ChangeSection(const MCSection *Sect, const MCExpr *Subsect) override;
   void EmitLabel(MCSymbol *Symbol) override;
-  void EmitDebugLabel(MCSymbol *Symbol) override;
   void EmitEHSymAttributes(const MCSymbol *Symbol, MCSymbol *EHSymbol) override;
   void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
   void EmitLinkerOptions(ArrayRef<std::string> Options) override;
@@ -162,9 +161,6 @@ void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) {
   SD.setFlags(SD.getFlags() & ~SF_ReferenceTypeMask);
 }
 
-void MCMachOStreamer::EmitDebugLabel(MCSymbol *Symbol) {
-  EmitLabel(Symbol);
-}
 void MCMachOStreamer::EmitDataRegion(DataRegionData::KindTy Kind) {
   if (!getAssembler().getBackend().hasDataInCodeSupport())
     return;
diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp
index 4f2740e..d543402 100644
--- a/lib/MC/MCNullStreamer.cpp
+++ b/lib/MC/MCNullStreamer.cpp
@@ -24,83 +24,17 @@ namespace {
     /// @name MCStreamer Interface
     /// @{
 
-    void ChangeSection(const MCSection *Section,
-                       const MCExpr *Subsection) override {
-    }
-
-    void EmitLabel(MCSymbol *Symbol) override {
-      assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
-      assert(getCurrentSection().first &&"Cannot emit before setting section!");
-      AssignSection(Symbol, getCurrentSection().first);
-    }
-    void EmitDebugLabel(MCSymbol *Symbol) override {
-      EmitLabel(Symbol);
-    }
-    void EmitAssemblerFlag(MCAssemblerFlag Flag) override {}
-    void EmitThumbFunc(MCSymbol *Func) override {}
-
-    void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override {}
-    void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override {}
     bool EmitSymbolAttribute(MCSymbol *Symbol,
                              MCSymbolAttr Attribute) override {
       return true;
     }
 
-    void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override {}
-
-    void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {}
-    void EmitCOFFSymbolStorageClass(int StorageClass) override {}
-    void EmitCOFFSymbolType(int Type) override {}
-    void EndCOFFSymbolDef() override {}
     void EmitCOFFSecRel32(MCSymbol const *Symbol) override {}
-
-    void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) override {}
     void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                           unsigned ByteAlignment) override {}
-    void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                               unsigned ByteAlignment) override {}
     void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = nullptr,
                       uint64_t Size = 0, unsigned ByteAlignment = 0) override {}
-    void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
-                        uint64_t Size, unsigned ByteAlignment) override {}
-    void EmitBytes(StringRef Data) override {}
-
-    void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                       const SMLoc &Loc = SMLoc()) override {}
-    void EmitULEB128Value(const MCExpr *Value) override {}
-    void EmitSLEB128Value(const MCExpr *Value) override {}
     void EmitGPRel32Value(const MCExpr *Value) override {}
-    void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
-                              unsigned ValueSize = 1,
-                              unsigned MaxBytesToEmit = 0) override {}
-
-    void EmitCodeAlignment(unsigned ByteAlignment,
-                           unsigned MaxBytesToEmit = 0) override {}
-
-    bool EmitValueToOffset(const MCExpr *Offset,
-                           unsigned char Value = 0) override { return false; }
-
-    void EmitFileDirective(StringRef Filename) override {}
-    unsigned EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
-                                    StringRef Filename,
-                                    unsigned CUID = 0) override {
-      return 0;
-    }
-    void EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
-                               unsigned Column, unsigned Flags,
-                               unsigned Isa, unsigned Discriminator,
-                               StringRef FileName) override {}
-    void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo&) override {}
-
-    void EmitBundleAlignMode(unsigned AlignPow2) override {}
-    void EmitBundleLock(bool AlignToEnd) override {}
-    void EmitBundleUnlock() override {}
-
-    void FinishImpl() override {}
-
-    void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override {
-      RecordProcEnd(Frame);
-    }
   };
 
 }
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index 9d413af..d490ef3 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -18,9 +18,29 @@
 #include "llvm/MC/MCSectionMachO.h"
 using namespace llvm;
 
+static bool useCompactUnwind(const Triple &T) {
+  // Only on darwin.
+  if (!T.isOSDarwin())
+    return false;
+
+  // aarch64 always has it.
+  if (T.getArch() == Triple::arm64 || T.getArch() == Triple::aarch64)
+    return true;
+
+  // Use it on newer version of OS X.
+  if (T.isMacOSX() && !T.isMacOSXVersionLT(10, 6))
+    return true;
+
+  // And the iOS simulator.
+  if (T.isiOS() &&
+      (T.getArch() == Triple::x86_64 || T.getArch() == Triple::x86))
+    return true;
+
+  return false;
+}
+
 void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
   // MachO
-  IsFunctionEHFrameSymbolPrivate = false;
   SupportsWeakOmittedEHFrame = false;
 
   if (T.isOSDarwin() &&
@@ -151,13 +171,10 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
 
   COFFDebugSymbolsSection = nullptr;
 
-  if ((T.isMacOSX() && !T.isMacOSXVersionLT(10, 6)) ||
-      (T.isOSDarwin() &&
-       (T.getArch() == Triple::arm64 || T.getArch() == Triple::aarch64))) {
+  if (useCompactUnwind(T)) {
     CompactUnwindSection =
-      Ctx->getMachOSection("__LD", "__compact_unwind",
-                           MachO::S_ATTR_DEBUG,
-                           SectionKind::getReadOnly());
+        Ctx->getMachOSection("__LD", "__compact_unwind", MachO::S_ATTR_DEBUG,
+                             SectionKind::getReadOnly());
 
     if (T.getArch() == Triple::x86_64 || T.getArch() == Triple::x86)
       CompactUnwindDwarfEHFrameOnly = 0x04000000;
@@ -321,6 +338,13 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
       TTypeEncoding = dwarf::DW_EH_PE_absptr;
     }
     break;
+  case Triple::mips:
+  case Triple::mipsel:
+    // MIPS uses indirect pointer to refer personality functions, so that the
+    // eh_frame section can be read-only.  DW.ref.personality will be generated
+    // for relocation.
+    PersonalityEncoding = dwarf::DW_EH_PE_indirect;
+    break;
   case Triple::ppc64:
   case Triple::ppc64le:
     PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
@@ -562,6 +586,8 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
 
 
 void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
+  bool IsWoA = T.getArch() == Triple::arm || T.getArch() == Triple::thumb;
+
   // The object file format cannot represent common symbols with explicit
   // alignments.
   CommDirectiveSupportsAlignment = false;
@@ -575,6 +601,8 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
                         SectionKind::getBSS());
   TextSection =
     Ctx->getCOFFSection(".text",
+                        (IsWoA ? COFF::IMAGE_SCN_MEM_16BIT
+                               : (COFF::SectionCharacteristics)0) |
                         COFF::IMAGE_SCN_CNT_CODE |
                         COFF::IMAGE_SCN_MEM_EXECUTE |
                         COFF::IMAGE_SCN_MEM_READ,
@@ -590,12 +618,18 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
                         COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getReadOnly());
-  if (T.isKnownWindowsMSVCEnvironment()) {
+
+  if (T.isKnownWindowsMSVCEnvironment() || T.isWindowsItaniumEnvironment()) {
     StaticCtorSection =
       Ctx->getCOFFSection(".CRT$XCU",
                           COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                           COFF::IMAGE_SCN_MEM_READ,
                           SectionKind::getReadOnly());
+    StaticDtorSection =
+      Ctx->getCOFFSection(".CRT$XTX",
+                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                          COFF::IMAGE_SCN_MEM_READ,
+                          SectionKind::getReadOnly());
   } else {
     StaticCtorSection =
       Ctx->getCOFFSection(".ctors",
@@ -603,16 +637,6 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
                           COFF::IMAGE_SCN_MEM_READ |
                           COFF::IMAGE_SCN_MEM_WRITE,
                           SectionKind::getDataRel());
-  }
-
-
-  if (T.isKnownWindowsMSVCEnvironment()) {
-    StaticDtorSection =
-      Ctx->getCOFFSection(".CRT$XTX",
-                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                          COFF::IMAGE_SCN_MEM_READ,
-                          SectionKind::getReadOnly());
-  } else {
     StaticDtorSection =
       Ctx->getCOFFSection(".dtors",
                           COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
@@ -625,11 +649,16 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
   // though it contains relocatable pointers.  In PIC mode, this is probably a
   // big runtime hit for C++ apps.  Either the contents of the LSDA need to be
   // adjusted or this should be a data section.
-  LSDASection =
-    Ctx->getCOFFSection(".gcc_except_table",
-                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                        COFF::IMAGE_SCN_MEM_READ,
-                        SectionKind::getReadOnly());
+  assert(T.isOSWindows() && "Windows is the only supported COFF target");
+  if (T.getArch() == Triple::x86_64) {
+    // On Windows 64 with SEH, the LSDA is emitted into the .xdata section
+    LSDASection = 0;
+  } else {
+    LSDASection = Ctx->getCOFFSection(".gcc_except_table",
+                                      COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                          COFF::IMAGE_SCN_MEM_READ,
+                                      SectionKind::getReadOnly());
+  }
 
   // Debug info.
   COFFDebugSymbolsSection =
@@ -705,36 +734,46 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
   DwarfInfoDWOSection =
-      Ctx->getCOFFSection(".debug_info.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                                COFF::IMAGE_SCN_MEM_READ,
+      Ctx->getCOFFSection(".debug_info.dwo",
+                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_MEM_READ,
                           SectionKind::getMetadata());
   DwarfAbbrevDWOSection =
-      Ctx->getCOFFSection(".debug_abbrev.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                                  COFF::IMAGE_SCN_MEM_READ,
+      Ctx->getCOFFSection(".debug_abbrev.dwo",
+                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_MEM_READ,
                           SectionKind::getMetadata());
   DwarfStrDWOSection =
-      Ctx->getCOFFSection(".debug_str.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                               COFF::IMAGE_SCN_MEM_READ,
+      Ctx->getCOFFSection(".debug_str.dwo",
+                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_MEM_READ,
                           SectionKind::getMetadata());
   DwarfLineDWOSection =
-      Ctx->getCOFFSection(".debug_line.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                                COFF::IMAGE_SCN_MEM_READ,
+      Ctx->getCOFFSection(".debug_line.dwo",
+                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_MEM_READ,
                           SectionKind::getMetadata());
   DwarfLocDWOSection =
-      Ctx->getCOFFSection(".debug_loc.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                               COFF::IMAGE_SCN_MEM_READ,
+      Ctx->getCOFFSection(".debug_loc.dwo",
+                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_MEM_READ,
                           SectionKind::getMetadata());
   DwarfStrOffDWOSection =
-      Ctx->getCOFFSection(".debug_str_offsets.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                                  COFF::IMAGE_SCN_MEM_READ,
+      Ctx->getCOFFSection(".debug_str_offsets.dwo",
+                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_MEM_READ,
                           SectionKind::getMetadata());
-  DwarfAddrSection = Ctx->getCOFFSection(
-      ".debug_addr", COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_MEM_READ,
-      SectionKind::getMetadata());
+
+  DwarfAddrSection =
+    Ctx->getCOFFSection(".debug_addr",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
 
   DrectveSection =
     Ctx->getCOFFSection(".drectve",
-                        COFF::IMAGE_SCN_LNK_INFO | COFF::IMAGE_SCN_LNK_REMOVE,
+                        COFF::IMAGE_SCN_LNK_INFO |
+                        COFF::IMAGE_SCN_LNK_REMOVE,
                         SectionKind::getMetadata());
 
   PDataSection =
@@ -748,6 +787,7 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
                         COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getDataRel());
+
   TLSDataSection =
     Ctx->getCOFFSection(".tls$",
                         COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
@@ -756,7 +796,7 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
                         SectionKind::getDataRel());
 }
 
-void MCObjectFileInfo::InitMCObjectFileInfo(StringRef TT, Reloc::Model relocm,
+void MCObjectFileInfo::InitMCObjectFileInfo(StringRef T, Reloc::Model relocm,
                                             CodeModel::Model cm,
                                             MCContext &ctx) {
   RelocM = relocm;
@@ -766,7 +806,6 @@ void MCObjectFileInfo::InitMCObjectFileInfo(StringRef TT, Reloc::Model relocm,
   // Common.
   CommDirectiveSupportsAlignment = true;
   SupportsWeakOmittedEHFrame = true;
-  IsFunctionEHFrameSymbolPrivate = true;
   SupportsCompactUnwindWithoutEHFrame = false;
 
   PersonalityEncoding = LSDAEncoding = FDECFIEncoding = TTypeEncoding =
@@ -781,8 +820,9 @@ void MCObjectFileInfo::InitMCObjectFileInfo(StringRef TT, Reloc::Model relocm,
   DwarfAccelNamespaceSection = nullptr; // Used only by selected targets.
   DwarfAccelTypesSection = nullptr;     // Used only by selected targets.
 
-  Triple T(TT);
-  Triple::ArchType Arch = T.getArch();
+  TT = Triple(T);
+
+  Triple::ArchType Arch = TT.getArch();
   // FIXME: Checking for Arch here to filter out bogus triples such as
   // cellspu-apple-darwin. Perhaps we should fix in Triple?
   if ((Arch == Triple::x86 || Arch == Triple::x86_64 ||
@@ -790,17 +830,17 @@ void MCObjectFileInfo::InitMCObjectFileInfo(StringRef TT, Reloc::Model relocm,
        Arch == Triple::arm64 || Arch == Triple::aarch64 ||
        Arch == Triple::ppc || Arch == Triple::ppc64 ||
        Arch == Triple::UnknownArch) &&
-      (T.isOSDarwin() || T.isOSBinFormatMachO())) {
+      (TT.isOSDarwin() || TT.isOSBinFormatMachO())) {
     Env = IsMachO;
-    InitMachOMCObjectFileInfo(T);
+    InitMachOMCObjectFileInfo(TT);
   } else if ((Arch == Triple::x86 || Arch == Triple::x86_64 ||
               Arch == Triple::arm || Arch == Triple::thumb) &&
-             (T.isOSWindows() && T.getObjectFormat() == Triple::COFF)) {
+             (TT.isOSWindows() && TT.getObjectFormat() == Triple::COFF)) {
     Env = IsCOFF;
-    InitCOFFMCObjectFileInfo(T);
+    InitCOFFMCObjectFileInfo(TT);
   } else {
     Env = IsELF;
-    InitELFMCObjectFileInfo(T);
+    InitELFMCObjectFileInfo(TT);
   }
 }
 
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index a1aa602..a721b59 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -83,32 +83,8 @@ MCDataFragment *MCObjectStreamer::getOrCreateDataFragment() const {
   return F;
 }
 
-const MCExpr *MCObjectStreamer::AddValueSymbols(const MCExpr *Value) {
-  switch (Value->getKind()) {
-  case MCExpr::Target:
-    cast<MCTargetExpr>(Value)->AddValueSymbols(Assembler);
-    break;
-
-  case MCExpr::Constant:
-    break;
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
-    AddValueSymbols(BE->getLHS());
-    AddValueSymbols(BE->getRHS());
-    break;
-  }
-
-  case MCExpr::SymbolRef:
-    Assembler->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
-    break;
-
-  case MCExpr::Unary:
-    AddValueSymbols(cast<MCUnaryExpr>(Value)->getSubExpr());
-    break;
-  }
-
-  return Value;
+void MCObjectStreamer::visitUsedSymbol(const MCSymbol &Sym) {
+  Assembler->getOrCreateSymbolData(Sym);
 }
 
 void MCObjectStreamer::EmitCFISections(bool EH, bool Debug) {
@@ -119,13 +95,14 @@ void MCObjectStreamer::EmitCFISections(bool EH, bool Debug) {
 
 void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
                                      const SMLoc &Loc) {
+  MCStreamer::EmitValueImpl(Value, Size, Loc);
   MCDataFragment *DF = getOrCreateDataFragment();
 
   MCLineEntry::Make(this, getCurrentSection().first);
 
   // Avoid fixups when possible.
   int64_t AbsValue;
-  if (AddValueSymbols(Value)->EvaluateAsAbsolute(AbsValue, getAssembler())) {
+  if (Value->EvaluateAsAbsolute(AbsValue, getAssembler())) {
     EmitIntValue(AbsValue, Size);
     return;
   }
@@ -136,11 +113,14 @@ void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
 }
 
 void MCObjectStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
-  RecordProcStart(Frame);
+  // We need to create a local symbol to avoid relocations.
+  Frame.Begin = getContext().CreateTempSymbol();
+  EmitLabel(Frame.Begin);
 }
 
 void MCObjectStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
-  RecordProcEnd(Frame);
+  Frame.End = getContext().CreateTempSymbol();
+  EmitLabel(Frame.End);
 }
 
 void MCObjectStreamer::EmitLabel(MCSymbol *Symbol) {
@@ -158,10 +138,6 @@ void MCObjectStreamer::EmitLabel(MCSymbol *Symbol) {
   SD.setOffset(F->getContents().size());
 }
 
-void MCObjectStreamer::EmitDebugLabel(MCSymbol *Symbol) {
-  EmitLabel(Symbol);
-}
-
 void MCObjectStreamer::EmitULEB128Value(const MCExpr *Value) {
   int64_t IntValue;
   if (Value->EvaluateAsAbsolute(IntValue, getAssembler())) {
@@ -205,15 +181,12 @@ void MCObjectStreamer::ChangeSection(const MCSection *Section,
 
 void MCObjectStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
   getAssembler().getOrCreateSymbolData(*Symbol);
-  AddValueSymbols(Value);
   MCStreamer::EmitAssignment(Symbol, Value);
 }
 
-void MCObjectStreamer::EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) {
-  // Scan for values.
-  for (unsigned i = Inst.getNumOperands(); i--; )
-    if (Inst.getOperand(i).isExpr())
-      AddValueSymbols(Inst.getOperand(i).getExpr());
+void MCObjectStreamer::EmitInstruction(const MCInst &Inst,
+                                       const MCSubtargetInfo &STI) {
+  MCStreamer::EmitInstruction(Inst, STI);
 
   MCSectionData *SD = getCurrentSectionData();
   SD->setHasInstructions(true);
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index bca516e..145ad4a 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -22,7 +22,6 @@
 using namespace llvm;
 
 AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI)  {
-  CurBuf = nullptr;
   CurPtr = nullptr;
   isAtStartOfLine = true;
   AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@");
@@ -31,13 +30,13 @@ AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI)  {
 AsmLexer::~AsmLexer() {
 }
 
-void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) {
-  CurBuf = buf;
+void AsmLexer::setBuffer(StringRef Buf, const char *ptr) {
+  CurBuf = Buf;
 
   if (ptr)
     CurPtr = ptr;
   else
-    CurPtr = CurBuf->getBufferStart();
+    CurPtr = CurBuf.begin();
 
   TokStart = nullptr;
 }
@@ -58,7 +57,7 @@ int AsmLexer::getNextChar() {
   case 0:
     // A nul character in the stream is either the end of the current buffer or
     // a random nul in the file.  Disambiguate that here.
-    if (CurPtr-1 != CurBuf->getBufferEnd())
+    if (CurPtr - 1 != CurBuf.end())
       return 0;  // Just whitespace.
 
     // Otherwise, return end of file.
@@ -201,8 +200,8 @@ AsmToken AsmLexer::LexLineComment() {
     CurChar = getNextChar();
 
   if (CurChar == EOF)
-    return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0));
-  return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0));
+    return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
+  return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
 }
 
 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
@@ -420,9 +419,8 @@ StringRef AsmLexer::LexUntilEndOfStatement() {
 
   while (!isAtStartOfComment(*CurPtr) &&    // Start of line comment.
          !isAtStatementSeparator(CurPtr) && // End of statement marker.
-         *CurPtr != '\n' &&
-         *CurPtr != '\r' &&
-         (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) {
+         *CurPtr != '\n' && *CurPtr != '\r' &&
+         (*CurPtr != 0 || CurPtr != CurBuf.end())) {
     ++CurPtr;
   }
   return StringRef(TokStart, CurPtr-TokStart);
@@ -431,9 +429,8 @@ StringRef AsmLexer::LexUntilEndOfStatement() {
 StringRef AsmLexer::LexUntilEndOfLine() {
   TokStart = CurPtr;
 
-  while (*CurPtr != '\n' &&
-         *CurPtr != '\r' &&
-         (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) {
+  while (*CurPtr != '\n' && *CurPtr != '\r' &&
+         (*CurPtr != 0 || CurPtr != CurBuf.end())) {
     ++CurPtr;
   }
   return StringRef(TokStart, CurPtr-TokStart);
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 168597f..62ab4a5 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -102,7 +102,7 @@ public:
 
 struct ParseStatementInfo {
   /// \brief The parsed operands from the last parsed statement.
-  SmallVector<MCParsedAsmOperand*, 8> ParsedOperands;
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> ParsedOperands;
 
   /// \brief The opcode from the last parsed instruction.
   unsigned Opcode;
@@ -115,13 +115,6 @@ struct ParseStatementInfo {
   ParseStatementInfo() : Opcode(~0U), ParseError(false), AsmRewrites(nullptr) {}
   ParseStatementInfo(SmallVectorImpl<AsmRewrite> *rewrites)
     : Opcode(~0), ParseError(false), AsmRewrites(rewrites) {}
-
-  ~ParseStatementInfo() {
-    // Free any parsed operands.
-    for (unsigned i = 0, e = ParsedOperands.size(); i != e; ++i)
-      delete ParsedOperands[i];
-    ParsedOperands.clear();
-  }
 };
 
 /// \brief The concrete assembly parser instance.
@@ -140,7 +133,7 @@ private:
 
   /// This is the current buffer index we're lexing from as managed by the
   /// SourceMgr object.
-  int CurBuffer;
+  unsigned CurBuffer;
 
   AsmCond TheCondState;
   std::vector<AsmCond> TheCondStack;
@@ -169,13 +162,13 @@ private:
   StringRef CppHashFilename;
   int64_t CppHashLineNumber;
   SMLoc CppHashLoc;
-  int CppHashBuf;
+  unsigned CppHashBuf;
   /// When generating dwarf for assembly source files we need to calculate the
   /// logical line number based on the last parsed cpp hash file line comment
   /// and current line. Since this is slow and messes up the SourceMgr's
   /// cache we save the last info we queried with SrcMgr.FindLineNumber().
   SMLoc LastQueryIDLoc;
-  int LastQueryBuffer;
+  unsigned LastQueryBuffer;
   unsigned LastQueryLine;
 
   /// AssemblerDialect. ~OU means unset value and use value provided by MAI.
@@ -317,9 +310,9 @@ private:
   /// current token is not set; clients should ensure Lex() is called
   /// subsequently.
   ///
-  /// \param InBuffer If not -1, should be the known buffer id that contains the
+  /// \param InBuffer If not 0, should be the known buffer id that contains the
   /// location.
-  void jumpToLoc(SMLoc Loc, int InBuffer=-1);
+  void jumpToLoc(SMLoc Loc, unsigned InBuffer = 0);
 
   /// \brief Parse up to the end of statement and a return the contents from the
   /// current token until the end of the statement; the current token on exit
@@ -352,8 +345,9 @@ private:
     DK_REFERENCE, DK_WEAK_DEFINITION, DK_WEAK_REFERENCE,
     DK_WEAK_DEF_CAN_BE_HIDDEN, DK_COMM, DK_COMMON, DK_LCOMM, DK_ABORT,
     DK_INCLUDE, DK_INCBIN, DK_CODE16, DK_CODE16GCC, DK_REPT, DK_IRP, DK_IRPC,
-    DK_IF, DK_IFNE, DK_IFB, DK_IFNB, DK_IFC, DK_IFEQS, DK_IFNC, DK_IFDEF,
-    DK_IFNDEF, DK_IFNOTDEF, DK_ELSEIF, DK_ELSE, DK_ENDIF,
+    DK_IF, DK_IFEQ, DK_IFGE, DK_IFGT, DK_IFLE, DK_IFLT, DK_IFNE, DK_IFB,
+    DK_IFNB, DK_IFC, DK_IFEQS, DK_IFNC, DK_IFDEF, DK_IFNDEF, DK_IFNOTDEF,
+    DK_ELSEIF, DK_ELSE, DK_ENDIF,
     DK_SPACE, DK_SKIP, DK_FILE, DK_LINE, DK_LOC, DK_STABS,
     DK_CFI_SECTIONS, DK_CFI_STARTPROC, DK_CFI_ENDPROC, DK_CFI_DEF_CFA,
     DK_CFI_DEF_CFA_OFFSET, DK_CFI_ADJUST_CFA_OFFSET, DK_CFI_DEF_CFA_REGISTER,
@@ -440,8 +434,8 @@ private:
   bool parseDirectiveInclude(); // ".include"
   bool parseDirectiveIncbin(); // ".incbin"
 
-  // ".if" or ".ifne"
-  bool parseDirectiveIf(SMLoc DirectiveLoc);
+  // ".if", ".ifeq", ".ifge", ".ifgt" , ".ifle", ".iflt" or ".ifne"
+  bool parseDirectiveIf(SMLoc DirectiveLoc, DirectiveKind DirKind);
   // ".ifb" or ".ifnb", depending on ExpectBlank.
   bool parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank);
   // ".ifc" or ".ifnc", depending on ExpectEqual.
@@ -497,15 +491,15 @@ enum { DEFAULT_ADDRSPACE = 0 };
 AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx, MCStreamer &_Out,
                      const MCAsmInfo &_MAI)
     : Lexer(_MAI), Ctx(_Ctx), Out(_Out), MAI(_MAI), SrcMgr(_SM),
-      PlatformParser(nullptr), CurBuffer(0), MacrosEnabledFlag(true),
-      HadError(false), CppHashLineNumber(0), AssemblerDialect(~0U),
-      IsDarwin(false), ParsingInlineAsm(false) {
+      PlatformParser(nullptr), CurBuffer(_SM.getMainFileID()),
+      MacrosEnabledFlag(true), HadError(false), CppHashLineNumber(0),
+      AssemblerDialect(~0U), IsDarwin(false), ParsingInlineAsm(false) {
   // Save the old handler.
   SavedDiagHandler = SrcMgr.getDiagHandler();
   SavedDiagContext = SrcMgr.getDiagContext();
   // Set our own handler which calls the saved handler.
   SrcMgr.setDiagHandler(DiagHandler, this);
-  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer));
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
 
   // Initialize the platform / file format parser.
   switch (_Ctx.getObjectFileInfo()->getObjectFileType()) {
@@ -572,14 +566,13 @@ bool AsmParser::Error(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges) {
 
 bool AsmParser::enterIncludeFile(const std::string &Filename) {
   std::string IncludedFile;
-  int NewBuf = SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
-  if (NewBuf == -1)
+  unsigned NewBuf =
+      SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
+  if (!NewBuf)
     return true;
 
   CurBuffer = NewBuf;
-
-  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer));
-
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
   return false;
 }
 
@@ -588,8 +581,9 @@ bool AsmParser::enterIncludeFile(const std::string &Filename) {
 /// returns true on failure.
 bool AsmParser::processIncbinFile(const std::string &Filename) {
   std::string IncludedFile;
-  int NewBuf = SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
-  if (NewBuf == -1)
+  unsigned NewBuf =
+      SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
+  if (!NewBuf)
     return true;
 
   // Pick up the bytes from the file and emit them.
@@ -597,13 +591,10 @@ bool AsmParser::processIncbinFile(const std::string &Filename) {
   return false;
 }
 
-void AsmParser::jumpToLoc(SMLoc Loc, int InBuffer) {
-  if (InBuffer != -1) {
-    CurBuffer = InBuffer;
-  } else {
-    CurBuffer = SrcMgr.FindBufferContainingLoc(Loc);
-  }
-  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer), Loc.getPointer());
+void AsmParser::jumpToLoc(SMLoc Loc, unsigned InBuffer) {
+  CurBuffer = InBuffer ? InBuffer : SrcMgr.FindBufferContainingLoc(Loc);
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(),
+                  Loc.getPointer());
 }
 
 const AsmToken &AsmParser::Lex() {
@@ -639,10 +630,12 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   // If we are generating dwarf for assembly source files save the initial text
   // section and generate a .file directive.
   if (getContext().getGenDwarfForAssembly()) {
-    getContext().setGenDwarfSection(getStreamer().getCurrentSection().first);
     MCSymbol *SectionStartSym = getContext().CreateTempSymbol();
     getStreamer().EmitLabel(SectionStartSym);
-    getContext().setGenDwarfSectionStartSym(SectionStartSym);
+    auto InsertResult = getContext().addGenDwarfSection(
+        getStreamer().getCurrentSection().first);
+    assert(InsertResult.second && ".text section should not have debug info yet");
+    InsertResult.first->second.first = SectionStartSym;
     getContext().setGenDwarfFileNumber(getStreamer().EmitDwarfFileDirective(
         0, StringRef(), getContext().getMainFileName()));
   }
@@ -818,7 +811,19 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     // Parse symbol variant
     std::pair<StringRef, StringRef> Split;
     if (!MAI.useParensForSymbolVariant()) {
-      Split = Identifier.split('@');
+      if (FirstTokenKind == AsmToken::String) {
+        if (Lexer.is(AsmToken::At)) {
+          Lexer.Lex(); // eat @
+          SMLoc AtLoc = getLexer().getLoc();
+          StringRef VName;
+          if (parseIdentifier(VName))
+            return Error(AtLoc, "expected symbol variant after '@'");
+
+          Split = std::make_pair(Identifier, VName);
+        }
+      } else {
+        Split = Identifier.split('@');
+      }
     } else if (Lexer.is(AsmToken::LParen)) {
       Lexer.Lex(); // eat (
       StringRef VName;
@@ -1236,8 +1241,13 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info) {
   default:
     break;
   case DK_IF:
+  case DK_IFEQ:
+  case DK_IFGE:
+  case DK_IFGT:
+  case DK_IFLE:
+  case DK_IFLT:
   case DK_IFNE:
-    return parseDirectiveIf(IDLoc);
+    return parseDirectiveIf(IDLoc, DirKind);
   case DK_IFB:
     return parseDirectiveIfb(IDLoc, true);
   case DK_IFNB:
@@ -1581,12 +1591,11 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info) {
     printMessage(IDLoc, SourceMgr::DK_Note, OS.str());
   }
 
-  // If we are generating dwarf for assembly source files and the current
-  // section is the initial text section then generate a .loc directive for
-  // the instruction.
+  // If we are generating dwarf for the current section then generate a .loc
+  // directive for the instruction.
   if (!HadError && getContext().getGenDwarfForAssembly() &&
-      getContext().getGenDwarfSection() ==
-          getStreamer().getCurrentSection().first) {
+      getContext().getGenDwarfSectionSyms().count(
+        getStreamer().getCurrentSection().first)) {
 
     unsigned Line = SrcMgr.FindLineNumber(IDLoc, CurBuffer);
 
@@ -1685,13 +1694,15 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
 
   const SourceMgr &DiagSrcMgr = *Diag.getSourceMgr();
   const SMLoc &DiagLoc = Diag.getLoc();
-  int DiagBuf = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
-  int CppHashBuf = Parser->SrcMgr.FindBufferContainingLoc(Parser->CppHashLoc);
+  unsigned DiagBuf = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
+  unsigned CppHashBuf =
+      Parser->SrcMgr.FindBufferContainingLoc(Parser->CppHashLoc);
 
   // Like SourceMgr::printMessage() we need to print the include stack if any
   // before printing the message.
-  int DiagCurBuffer = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
-  if (!Parser->SavedDiagHandler && DiagCurBuffer > 0) {
+  unsigned DiagCurBuffer = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
+  if (!Parser->SavedDiagHandler && DiagCurBuffer &&
+      DiagCurBuffer != DiagSrcMgr.getMainFileID()) {
     SMLoc ParentIncludeLoc = DiagSrcMgr.getParentIncludeLoc(DiagCurBuffer);
     DiagSrcMgr.PrintIncludeStack(ParentIncludeLoc, OS);
   }
@@ -2018,7 +2029,7 @@ bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
           break;
 
       if (FAI >= NParameters) {
-	assert(M && "expected macro to be defined");
+    assert(M && "expected macro to be defined");
         Error(IDLoc,
               "parameter named '" + FA.Name + "' does not exist for macro '" +
               M->Name + "'");
@@ -2117,7 +2128,7 @@ bool AsmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
 
   // Jump to the macro instantiation and prime the lexer.
   CurBuffer = SrcMgr.AddNewSourceBuffer(MI->Instantiation, SMLoc());
-  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer));
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
   Lex();
 
   return false;
@@ -3799,9 +3810,8 @@ bool AsmParser::parseDirectiveIncbin() {
 }
 
 /// parseDirectiveIf
-/// ::= .if expression
-/// ::= .ifne expression
-bool AsmParser::parseDirectiveIf(SMLoc DirectiveLoc) {
+/// ::= .if{,eq,ge,gt,le,lt,ne} expression
+bool AsmParser::parseDirectiveIf(SMLoc DirectiveLoc, DirectiveKind DirKind) {
   TheCondStack.push_back(TheCondState);
   TheCondState.TheCond = AsmCond::IfCond;
   if (TheCondState.Ignore) {
@@ -3816,6 +3826,29 @@ bool AsmParser::parseDirectiveIf(SMLoc DirectiveLoc) {
 
     Lex();
 
+    switch (DirKind) {
+    default:
+      llvm_unreachable("unsupported directive");
+    case DK_IF:
+    case DK_IFNE:
+      break;
+    case DK_IFEQ:
+      ExprValue = ExprValue == 0;
+      break;
+    case DK_IFGE:
+      ExprValue = ExprValue >= 0;
+      break;
+    case DK_IFGT:
+      ExprValue = ExprValue > 0;
+      break;
+    case DK_IFLE:
+      ExprValue = ExprValue <= 0;
+      break;
+    case DK_IFLT:
+      ExprValue = ExprValue < 0;
+      break;
+    }
+
     TheCondState.CondMet = ExprValue;
     TheCondState.Ignore = !TheCondState.CondMet;
   }
@@ -4118,6 +4151,11 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".bundle_lock"] = DK_BUNDLE_LOCK;
   DirectiveKindMap[".bundle_unlock"] = DK_BUNDLE_UNLOCK;
   DirectiveKindMap[".if"] = DK_IF;
+  DirectiveKindMap[".ifeq"] = DK_IFEQ;
+  DirectiveKindMap[".ifge"] = DK_IFGE;
+  DirectiveKindMap[".ifgt"] = DK_IFGT;
+  DirectiveKindMap[".ifle"] = DK_IFLE;
+  DirectiveKindMap[".iflt"] = DK_IFLT;
   DirectiveKindMap[".ifne"] = DK_IFNE;
   DirectiveKindMap[".ifb"] = DK_IFB;
   DirectiveKindMap[".ifnb"] = DK_IFNB;
@@ -4227,7 +4265,7 @@ void AsmParser::instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
 
   // Jump to the macro instantiation and prime the lexer.
   CurBuffer = SrcMgr.AddNewSourceBuffer(MI->Instantiation, SMLoc());
-  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer));
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
   Lex();
 }
 
@@ -4465,27 +4503,27 @@ bool AsmParser::parseMSInlineAsm(
 
     // Build the list of clobbers, outputs and inputs.
     for (unsigned i = 1, e = Info.ParsedOperands.size(); i != e; ++i) {
-      MCParsedAsmOperand *Operand = Info.ParsedOperands[i];
+      MCParsedAsmOperand &Operand = *Info.ParsedOperands[i];
 
       // Immediate.
-      if (Operand->isImm())
+      if (Operand.isImm())
         continue;
 
       // Register operand.
-      if (Operand->isReg() && !Operand->needAddressOf()) {
+      if (Operand.isReg() && !Operand.needAddressOf()) {
         unsigned NumDefs = Desc.getNumDefs();
         // Clobber.
-        if (NumDefs && Operand->getMCOperandNum() < NumDefs)
-          ClobberRegs.push_back(Operand->getReg());
+        if (NumDefs && Operand.getMCOperandNum() < NumDefs)
+          ClobberRegs.push_back(Operand.getReg());
         continue;
       }
 
       // Expr/Input or Output.
-      StringRef SymName = Operand->getSymName();
+      StringRef SymName = Operand.getSymName();
       if (SymName.empty())
         continue;
 
-      void *OpDecl = Operand->getOpDecl();
+      void *OpDecl = Operand.getOpDecl();
       if (!OpDecl)
         continue;
 
@@ -4494,21 +4532,21 @@ bool AsmParser::parseMSInlineAsm(
       if (isOutput) {
         ++InputIdx;
         OutputDecls.push_back(OpDecl);
-        OutputDeclsAddressOf.push_back(Operand->needAddressOf());
-        OutputConstraints.push_back('=' + Operand->getConstraint().str());
+        OutputDeclsAddressOf.push_back(Operand.needAddressOf());
+        OutputConstraints.push_back('=' + Operand.getConstraint().str());
         AsmStrRewrites.push_back(AsmRewrite(AOK_Output, Start, SymName.size()));
       } else {
         InputDecls.push_back(OpDecl);
-        InputDeclsAddressOf.push_back(Operand->needAddressOf());
-        InputConstraints.push_back(Operand->getConstraint().str());
+        InputDeclsAddressOf.push_back(Operand.needAddressOf());
+        InputConstraints.push_back(Operand.getConstraint().str());
         AsmStrRewrites.push_back(AsmRewrite(AOK_Input, Start, SymName.size()));
       }
     }
 
     // Consider implicit defs to be clobbers.  Think of cpuid and push.
-    const uint16_t *ImpDefs = Desc.getImplicitDefs();
-    for (unsigned I = 0, E = Desc.getNumImplicitDefs(); I != E; ++I)
-      ClobberRegs.push_back(ImpDefs[I]);
+    ArrayRef<uint16_t> ImpDefs(Desc.getImplicitDefs(),
+                               Desc.getNumImplicitDefs());
+    ClobberRegs.insert(ClobberRegs.end(), ImpDefs.begin(), ImpDefs.end());
   }
 
   // Set the number of Outputs and Inputs.
@@ -4543,27 +4581,26 @@ bool AsmParser::parseMSInlineAsm(
   // Build the IR assembly string.
   std::string AsmStringIR;
   raw_string_ostream OS(AsmStringIR);
-  const char *AsmStart = SrcMgr.getMemoryBuffer(0)->getBufferStart();
-  const char *AsmEnd = SrcMgr.getMemoryBuffer(0)->getBufferEnd();
+  StringRef ASMString =
+      SrcMgr.getMemoryBuffer(SrcMgr.getMainFileID())->getBuffer();
+  const char *AsmStart = ASMString.begin();
+  const char *AsmEnd = ASMString.end();
   array_pod_sort(AsmStrRewrites.begin(), AsmStrRewrites.end(), rewritesSort);
-  for (SmallVectorImpl<AsmRewrite>::iterator I = AsmStrRewrites.begin(),
-                                             E = AsmStrRewrites.end();
-       I != E; ++I) {
-    AsmRewriteKind Kind = (*I).Kind;
+  for (const AsmRewrite &AR : AsmStrRewrites) {
+    AsmRewriteKind Kind = AR.Kind;
     if (Kind == AOK_Delete)
       continue;
 
-    const char *Loc = (*I).Loc.getPointer();
+    const char *Loc = AR.Loc.getPointer();
     assert(Loc >= AsmStart && "Expected Loc to be at or after Start!");
 
     // Emit everything up to the immediate/expression.
-    unsigned Len = Loc - AsmStart;
-    if (Len)
+    if (unsigned Len = Loc - AsmStart)
       OS << StringRef(AsmStart, Len);
 
     // Skip the original expression.
     if (Kind == AOK_Skip) {
-      AsmStart = Loc + (*I).Len;
+      AsmStart = Loc + AR.Len;
       continue;
     }
 
@@ -4573,7 +4610,7 @@ bool AsmParser::parseMSInlineAsm(
     default:
       break;
     case AOK_Imm:
-      OS << "$$" << (*I).Val;
+      OS << "$$" << AR.Val;
       break;
     case AOK_ImmPrefix:
       OS << "$$";
@@ -4585,7 +4622,7 @@ bool AsmParser::parseMSInlineAsm(
       OS << '$' << OutputIdx++;
       break;
     case AOK_SizeDirective:
-      switch ((*I).Val) {
+      switch (AR.Val) {
       default: break;
       case 8:  OS << "byte ptr "; break;
       case 16: OS << "word ptr "; break;
@@ -4600,7 +4637,7 @@ bool AsmParser::parseMSInlineAsm(
       OS << ".byte";
       break;
     case AOK_Align: {
-      unsigned Val = (*I).Val;
+      unsigned Val = AR.Val;
       OS << ".align " << Val;
 
       // Skip the original immediate.
@@ -4613,12 +4650,12 @@ bool AsmParser::parseMSInlineAsm(
       OS.flush();
       if (AsmStringIR.back() != '.')
         OS << '.';
-      OS << (*I).Val;
+      OS << AR.Val;
       break;
     }
 
     // Skip the original expression.
-    AsmStart = Loc + (*I).Len + AdditionalSkip;
+    AsmStart = Loc + AR.Len + AdditionalSkip;
   }
 
   // Emit the remainder of the asm string.
diff --git a/lib/MC/MCParser/COFFAsmParser.cpp b/lib/MC/MCParser/COFFAsmParser.cpp
index decf01c..5ecf9e5 100644
--- a/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/lib/MC/MCParser/COFFAsmParser.cpp
@@ -13,6 +13,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionCOFF.h"
@@ -37,7 +38,7 @@ class COFFAsmParser : public MCAsmParserExtension {
 
   bool ParseSectionSwitch(StringRef Section, unsigned Characteristics,
                           SectionKind Kind, StringRef COMDATSymName,
-                          COFF::COMDATType Type, const MCSectionCOFF *Assoc);
+                          COFF::COMDATType Type);
 
   bool ParseSectionName(StringRef &SectionName);
   bool ParseSectionFlags(StringRef FlagsString, unsigned* Flags);
@@ -117,8 +118,7 @@ class COFFAsmParser : public MCAsmParserExtension {
   bool ParseDirectiveEndef(StringRef, SMLoc);
   bool ParseDirectiveSecRel32(StringRef, SMLoc);
   bool ParseDirectiveSecIdx(StringRef, SMLoc);
-  bool parseCOMDATTypeAndAssoc(COFF::COMDATType &Type,
-                               const MCSectionCOFF *&Assoc);
+  bool parseCOMDATType(COFF::COMDATType &Type);
   bool ParseDirectiveLinkOnce(StringRef, SMLoc);
 
   // Win64 EH directives.
@@ -170,8 +170,8 @@ bool COFFAsmParser::ParseSectionFlags(StringRef FlagsString, unsigned* Flags) {
   bool ReadOnlyRemoved = false;
   unsigned SecFlags = None;
 
-  for (unsigned i = 0; i < FlagsString.size(); ++i) {
-    switch (FlagsString[i]) {
+  for (char FlagChar : FlagsString) {
+    switch (FlagChar) {
     case 'a':
       // Ignored.
       break;
@@ -292,22 +292,20 @@ bool COFFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
 bool COFFAsmParser::ParseSectionSwitch(StringRef Section,
                                        unsigned Characteristics,
                                        SectionKind Kind) {
-  return ParseSectionSwitch(Section, Characteristics, Kind, "",
-                            COFF::IMAGE_COMDAT_SELECT_ANY, nullptr);
+  return ParseSectionSwitch(Section, Characteristics, Kind, "", (COFF::COMDATType)0);
 }
 
 bool COFFAsmParser::ParseSectionSwitch(StringRef Section,
                                        unsigned Characteristics,
                                        SectionKind Kind,
                                        StringRef COMDATSymName,
-                                       COFF::COMDATType Type,
-                                       const MCSectionCOFF *Assoc) {
+                                       COFF::COMDATType Type) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in section switching directive");
   Lex();
 
   getStreamer().SwitchSection(getContext().getCOFFSection(
-      Section, Characteristics, Kind, COMDATSymName, Type, Assoc));
+      Section, Characteristics, Kind, COMDATSymName, Type));
 
   return false;
 }
@@ -358,15 +356,15 @@ bool COFFAsmParser::ParseDirectiveSection(StringRef, SMLoc) {
       return true;
   }
 
-  COFF::COMDATType Type = COFF::IMAGE_COMDAT_SELECT_ANY;
-  const MCSectionCOFF *Assoc = nullptr;
+  COFF::COMDATType Type = (COFF::COMDATType)0;
   StringRef COMDATSymName;
   if (getLexer().is(AsmToken::Comma)) {
+    Type = COFF::IMAGE_COMDAT_SELECT_ANY;;
     Lex();
 
     Flags |= COFF::IMAGE_SCN_LNK_COMDAT;
 
-    if (parseCOMDATTypeAndAssoc(Type, Assoc))
+    if (parseCOMDATType(Type))
       return true;
 
     if (getLexer().isNot(AsmToken::Comma))
@@ -381,7 +379,12 @@ bool COFFAsmParser::ParseDirectiveSection(StringRef, SMLoc) {
     return TokError("unexpected token in directive");
 
   SectionKind Kind = computeSectionKind(Flags);
-  ParseSectionSwitch(SectionName, Flags, Kind, COMDATSymName, Type, Assoc);
+  if (Kind.isText()) {
+    const Triple &T = getContext().getObjectFileInfo()->getTargetTriple();
+    if (T.getArch() == Triple::arm || T.getArch() == Triple::thumb)
+      Flags |= COFF::IMAGE_SCN_MEM_16BIT;
+  }
+  ParseSectionSwitch(SectionName, Flags, Kind, COMDATSymName, Type);
   return false;
 }
 
@@ -461,9 +464,8 @@ bool COFFAsmParser::ParseDirectiveSecIdx(StringRef, SMLoc) {
   return false;
 }
 
-/// ::= [ identifier [ identifier ] ]
-bool COFFAsmParser::parseCOMDATTypeAndAssoc(COFF::COMDATType &Type,
-                                            const MCSectionCOFF *&Assoc) {
+/// ::= [ identifier ]
+bool COFFAsmParser::parseCOMDATType(COFF::COMDATType &Type) {
   StringRef TypeId = getTok().getIdentifier();
 
   Type = StringSwitch<COFF::COMDATType>(TypeId)
@@ -481,48 +483,28 @@ bool COFFAsmParser::parseCOMDATTypeAndAssoc(COFF::COMDATType &Type,
 
   Lex();
 
-  if (Type == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
-    SMLoc Loc = getTok().getLoc();
-    StringRef AssocName;
-    if (ParseSectionName(AssocName))
-      return TokError("expected associated section name");
-
-    Assoc = static_cast<const MCSectionCOFF*>(
-                                        getContext().getCOFFSection(AssocName));
-    if (!Assoc)
-      return Error(Loc, "cannot associate unknown section '" + AssocName + "'");
-    if (!(Assoc->getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT))
-      return Error(Loc, "associated section must be a COMDAT section");
-    if (Assoc->getSelection() == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
-      return Error(Loc, "associated section cannot be itself associative");
-  }
-
   return false;
 }
 
 /// ParseDirectiveLinkOnce
-///  ::= .linkonce [ identifier [ identifier ] ]
+///  ::= .linkonce [ identifier ]
 bool COFFAsmParser::ParseDirectiveLinkOnce(StringRef, SMLoc Loc) {
   COFF::COMDATType Type = COFF::IMAGE_COMDAT_SELECT_ANY;
-  const MCSectionCOFF *Assoc = nullptr;
   if (getLexer().is(AsmToken::Identifier))
-    if (parseCOMDATTypeAndAssoc(Type, Assoc))
+    if (parseCOMDATType(Type))
       return true;
 
   const MCSectionCOFF *Current = static_cast<const MCSectionCOFF*>(
                                        getStreamer().getCurrentSection().first);
 
-
-  if (Type == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
-    if (Assoc == Current)
-      return Error(Loc, "cannot associate a section with itself");
-  }
+  if (Type == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
+    return Error(Loc, "cannot make section associative with .linkonce");
 
   if (Current->getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT)
     return Error(Loc, Twine("section '") + Current->getSectionName() +
                                                        "' is already linkonce");
 
-  Current->setSelection(Type, Assoc);
+  Current->setSelection(Type);
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
@@ -541,25 +523,25 @@ bool COFFAsmParser::ParseSEHDirectiveStartProc(StringRef, SMLoc) {
   MCSymbol *Symbol = getContext().GetOrCreateSymbol(SymbolID);
 
   Lex();
-  getStreamer().EmitWin64EHStartProc(Symbol);
+  getStreamer().EmitWinCFIStartProc(Symbol);
   return false;
 }
 
 bool COFFAsmParser::ParseSEHDirectiveEndProc(StringRef, SMLoc) {
   Lex();
-  getStreamer().EmitWin64EHEndProc();
+  getStreamer().EmitWinCFIEndProc();
   return false;
 }
 
 bool COFFAsmParser::ParseSEHDirectiveStartChained(StringRef, SMLoc) {
   Lex();
-  getStreamer().EmitWin64EHStartChained();
+  getStreamer().EmitWinCFIStartChained();
   return false;
 }
 
 bool COFFAsmParser::ParseSEHDirectiveEndChained(StringRef, SMLoc) {
   Lex();
-  getStreamer().EmitWin64EHEndChained();
+  getStreamer().EmitWinCFIEndChained();
   return false;
 }
 
@@ -585,13 +567,13 @@ bool COFFAsmParser::ParseSEHDirectiveHandler(StringRef, SMLoc) {
   MCSymbol *handler = getContext().GetOrCreateSymbol(SymbolID);
 
   Lex();
-  getStreamer().EmitWin64EHHandler(handler, unwind, except);
+  getStreamer().EmitWinEHHandler(handler, unwind, except);
   return false;
 }
 
 bool COFFAsmParser::ParseSEHDirectiveHandlerData(StringRef, SMLoc) {
   Lex();
-  getStreamer().EmitWin64EHHandlerData();
+  getStreamer().EmitWinEHHandlerData();
   return false;
 }
 
@@ -604,7 +586,7 @@ bool COFFAsmParser::ParseSEHDirectivePushReg(StringRef, SMLoc L) {
     return TokError("unexpected token in directive");
 
   Lex();
-  getStreamer().EmitWin64EHPushReg(Reg);
+  getStreamer().EmitWinCFIPushReg(Reg);
   return false;
 }
 
@@ -628,7 +610,7 @@ bool COFFAsmParser::ParseSEHDirectiveSetFrame(StringRef, SMLoc L) {
     return TokError("unexpected token in directive");
 
   Lex();
-  getStreamer().EmitWin64EHSetFrame(Reg, Off);
+  getStreamer().EmitWinCFISetFrame(Reg, Off);
   return false;
 }
 
@@ -645,7 +627,7 @@ bool COFFAsmParser::ParseSEHDirectiveAllocStack(StringRef, SMLoc) {
     return TokError("unexpected token in directive");
 
   Lex();
-  getStreamer().EmitWin64EHAllocStack(Size);
+  getStreamer().EmitWinCFIAllocStack(Size);
   return false;
 }
 
@@ -670,7 +652,7 @@ bool COFFAsmParser::ParseSEHDirectiveSaveReg(StringRef, SMLoc L) {
 
   Lex();
   // FIXME: Err on %xmm* registers
-  getStreamer().EmitWin64EHSaveReg(Reg, Off);
+  getStreamer().EmitWinCFISaveReg(Reg, Off);
   return false;
 }
 
@@ -697,7 +679,7 @@ bool COFFAsmParser::ParseSEHDirectiveSaveXMM(StringRef, SMLoc L) {
 
   Lex();
   // FIXME: Err on non-%xmm* registers
-  getStreamer().EmitWin64EHSaveXMM(Reg, Off);
+  getStreamer().EmitWinCFISaveXMM(Reg, Off);
   return false;
 }
 
@@ -718,13 +700,13 @@ bool COFFAsmParser::ParseSEHDirectivePushFrame(StringRef, SMLoc) {
     return TokError("unexpected token in directive");
 
   Lex();
-  getStreamer().EmitWin64EHPushFrame(Code);
+  getStreamer().EmitWinCFIPushFrame(Code);
   return false;
 }
 
 bool COFFAsmParser::ParseSEHDirectiveEndProlog(StringRef, SMLoc) {
   Lex();
-  getStreamer().EmitWin64EHEndProlog();
+  getStreamer().EmitWinCFIEndProlog();
   return false;
 }
 
diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp
index f74b30a..b2a6785 100644
--- a/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -650,7 +650,7 @@ bool DarwinAsmParser::parseDirectiveSecureLogUnique(StringRef, SMLoc IDLoc) {
   }
 
   // Write the message.
-  int CurBuf = getSourceManager().FindBufferContainingLoc(IDLoc);
+  unsigned CurBuf = getSourceManager().FindBufferContainingLoc(IDLoc);
   *OS << getSourceManager().getBufferInfo(CurBuf).Buffer->getBufferIdentifier()
       << ":" << getSourceManager().FindLineNumber(IDLoc, CurBuf) << ":"
       << LogMessage + "\n";
diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
index 95c4971..98b2b3b 100644
--- a/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/lib/MC/MCParser/ELFAsmParser.cpp
@@ -150,7 +150,7 @@ public:
 
 private:
   bool ParseSectionName(StringRef &SectionName);
-  bool ParseSectionArguments(bool IsPush);
+  bool ParseSectionArguments(bool IsPush, SMLoc loc);
   unsigned parseSunStyleSectionFlags();
 };
 
@@ -382,7 +382,7 @@ unsigned ELFAsmParser::parseSunStyleSectionFlags() {
 bool ELFAsmParser::ParseDirectivePushSection(StringRef s, SMLoc loc) {
   getStreamer().PushSection();
 
-  if (ParseSectionArguments(/*IsPush=*/true)) {
+  if (ParseSectionArguments(/*IsPush=*/true, loc)) {
     getStreamer().PopSection();
     return true;
   }
@@ -397,11 +397,11 @@ bool ELFAsmParser::ParseDirectivePopSection(StringRef, SMLoc) {
 }
 
 // FIXME: This is a work in progress.
-bool ELFAsmParser::ParseDirectiveSection(StringRef, SMLoc) {
-  return ParseSectionArguments(/*IsPush=*/false);
+bool ELFAsmParser::ParseDirectiveSection(StringRef, SMLoc loc) {
+  return ParseSectionArguments(/*IsPush=*/false, loc);
 }
 
-bool ELFAsmParser::ParseSectionArguments(bool IsPush) {
+bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
   StringRef SectionName;
 
   if (ParseSectionName(SectionName))
@@ -545,10 +545,24 @@ EndStmt:
   }
 
   SectionKind Kind = computeSectionKind(Flags, Size);
-  getStreamer().SwitchSection(getContext().getELFSection(SectionName, Type,
-                                                         Flags, Kind, Size,
-                                                         GroupName),
-                              Subsection);
+  const MCSection *ELFSection = getContext().getELFSection(
+      SectionName, Type, Flags, Kind, Size, GroupName);
+  getStreamer().SwitchSection(ELFSection, Subsection);
+
+  if (getContext().getGenDwarfForAssembly()) {
+    auto &Sections = getContext().getGenDwarfSectionSyms();
+    auto InsertResult = Sections.insert(
+        std::make_pair(ELFSection, std::make_pair(nullptr, nullptr)));
+    if (InsertResult.second) {
+      if (getContext().getDwarfVersion() <= 2)
+        Error(loc, "DWARF2 only supports one section per compilation unit");
+
+      MCSymbol *SectionStartSymbol = getContext().CreateTempSymbol();
+      getStreamer().EmitLabel(SectionStartSymbol);
+      InsertResult.first->second.first = SectionStartSymbol;
+    }
+  }
+
   return false;
 }
 
@@ -561,6 +575,19 @@ bool ELFAsmParser::ParseDirectivePrevious(StringRef DirName, SMLoc) {
   return false;
 }
 
+static MCSymbolAttr MCAttrForString(StringRef Type) {
+  return StringSwitch<MCSymbolAttr>(Type)
+          .Cases("STT_FUNC", "function", MCSA_ELF_TypeFunction)
+          .Cases("STT_OBJECT", "object", MCSA_ELF_TypeObject)
+          .Cases("STT_TLS", "tls_object", MCSA_ELF_TypeTLS)
+          .Cases("STT_COMMON", "common", MCSA_ELF_TypeCommon)
+          .Cases("STT_NOTYPE", "notype", MCSA_ELF_TypeNoType)
+          .Cases("STT_GNU_IFUNC", "gnu_indirect_function",
+                 MCSA_ELF_TypeIndFunction)
+          .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject)
+          .Default(MCSA_Invalid);
+}
+
 /// ParseDirectiveELFType
 ///  ::= .type identifier , STT_<TYPE_IN_UPPER_CASE>
 ///  ::= .type identifier , #attribute
@@ -575,53 +602,36 @@ bool ELFAsmParser::ParseDirectiveType(StringRef, SMLoc) {
   // Handle the identifier as the key symbol.
   MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
 
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("unexpected token in '.type' directive");
-  Lex();
-
-  StringRef Type;
-  SMLoc TypeLoc;
-  MCSymbolAttr Attr;
-  if (getLexer().is(AsmToken::Identifier)) {
-    TypeLoc = getLexer().getLoc();
-    if (getParser().parseIdentifier(Type))
-      return TokError("expected symbol type in directive");
-    Attr = StringSwitch<MCSymbolAttr>(Type)
-               .Case("STT_FUNC", MCSA_ELF_TypeFunction)
-               .Case("STT_OBJECT", MCSA_ELF_TypeObject)
-               .Case("STT_TLS", MCSA_ELF_TypeTLS)
-               .Case("STT_COMMON", MCSA_ELF_TypeCommon)
-               .Case("STT_NOTYPE", MCSA_ELF_TypeNoType)
-               .Case("STT_GNU_IFUNC", MCSA_ELF_TypeIndFunction)
-               .Default(MCSA_Invalid);
-  } else if (getLexer().is(AsmToken::Hash) || getLexer().is(AsmToken::At) ||
-             getLexer().is(AsmToken::Percent) ||
-             getLexer().is(AsmToken::String)) {
-    if (!getLexer().is(AsmToken::String))
-      Lex();
+  // NOTE the comma is optional in all cases.  It is only documented as being
+  // optional in the first case, however, GAS will silently treat the comma as
+  // optional in all cases.  Furthermore, although the documentation states that
+  // the first form only accepts STT_<TYPE_IN_UPPER_CASE>, in reality, GAS
+  // accepts both the upper case name as well as the lower case aliases.
+  if (getLexer().is(AsmToken::Comma))
+    Lex();
 
-    TypeLoc = getLexer().getLoc();
-    if (getParser().parseIdentifier(Type))
-      return TokError("expected symbol type in directive");
-    Attr = StringSwitch<MCSymbolAttr>(Type)
-               .Case("function", MCSA_ELF_TypeFunction)
-               .Case("object", MCSA_ELF_TypeObject)
-               .Case("tls_object", MCSA_ELF_TypeTLS)
-               .Case("common", MCSA_ELF_TypeCommon)
-               .Case("notype", MCSA_ELF_TypeNoType)
-               .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject)
-               .Case("gnu_indirect_function", MCSA_ELF_TypeIndFunction)
-               .Default(MCSA_Invalid);
-  } else
+  if (getLexer().isNot(AsmToken::Identifier) &&
+      getLexer().isNot(AsmToken::Hash) && getLexer().isNot(AsmToken::At) &&
+      getLexer().isNot(AsmToken::Percent) && getLexer().isNot(AsmToken::String))
     return TokError("expected STT_<TYPE_IN_UPPER_CASE>, '#<type>', '@<type>', "
                     "'%<type>' or \"<type>\"");
 
+  if (getLexer().isNot(AsmToken::String) &&
+      getLexer().isNot(AsmToken::Identifier))
+    Lex();
+
+  SMLoc TypeLoc = getLexer().getLoc();
+
+  StringRef Type;
+  if (getParser().parseIdentifier(Type))
+    return TokError("expected symbol type in directive");
+
+  MCSymbolAttr Attr = MCAttrForString(Type);
   if (Attr == MCSA_Invalid)
     return Error(TypeLoc, "unsupported attribute in '.type' directive");
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.type' directive");
-
   Lex();
 
   getStreamer().EmitSymbolAttribute(Sym, Attr);
diff --git a/lib/MC/MCSectionCOFF.cpp b/lib/MC/MCSectionCOFF.cpp
index 335b8cd..fc2bd36 100644
--- a/lib/MC/MCSectionCOFF.cpp
+++ b/lib/MC/MCSectionCOFF.cpp
@@ -30,14 +30,9 @@ bool MCSectionCOFF::ShouldOmitSectionDirective(StringRef Name,
   return false;
 }
 
-void MCSectionCOFF::setSelection(int Selection,
-                                 const MCSectionCOFF *Assoc) const {
+void MCSectionCOFF::setSelection(int Selection) const {
   assert(Selection != 0 && "invalid COMDAT selection type");
-  assert((Selection == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) ==
-         (Assoc != nullptr) &&
-    "associative COMDAT section must have an associated section");
   this->Selection = Selection;
-  this->Assoc = Assoc;
   Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
 }
 
@@ -82,7 +77,7 @@ void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI,
         OS << "same_contents,";
         break;
       case COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE:
-        OS << "associative " << Assoc->getSectionName() << ",";
+        OS << "associative,";
         break;
       case COFF::IMAGE_COMDAT_SELECT_LARGEST:
         OS << "largest,";
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 7dccf0d..bdcdb97 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -37,7 +37,7 @@ void MCTargetStreamer::finish() {}
 void MCTargetStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {}
 
 MCStreamer::MCStreamer(MCContext &Ctx)
-    : Context(Ctx), CurrentW64UnwindInfo(nullptr), LastSymbol(nullptr) {
+    : Context(Ctx), CurrentW64UnwindInfo(nullptr) {
   SectionStack.push_back(std::pair<MCSectionSubPair, MCSectionSubPair>());
 }
 
@@ -51,7 +51,6 @@ void MCStreamer::reset() {
     delete W64UnwindInfos[i];
   W64UnwindInfos.clear();
   CurrentW64UnwindInfo = nullptr;
-  LastSymbol = nullptr;
   SectionStack.clear();
   SectionStack.push_back(std::pair<MCSectionSubPair, MCSectionSubPair>());
 }
@@ -234,20 +233,12 @@ void MCStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
   assert(getCurrentSection().first && "Cannot emit before setting section!");
   AssignSection(Symbol, getCurrentSection().first);
-  LastSymbol = Symbol;
 
   MCTargetStreamer *TS = getTargetStreamer();
   if (TS)
     TS->emitLabel(Symbol);
 }
 
-void MCStreamer::EmitDebugLabel(MCSymbol *Symbol) {
-  assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
-  assert(getCurrentSection().first && "Cannot emit before setting section!");
-  AssignSection(Symbol, getCurrentSection().first);
-  LastSymbol = Symbol;
-}
-
 void MCStreamer::EmitCompactUnwindEncoding(uint32_t CompactUnwindEncoding) {
   EnsureValidFrame();
   MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
@@ -273,17 +264,6 @@ void MCStreamer::EmitCFIStartProc(bool IsSimple) {
 void MCStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
 }
 
-void MCStreamer::RecordProcStart(MCDwarfFrameInfo &Frame) {
-  // Report an error if we haven't seen a symbol yet where we'd bind
-  // .cfi_startproc.
-  if (!LastSymbol)
-    report_fatal_error("No symbol to start a frame");
-  Frame.Function = LastSymbol;
-  // We need to create a local symbol to avoid relocations.
-  Frame.Begin = getContext().CreateTempSymbol();
-  EmitLabel(Frame.Begin);
-}
-
 void MCStreamer::EmitCFIEndProc() {
   EnsureValidFrame();
   MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
@@ -291,11 +271,9 @@ void MCStreamer::EmitCFIEndProc() {
 }
 
 void MCStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
-}
-
-void MCStreamer::RecordProcEnd(MCDwarfFrameInfo &Frame) {
-  Frame.End = getContext().CreateTempSymbol();
-  EmitLabel(Frame.End);
+  // Put a dummy non-null value in Frame.End to mark that this frame has been
+  // closed.
+  Frame.End = (MCSymbol *) 1;
 }
 
 MCSymbol *MCStreamer::EmitCFICommon() {
@@ -447,7 +425,7 @@ void MCStreamer::EnsureValidW64UnwindInfo() {
     report_fatal_error("No open Win64 EH frame function!");
 }
 
-void MCStreamer::EmitWin64EHStartProc(const MCSymbol *Symbol) {
+void MCStreamer::EmitWinCFIStartProc(const MCSymbol *Symbol) {
   MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
   if (CurFrame && !CurFrame->End)
     report_fatal_error("Starting a function before ending the previous one!");
@@ -458,7 +436,7 @@ void MCStreamer::EmitWin64EHStartProc(const MCSymbol *Symbol) {
   setCurrentW64UnwindInfo(Frame);
 }
 
-void MCStreamer::EmitWin64EHEndProc() {
+void MCStreamer::EmitWinCFIEndProc() {
   EnsureValidW64UnwindInfo();
   MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
   if (CurFrame->ChainedParent)
@@ -467,7 +445,7 @@ void MCStreamer::EmitWin64EHEndProc() {
   EmitLabel(CurFrame->End);
 }
 
-void MCStreamer::EmitWin64EHStartChained() {
+void MCStreamer::EmitWinCFIStartChained() {
   EnsureValidW64UnwindInfo();
   MCWin64EHUnwindInfo *Frame = new MCWin64EHUnwindInfo;
   MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
@@ -478,7 +456,7 @@ void MCStreamer::EmitWin64EHStartChained() {
   setCurrentW64UnwindInfo(Frame);
 }
 
-void MCStreamer::EmitWin64EHEndChained() {
+void MCStreamer::EmitWinCFIEndChained() {
   EnsureValidW64UnwindInfo();
   MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
   if (!CurFrame->ChainedParent)
@@ -488,8 +466,8 @@ void MCStreamer::EmitWin64EHEndChained() {
   CurrentW64UnwindInfo = CurFrame->ChainedParent;
 }
 
-void MCStreamer::EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind,
-                                    bool Except) {
+void MCStreamer::EmitWinEHHandler(const MCSymbol *Sym, bool Unwind,
+                                  bool Except) {
   EnsureValidW64UnwindInfo();
   MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
   if (CurFrame->ChainedParent)
@@ -503,14 +481,14 @@ void MCStreamer::EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind,
     CurFrame->HandlesExceptions = true;
 }
 
-void MCStreamer::EmitWin64EHHandlerData() {
+void MCStreamer::EmitWinEHHandlerData() {
   EnsureValidW64UnwindInfo();
   MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
   if (CurFrame->ChainedParent)
     report_fatal_error("Chained unwind areas can't have handlers!");
 }
 
-void MCStreamer::EmitWin64EHPushReg(unsigned Register) {
+void MCStreamer::EmitWinCFIPushReg(unsigned Register) {
   EnsureValidW64UnwindInfo();
   MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
   MCSymbol *Label = getContext().CreateTempSymbol();
@@ -519,13 +497,15 @@ void MCStreamer::EmitWin64EHPushReg(unsigned Register) {
   CurFrame->Instructions.push_back(Inst);
 }
 
-void MCStreamer::EmitWin64EHSetFrame(unsigned Register, unsigned Offset) {
+void MCStreamer::EmitWinCFISetFrame(unsigned Register, unsigned Offset) {
   EnsureValidW64UnwindInfo();
   MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
   if (CurFrame->LastFrameInst >= 0)
     report_fatal_error("Frame register and offset already specified!");
   if (Offset & 0x0F)
     report_fatal_error("Misaligned frame pointer offset!");
+  if (Offset > 240)
+    report_fatal_error("Frame offset must be less than or equal to 240!");
   MCSymbol *Label = getContext().CreateTempSymbol();
   MCWin64EHInstruction Inst(Win64EH::UOP_SetFPReg, Label, Register, Offset);
   EmitLabel(Label);
@@ -533,8 +513,10 @@ void MCStreamer::EmitWin64EHSetFrame(unsigned Register, unsigned Offset) {
   CurFrame->Instructions.push_back(Inst);
 }
 
-void MCStreamer::EmitWin64EHAllocStack(unsigned Size) {
+void MCStreamer::EmitWinCFIAllocStack(unsigned Size) {
   EnsureValidW64UnwindInfo();
+  if (Size == 0)
+    report_fatal_error("Allocation size must be non-zero!");
   if (Size & 7)
     report_fatal_error("Misaligned stack allocation!");
   MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
@@ -544,7 +526,7 @@ void MCStreamer::EmitWin64EHAllocStack(unsigned Size) {
   CurFrame->Instructions.push_back(Inst);
 }
 
-void MCStreamer::EmitWin64EHSaveReg(unsigned Register, unsigned Offset) {
+void MCStreamer::EmitWinCFISaveReg(unsigned Register, unsigned Offset) {
   EnsureValidW64UnwindInfo();
   if (Offset & 7)
     report_fatal_error("Misaligned saved register offset!");
@@ -557,7 +539,7 @@ void MCStreamer::EmitWin64EHSaveReg(unsigned Register, unsigned Offset) {
   CurFrame->Instructions.push_back(Inst);
 }
 
-void MCStreamer::EmitWin64EHSaveXMM(unsigned Register, unsigned Offset) {
+void MCStreamer::EmitWinCFISaveXMM(unsigned Register, unsigned Offset) {
   EnsureValidW64UnwindInfo();
   if (Offset & 0x0F)
     report_fatal_error("Misaligned saved vector register offset!");
@@ -570,7 +552,7 @@ void MCStreamer::EmitWin64EHSaveXMM(unsigned Register, unsigned Offset) {
   CurFrame->Instructions.push_back(Inst);
 }
 
-void MCStreamer::EmitWin64EHPushFrame(bool Code) {
+void MCStreamer::EmitWinCFIPushFrame(bool Code) {
   EnsureValidW64UnwindInfo();
   MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
   if (CurFrame->Instructions.size() > 0)
@@ -581,7 +563,7 @@ void MCStreamer::EmitWin64EHPushFrame(bool Code) {
   CurFrame->Instructions.push_back(Inst);
 }
 
-void MCStreamer::EmitWin64EHEndProlog() {
+void MCStreamer::EmitWinCFIEndProlog() {
   EnsureValidW64UnwindInfo();
   MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
   CurFrame->PrologEnd = getContext().CreateTempSymbol();
@@ -589,11 +571,9 @@ void MCStreamer::EmitWin64EHEndProlog() {
 }
 
 void MCStreamer::EmitCOFFSectionIndex(MCSymbol const *Symbol) {
-  llvm_unreachable("This file format doesn't support this directive");
 }
 
 void MCStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol) {
-  llvm_unreachable("This file format doesn't support this directive");
 }
 
 /// EmitRawText - If this file is backed by an assembly streamer, this dumps
@@ -629,9 +609,82 @@ void MCStreamer::Finish() {
 }
 
 void MCStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
+  visitUsedExpr(*Value);
   Symbol->setVariableValue(Value);
 
   MCTargetStreamer *TS = getTargetStreamer();
   if (TS)
     TS->emitAssignment(Symbol, Value);
 }
+
+void MCStreamer::visitUsedSymbol(const MCSymbol &Sym) {
+}
+
+void MCStreamer::visitUsedExpr(const MCExpr &Expr) {
+  switch (Expr.getKind()) {
+  case MCExpr::Target:
+    cast<MCTargetExpr>(Expr).visitUsedExpr(*this);
+    break;
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr &BE = cast<MCBinaryExpr>(Expr);
+    visitUsedExpr(*BE.getLHS());
+    visitUsedExpr(*BE.getRHS());
+    break;
+  }
+
+  case MCExpr::SymbolRef:
+    visitUsedSymbol(cast<MCSymbolRefExpr>(Expr).getSymbol());
+    break;
+
+  case MCExpr::Unary:
+    visitUsedExpr(*cast<MCUnaryExpr>(Expr).getSubExpr());
+    break;
+  }
+}
+
+void MCStreamer::EmitInstruction(const MCInst &Inst,
+                                 const MCSubtargetInfo &STI) {
+  // Scan for values.
+  for (unsigned i = Inst.getNumOperands(); i--;)
+    if (Inst.getOperand(i).isExpr())
+      visitUsedExpr(*Inst.getOperand(i).getExpr());
+}
+
+void MCStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {}
+void MCStreamer::EmitThumbFunc(MCSymbol *Func) {}
+void MCStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {}
+void MCStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {}
+void MCStreamer::EndCOFFSymbolDef() {}
+void MCStreamer::EmitFileDirective(StringRef Filename) {}
+void MCStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {}
+void MCStreamer::EmitCOFFSymbolType(int Type) {}
+void MCStreamer::EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {}
+void MCStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                       unsigned ByteAlignment) {}
+void MCStreamer::EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
+                                uint64_t Size, unsigned ByteAlignment) {}
+void MCStreamer::ChangeSection(const MCSection *, const MCExpr *) {}
+void MCStreamer::EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {}
+void MCStreamer::EmitBytes(StringRef Data) {}
+void MCStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
+                               const SMLoc &Loc) {
+  visitUsedExpr(*Value);
+}
+void MCStreamer::EmitULEB128Value(const MCExpr *Value) {}
+void MCStreamer::EmitSLEB128Value(const MCExpr *Value) {}
+void MCStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
+                                      unsigned ValueSize,
+                                      unsigned MaxBytesToEmit) {}
+void MCStreamer::EmitCodeAlignment(unsigned ByteAlignment,
+                                   unsigned MaxBytesToEmit) {}
+bool MCStreamer::EmitValueToOffset(const MCExpr *Offset, unsigned char Value) {
+  return false;
+}
+void MCStreamer::EmitBundleAlignMode(unsigned AlignPow2) {}
+void MCStreamer::EmitBundleLock(bool AlignToEnd) {}
+void MCStreamer::FinishImpl() {}
+void MCStreamer::EmitBundleUnlock() {}
diff --git a/lib/MC/MCTargetOptions.cpp b/lib/MC/MCTargetOptions.cpp
index 8e946d5..efd724a 100644
--- a/lib/MC/MCTargetOptions.cpp
+++ b/lib/MC/MCTargetOptions.cpp
@@ -14,6 +14,7 @@ namespace llvm {
 MCTargetOptions::MCTargetOptions()
     : SanitizeAddress(false), MCRelaxAll(false), MCNoExecStack(false),
       MCSaveTempLabels(false), MCUseDwarfDirectory(false),
-      ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false) {}
+      ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false),
+      DwarfVersion(0) {}
 
 } // end namespace llvm
diff --git a/lib/MC/MCWin64EH.cpp b/lib/MC/MCWin64EH.cpp
index b8b07d3..bb651647 100644
--- a/lib/MC/MCWin64EH.cpp
+++ b/lib/MC/MCWin64EH.cpp
@@ -20,34 +20,30 @@ namespace llvm {
 
 // NOTE: All relocations generated here are 4-byte image-relative.
 
-static uint8_t CountOfUnwindCodes(std::vector<MCWin64EHInstruction> &instArray){
-  uint8_t count = 0;
-  for (std::vector<MCWin64EHInstruction>::const_iterator I = instArray.begin(),
-       E = instArray.end(); I != E; ++I) {
-    switch (I->getOperation()) {
+static uint8_t CountOfUnwindCodes(std::vector<MCWin64EHInstruction> &Insns) {
+  uint8_t Count = 0;
+  for (const auto &I : Insns) {
+    switch (I.getOperation()) {
     case Win64EH::UOP_PushNonVol:
     case Win64EH::UOP_AllocSmall:
     case Win64EH::UOP_SetFPReg:
     case Win64EH::UOP_PushMachFrame:
-      count += 1;
+      Count += 1;
       break;
     case Win64EH::UOP_SaveNonVol:
     case Win64EH::UOP_SaveXMM128:
-      count += 2;
+      Count += 2;
       break;
     case Win64EH::UOP_SaveNonVolBig:
     case Win64EH::UOP_SaveXMM128Big:
-      count += 3;
+      Count += 3;
       break;
     case Win64EH::UOP_AllocLarge:
-      if (I->getSize() > 512*1024-8)
-        count += 3;
-      else
-        count += 2;
+      Count += (I.getSize() > 512 * 1024 - 8) ? 3 : 2;
       break;
     }
   }
-  return count;
+  return Count;
 }
 
 static void EmitAbsDifference(MCStreamer &streamer, MCSymbol *lhs,
@@ -274,23 +270,23 @@ void MCWin64EHUnwindEmitter::EmitUnwindInfo(MCStreamer &streamer,
   llvm::EmitUnwindInfo(streamer, info);
 }
 
-void MCWin64EHUnwindEmitter::Emit(MCStreamer &streamer) {
-  MCContext &context = streamer.getContext();
+void MCWin64EHUnwindEmitter::Emit(MCStreamer &Streamer) {
+  MCContext &Context = Streamer.getContext();
+
   // Emit the unwind info structs first.
-  for (unsigned i = 0; i < streamer.getNumW64UnwindInfos(); ++i) {
-    MCWin64EHUnwindInfo &info = streamer.getW64UnwindInfo(i);
-    const MCSection *xdataSect =
-      getWin64EHTableSection(GetSectionSuffix(info.Function), context);
-    streamer.SwitchSection(xdataSect);
-    llvm::EmitUnwindInfo(streamer, &info);
+  for (const auto &CFI : Streamer.getW64UnwindInfos()) {
+    const MCSection *XData =
+        getWin64EHTableSection(GetSectionSuffix(CFI->Function), Context);
+    Streamer.SwitchSection(XData);
+    EmitUnwindInfo(Streamer, CFI);
   }
+
   // Now emit RUNTIME_FUNCTION entries.
-  for (unsigned i = 0; i < streamer.getNumW64UnwindInfos(); ++i) {
-    MCWin64EHUnwindInfo &info = streamer.getW64UnwindInfo(i);
-    const MCSection *pdataSect =
-      getWin64EHFuncTableSection(GetSectionSuffix(info.Function), context);
-    streamer.SwitchSection(pdataSect);
-    EmitRuntimeFunction(streamer, &info);
+  for (const auto &CFI : Streamer.getW64UnwindInfos()) {
+    const MCSection *PData =
+        getWin64EHFuncTableSection(GetSectionSuffix(CFI->Function), Context);
+    Streamer.SwitchSection(PData);
+    EmitRuntimeFunction(Streamer, CFI);
   }
 }
 
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index cbaf0b8..5214398 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -303,20 +303,50 @@ void MachObjectWriter::WriteDysymtabLoadCommand(uint32_t FirstLocalSymbol,
   assert(OS.tell() - Start == sizeof(MachO::dysymtab_command));
 }
 
+MachObjectWriter::MachSymbolData *
+MachObjectWriter::findSymbolData(const MCSymbol &Sym) {
+  for (auto &Entry : LocalSymbolData)
+    if (&Entry.SymbolData->getSymbol() == &Sym)
+      return &Entry;
+
+  for (auto &Entry : ExternalSymbolData)
+    if (&Entry.SymbolData->getSymbol() == &Sym)
+      return &Entry;
+
+  for (auto &Entry : UndefinedSymbolData)
+    if (&Entry.SymbolData->getSymbol() == &Sym)
+      return &Entry;
+
+  return nullptr;
+}
+
 void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
                                   const MCAsmLayout &Layout) {
   MCSymbolData &Data = *MSD.SymbolData;
-  const MCSymbol &Symbol = Data.getSymbol();
+  const MCSymbol *Symbol = &Data.getSymbol();
+  const MCSymbol *AliasedSymbol = &Symbol->AliasedSymbol();
+  uint8_t SectionIndex = MSD.SectionIndex;
   uint8_t Type = 0;
   uint16_t Flags = Data.getFlags();
   uint64_t Address = 0;
+  bool IsAlias = Symbol != AliasedSymbol;
+
+  MachSymbolData *AliaseeInfo;
+  if (IsAlias) {
+    AliaseeInfo = findSymbolData(*AliasedSymbol);
+    if (AliaseeInfo)
+      SectionIndex = AliaseeInfo->SectionIndex;
+    Symbol = AliasedSymbol;
+  }
 
   // Set the N_TYPE bits. See <mach-o/nlist.h>.
   //
   // FIXME: Are the prebound or indirect fields possible here?
-  if (Symbol.isUndefined())
+  if (IsAlias && Symbol->isUndefined())
+    Type = MachO::N_INDR;
+  else if (Symbol->isUndefined())
     Type = MachO::N_UNDF;
-  else if (Symbol.isAbsolute())
+  else if (Symbol->isAbsolute())
     Type = MachO::N_ABS;
   else
     Type = MachO::N_SECT;
@@ -327,13 +357,15 @@ void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
     Type |= MachO::N_PEXT;
 
   // Set external bit.
-  if (Data.isExternal() || Symbol.isUndefined())
+  if (Data.isExternal() || (!IsAlias && Symbol->isUndefined()))
     Type |= MachO::N_EXT;
 
   // Compute the symbol address.
-  if (Symbol.isDefined()) {
+  if (IsAlias && Symbol->isUndefined())
+    Address = AliaseeInfo->StringIndex;
+  else if (Symbol->isDefined())
     Address = getSymbolAddress(&Data, Layout);
-  } else if (Data.isCommon()) {
+  else if (Data.isCommon()) {
     // Common symbols are encoded with the size in the address
     // field, and their alignment in the flags.
     Address = Data.getCommonSize();
@@ -344,21 +376,21 @@ void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
       assert((1U << Log2Size) == Align && "Invalid 'common' alignment!");
       if (Log2Size > 15)
         report_fatal_error("invalid 'common' alignment '" +
-                           Twine(Align) + "' for '" + Symbol.getName() + "'",
+                           Twine(Align) + "' for '" + Symbol->getName() + "'",
                            false);
       // FIXME: Keep this mask with the SymbolFlags enumeration.
       Flags = (Flags & 0xF0FF) | (Log2Size << 8);
     }
   }
 
-  if (Layout.getAssembler().isThumbFunc(&Symbol))
+  if (Layout.getAssembler().isThumbFunc(Symbol))
     Flags |= SF_ThumbFunc;
 
   // struct nlist (12 bytes)
 
   Write32(MSD.StringIndex);
   Write8(Type);
-  Write8(MSD.SectionIndex);
+  Write8(SectionIndex);
 
   // The Mach-O streamer uses the lowest 16-bits of the flags for the 'desc'
   // value.
diff --git a/lib/MC/Makefile b/lib/MC/Makefile
index bf8b7c0..a10f17e 100644
--- a/lib/MC/Makefile
+++ b/lib/MC/Makefile
@@ -10,7 +10,7 @@
 LEVEL = ../..
 LIBRARYNAME = LLVMMC
 BUILD_ARCHIVE := 1
-PARALLEL_DIRS := MCParser MCDisassembler
+PARALLEL_DIRS := MCAnalysis MCParser MCDisassembler
 
 include $(LEVEL)/Makefile.common
 
diff --git a/lib/Object/StringTableBuilder.cpp b/lib/MC/StringTableBuilder.cpp
index 9152834..db58ece 100644
--- a/lib/Object/StringTableBuilder.cpp
+++ b/lib/MC/StringTableBuilder.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/MC/StringTableBuilder.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Object/StringTableBuilder.h"
 
 using namespace llvm;
 
diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
index 961cbc6..a462c0d 100644
--- a/lib/MC/WinCOFFObjectWriter.cpp
+++ b/lib/MC/WinCOFFObjectWriter.cpp
@@ -347,6 +347,14 @@ void WinCOFFObjectWriter::DefineSection(MCSectionData const &SectionData) {
 
   COFFSection *coff_section = createSection(Sec.getSectionName());
   COFFSymbol  *coff_symbol = createSymbol(Sec.getSectionName());
+  if (Sec.getSelection() != COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
+    if (const MCSymbol *S = Sec.getCOMDATSymbol()) {
+      COFFSymbol *COMDATSymbol = GetOrCreateCOFFSymbol(S);
+      if (COMDATSymbol->Section)
+        report_fatal_error("two sections have the same comdat");
+      COMDATSymbol->Section = coff_section;
+    }
+  }
 
   coff_section->Symbol = coff_symbol;
   coff_symbol->Section = coff_section;
@@ -458,9 +466,15 @@ void WinCOFFObjectWriter::DefineSymbol(MCSymbolData const &SymbolData,
       coff_symbol->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
     } else {
       const MCSymbolData &BaseData = Assembler.getSymbolData(*Base);
-      if (BaseData.Fragment)
-        coff_symbol->Section =
+      if (BaseData.Fragment) {
+        COFFSection *Sec =
             SectionMap[&BaseData.Fragment->getParent()->getSection()];
+
+        if (coff_symbol->Section && coff_symbol->Section != Sec)
+          report_fatal_error("conflicting sections for symbol");
+
+        coff_symbol->Section = Sec;
+      }
     }
 
     coff_symbol->MCData = &ResSymData;
@@ -537,7 +551,7 @@ bool WinCOFFObjectWriter::ExportSymbol(MCSymbolData const &SymbolData,
   // This doesn't seem to be right. Strings referred to from the .data section
   // need symbols so they can be linked to code in the .text section right?
 
-  // return Asm.isSymbolLinkerVisible (&SymbolData);
+  // return Asm.isSymbolLinkerVisible(SymbolData.getSymbol());
 
   // For now, all non-variable symbols are exported,
   // the linker will sort the rest out for us.
@@ -819,13 +833,9 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
 
   DenseMap<COFFSection *, uint16_t> SectionIndices;
   for (auto & Section : Sections) {
-    if (Layout.getSectionAddressSize(Section->MCData) > 0) {
-      size_t Number = ++Header.NumberOfSections;
-      SectionIndices[Section.get()] = Number;
-      MakeSectionReal(*Section, Number);
-    } else {
-      Section->Number = -1;
-    }
+    size_t Number = ++Header.NumberOfSections;
+    SectionIndices[Section.get()] = Number;
+    MakeSectionReal(*Section, Number);
   }
 
   Header.NumberOfSymbols = 0;
@@ -865,11 +875,15 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
     const MCSectionCOFF &MCSec =
       static_cast<const MCSectionCOFF &>(Section->MCData->getSection());
 
-    COFFSection *Assoc = SectionMap.lookup(MCSec.getAssocSection());
+    const MCSymbol *COMDAT = MCSec.getCOMDATSymbol();
+    assert(COMDAT);
+    COFFSymbol *COMDATSymbol = GetOrCreateCOFFSymbol(COMDAT);
+    assert(COMDATSymbol);
+    COFFSection *Assoc = COMDATSymbol->Section;
     if (!Assoc)
-      report_fatal_error(Twine("Missing associated COMDAT section ") +
-                         MCSec.getAssocSection()->getSectionName() +
-                         " for section " + MCSec.getSectionName());
+      report_fatal_error(
+          Twine("Missing associated COMDAT section for section ") +
+          MCSec.getSectionName());
 
     // Skip this section if the associated section is unused.
     if (Assoc->Number == -1)
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index e6df465..d391a3f 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -82,10 +82,6 @@ void MCWinCOFFStreamer::EmitLabel(MCSymbol *Symbol) {
   MCObjectStreamer::EmitLabel(Symbol);
 }
 
-void MCWinCOFFStreamer::EmitDebugLabel(MCSymbol *Symbol) {
-  EmitLabel(Symbol);
-}
-
 void MCWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   llvm_unreachable("not implemented");
 }
@@ -242,7 +238,7 @@ void MCWinCOFFStreamer::EmitIdent(StringRef IdentString) {
   llvm_unreachable("not implemented");
 }
 
-void MCWinCOFFStreamer::EmitWin64EHHandlerData() {
+void MCWinCOFFStreamer::EmitWinEHHandlerData() {
   llvm_unreachable("not implemented");
 }
 
diff --git a/lib/Object/YAML.cpp b/lib/MC/YAML.cpp
index 61e9da3..067e91a 100644
--- a/lib/Object/YAML.cpp
+++ b/lib/MC/YAML.cpp
@@ -12,21 +12,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Object/YAML.h"
+#include "llvm/MC/YAML.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
 
 using namespace llvm;
-using namespace object::yaml;
 
-void yaml::ScalarTraits<object::yaml::BinaryRef>::output(
-    const object::yaml::BinaryRef &Val, void *, llvm::raw_ostream &Out) {
+void yaml::ScalarTraits<yaml::BinaryRef>::output(
+    const yaml::BinaryRef &Val, void *, llvm::raw_ostream &Out) {
   Val.writeAsHex(Out);
 }
 
-StringRef yaml::ScalarTraits<object::yaml::BinaryRef>::input(
-    StringRef Scalar, void *, object::yaml::BinaryRef &Val) {
+StringRef yaml::ScalarTraits<yaml::BinaryRef>::input(StringRef Scalar, void *,
+                                                     yaml::BinaryRef &Val) {
   if (Scalar.size() % 2 != 0)
     return "BinaryRef hex string must contain an even number of nybbles.";
   // TODO: Can we improve YAMLIO to permit a more accurate diagnostic here?
@@ -34,11 +33,11 @@ StringRef yaml::ScalarTraits<object::yaml::BinaryRef>::input(
   for (unsigned I = 0, N = Scalar.size(); I != N; ++I)
     if (!isxdigit(Scalar[I]))
       return "BinaryRef hex string must contain only hex digits.";
-  Val = object::yaml::BinaryRef(Scalar);
+  Val = yaml::BinaryRef(Scalar);
   return StringRef();
 }
 
-void BinaryRef::writeAsBinary(raw_ostream &OS) const {
+void yaml::BinaryRef::writeAsBinary(raw_ostream &OS) const {
   if (!DataIsHexString) {
     OS.write((const char *)Data.data(), Data.size());
     return;
@@ -50,7 +49,7 @@ void BinaryRef::writeAsBinary(raw_ostream &OS) const {
   }
 }
 
-void BinaryRef::writeAsHex(raw_ostream &OS) const {
+void yaml::BinaryRef::writeAsHex(raw_ostream &OS) const {
   if (binary_size() == 0)
     return;
   if (DataIsHexString) {
diff --git a/lib/Object/Android.mk b/lib/Object/Android.mk
index 4385f5a..acda4f2 100644
--- a/lib/Object/Android.mk
+++ b/lib/Object/Android.mk
@@ -7,17 +7,15 @@ object_SRC_FILES := \
   COFFYAML.cpp \
   ELF.cpp \
   ELFObjectFile.cpp \
+  ELFYAML.cpp \
   Error.cpp \
   IRObjectFile.cpp \
   MachOObjectFile.cpp \
   MachOUniversal.cpp \
   Object.cpp \
   ObjectFile.cpp \
-  StringTableBuilder.cpp \
-  SymbolicFile.cpp \
-  YAML.cpp \
-  ELFYAML.cpp \
-
+  RecordStreamer.cpp \
+  SymbolicFile.cpp
 
 # For the host
 # =====================================================
diff --git a/lib/Object/Archive.cpp b/lib/Object/Archive.cpp
index 304ca47..6d09bdb 100644
--- a/lib/Object/Archive.cpp
+++ b/lib/Object/Archive.cpp
@@ -115,18 +115,14 @@ Archive::Child Archive::Child::getNext() const {
   return Child(Parent, NextLoc);
 }
 
-error_code Archive::Child::getName(StringRef &Result) const {
+ErrorOr<StringRef> Archive::Child::getName() const {
   StringRef name = getRawName();
   // Check if it's a special name.
   if (name[0] == '/') {
-    if (name.size() == 1) { // Linker member.
-      Result = name;
-      return object_error::success;
-    }
-    if (name.size() == 2 && name[1] == '/') { // String table.
-      Result = name;
-      return object_error::success;
-    }
+    if (name.size() == 1) // Linker member.
+      return name;
+    if (name.size() == 2 && name[1] == '/') // String table.
+      return name;
     // It's a long name.
     // Get the offset.
     std::size_t offset;
@@ -147,68 +143,62 @@ error_code Archive::Child::getName(StringRef &Result) const {
     // GNU long file names end with a /.
     if (Parent->kind() == K_GNU) {
       StringRef::size_type End = StringRef(addr).find('/');
-      Result = StringRef(addr, End);
-    } else {
-      Result = addr;
+      return StringRef(addr, End);
     }
-    return object_error::success;
+    return StringRef(addr);
   } else if (name.startswith("#1/")) {
     uint64_t name_size;
     if (name.substr(3).rtrim(" ").getAsInteger(10, name_size))
       llvm_unreachable("Long name length is not an ingeter");
-    Result = Data.substr(sizeof(ArchiveMemberHeader), name_size)
+    return Data.substr(sizeof(ArchiveMemberHeader), name_size)
         .rtrim(StringRef("\0", 1));
-    return object_error::success;
   }
   // It's a simple name.
   if (name[name.size() - 1] == '/')
-    Result = name.substr(0, name.size() - 1);
-  else
-    Result = name;
-  return object_error::success;
+    return name.substr(0, name.size() - 1);
+  return name;
 }
 
-error_code Archive::Child::getMemoryBuffer(std::unique_ptr<MemoryBuffer> &Result,
-                                           bool FullPath) const {
-  StringRef Name;
-  if (error_code ec = getName(Name))
-    return ec;
+ErrorOr<std::unique_ptr<MemoryBuffer>>
+Archive::Child::getMemoryBuffer(bool FullPath) const {
+  ErrorOr<StringRef> NameOrErr = getName();
+  if (std::error_code EC = NameOrErr.getError())
+    return EC;
+  StringRef Name = NameOrErr.get();
   SmallString<128> Path;
-  Result.reset(MemoryBuffer::getMemBuffer(
-      getBuffer(), FullPath ? (Twine(Parent->getFileName()) + "(" + Name + ")")
-                                  .toStringRef(Path)
-                            : Name,
+  std::unique_ptr<MemoryBuffer> Ret(MemoryBuffer::getMemBuffer(
+      getBuffer(),
+      FullPath
+          ? (Twine(Parent->getFileName()) + "(" + Name + ")").toStringRef(Path)
+          : Name,
       false));
-  return error_code::success();
+  return std::move(Ret);
 }
 
-error_code Archive::Child::getAsBinary(std::unique_ptr<Binary> &Result,
-                                       LLVMContext *Context) const {
+ErrorOr<std::unique_ptr<Binary>>
+Archive::Child::getAsBinary(LLVMContext *Context) const {
   std::unique_ptr<Binary> ret;
-  std::unique_ptr<MemoryBuffer> Buff;
-  if (error_code ec = getMemoryBuffer(Buff))
-    return ec;
-  ErrorOr<Binary *> BinaryOrErr = createBinary(Buff.release(), Context);
-  if (error_code EC = BinaryOrErr.getError())
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BuffOrErr = getMemoryBuffer();
+  if (std::error_code EC = BuffOrErr.getError())
     return EC;
-  Result.reset(BinaryOrErr.get());
-  return object_error::success;
+
+  std::unique_ptr<MemoryBuffer> Buff(BuffOrErr.get().release());
+  return createBinary(Buff, Context);
 }
 
-ErrorOr<Archive*> Archive::create(MemoryBuffer *Source) {
-  error_code EC;
-  std::unique_ptr<Archive> Ret(new Archive(Source, EC));
+ErrorOr<Archive *> Archive::create(std::unique_ptr<MemoryBuffer> Source) {
+  std::error_code EC;
+  std::unique_ptr<Archive> Ret(new Archive(std::move(Source), EC));
   if (EC)
     return EC;
   return Ret.release();
 }
 
-Archive::Archive(MemoryBuffer *source, error_code &ec)
-  : Binary(Binary::ID_Archive, source), SymbolTable(child_end()) {
+Archive::Archive(std::unique_ptr<MemoryBuffer> Source, std::error_code &ec)
+    : Binary(Binary::ID_Archive, std::move(Source)), SymbolTable(child_end()) {
   // Check for sufficient magic.
-  assert(source);
-  if (source->getBufferSize() < 8 ||
-      StringRef(source->getBufferStart(), 8) != Magic) {
+  if (Data->getBufferSize() < 8 ||
+      StringRef(Data->getBufferStart(), 8) != Magic) {
     ec = object_error::invalid_file_type;
     return;
   }
@@ -255,9 +245,11 @@ Archive::Archive(MemoryBuffer *source, error_code &ec)
   if (Name.startswith("#1/")) {
     Format = K_BSD;
     // We know this is BSD, so getName will work since there is no string table.
-    ec = i->getName(Name);
+    ErrorOr<StringRef> NameOrErr = i->getName();
+    ec = NameOrErr.getError();
     if (ec)
       return;
+    Name = NameOrErr.get();
     if (Name == "__.SYMDEF SORTED") {
       SymbolTable = i;
       ++i;
@@ -335,12 +327,11 @@ Archive::child_iterator Archive::child_end() const {
   return Child(this, nullptr);
 }
 
-error_code Archive::Symbol::getName(StringRef &Result) const {
-  Result = StringRef(Parent->SymbolTable->getBuffer().begin() + StringIndex);
-  return object_error::success;
+StringRef Archive::Symbol::getName() const {
+  return Parent->SymbolTable->getBuffer().begin() + StringIndex;
 }
 
-error_code Archive::Symbol::getMember(child_iterator &Result) const {
+ErrorOr<Archive::child_iterator> Archive::Symbol::getMember() const {
   const char *Buf = Parent->SymbolTable->getBuffer().begin();
   const char *Offsets = Buf + 4;
   uint32_t Offset = 0;
@@ -348,7 +339,14 @@ error_code Archive::Symbol::getMember(child_iterator &Result) const {
     Offset = *(reinterpret_cast<const support::ubig32_t*>(Offsets)
                + SymbolIndex);
   } else if (Parent->kind() == K_BSD) {
-    llvm_unreachable("BSD format is not supported");
+    // The SymbolIndex is an index into the ranlib structs that start at
+    // Offsets (the first uint32_t is the number of bytes of the ranlib
+    // structs).  The ranlib structs are a pair of uint32_t's the first
+    // being a string table offset and the second being the offset into
+    // the archive of the member that defines the symbol.  Which is what
+    // is needed here.
+    Offset = *(reinterpret_cast<const support::ulittle32_t *>(Offsets) +
+               (SymbolIndex * 2) + 1);
   } else {
     uint32_t MemberCount = *reinterpret_cast<const support::ulittle32_t*>(Buf);
     
@@ -380,16 +378,49 @@ error_code Archive::Symbol::getMember(child_iterator &Result) const {
   }
 
   const char *Loc = Parent->getData().begin() + Offset;
-  Result = Child(Parent, Loc);
-
-  return object_error::success;
+  child_iterator Iter(Child(Parent, Loc));
+  return Iter;
 }
 
 Archive::Symbol Archive::Symbol::getNext() const {
   Symbol t(*this);
-  // Go to one past next null.
-  t.StringIndex =
-      Parent->SymbolTable->getBuffer().find('\0', t.StringIndex) + 1;
+  if (Parent->kind() == K_BSD) {
+    // t.StringIndex is an offset from the start of the __.SYMDEF or
+    // "__.SYMDEF SORTED" member into the string table for the ranlib
+    // struct indexed by t.SymbolIndex .  To change t.StringIndex to the
+    // offset in the string table for t.SymbolIndex+1 we subtract the
+    // its offset from the start of the string table for t.SymbolIndex
+    // and add the offset of the string table for t.SymbolIndex+1.
+
+    // The __.SYMDEF or "__.SYMDEF SORTED" member starts with a uint32_t
+    // which is the number of bytes of ranlib structs that follow.  The ranlib
+    // structs are a pair of uint32_t's the first being a string table offset
+    // and the second being the offset into the archive of the member that
+    // define the symbol. After that the next uint32_t is the byte count of
+    // the string table followed by the string table.
+    const char *Buf = Parent->SymbolTable->getBuffer().begin();
+    uint32_t RanlibCount = 0;
+    RanlibCount = (*reinterpret_cast<const support::ulittle32_t *>(Buf)) /
+                  (sizeof(uint32_t) * 2);
+    // If t.SymbolIndex + 1 will be past the count of symbols (the RanlibCount)
+    // don't change the t.StringIndex as we don't want to reference a ranlib
+    // past RanlibCount.
+    if (t.SymbolIndex + 1 < RanlibCount) {
+      const char *Ranlibs = Buf + 4;
+      uint32_t CurRanStrx = 0;
+      uint32_t NextRanStrx = 0;
+      CurRanStrx = *(reinterpret_cast<const support::ulittle32_t *>(Ranlibs) +
+                     (t.SymbolIndex * 2));
+      NextRanStrx = *(reinterpret_cast<const support::ulittle32_t *>(Ranlibs) +
+                      ((t.SymbolIndex + 1) * 2));
+      t.StringIndex -= CurRanStrx;
+      t.StringIndex += NextRanStrx;
+    }
+  } else {
+    // Go to one past next null.
+    t.StringIndex =
+        Parent->SymbolTable->getBuffer().find('\0', t.StringIndex) + 1;
+  }
   ++t.SymbolIndex;
   return t;
 }
@@ -404,7 +435,22 @@ Archive::symbol_iterator Archive::symbol_begin() const {
     symbol_count = *reinterpret_cast<const support::ubig32_t*>(buf);
     buf += sizeof(uint32_t) + (symbol_count * (sizeof(uint32_t)));
   } else if (kind() == K_BSD) {
-    llvm_unreachable("BSD archive format is not supported");
+    // The __.SYMDEF or "__.SYMDEF SORTED" member starts with a uint32_t
+    // which is the number of bytes of ranlib structs that follow.  The ranlib
+    // structs are a pair of uint32_t's the first being a string table offset
+    // and the second being the offset into the archive of the member that
+    // define the symbol. After that the next uint32_t is the byte count of
+    // the string table followed by the string table.
+    uint32_t ranlib_count = 0;
+    ranlib_count = (*reinterpret_cast<const support::ulittle32_t *>(buf)) /
+                   (sizeof(uint32_t) * 2);
+    const char *ranlibs = buf + 4;
+    uint32_t ran_strx = 0;
+    ran_strx = *(reinterpret_cast<const support::ulittle32_t *>(ranlibs));
+    buf += sizeof(uint32_t) + (ranlib_count * (2 * (sizeof(uint32_t))));
+    // Skip the byte count of the string table.
+    buf += sizeof(uint32_t);
+    buf += ran_strx;
   } else {
     uint32_t member_count = 0;
     uint32_t symbol_count = 0;
@@ -426,7 +472,8 @@ Archive::symbol_iterator Archive::symbol_end() const {
   if (kind() == K_GNU) {
     symbol_count = *reinterpret_cast<const support::ubig32_t*>(buf);
   } else if (kind() == K_BSD) {
-    llvm_unreachable("BSD archive format is not supported");
+    symbol_count = (*reinterpret_cast<const support::ulittle32_t *>(buf)) /
+                   (sizeof(uint32_t) * 2);
   } else {
     uint32_t member_count = 0;
     member_count = *reinterpret_cast<const support::ulittle32_t*>(buf);
@@ -440,16 +487,15 @@ Archive::symbol_iterator Archive::symbol_end() const {
 Archive::child_iterator Archive::findSym(StringRef name) const {
   Archive::symbol_iterator bs = symbol_begin();
   Archive::symbol_iterator es = symbol_end();
-  Archive::child_iterator result;
-  
-  StringRef symname;
+
   for (; bs != es; ++bs) {
-    if (bs->getName(symname))
-        return child_end();
-    if (symname == name) {
-      if (bs->getMember(result))
+    StringRef SymName = bs->getName();
+    if (SymName == name) {
+      ErrorOr<Archive::child_iterator> ResultOrErr = bs->getMember();
+      // FIXME: Should we really eat the error?
+      if (ResultOrErr.getError())
         return child_end();
-      return result;
+      return ResultOrErr.get();
     }
   }
   return child_end();
diff --git a/lib/Object/Binary.cpp b/lib/Object/Binary.cpp
index 63fd3ed..9f6a685 100644
--- a/lib/Object/Binary.cpp
+++ b/lib/Object/Binary.cpp
@@ -25,13 +25,10 @@
 using namespace llvm;
 using namespace object;
 
-Binary::~Binary() {
-  if (BufferOwned)
-    delete Data;
-}
+Binary::~Binary() {}
 
-Binary::Binary(unsigned int Type, MemoryBuffer *Source, bool BufferOwned)
-  : TypeID(Type), BufferOwned(BufferOwned), Data(Source) {}
+Binary::Binary(unsigned int Type, std::unique_ptr<MemoryBuffer> Source)
+    : TypeID(Type), Data(std::move(Source)) {}
 
 StringRef Binary::getData() const {
   return Data->getBuffer();
@@ -41,14 +38,13 @@ StringRef Binary::getFileName() const {
   return Data->getBufferIdentifier();
 }
 
-ErrorOr<Binary *> object::createBinary(MemoryBuffer *Source,
+ErrorOr<Binary *> object::createBinary(std::unique_ptr<MemoryBuffer> &Buffer,
                                        LLVMContext *Context) {
-  std::unique_ptr<MemoryBuffer> scopedSource(Source);
-  sys::fs::file_magic Type = sys::fs::identify_magic(Source->getBuffer());
+  sys::fs::file_magic Type = sys::fs::identify_magic(Buffer->getBuffer());
 
   switch (Type) {
     case sys::fs::file_magic::archive:
-      return Archive::create(scopedSource.release());
+      return Archive::create(std::move(Buffer));
     case sys::fs::file_magic::elf_relocatable:
     case sys::fs::file_magic::elf_executable:
     case sys::fs::file_magic::elf_shared_object:
@@ -67,10 +63,9 @@ ErrorOr<Binary *> object::createBinary(MemoryBuffer *Source,
     case sys::fs::file_magic::coff_import_library:
     case sys::fs::file_magic::pecoff_executable:
     case sys::fs::file_magic::bitcode:
-      return ObjectFile::createSymbolicFile(scopedSource.release(), true, Type,
-                                            Context);
+      return ObjectFile::createSymbolicFile(Buffer, Type, Context);
     case sys::fs::file_magic::macho_universal_binary:
-      return MachOUniversalBinary::create(scopedSource.release());
+      return MachOUniversalBinary::create(std::move(Buffer));
     case sys::fs::file_magic::unknown:
     case sys::fs::file_magic::windows_resource:
       // Unrecognized object file format.
@@ -80,8 +75,9 @@ ErrorOr<Binary *> object::createBinary(MemoryBuffer *Source,
 }
 
 ErrorOr<Binary *> object::createBinary(StringRef Path) {
-  std::unique_ptr<MemoryBuffer> File;
-  if (error_code EC = MemoryBuffer::getFileOrSTDIN(Path, File))
+  ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
+      MemoryBuffer::getFileOrSTDIN(Path);
+  if (std::error_code EC = FileOrErr.getError())
     return EC;
-  return createBinary(File.release());
+  return createBinary(FileOrErr.get());
 }
diff --git a/lib/Object/CMakeLists.txt b/lib/Object/CMakeLists.txt
index cd8c9ef..5b08e42 100644
--- a/lib/Object/CMakeLists.txt
+++ b/lib/Object/CMakeLists.txt
@@ -12,7 +12,6 @@ add_llvm_library(LLVMObject
   MachOUniversal.cpp
   Object.cpp
   ObjectFile.cpp
-  StringTableBuilder.cpp
+  RecordStreamer.cpp
   SymbolicFile.cpp
-  YAML.cpp
   )
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index 262c040..46ef87d 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -31,8 +31,9 @@ using support::ulittle32_t;
 using support::little16_t;
 
 // Returns false if size is greater than the buffer size. And sets ec.
-static bool checkSize(const MemoryBuffer *M, error_code &EC, uint64_t Size) {
-  if (M->getBufferSize() < Size) {
+static bool checkSize(const MemoryBuffer &M, std::error_code &EC,
+                      uint64_t Size) {
+  if (M.getBufferSize() < Size) {
     EC = object_error::unexpected_eof;
     return false;
   }
@@ -41,13 +42,13 @@ static bool checkSize(const MemoryBuffer *M, error_code &EC, uint64_t Size) {
 
 // Sets Obj unless any bytes in [addr, addr + size) fall outsize of m.
 // Returns unexpected_eof if error.
-template<typename T>
-static error_code getObject(const T *&Obj, const MemoryBuffer *M,
-                            const uint8_t *Ptr, const size_t Size = sizeof(T)) {
+template <typename T>
+static std::error_code getObject(const T *&Obj, const MemoryBuffer &M,
+                                 const uint8_t *Ptr,
+                                 const size_t Size = sizeof(T)) {
   uintptr_t Addr = uintptr_t(Ptr);
-  if (Addr + Size < Addr ||
-      Addr + Size < Size ||
-      Addr + Size > uintptr_t(M->getBufferEnd())) {
+  if (Addr + Size < Addr || Addr + Size < Size ||
+      Addr + Size > uintptr_t(M.getBufferEnd())) {
     return object_error::unexpected_eof;
   }
   Obj = reinterpret_cast<const T *>(Addr);
@@ -129,17 +130,17 @@ void COFFObjectFile::moveSymbolNext(DataRefImpl &Ref) const {
   Ref.p = reinterpret_cast<uintptr_t>(Symb);
 }
 
-error_code COFFObjectFile::getSymbolName(DataRefImpl Ref,
-                                         StringRef &Result) const {
+std::error_code COFFObjectFile::getSymbolName(DataRefImpl Ref,
+                                              StringRef &Result) const {
   const coff_symbol *Symb = toSymb(Ref);
   return getSymbolName(Symb, Result);
 }
 
-error_code COFFObjectFile::getSymbolAddress(DataRefImpl Ref,
-                                            uint64_t &Result) const {
+std::error_code COFFObjectFile::getSymbolAddress(DataRefImpl Ref,
+                                                 uint64_t &Result) const {
   const coff_symbol *Symb = toSymb(Ref);
   const coff_section *Section = nullptr;
-  if (error_code EC = getSection(Symb->SectionNumber, Section))
+  if (std::error_code EC = getSection(Symb->SectionNumber, Section))
     return EC;
 
   if (Symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED)
@@ -151,8 +152,8 @@ error_code COFFObjectFile::getSymbolAddress(DataRefImpl Ref,
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSymbolType(DataRefImpl Ref,
-                                         SymbolRef::Type &Result) const {
+std::error_code COFFObjectFile::getSymbolType(DataRefImpl Ref,
+                                              SymbolRef::Type &Result) const {
   const coff_symbol *Symb = toSymb(Ref);
   Result = SymbolRef::ST_Other;
   if (Symb->StorageClass == COFF::IMAGE_SYM_CLASS_EXTERNAL &&
@@ -164,7 +165,7 @@ error_code COFFObjectFile::getSymbolType(DataRefImpl Ref,
     uint32_t Characteristics = 0;
     if (!COFF::isReservedSectionNumber(Symb->SectionNumber)) {
       const coff_section *Section = nullptr;
-      if (error_code EC = getSection(Symb->SectionNumber, Section))
+      if (std::error_code EC = getSection(Symb->SectionNumber, Section))
         return EC;
       Characteristics = Section->Characteristics;
     }
@@ -202,14 +203,14 @@ uint32_t COFFObjectFile::getSymbolFlags(DataRefImpl Ref) const {
   return Result;
 }
 
-error_code COFFObjectFile::getSymbolSize(DataRefImpl Ref,
-                                         uint64_t &Result) const {
+std::error_code COFFObjectFile::getSymbolSize(DataRefImpl Ref,
+                                              uint64_t &Result) const {
   // FIXME: Return the correct size. This requires looking at all the symbols
   //        in the same section as this symbol, and looking for either the next
   //        symbol, or the end of the section.
   const coff_symbol *Symb = toSymb(Ref);
   const coff_section *Section = nullptr;
-  if (error_code EC = getSection(Symb->SectionNumber, Section))
+  if (std::error_code EC = getSection(Symb->SectionNumber, Section))
     return EC;
 
   if (Symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED)
@@ -221,14 +222,16 @@ error_code COFFObjectFile::getSymbolSize(DataRefImpl Ref,
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSymbolSection(DataRefImpl Ref,
-                                            section_iterator &Result) const {
+std::error_code
+COFFObjectFile::getSymbolSection(DataRefImpl Ref,
+                                 section_iterator &Result) const {
   const coff_symbol *Symb = toSymb(Ref);
   if (COFF::isReservedSectionNumber(Symb->SectionNumber)) {
     Result = section_end();
   } else {
     const coff_section *Sec = nullptr;
-    if (error_code EC = getSection(Symb->SectionNumber, Sec)) return EC;
+    if (std::error_code EC = getSection(Symb->SectionNumber, Sec))
+      return EC;
     DataRefImpl Ref;
     Ref.p = reinterpret_cast<uintptr_t>(Sec);
     Result = section_iterator(SectionRef(Ref, this));
@@ -242,37 +245,37 @@ void COFFObjectFile::moveSectionNext(DataRefImpl &Ref) const {
   Ref.p = reinterpret_cast<uintptr_t>(Sec);
 }
 
-error_code COFFObjectFile::getSectionName(DataRefImpl Ref,
-                                          StringRef &Result) const {
+std::error_code COFFObjectFile::getSectionName(DataRefImpl Ref,
+                                               StringRef &Result) const {
   const coff_section *Sec = toSec(Ref);
   return getSectionName(Sec, Result);
 }
 
-error_code COFFObjectFile::getSectionAddress(DataRefImpl Ref,
-                                             uint64_t &Result) const {
+std::error_code COFFObjectFile::getSectionAddress(DataRefImpl Ref,
+                                                  uint64_t &Result) const {
   const coff_section *Sec = toSec(Ref);
   Result = Sec->VirtualAddress;
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSectionSize(DataRefImpl Ref,
-                                          uint64_t &Result) const {
+std::error_code COFFObjectFile::getSectionSize(DataRefImpl Ref,
+                                               uint64_t &Result) const {
   const coff_section *Sec = toSec(Ref);
   Result = Sec->SizeOfRawData;
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSectionContents(DataRefImpl Ref,
-                                              StringRef &Result) const {
+std::error_code COFFObjectFile::getSectionContents(DataRefImpl Ref,
+                                                   StringRef &Result) const {
   const coff_section *Sec = toSec(Ref);
   ArrayRef<uint8_t> Res;
-  error_code EC = getSectionContents(Sec, Res);
+  std::error_code EC = getSectionContents(Sec, Res);
   Result = StringRef(reinterpret_cast<const char*>(Res.data()), Res.size());
   return EC;
 }
 
-error_code COFFObjectFile::getSectionAlignment(DataRefImpl Ref,
-                                               uint64_t &Res) const {
+std::error_code COFFObjectFile::getSectionAlignment(DataRefImpl Ref,
+                                                    uint64_t &Res) const {
   const coff_section *Sec = toSec(Ref);
   if (!Sec)
     return object_error::parse_failed;
@@ -280,62 +283,64 @@ error_code COFFObjectFile::getSectionAlignment(DataRefImpl Ref,
   return object_error::success;
 }
 
-error_code COFFObjectFile::isSectionText(DataRefImpl Ref,
-                                         bool &Result) const {
+std::error_code COFFObjectFile::isSectionText(DataRefImpl Ref,
+                                              bool &Result) const {
   const coff_section *Sec = toSec(Ref);
   Result = Sec->Characteristics & COFF::IMAGE_SCN_CNT_CODE;
   return object_error::success;
 }
 
-error_code COFFObjectFile::isSectionData(DataRefImpl Ref,
-                                         bool &Result) const {
+std::error_code COFFObjectFile::isSectionData(DataRefImpl Ref,
+                                              bool &Result) const {
   const coff_section *Sec = toSec(Ref);
   Result = Sec->Characteristics & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA;
   return object_error::success;
 }
 
-error_code COFFObjectFile::isSectionBSS(DataRefImpl Ref,
-                                        bool &Result) const {
+std::error_code COFFObjectFile::isSectionBSS(DataRefImpl Ref,
+                                             bool &Result) const {
   const coff_section *Sec = toSec(Ref);
   Result = Sec->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
   return object_error::success;
 }
 
-error_code COFFObjectFile::isSectionRequiredForExecution(DataRefImpl Ref,
-                                                         bool &Result) const {
+std::error_code
+COFFObjectFile::isSectionRequiredForExecution(DataRefImpl Ref,
+                                              bool &Result) const {
   // FIXME: Unimplemented
   Result = true;
   return object_error::success;
 }
 
-error_code COFFObjectFile::isSectionVirtual(DataRefImpl Ref,
-                                           bool &Result) const {
+std::error_code COFFObjectFile::isSectionVirtual(DataRefImpl Ref,
+                                                 bool &Result) const {
   const coff_section *Sec = toSec(Ref);
   Result = Sec->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
   return object_error::success;
 }
 
-error_code COFFObjectFile::isSectionZeroInit(DataRefImpl Ref,
-                                             bool &Result) const {
+std::error_code COFFObjectFile::isSectionZeroInit(DataRefImpl Ref,
+                                                  bool &Result) const {
   // FIXME: Unimplemented.
   Result = false;
   return object_error::success;
 }
 
-error_code COFFObjectFile::isSectionReadOnlyData(DataRefImpl Ref,
-                                                bool &Result) const {
+std::error_code COFFObjectFile::isSectionReadOnlyData(DataRefImpl Ref,
+                                                      bool &Result) const {
   // FIXME: Unimplemented.
   Result = false;
   return object_error::success;
 }
 
-error_code COFFObjectFile::sectionContainsSymbol(DataRefImpl SecRef,
-                                                 DataRefImpl SymbRef,
-                                                 bool &Result) const {
+std::error_code COFFObjectFile::sectionContainsSymbol(DataRefImpl SecRef,
+                                                      DataRefImpl SymbRef,
+                                                      bool &Result) const {
   const coff_section *Sec = toSec(SecRef);
   const coff_symbol *Symb = toSymb(SymbRef);
   const coff_section *SymbSec = nullptr;
-  if (error_code EC = getSection(Symb->SectionNumber, SymbSec)) return EC;
+  if (std::error_code EC = getSection(Symb->SectionNumber, SymbSec))
+    return EC;
   if (SymbSec == Sec)
     Result = true;
   else
@@ -390,9 +395,9 @@ relocation_iterator COFFObjectFile::section_rel_end(DataRefImpl Ref) const {
 }
 
 // Initialize the pointer to the symbol table.
-error_code COFFObjectFile::initSymbolTablePtr() {
-  if (error_code EC = getObject(
-          SymbolTable, Data, base() + COFFHeader->PointerToSymbolTable,
+std::error_code COFFObjectFile::initSymbolTablePtr() {
+  if (std::error_code EC = getObject(
+          SymbolTable, *Data, base() + COFFHeader->PointerToSymbolTable,
           COFFHeader->NumberOfSymbols * sizeof(coff_symbol)))
     return EC;
 
@@ -403,11 +408,12 @@ error_code COFFObjectFile::initSymbolTablePtr() {
       base() + COFFHeader->PointerToSymbolTable +
       COFFHeader->NumberOfSymbols * sizeof(coff_symbol);
   const ulittle32_t *StringTableSizePtr;
-  if (error_code EC = getObject(StringTableSizePtr, Data, StringTableAddr))
+  if (std::error_code EC =
+          getObject(StringTableSizePtr, *Data, StringTableAddr))
     return EC;
   StringTableSize = *StringTableSizePtr;
-  if (error_code EC =
-      getObject(StringTable, Data, StringTableAddr, StringTableSize))
+  if (std::error_code EC =
+          getObject(StringTable, *Data, StringTableAddr, StringTableSize))
     return EC;
 
   // Treat table sizes < 4 as empty because contrary to the PECOFF spec, some
@@ -422,7 +428,7 @@ error_code COFFObjectFile::initSymbolTablePtr() {
 }
 
 // Returns the file offset for the given VA.
-error_code COFFObjectFile::getVaPtr(uint64_t Addr, uintptr_t &Res) const {
+std::error_code COFFObjectFile::getVaPtr(uint64_t Addr, uintptr_t &Res) const {
   uint64_t ImageBase = PE32Header ? (uint64_t)PE32Header->ImageBase
                                   : (uint64_t)PE32PlusHeader->ImageBase;
   uint64_t Rva = Addr - ImageBase;
@@ -431,7 +437,7 @@ error_code COFFObjectFile::getVaPtr(uint64_t Addr, uintptr_t &Res) const {
 }
 
 // Returns the file offset for the given RVA.
-error_code COFFObjectFile::getRvaPtr(uint32_t Addr, uintptr_t &Res) const {
+std::error_code COFFObjectFile::getRvaPtr(uint32_t Addr, uintptr_t &Res) const {
   for (const SectionRef &S : sections()) {
     const coff_section *Section = getCOFFSection(S);
     uint32_t SectionStart = Section->VirtualAddress;
@@ -447,10 +453,10 @@ error_code COFFObjectFile::getRvaPtr(uint32_t Addr, uintptr_t &Res) const {
 
 // Returns hint and name fields, assuming \p Rva is pointing to a Hint/Name
 // table entry.
-error_code COFFObjectFile::
-getHintName(uint32_t Rva, uint16_t &Hint, StringRef &Name) const {
+std::error_code COFFObjectFile::getHintName(uint32_t Rva, uint16_t &Hint,
+                                            StringRef &Name) const {
   uintptr_t IntPtr = 0;
-  if (error_code EC = getRvaPtr(Rva, IntPtr))
+  if (std::error_code EC = getRvaPtr(Rva, IntPtr))
     return EC;
   const uint8_t *Ptr = reinterpret_cast<const uint8_t *>(IntPtr);
   Hint = *reinterpret_cast<const ulittle16_t *>(Ptr);
@@ -459,7 +465,7 @@ getHintName(uint32_t Rva, uint16_t &Hint, StringRef &Name) const {
 }
 
 // Find the import table.
-error_code COFFObjectFile::initImportTablePtr() {
+std::error_code COFFObjectFile::initImportTablePtr() {
   // First, we get the RVA of the import table. If the file lacks a pointer to
   // the import table, do nothing.
   const data_directory *DataEntry;
@@ -477,7 +483,7 @@ error_code COFFObjectFile::initImportTablePtr() {
   // Find the section that contains the RVA. This is needed because the RVA is
   // the import table's memory address which is different from its file offset.
   uintptr_t IntPtr = 0;
-  if (error_code EC = getRvaPtr(ImportTableRva, IntPtr))
+  if (std::error_code EC = getRvaPtr(ImportTableRva, IntPtr))
     return EC;
   ImportDirectory = reinterpret_cast<
       const import_directory_table_entry *>(IntPtr);
@@ -485,7 +491,7 @@ error_code COFFObjectFile::initImportTablePtr() {
 }
 
 // Find the export table.
-error_code COFFObjectFile::initExportTablePtr() {
+std::error_code COFFObjectFile::initExportTablePtr() {
   // First, we get the RVA of the export table. If the file lacks a pointer to
   // the export table, do nothing.
   const data_directory *DataEntry;
@@ -498,22 +504,23 @@ error_code COFFObjectFile::initExportTablePtr() {
 
   uint32_t ExportTableRva = DataEntry->RelativeVirtualAddress;
   uintptr_t IntPtr = 0;
-  if (error_code EC = getRvaPtr(ExportTableRva, IntPtr))
+  if (std::error_code EC = getRvaPtr(ExportTableRva, IntPtr))
     return EC;
   ExportDirectory =
       reinterpret_cast<const export_directory_table_entry *>(IntPtr);
   return object_error::success;
 }
 
-COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &EC,
-                               bool BufferOwned)
-    : ObjectFile(Binary::ID_COFF, Object, BufferOwned), COFFHeader(nullptr),
+COFFObjectFile::COFFObjectFile(std::unique_ptr<MemoryBuffer> Object,
+                               std::error_code &EC)
+    : ObjectFile(Binary::ID_COFF, std::move(Object)), COFFHeader(nullptr),
       PE32Header(nullptr), PE32PlusHeader(nullptr), DataDirectory(nullptr),
       SectionTable(nullptr), SymbolTable(nullptr), StringTable(nullptr),
       StringTableSize(0), ImportDirectory(nullptr), NumberOfImportDirectory(0),
       ExportDirectory(nullptr) {
   // Check that we at least have enough room for a header.
-  if (!checkSize(Data, EC, sizeof(coff_file_header))) return;
+  if (!checkSize(*Data, EC, sizeof(coff_file_header)))
+    return;
 
   // The current location in the file where we are looking at.
   uint64_t CurPtr = 0;
@@ -526,7 +533,8 @@ COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &EC,
   if (base()[0] == 0x4d && base()[1] == 0x5a) {
     // PE/COFF, seek through MS-DOS compatibility stub and 4-byte
     // PE signature to find 'normal' COFF header.
-    if (!checkSize(Data, EC, 0x3c + 8)) return;
+    if (!checkSize(*Data, EC, 0x3c + 8))
+      return;
     CurPtr = *reinterpret_cast<const ulittle16_t *>(base() + 0x3c);
     // Check the PE magic bytes. ("PE\0\0")
     if (std::memcmp(base() + CurPtr, "PE\0\0", 4) != 0) {
@@ -537,13 +545,13 @@ COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &EC,
     HasPEHeader = true;
   }
 
-  if ((EC = getObject(COFFHeader, Data, base() + CurPtr)))
+  if ((EC = getObject(COFFHeader, *Data, base() + CurPtr)))
     return;
   CurPtr += sizeof(coff_file_header);
 
   if (HasPEHeader) {
     const pe32_header *Header;
-    if ((EC = getObject(Header, Data, base() + CurPtr)))
+    if ((EC = getObject(Header, *Data, base() + CurPtr)))
       return;
 
     const uint8_t *DataDirAddr;
@@ -561,7 +569,7 @@ COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &EC,
       EC = object_error::parse_failed;
       return;
     }
-    if ((EC = getObject(DataDirectory, Data, DataDirAddr, DataDirSize)))
+    if ((EC = getObject(DataDirectory, *Data, DataDirAddr, DataDirSize)))
       return;
     CurPtr += COFFHeader->SizeOfOptionalHeader;
   }
@@ -569,7 +577,7 @@ COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &EC,
   if (COFFHeader->isImportLibrary())
     return;
 
-  if ((EC = getObject(SectionTable, Data, base() + CurPtr,
+  if ((EC = getObject(SectionTable, *Data, base() + CurPtr,
                       COFFHeader->NumberOfSections * sizeof(coff_section))))
     return;
 
@@ -686,28 +694,30 @@ unsigned COFFObjectFile::getArch() const {
 
 // This method is kept here because lld uses this. As soon as we make
 // lld to use getCOFFHeader, this method will be removed.
-error_code COFFObjectFile::getHeader(const coff_file_header *&Res) const {
+std::error_code COFFObjectFile::getHeader(const coff_file_header *&Res) const {
   return getCOFFHeader(Res);
 }
 
-error_code COFFObjectFile::getCOFFHeader(const coff_file_header *&Res) const {
+std::error_code
+COFFObjectFile::getCOFFHeader(const coff_file_header *&Res) const {
   Res = COFFHeader;
   return object_error::success;
 }
 
-error_code COFFObjectFile::getPE32Header(const pe32_header *&Res) const {
+std::error_code COFFObjectFile::getPE32Header(const pe32_header *&Res) const {
   Res = PE32Header;
   return object_error::success;
 }
 
-error_code
+std::error_code
 COFFObjectFile::getPE32PlusHeader(const pe32plus_header *&Res) const {
   Res = PE32PlusHeader;
   return object_error::success;
 }
 
-error_code COFFObjectFile::getDataDirectory(uint32_t Index,
-                                            const data_directory *&Res) const {
+std::error_code
+COFFObjectFile::getDataDirectory(uint32_t Index,
+                                 const data_directory *&Res) const {
   // Error if if there's no data directory or the index is out of range.
   if (!DataDirectory)
     return object_error::parse_failed;
@@ -720,8 +730,8 @@ error_code COFFObjectFile::getDataDirectory(uint32_t Index,
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSection(int32_t Index,
-                                      const coff_section *&Result) const {
+std::error_code COFFObjectFile::getSection(int32_t Index,
+                                           const coff_section *&Result) const {
   // Check for special index values.
   if (COFF::isReservedSectionNumber(Index))
     Result = nullptr;
@@ -733,8 +743,8 @@ error_code COFFObjectFile::getSection(int32_t Index,
   return object_error::success;
 }
 
-error_code COFFObjectFile::getString(uint32_t Offset,
-                                     StringRef &Result) const {
+std::error_code COFFObjectFile::getString(uint32_t Offset,
+                                          StringRef &Result) const {
   if (StringTableSize <= 4)
     // Tried to get a string from an empty string table.
     return object_error::parse_failed;
@@ -744,8 +754,8 @@ error_code COFFObjectFile::getString(uint32_t Offset,
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSymbol(uint32_t Index,
-                                     const coff_symbol *&Result) const {
+std::error_code COFFObjectFile::getSymbol(uint32_t Index,
+                                          const coff_symbol *&Result) const {
   if (Index < COFFHeader->NumberOfSymbols)
     Result = SymbolTable + Index;
   else
@@ -753,12 +763,12 @@ error_code COFFObjectFile::getSymbol(uint32_t Index,
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSymbolName(const coff_symbol *Symbol,
-                                         StringRef &Res) const {
+std::error_code COFFObjectFile::getSymbolName(const coff_symbol *Symbol,
+                                              StringRef &Res) const {
   // Check for string table entry. First 4 bytes are 0.
   if (Symbol->Name.Offset.Zeroes == 0) {
     uint32_t Offset = Symbol->Name.Offset.Offset;
-    if (error_code EC = getString(Offset, Res))
+    if (std::error_code EC = getString(Offset, Res))
       return EC;
     return object_error::success;
   }
@@ -795,8 +805,8 @@ ArrayRef<uint8_t> COFFObjectFile::getSymbolAuxData(
                            Symbol->NumberOfAuxSymbols * sizeof(coff_symbol));
 }
 
-error_code COFFObjectFile::getSectionName(const coff_section *Sec,
-                                          StringRef &Res) const {
+std::error_code COFFObjectFile::getSectionName(const coff_section *Sec,
+                                               StringRef &Res) const {
   StringRef Name;
   if (Sec->Name[7] == 0)
     // Null terminated, let ::strlen figure out the length.
@@ -815,7 +825,7 @@ error_code COFFObjectFile::getSectionName(const coff_section *Sec,
       if (Name.substr(1).getAsInteger(10, Offset))
         return object_error::parse_failed;
     }
-    if (error_code EC = getString(Offset, Name))
+    if (std::error_code EC = getString(Offset, Name))
       return EC;
   }
 
@@ -823,8 +833,9 @@ error_code COFFObjectFile::getSectionName(const coff_section *Sec,
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSectionContents(const coff_section *Sec,
-                                              ArrayRef<uint8_t> &Res) const {
+std::error_code
+COFFObjectFile::getSectionContents(const coff_section *Sec,
+                                   ArrayRef<uint8_t> &Res) const {
   // The only thing that we need to verify is that the contents is contained
   // within the file bounds. We don't need to make sure it doesn't cover other
   // data, as there's nothing that says that is not allowed.
@@ -846,13 +857,13 @@ void COFFObjectFile::moveRelocationNext(DataRefImpl &Rel) const {
             reinterpret_cast<const coff_relocation*>(Rel.p) + 1);
 }
 
-error_code COFFObjectFile::getRelocationAddress(DataRefImpl Rel,
-                                                uint64_t &Res) const {
+std::error_code COFFObjectFile::getRelocationAddress(DataRefImpl Rel,
+                                                     uint64_t &Res) const {
   report_fatal_error("getRelocationAddress not implemented in COFFObjectFile");
 }
 
-error_code COFFObjectFile::getRelocationOffset(DataRefImpl Rel,
-                                               uint64_t &Res) const {
+std::error_code COFFObjectFile::getRelocationOffset(DataRefImpl Rel,
+                                                    uint64_t &Res) const {
   Res = toRel(Rel)->VirtualAddress;
   return object_error::success;
 }
@@ -864,8 +875,8 @@ symbol_iterator COFFObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
   return symbol_iterator(SymbolRef(Ref, this));
 }
 
-error_code COFFObjectFile::getRelocationType(DataRefImpl Rel,
-                                             uint64_t &Res) const {
+std::error_code COFFObjectFile::getRelocationType(DataRefImpl Rel,
+                                                  uint64_t &Res) const {
   const coff_relocation* R = toRel(Rel);
   Res = R->Type;
   return object_error::success;
@@ -891,8 +902,9 @@ COFFObjectFile::getCOFFRelocation(const RelocationRef &Reloc) const {
     Res = #reloc_type;                                                         \
     break;
 
-error_code COFFObjectFile::getRelocationTypeName(DataRefImpl Rel,
-                                          SmallVectorImpl<char> &Result) const {
+std::error_code
+COFFObjectFile::getRelocationTypeName(DataRefImpl Rel,
+                                      SmallVectorImpl<char> &Result) const {
   const coff_relocation *Reloc = toRel(Rel);
   StringRef Res;
   switch (COFFHeader->Machine) {
@@ -966,26 +978,29 @@ error_code COFFObjectFile::getRelocationTypeName(DataRefImpl Rel,
 
 #undef LLVM_COFF_SWITCH_RELOC_TYPE_NAME
 
-error_code COFFObjectFile::getRelocationValueString(DataRefImpl Rel,
-                                          SmallVectorImpl<char> &Result) const {
+std::error_code
+COFFObjectFile::getRelocationValueString(DataRefImpl Rel,
+                                         SmallVectorImpl<char> &Result) const {
   const coff_relocation *Reloc = toRel(Rel);
   const coff_symbol *Symb = nullptr;
-  if (error_code EC = getSymbol(Reloc->SymbolTableIndex, Symb)) return EC;
+  if (std::error_code EC = getSymbol(Reloc->SymbolTableIndex, Symb))
+    return EC;
   DataRefImpl Sym;
   Sym.p = reinterpret_cast<uintptr_t>(Symb);
   StringRef SymName;
-  if (error_code EC = getSymbolName(Sym, SymName)) return EC;
+  if (std::error_code EC = getSymbolName(Sym, SymName))
+    return EC;
   Result.append(SymName.begin(), SymName.end());
   return object_error::success;
 }
 
-error_code COFFObjectFile::getLibraryNext(DataRefImpl LibData,
-                                          LibraryRef &Result) const {
+std::error_code COFFObjectFile::getLibraryNext(DataRefImpl LibData,
+                                               LibraryRef &Result) const {
   report_fatal_error("getLibraryNext not implemented in COFFObjectFile");
 }
 
-error_code COFFObjectFile::getLibraryPath(DataRefImpl LibData,
-                                          StringRef &Result) const {
+std::error_code COFFObjectFile::getLibraryPath(DataRefImpl LibData,
+                                               StringRef &Result) const {
   report_fatal_error("getLibraryPath not implemented in COFFObjectFile");
 }
 
@@ -998,24 +1013,25 @@ void ImportDirectoryEntryRef::moveNext() {
   ++Index;
 }
 
-error_code ImportDirectoryEntryRef::
-getImportTableEntry(const import_directory_table_entry *&Result) const {
+std::error_code ImportDirectoryEntryRef::getImportTableEntry(
+    const import_directory_table_entry *&Result) const {
   Result = ImportTable;
   return object_error::success;
 }
 
-error_code ImportDirectoryEntryRef::getName(StringRef &Result) const {
+std::error_code ImportDirectoryEntryRef::getName(StringRef &Result) const {
   uintptr_t IntPtr = 0;
-  if (error_code EC = OwningObject->getRvaPtr(ImportTable->NameRVA, IntPtr))
+  if (std::error_code EC =
+          OwningObject->getRvaPtr(ImportTable->NameRVA, IntPtr))
     return EC;
   Result = StringRef(reinterpret_cast<const char *>(IntPtr));
   return object_error::success;
 }
 
-error_code ImportDirectoryEntryRef::getImportLookupEntry(
+std::error_code ImportDirectoryEntryRef::getImportLookupEntry(
     const import_lookup_table_entry32 *&Result) const {
   uintptr_t IntPtr = 0;
-  if (error_code EC =
+  if (std::error_code EC =
           OwningObject->getRvaPtr(ImportTable->ImportLookupTableRVA, IntPtr))
     return EC;
   Result = reinterpret_cast<const import_lookup_table_entry32 *>(IntPtr);
@@ -1033,31 +1049,33 @@ void ExportDirectoryEntryRef::moveNext() {
 
 // Returns the name of the current export symbol. If the symbol is exported only
 // by ordinal, the empty string is set as a result.
-error_code ExportDirectoryEntryRef::getDllName(StringRef &Result) const {
+std::error_code ExportDirectoryEntryRef::getDllName(StringRef &Result) const {
   uintptr_t IntPtr = 0;
-  if (error_code EC = OwningObject->getRvaPtr(ExportTable->NameRVA, IntPtr))
+  if (std::error_code EC =
+          OwningObject->getRvaPtr(ExportTable->NameRVA, IntPtr))
     return EC;
   Result = StringRef(reinterpret_cast<const char *>(IntPtr));
   return object_error::success;
 }
 
 // Returns the starting ordinal number.
-error_code ExportDirectoryEntryRef::getOrdinalBase(uint32_t &Result) const {
+std::error_code
+ExportDirectoryEntryRef::getOrdinalBase(uint32_t &Result) const {
   Result = ExportTable->OrdinalBase;
   return object_error::success;
 }
 
 // Returns the export ordinal of the current export symbol.
-error_code ExportDirectoryEntryRef::getOrdinal(uint32_t &Result) const {
+std::error_code ExportDirectoryEntryRef::getOrdinal(uint32_t &Result) const {
   Result = ExportTable->OrdinalBase + Index;
   return object_error::success;
 }
 
 // Returns the address of the current export symbol.
-error_code ExportDirectoryEntryRef::getExportRVA(uint32_t &Result) const {
+std::error_code ExportDirectoryEntryRef::getExportRVA(uint32_t &Result) const {
   uintptr_t IntPtr = 0;
-  if (error_code EC = OwningObject->getRvaPtr(
-          ExportTable->ExportAddressTableRVA, IntPtr))
+  if (std::error_code EC =
+          OwningObject->getRvaPtr(ExportTable->ExportAddressTableRVA, IntPtr))
     return EC;
   const export_address_table_entry *entry =
       reinterpret_cast<const export_address_table_entry *>(IntPtr);
@@ -1067,10 +1085,11 @@ error_code ExportDirectoryEntryRef::getExportRVA(uint32_t &Result) const {
 
 // Returns the name of the current export symbol. If the symbol is exported only
 // by ordinal, the empty string is set as a result.
-error_code ExportDirectoryEntryRef::getSymbolName(StringRef &Result) const {
+std::error_code
+ExportDirectoryEntryRef::getSymbolName(StringRef &Result) const {
   uintptr_t IntPtr = 0;
-  if (error_code EC = OwningObject->getRvaPtr(
-          ExportTable->OrdinalTableRVA, IntPtr))
+  if (std::error_code EC =
+          OwningObject->getRvaPtr(ExportTable->OrdinalTableRVA, IntPtr))
     return EC;
   const ulittle16_t *Start = reinterpret_cast<const ulittle16_t *>(IntPtr);
 
@@ -1080,11 +1099,11 @@ error_code ExportDirectoryEntryRef::getSymbolName(StringRef &Result) const {
        I < E; ++I, ++Offset) {
     if (*I != Index)
       continue;
-    if (error_code EC = OwningObject->getRvaPtr(
-            ExportTable->NamePointerRVA, IntPtr))
+    if (std::error_code EC =
+            OwningObject->getRvaPtr(ExportTable->NamePointerRVA, IntPtr))
       return EC;
     const ulittle32_t *NamePtr = reinterpret_cast<const ulittle32_t *>(IntPtr);
-    if (error_code EC = OwningObject->getRvaPtr(NamePtr[Offset], IntPtr))
+    if (std::error_code EC = OwningObject->getRvaPtr(NamePtr[Offset], IntPtr))
       return EC;
     Result = StringRef(reinterpret_cast<const char *>(IntPtr));
     return object_error::success;
@@ -1093,11 +1112,11 @@ error_code ExportDirectoryEntryRef::getSymbolName(StringRef &Result) const {
   return object_error::success;
 }
 
-ErrorOr<ObjectFile *> ObjectFile::createCOFFObjectFile(MemoryBuffer *Object,
-                                                       bool BufferOwned) {
-  error_code EC;
+ErrorOr<ObjectFile *>
+ObjectFile::createCOFFObjectFile(std::unique_ptr<MemoryBuffer> Object) {
+  std::error_code EC;
   std::unique_ptr<COFFObjectFile> Ret(
-      new COFFObjectFile(Object, EC, BufferOwned));
+      new COFFObjectFile(std::move(Object), EC));
   if (EC)
     return EC;
   return Ret.release();
diff --git a/lib/Object/ELFObjectFile.cpp b/lib/Object/ELFObjectFile.cpp
index a2c4df2..4f0f60b 100644
--- a/lib/Object/ELFObjectFile.cpp
+++ b/lib/Object/ELFObjectFile.cpp
@@ -17,65 +17,66 @@
 namespace llvm {
 using namespace object;
 
-ErrorOr<ObjectFile *> ObjectFile::createELFObjectFile(MemoryBuffer *Obj,
-                                                      bool BufferOwned) {
-  std::pair<unsigned char, unsigned char> Ident = getElfArchType(Obj);
+ErrorOr<ObjectFile *>
+ObjectFile::createELFObjectFile(std::unique_ptr<MemoryBuffer> &Obj) {
+  std::pair<unsigned char, unsigned char> Ident =
+      getElfArchType(Obj->getBuffer());
   std::size_t MaxAlignment =
     1ULL << countTrailingZeros(uintptr_t(Obj->getBufferStart()));
 
-  error_code EC;
+  std::error_code EC;
   std::unique_ptr<ObjectFile> R;
   if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2LSB)
 #if !LLVM_IS_UNALIGNED_ACCESS_FAST
     if (MaxAlignment >= 4)
-      R.reset(new ELFObjectFile<ELFType<support::little, 4, false> >(
-          Obj, EC, BufferOwned));
+      R.reset(new ELFObjectFile<ELFType<support::little, 4, false>>(
+          std::move(Obj), EC));
     else
 #endif
     if (MaxAlignment >= 2)
-      R.reset(new ELFObjectFile<ELFType<support::little, 2, false> >(
-          Obj, EC, BufferOwned));
+      R.reset(new ELFObjectFile<ELFType<support::little, 2, false>>(
+          std::move(Obj), EC));
     else
-      llvm_unreachable("Invalid alignment for ELF file!");
+      return object_error::parse_failed;
   else if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2MSB)
 #if !LLVM_IS_UNALIGNED_ACCESS_FAST
     if (MaxAlignment >= 4)
-      R.reset(new ELFObjectFile<ELFType<support::big, 4, false> >(Obj, EC,
-                                                                  BufferOwned));
+      R.reset(new ELFObjectFile<ELFType<support::big, 4, false>>(std::move(Obj),
+                                                                 EC));
     else
 #endif
     if (MaxAlignment >= 2)
-      R.reset(new ELFObjectFile<ELFType<support::big, 2, false> >(Obj, EC,
-                                                                  BufferOwned));
+      R.reset(new ELFObjectFile<ELFType<support::big, 2, false>>(std::move(Obj),
+                                                                 EC));
     else
-      llvm_unreachable("Invalid alignment for ELF file!");
+      return object_error::parse_failed;
   else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2MSB)
 #if !LLVM_IS_UNALIGNED_ACCESS_FAST
     if (MaxAlignment >= 8)
-      R.reset(new ELFObjectFile<ELFType<support::big, 8, true> >(Obj, EC,
-                                                                 BufferOwned));
+      R.reset(new ELFObjectFile<ELFType<support::big, 8, true>>(std::move(Obj),
+                                                                EC));
     else
 #endif
     if (MaxAlignment >= 2)
-      R.reset(new ELFObjectFile<ELFType<support::big, 2, true> >(Obj, EC,
-                                                                 BufferOwned));
+      R.reset(new ELFObjectFile<ELFType<support::big, 2, true>>(std::move(Obj),
+                                                                EC));
     else
-      llvm_unreachable("Invalid alignment for ELF file!");
+      return object_error::parse_failed;
   else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2LSB) {
 #if !LLVM_IS_UNALIGNED_ACCESS_FAST
     if (MaxAlignment >= 8)
-      R.reset(new ELFObjectFile<ELFType<support::little, 8, true> >(
-          Obj, EC, BufferOwned));
+      R.reset(new ELFObjectFile<ELFType<support::little, 8, true>>(
+          std::move(Obj), EC));
     else
 #endif
     if (MaxAlignment >= 2)
-      R.reset(new ELFObjectFile<ELFType<support::little, 2, true> >(
-          Obj, EC, BufferOwned));
+      R.reset(new ELFObjectFile<ELFType<support::little, 2, true>>(
+          std::move(Obj), EC));
     else
-      llvm_unreachable("Invalid alignment for ELF file!");
+      return object_error::parse_failed;
   }
   else
-    report_fatal_error("Buffer is not an ELF object file!");
+    llvm_unreachable("Buffer is not an ELF object file!");
 
   if (EC)
     return EC;
diff --git a/lib/Object/ELFYAML.cpp b/lib/Object/ELFYAML.cpp
index 7d50f23..dc3d467 100644
--- a/lib/Object/ELFYAML.cpp
+++ b/lib/Object/ELFYAML.cpp
@@ -368,6 +368,16 @@ void ScalarEnumerationTraits<ELFYAML::ELF_STT>::enumeration(
 #undef ECase
 }
 
+void ScalarEnumerationTraits<ELFYAML::ELF_STV>::enumeration(
+    IO &IO, ELFYAML::ELF_STV &Value) {
+#define ECase(X) IO.enumCase(Value, #X, ELF::X);
+  ECase(STV_DEFAULT)
+  ECase(STV_INTERNAL)
+  ECase(STV_HIDDEN)
+  ECase(STV_PROTECTED)
+#undef ECase
+}
+
 void ScalarEnumerationTraits<ELFYAML::ELF_REL>::enumeration(
     IO &IO, ELFYAML::ELF_REL &Value) {
   const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
@@ -649,6 +659,7 @@ void MappingTraits<ELFYAML::Symbol>::mapping(IO &IO, ELFYAML::Symbol &Symbol) {
   IO.mapOptional("Section", Symbol.Section, StringRef());
   IO.mapOptional("Value", Symbol.Value, Hex64(0));
   IO.mapOptional("Size", Symbol.Size, Hex64(0));
+  IO.mapOptional("Visibility", Symbol.Visibility, ELFYAML::ELF_STV(0));
 }
 
 void MappingTraits<ELFYAML::LocalGlobalWeakSymbols>::mapping(
@@ -664,7 +675,6 @@ static void commonSectionMapping(IO &IO, ELFYAML::Section &Section) {
   IO.mapOptional("Flags", Section.Flags, ELFYAML::ELF_SHF(0));
   IO.mapOptional("Address", Section.Address, Hex64(0));
   IO.mapOptional("Link", Section.Link, StringRef());
-  IO.mapOptional("Info", Section.Info, StringRef());
   IO.mapOptional("AddressAlign", Section.AddressAlign, Hex64(0));
 }
 
@@ -676,6 +686,7 @@ static void sectionMapping(IO &IO, ELFYAML::RawContentSection &Section) {
 
 static void sectionMapping(IO &IO, ELFYAML::RelocationSection &Section) {
   commonSectionMapping(IO, Section);
+  IO.mapOptional("Info", Section.Info, StringRef());
   IO.mapOptional("Relocations", Section.Relocations);
 }
 
diff --git a/lib/Object/Error.cpp b/lib/Object/Error.cpp
index 8e50869..9d25269 100644
--- a/lib/Object/Error.cpp
+++ b/lib/Object/Error.cpp
@@ -18,11 +18,10 @@ using namespace llvm;
 using namespace object;
 
 namespace {
-class _object_error_category : public error_category {
+class _object_error_category : public std::error_category {
 public:
-  const char* name() const override;
+  const char* name() const LLVM_NOEXCEPT override;
   std::string message(int ev) const override;
-  error_condition default_error_condition(int ev) const override;
 };
 }
 
@@ -30,8 +29,8 @@ const char *_object_error_category::name() const {
   return "llvm.object";
 }
 
-std::string _object_error_category::message(int ev) const {
-  object_error::Impl E = static_cast<object_error::Impl>(ev);
+std::string _object_error_category::message(int EV) const {
+  object_error E = static_cast<object_error>(EV);
   switch (E) {
   case object_error::success: return "Success";
   case object_error::arch_not_found:
@@ -47,13 +46,7 @@ std::string _object_error_category::message(int ev) const {
                    "defined.");
 }
 
-error_condition _object_error_category::default_error_condition(int ev) const {
-  if (ev == object_error::success)
-    return errc::success;
-  return errc::invalid_argument;
-}
-
-const error_category &object::object_category() {
+const std::error_category &object::object_category() {
   static _object_error_category o;
   return o;
 }
diff --git a/lib/Object/IRObjectFile.cpp b/lib/Object/IRObjectFile.cpp
index a8aba26..5323d92 100644
--- a/lib/Object/IRObjectFile.cpp
+++ b/lib/Object/IRObjectFile.cpp
@@ -11,34 +11,119 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Object/IRObjectFile.h"
+#include "RecordStreamer.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/GVMaterializer.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Object/IRObjectFile.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 using namespace object;
 
-IRObjectFile::IRObjectFile(MemoryBuffer *Object, error_code &EC,
-                           LLVMContext &Context, bool BufferOwned)
-    : SymbolicFile(Binary::ID_IR, Object, BufferOwned) {
-  ErrorOr<Module*> MOrErr = parseBitcodeFile(Object, Context);
-  if ((EC = MOrErr.getError()))
-    return;
-
-  M.reset(MOrErr.get());
-
+IRObjectFile::IRObjectFile(std::unique_ptr<MemoryBuffer> Object,
+                           std::unique_ptr<Module> Mod)
+    : SymbolicFile(Binary::ID_IR, std::move(Object)), M(std::move(Mod)) {
   // If we have a DataLayout, setup a mangler.
   const DataLayout *DL = M->getDataLayout();
   if (!DL)
     return;
 
   Mang.reset(new Mangler(DL));
+
+  const std::string &InlineAsm = M->getModuleInlineAsm();
+  if (InlineAsm.empty())
+    return;
+
+  StringRef Triple = M->getTargetTriple();
+  std::string Err;
+  const Target *T = TargetRegistry::lookupTarget(Triple, Err);
+  if (!T)
+    return;
+
+  std::unique_ptr<MCRegisterInfo> MRI(T->createMCRegInfo(Triple));
+  if (!MRI)
+    return;
+
+  std::unique_ptr<MCAsmInfo> MAI(T->createMCAsmInfo(*MRI, Triple));
+  if (!MAI)
+    return;
+
+  std::unique_ptr<MCSubtargetInfo> STI(
+      T->createMCSubtargetInfo(Triple, "", ""));
+  if (!STI)
+    return;
+
+  std::unique_ptr<MCInstrInfo> MCII(T->createMCInstrInfo());
+  if (!MCII)
+    return;
+
+  MCObjectFileInfo MOFI;
+  MCContext MCCtx(MAI.get(), MRI.get(), &MOFI);
+  MOFI.InitMCObjectFileInfo(Triple, Reloc::Default, CodeModel::Default, MCCtx);
+  std::unique_ptr<RecordStreamer> Streamer(new RecordStreamer(MCCtx));
+
+  std::unique_ptr<MemoryBuffer> Buffer(MemoryBuffer::getMemBuffer(InlineAsm));
+  SourceMgr SrcMgr;
+  SrcMgr.AddNewSourceBuffer(Buffer.release(), SMLoc());
+  std::unique_ptr<MCAsmParser> Parser(
+      createMCAsmParser(SrcMgr, MCCtx, *Streamer, *MAI));
+
+  MCTargetOptions MCOptions;
+  std::unique_ptr<MCTargetAsmParser> TAP(
+      T->createMCAsmParser(*STI, *Parser, *MCII, MCOptions));
+  if (!TAP)
+    return;
+
+  Parser->setTargetParser(*TAP);
+  if (Parser->Run(false))
+    return;
+
+  for (auto &KV : *Streamer) {
+    StringRef Key = KV.first();
+    RecordStreamer::State Value = KV.second;
+    uint32_t Res = BasicSymbolRef::SF_None;
+    switch (Value) {
+    case RecordStreamer::NeverSeen:
+      llvm_unreachable("foo");
+    case RecordStreamer::DefinedGlobal:
+      Res |= BasicSymbolRef::SF_Global;
+      break;
+    case RecordStreamer::Defined:
+      break;
+    case RecordStreamer::Global:
+    case RecordStreamer::Used:
+      Res |= BasicSymbolRef::SF_Undefined;
+      Res |= BasicSymbolRef::SF_Global;
+      break;
+    }
+    AsmSymbols.push_back(
+        std::make_pair<std::string, uint32_t>(Key, std::move(Res)));
+  }
 }
 
-static const GlobalValue &getGV(DataRefImpl &Symb) {
-  return *reinterpret_cast<GlobalValue*>(Symb.p & ~uintptr_t(3));
+IRObjectFile::~IRObjectFile() {
+  GVMaterializer *GVM =  M->getMaterializer();
+  if (GVM)
+    GVM->releaseBuffer();
+ }
+
+static const GlobalValue *getGV(DataRefImpl &Symb) {
+  if ((Symb.p & 3) == 3)
+    return nullptr;
+
+  return reinterpret_cast<GlobalValue*>(Symb.p & ~uintptr_t(3));
 }
 
 static uintptr_t skipEmpty(Module::const_alias_iterator I, const Module &M) {
@@ -62,68 +147,109 @@ static uintptr_t skipEmpty(Module::const_iterator I, const Module &M) {
   return reinterpret_cast<uintptr_t>(GV) | 0;
 }
 
+static unsigned getAsmSymIndex(DataRefImpl Symb) {
+  assert((Symb.p & uintptr_t(3)) == 3);
+  uintptr_t Index = Symb.p & ~uintptr_t(3);
+  Index >>= 2;
+  return Index;
+}
+
 void IRObjectFile::moveSymbolNext(DataRefImpl &Symb) const {
-  const GlobalValue *GV = &getGV(Symb);
-  const Module &M = *GV->getParent();
+  const GlobalValue *GV = getGV(Symb);
   uintptr_t Res;
+
   switch (Symb.p & 3) {
   case 0: {
     Module::const_iterator Iter(static_cast<const Function*>(GV));
     ++Iter;
-    Res = skipEmpty(Iter, M);
+    Res = skipEmpty(Iter, *M);
     break;
   }
   case 1: {
     Module::const_global_iterator Iter(static_cast<const GlobalVariable*>(GV));
     ++Iter;
-    Res = skipEmpty(Iter, M);
+    Res = skipEmpty(Iter, *M);
     break;
   }
   case 2: {
     Module::const_alias_iterator Iter(static_cast<const GlobalAlias*>(GV));
     ++Iter;
-    Res = skipEmpty(Iter, M);
+    Res = skipEmpty(Iter, *M);
+    break;
+  }
+  case 3: {
+    unsigned Index = getAsmSymIndex(Symb);
+    assert(Index < AsmSymbols.size());
+    ++Index;
+    Res = (Index << 2) | 3;
     break;
   }
-  case 3:
-    llvm_unreachable("Invalid symbol reference");
   }
 
   Symb.p = Res;
 }
 
-error_code IRObjectFile::printSymbolName(raw_ostream &OS,
-                                         DataRefImpl Symb) const {
-  const GlobalValue &GV = getGV(Symb);
+std::error_code IRObjectFile::printSymbolName(raw_ostream &OS,
+                                              DataRefImpl Symb) const {
+  const GlobalValue *GV = getGV(Symb);
+  if (!GV) {
+    unsigned Index = getAsmSymIndex(Symb);
+    assert(Index <= AsmSymbols.size());
+    OS << AsmSymbols[Index].first;
+    return object_error::success;;
+  }
 
   if (Mang)
-    Mang->getNameWithPrefix(OS, &GV, false);
+    Mang->getNameWithPrefix(OS, GV, false);
   else
-    OS << GV.getName();
+    OS << GV->getName();
 
   return object_error::success;
 }
 
+static bool isDeclaration(const GlobalValue &V) {
+  if (V.hasAvailableExternallyLinkage())
+    return true;
+
+  if (V.isMaterializable())
+    return false;
+
+  return V.isDeclaration();
+}
+
 uint32_t IRObjectFile::getSymbolFlags(DataRefImpl Symb) const {
-  const GlobalValue &GV = getGV(Symb);
+  const GlobalValue *GV = getGV(Symb);
+
+  if (!GV) {
+    unsigned Index = getAsmSymIndex(Symb);
+    assert(Index <= AsmSymbols.size());
+    return AsmSymbols[Index].second;
+  }
 
   uint32_t Res = BasicSymbolRef::SF_None;
-  if (GV.isDeclaration() || GV.hasAvailableExternallyLinkage())
+  if (isDeclaration(*GV))
     Res |= BasicSymbolRef::SF_Undefined;
-  if (GV.hasPrivateLinkage())
+  if (GV->hasPrivateLinkage())
     Res |= BasicSymbolRef::SF_FormatSpecific;
-  if (!GV.hasLocalLinkage())
+  if (!GV->hasLocalLinkage())
     Res |= BasicSymbolRef::SF_Global;
-  if (GV.hasCommonLinkage())
+  if (GV->hasCommonLinkage())
     Res |= BasicSymbolRef::SF_Common;
-  if (GV.hasLinkOnceLinkage() || GV.hasWeakLinkage())
+  if (GV->hasLinkOnceLinkage() || GV->hasWeakLinkage())
     Res |= BasicSymbolRef::SF_Weak;
 
+  if (GV->getName().startswith("llvm."))
+    Res |= BasicSymbolRef::SF_FormatSpecific;
+  else if (auto *Var = dyn_cast<GlobalVariable>(GV)) {
+    if (Var->getSection() == StringRef("llvm.metadata"))
+      Res |= BasicSymbolRef::SF_FormatSpecific;
+  }
+
   return Res;
 }
 
-const GlobalValue &IRObjectFile::getSymbolGV(DataRefImpl Symb) const {
-  const GlobalValue &GV = getGV(Symb);
+const GlobalValue *IRObjectFile::getSymbolGV(DataRefImpl Symb) const {
+  const GlobalValue *GV = getGV(Symb);
   return GV;
 }
 
@@ -136,16 +262,18 @@ basic_symbol_iterator IRObjectFile::symbol_begin_impl() const {
 
 basic_symbol_iterator IRObjectFile::symbol_end_impl() const {
   DataRefImpl Ret;
-  Ret.p = 3;
+  uint64_t NumAsm = AsmSymbols.size();
+  NumAsm <<= 2;
+  Ret.p = 3 | NumAsm;
   return basic_symbol_iterator(BasicSymbolRef(Ret, this));
 }
 
-ErrorOr<SymbolicFile *> llvm::object::SymbolicFile::createIRObjectFile(
-    MemoryBuffer *Object, LLVMContext &Context, bool BufferOwned) {
-  error_code EC;
-  std::unique_ptr<IRObjectFile> Ret(
-      new IRObjectFile(Object, EC, Context, BufferOwned));
-  if (EC)
+ErrorOr<IRObjectFile *> llvm::object::IRObjectFile::createIRObjectFile(
+    std::unique_ptr<MemoryBuffer> Object, LLVMContext &Context) {
+  ErrorOr<Module *> MOrErr = getLazyBitcodeModule(Object.get(), Context);
+  if (std::error_code EC = MOrErr.getError())
     return EC;
-  return Ret.release();
+
+  std::unique_ptr<Module> M(MOrErr.get());
+  return new IRObjectFile(std::move(Object), std::move(M));
 }
diff --git a/lib/Object/LLVMBuild.txt b/lib/Object/LLVMBuild.txt
index 7813832..8acacba 100644
--- a/lib/Object/LLVMBuild.txt
+++ b/lib/Object/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = Object
 parent = Libraries
-required_libraries = BitReader Core Support
+required_libraries = BitReader Core Support MC MCParser
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index c6bab03..4919114 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -28,6 +28,7 @@ using namespace llvm;
 using namespace object;
 
 namespace llvm {
+
 namespace object {
 
 struct nlist_base {
@@ -43,190 +44,195 @@ struct section_base {
 };
 
 template<typename T>
-static void SwapValue(T &Value) {
-  Value = sys::SwapByteOrder(Value);
-}
-
-template<typename T>
 static void SwapStruct(T &Value);
 
 template<>
 void SwapStruct(MachO::any_relocation_info &H) {
-  SwapValue(H.r_word0);
-  SwapValue(H.r_word1);
+  sys::swapByteOrder(H.r_word0);
+  sys::swapByteOrder(H.r_word1);
 }
 
 template<>
 void SwapStruct(MachO::load_command &L) {
-  SwapValue(L.cmd);
-  SwapValue(L.cmdsize);
+  sys::swapByteOrder(L.cmd);
+  sys::swapByteOrder(L.cmdsize);
 }
 
 template<>
 void SwapStruct(nlist_base &S) {
-  SwapValue(S.n_strx);
-  SwapValue(S.n_desc);
+  sys::swapByteOrder(S.n_strx);
+  sys::swapByteOrder(S.n_desc);
 }
 
 template<>
 void SwapStruct(MachO::section &S) {
-  SwapValue(S.addr);
-  SwapValue(S.size);
-  SwapValue(S.offset);
-  SwapValue(S.align);
-  SwapValue(S.reloff);
-  SwapValue(S.nreloc);
-  SwapValue(S.flags);
-  SwapValue(S.reserved1);
-  SwapValue(S.reserved2);
+  sys::swapByteOrder(S.addr);
+  sys::swapByteOrder(S.size);
+  sys::swapByteOrder(S.offset);
+  sys::swapByteOrder(S.align);
+  sys::swapByteOrder(S.reloff);
+  sys::swapByteOrder(S.nreloc);
+  sys::swapByteOrder(S.flags);
+  sys::swapByteOrder(S.reserved1);
+  sys::swapByteOrder(S.reserved2);
 }
 
 template<>
 void SwapStruct(MachO::section_64 &S) {
-  SwapValue(S.addr);
-  SwapValue(S.size);
-  SwapValue(S.offset);
-  SwapValue(S.align);
-  SwapValue(S.reloff);
-  SwapValue(S.nreloc);
-  SwapValue(S.flags);
-  SwapValue(S.reserved1);
-  SwapValue(S.reserved2);
-  SwapValue(S.reserved3);
+  sys::swapByteOrder(S.addr);
+  sys::swapByteOrder(S.size);
+  sys::swapByteOrder(S.offset);
+  sys::swapByteOrder(S.align);
+  sys::swapByteOrder(S.reloff);
+  sys::swapByteOrder(S.nreloc);
+  sys::swapByteOrder(S.flags);
+  sys::swapByteOrder(S.reserved1);
+  sys::swapByteOrder(S.reserved2);
+  sys::swapByteOrder(S.reserved3);
 }
 
 template<>
 void SwapStruct(MachO::nlist &S) {
-  SwapValue(S.n_strx);
-  SwapValue(S.n_desc);
-  SwapValue(S.n_value);
+  sys::swapByteOrder(S.n_strx);
+  sys::swapByteOrder(S.n_desc);
+  sys::swapByteOrder(S.n_value);
 }
 
 template<>
 void SwapStruct(MachO::nlist_64 &S) {
-  SwapValue(S.n_strx);
-  SwapValue(S.n_desc);
-  SwapValue(S.n_value);
+  sys::swapByteOrder(S.n_strx);
+  sys::swapByteOrder(S.n_desc);
+  sys::swapByteOrder(S.n_value);
 }
 
 template<>
 void SwapStruct(MachO::mach_header &H) {
-  SwapValue(H.magic);
-  SwapValue(H.cputype);
-  SwapValue(H.cpusubtype);
-  SwapValue(H.filetype);
-  SwapValue(H.ncmds);
-  SwapValue(H.sizeofcmds);
-  SwapValue(H.flags);
+  sys::swapByteOrder(H.magic);
+  sys::swapByteOrder(H.cputype);
+  sys::swapByteOrder(H.cpusubtype);
+  sys::swapByteOrder(H.filetype);
+  sys::swapByteOrder(H.ncmds);
+  sys::swapByteOrder(H.sizeofcmds);
+  sys::swapByteOrder(H.flags);
 }
 
 template<>
 void SwapStruct(MachO::mach_header_64 &H) {
-  SwapValue(H.magic);
-  SwapValue(H.cputype);
-  SwapValue(H.cpusubtype);
-  SwapValue(H.filetype);
-  SwapValue(H.ncmds);
-  SwapValue(H.sizeofcmds);
-  SwapValue(H.flags);
-  SwapValue(H.reserved);
+  sys::swapByteOrder(H.magic);
+  sys::swapByteOrder(H.cputype);
+  sys::swapByteOrder(H.cpusubtype);
+  sys::swapByteOrder(H.filetype);
+  sys::swapByteOrder(H.ncmds);
+  sys::swapByteOrder(H.sizeofcmds);
+  sys::swapByteOrder(H.flags);
+  sys::swapByteOrder(H.reserved);
 }
 
 template<>
 void SwapStruct(MachO::symtab_command &C) {
-  SwapValue(C.cmd);
-  SwapValue(C.cmdsize);
-  SwapValue(C.symoff);
-  SwapValue(C.nsyms);
-  SwapValue(C.stroff);
-  SwapValue(C.strsize);
+  sys::swapByteOrder(C.cmd);
+  sys::swapByteOrder(C.cmdsize);
+  sys::swapByteOrder(C.symoff);
+  sys::swapByteOrder(C.nsyms);
+  sys::swapByteOrder(C.stroff);
+  sys::swapByteOrder(C.strsize);
 }
 
 template<>
 void SwapStruct(MachO::dysymtab_command &C) {
-  SwapValue(C.cmd);
-  SwapValue(C.cmdsize);
-  SwapValue(C.ilocalsym);
-  SwapValue(C.nlocalsym);
-  SwapValue(C.iextdefsym);
-  SwapValue(C.nextdefsym);
-  SwapValue(C.iundefsym);
-  SwapValue(C.nundefsym);
-  SwapValue(C.tocoff);
-  SwapValue(C.ntoc);
-  SwapValue(C.modtaboff);
-  SwapValue(C.nmodtab);
-  SwapValue(C.extrefsymoff);
-  SwapValue(C.nextrefsyms);
-  SwapValue(C.indirectsymoff);
-  SwapValue(C.nindirectsyms);
-  SwapValue(C.extreloff);
-  SwapValue(C.nextrel);
-  SwapValue(C.locreloff);
-  SwapValue(C.nlocrel);
+  sys::swapByteOrder(C.cmd);
+  sys::swapByteOrder(C.cmdsize);
+  sys::swapByteOrder(C.ilocalsym);
+  sys::swapByteOrder(C.nlocalsym);
+  sys::swapByteOrder(C.iextdefsym);
+  sys::swapByteOrder(C.nextdefsym);
+  sys::swapByteOrder(C.iundefsym);
+  sys::swapByteOrder(C.nundefsym);
+  sys::swapByteOrder(C.tocoff);
+  sys::swapByteOrder(C.ntoc);
+  sys::swapByteOrder(C.modtaboff);
+  sys::swapByteOrder(C.nmodtab);
+  sys::swapByteOrder(C.extrefsymoff);
+  sys::swapByteOrder(C.nextrefsyms);
+  sys::swapByteOrder(C.indirectsymoff);
+  sys::swapByteOrder(C.nindirectsyms);
+  sys::swapByteOrder(C.extreloff);
+  sys::swapByteOrder(C.nextrel);
+  sys::swapByteOrder(C.locreloff);
+  sys::swapByteOrder(C.nlocrel);
 }
 
 template<>
 void SwapStruct(MachO::linkedit_data_command &C) {
-  SwapValue(C.cmd);
-  SwapValue(C.cmdsize);
-  SwapValue(C.dataoff);
-  SwapValue(C.datasize);
+  sys::swapByteOrder(C.cmd);
+  sys::swapByteOrder(C.cmdsize);
+  sys::swapByteOrder(C.dataoff);
+  sys::swapByteOrder(C.datasize);
 }
 
 template<>
 void SwapStruct(MachO::segment_command &C) {
-  SwapValue(C.cmd);
-  SwapValue(C.cmdsize);
-  SwapValue(C.vmaddr);
-  SwapValue(C.vmsize);
-  SwapValue(C.fileoff);
-  SwapValue(C.filesize);
-  SwapValue(C.maxprot);
-  SwapValue(C.initprot);
-  SwapValue(C.nsects);
-  SwapValue(C.flags);
+  sys::swapByteOrder(C.cmd);
+  sys::swapByteOrder(C.cmdsize);
+  sys::swapByteOrder(C.vmaddr);
+  sys::swapByteOrder(C.vmsize);
+  sys::swapByteOrder(C.fileoff);
+  sys::swapByteOrder(C.filesize);
+  sys::swapByteOrder(C.maxprot);
+  sys::swapByteOrder(C.initprot);
+  sys::swapByteOrder(C.nsects);
+  sys::swapByteOrder(C.flags);
 }
 
 template<>
 void SwapStruct(MachO::segment_command_64 &C) {
-  SwapValue(C.cmd);
-  SwapValue(C.cmdsize);
-  SwapValue(C.vmaddr);
-  SwapValue(C.vmsize);
-  SwapValue(C.fileoff);
-  SwapValue(C.filesize);
-  SwapValue(C.maxprot);
-  SwapValue(C.initprot);
-  SwapValue(C.nsects);
-  SwapValue(C.flags);
+  sys::swapByteOrder(C.cmd);
+  sys::swapByteOrder(C.cmdsize);
+  sys::swapByteOrder(C.vmaddr);
+  sys::swapByteOrder(C.vmsize);
+  sys::swapByteOrder(C.fileoff);
+  sys::swapByteOrder(C.filesize);
+  sys::swapByteOrder(C.maxprot);
+  sys::swapByteOrder(C.initprot);
+  sys::swapByteOrder(C.nsects);
+  sys::swapByteOrder(C.flags);
 }
 
 template<>
 void SwapStruct(uint32_t &C) {
-  SwapValue(C);
+  sys::swapByteOrder(C);
 }
 
 template<>
 void SwapStruct(MachO::linker_options_command &C) {
-  SwapValue(C.cmd);
-  SwapValue(C.cmdsize);
-  SwapValue(C.count);
+  sys::swapByteOrder(C.cmd);
+  sys::swapByteOrder(C.cmdsize);
+  sys::swapByteOrder(C.count);
 }
 
 template<>
 void SwapStruct(MachO::version_min_command&C) {
-  SwapValue(C.cmd);
-  SwapValue(C.cmdsize);
-  SwapValue(C.version);
-  SwapValue(C.reserved);
+  sys::swapByteOrder(C.cmd);
+  sys::swapByteOrder(C.cmdsize);
+  sys::swapByteOrder(C.version);
+  sys::swapByteOrder(C.reserved);
+}
+
+template<>
+void SwapStruct(MachO::dylib_command&C) {
+  sys::swapByteOrder(C.cmd);
+  sys::swapByteOrder(C.cmdsize);
+  sys::swapByteOrder(C.dylib.name);
+  sys::swapByteOrder(C.dylib.timestamp);
+  sys::swapByteOrder(C.dylib.current_version);
+  sys::swapByteOrder(C.dylib.compatibility_version);
 }
 
 template<>
 void SwapStruct(MachO::data_in_code_entry &C) {
-  SwapValue(C.offset);
-  SwapValue(C.length);
-  SwapValue(C.kind);
+  sys::swapByteOrder(C.offset);
+  sys::swapByteOrder(C.length);
+  sys::swapByteOrder(C.kind);
 }
 
 template<typename T>
@@ -306,7 +312,7 @@ static void printRelocationTargetName(const MachOObjectFile *O,
     uint32_t Val = O->getPlainRelocationSymbolNum(RE);
 
     for (const SymbolRef &Symbol : O->symbols()) {
-      error_code ec;
+      std::error_code ec;
       uint64_t Addr;
       StringRef Name;
 
@@ -323,7 +329,7 @@ static void printRelocationTargetName(const MachOObjectFile *O,
     // If we couldn't find a symbol that this relocation refers to, try
     // to find a section beginning instead.
     for (const SectionRef &Section : O->sections()) {
-      error_code ec;
+      std::error_code ec;
       uint64_t Addr;
       StringRef Name;
 
@@ -416,10 +422,10 @@ static uint32_t getSectionFlags(const MachOObjectFile *O,
   return Sect.flags;
 }
 
-MachOObjectFile::MachOObjectFile(MemoryBuffer *Object, bool IsLittleEndian,
-                                 bool Is64bits, error_code &EC,
-                                 bool BufferOwned)
-    : ObjectFile(getMachOType(IsLittleEndian, Is64bits), Object, BufferOwned),
+MachOObjectFile::MachOObjectFile(std::unique_ptr<MemoryBuffer> Object,
+                                 bool IsLittleEndian, bool Is64bits,
+                                 std::error_code &EC)
+    : ObjectFile(getMachOType(IsLittleEndian, Is64bits), std::move(Object)),
       SymtabLoadCmd(nullptr), DysymtabLoadCmd(nullptr),
       DataInCodeLoadCmd(nullptr) {
   uint32_t LoadCommandCount = this->getHeader().ncmds;
@@ -443,6 +449,12 @@ MachOObjectFile::MachOObjectFile(MemoryBuffer *Object, bool IsLittleEndian,
         const char *Sec = getSectionPtr(this, Load, J);
         Sections.push_back(Sec);
       }
+    } else if (Load.C.cmd == MachO::LC_LOAD_DYLIB ||
+               Load.C.cmd == MachO::LC_LOAD_WEAK_DYLIB ||
+               Load.C.cmd == MachO::LC_LAZY_LOAD_DYLIB ||
+               Load.C.cmd == MachO::LC_REEXPORT_DYLIB ||
+               Load.C.cmd == MachO::LC_LOAD_UPWARD_DYLIB) {
+      Libraries.push_back(Load.Ptr);
     }
 
     if (I == LoadCommandCount - 1)
@@ -459,8 +471,8 @@ void MachOObjectFile::moveSymbolNext(DataRefImpl &Symb) const {
   Symb.p += SymbolTableEntrySize;
 }
 
-error_code MachOObjectFile::getSymbolName(DataRefImpl Symb,
-                                          StringRef &Res) const {
+std::error_code MachOObjectFile::getSymbolName(DataRefImpl Symb,
+                                               StringRef &Res) const {
   StringRef StringTable = getStringTableData();
   nlist_base Entry = getSymbolTableEntryBase(this, Symb);
   const char *Start = &StringTable.data()[Entry.n_strx];
@@ -468,8 +480,32 @@ error_code MachOObjectFile::getSymbolName(DataRefImpl Symb,
   return object_error::success;
 }
 
-error_code MachOObjectFile::getSymbolAddress(DataRefImpl Symb,
-                                             uint64_t &Res) const {
+// getIndirectName() returns the name of the alias'ed symbol who's string table
+// index is in the n_value field.
+std::error_code MachOObjectFile::getIndirectName(DataRefImpl Symb,
+                                                 StringRef &Res) const {
+  StringRef StringTable = getStringTableData();
+  uint64_t NValue;
+  if (is64Bit()) {
+    MachO::nlist_64 Entry = getSymbol64TableEntry(Symb);
+    NValue = Entry.n_value;
+    if ((Entry.n_type & MachO::N_TYPE) != MachO::N_INDR)
+      return object_error::parse_failed;
+  } else {
+    MachO::nlist Entry = getSymbolTableEntry(Symb);
+    NValue = Entry.n_value;
+    if ((Entry.n_type & MachO::N_TYPE) != MachO::N_INDR)
+      return object_error::parse_failed;
+  }
+  if (NValue >= StringTable.size())
+    return object_error::parse_failed;
+  const char *Start = &StringTable.data()[NValue];
+  Res = StringRef(Start);
+  return object_error::success;
+}
+
+std::error_code MachOObjectFile::getSymbolAddress(DataRefImpl Symb,
+                                                  uint64_t &Res) const {
   if (is64Bit()) {
     MachO::nlist_64 Entry = getSymbol64TableEntry(Symb);
     if ((Entry.n_type & MachO::N_TYPE) == MachO::N_UNDF &&
@@ -488,8 +524,8 @@ error_code MachOObjectFile::getSymbolAddress(DataRefImpl Symb,
   return object_error::success;
 }
 
-error_code MachOObjectFile::getSymbolAlignment(DataRefImpl DRI,
-                                               uint32_t &Result) const {
+std::error_code MachOObjectFile::getSymbolAlignment(DataRefImpl DRI,
+                                                    uint32_t &Result) const {
   uint32_t flags = getSymbolFlags(DRI);
   if (flags & SymbolRef::SF_Common) {
     nlist_base Entry = getSymbolTableEntryBase(this, DRI);
@@ -500,8 +536,8 @@ error_code MachOObjectFile::getSymbolAlignment(DataRefImpl DRI,
   return object_error::success;
 }
 
-error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
-                                          uint64_t &Result) const {
+std::error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
+                                               uint64_t &Result) const {
   uint64_t BeginOffset;
   uint64_t EndOffset = 0;
   uint8_t SectionIndex;
@@ -549,8 +585,8 @@ error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
   return object_error::success;
 }
 
-error_code MachOObjectFile::getSymbolType(DataRefImpl Symb,
-                                          SymbolRef::Type &Res) const {
+std::error_code MachOObjectFile::getSymbolType(DataRefImpl Symb,
+                                               SymbolRef::Type &Res) const {
   nlist_base Entry = getSymbolTableEntryBase(this, Symb);
   uint8_t n_type = Entry.n_type;
 
@@ -584,6 +620,9 @@ uint32_t MachOObjectFile::getSymbolFlags(DataRefImpl DRI) const {
   if ((MachOType & MachO::N_TYPE) == MachO::N_UNDF)
     Result |= SymbolRef::SF_Undefined;
 
+  if ((MachOType & MachO::N_TYPE) == MachO::N_INDR)
+    Result |= SymbolRef::SF_Indirect;
+
   if (MachOType & MachO::N_STAB)
     Result |= SymbolRef::SF_FormatSpecific;
 
@@ -606,9 +645,8 @@ uint32_t MachOObjectFile::getSymbolFlags(DataRefImpl DRI) const {
   return Result;
 }
 
-error_code
-MachOObjectFile::getSymbolSection(DataRefImpl Symb,
-                                  section_iterator &Res) const {
+std::error_code MachOObjectFile::getSymbolSection(DataRefImpl Symb,
+                                                  section_iterator &Res) const {
   nlist_base Entry = getSymbolTableEntryBase(this, Symb);
   uint8_t index = Entry.n_sect;
 
@@ -627,15 +665,15 @@ void MachOObjectFile::moveSectionNext(DataRefImpl &Sec) const {
   Sec.d.a++;
 }
 
-error_code
-MachOObjectFile::getSectionName(DataRefImpl Sec, StringRef &Result) const {
+std::error_code MachOObjectFile::getSectionName(DataRefImpl Sec,
+                                                StringRef &Result) const {
   ArrayRef<char> Raw = getSectionRawName(Sec);
   Result = parseSegmentOrSectionName(Raw.data());
   return object_error::success;
 }
 
-error_code
-MachOObjectFile::getSectionAddress(DataRefImpl Sec, uint64_t &Res) const {
+std::error_code MachOObjectFile::getSectionAddress(DataRefImpl Sec,
+                                                   uint64_t &Res) const {
   if (is64Bit()) {
     MachO::section_64 Sect = getSection64(Sec);
     Res = Sect.addr;
@@ -646,8 +684,8 @@ MachOObjectFile::getSectionAddress(DataRefImpl Sec, uint64_t &Res) const {
   return object_error::success;
 }
 
-error_code
-MachOObjectFile::getSectionSize(DataRefImpl Sec, uint64_t &Res) const {
+std::error_code MachOObjectFile::getSectionSize(DataRefImpl Sec,
+                                                uint64_t &Res) const {
   if (is64Bit()) {
     MachO::section_64 Sect = getSection64(Sec);
     Res = Sect.size;
@@ -659,8 +697,8 @@ MachOObjectFile::getSectionSize(DataRefImpl Sec, uint64_t &Res) const {
   return object_error::success;
 }
 
-error_code
-MachOObjectFile::getSectionContents(DataRefImpl Sec, StringRef &Res) const {
+std::error_code MachOObjectFile::getSectionContents(DataRefImpl Sec,
+                                                    StringRef &Res) const {
   uint32_t Offset;
   uint64_t Size;
 
@@ -678,8 +716,8 @@ MachOObjectFile::getSectionContents(DataRefImpl Sec, StringRef &Res) const {
   return object_error::success;
 }
 
-error_code
-MachOObjectFile::getSectionAlignment(DataRefImpl Sec, uint64_t &Res) const {
+std::error_code MachOObjectFile::getSectionAlignment(DataRefImpl Sec,
+                                                     uint64_t &Res) const {
   uint32_t Align;
   if (is64Bit()) {
     MachO::section_64 Sect = getSection64(Sec);
@@ -693,14 +731,15 @@ MachOObjectFile::getSectionAlignment(DataRefImpl Sec, uint64_t &Res) const {
   return object_error::success;
 }
 
-error_code
-MachOObjectFile::isSectionText(DataRefImpl Sec, bool &Res) const {
+std::error_code MachOObjectFile::isSectionText(DataRefImpl Sec,
+                                               bool &Res) const {
   uint32_t Flags = getSectionFlags(this, Sec);
   Res = Flags & MachO::S_ATTR_PURE_INSTRUCTIONS;
   return object_error::success;
 }
 
-error_code MachOObjectFile::isSectionData(DataRefImpl Sec, bool &Result) const {
+std::error_code MachOObjectFile::isSectionData(DataRefImpl Sec,
+                                               bool &Result) const {
   uint32_t Flags = getSectionFlags(this, Sec);
   unsigned SectionType = Flags & MachO::SECTION_TYPE;
   Result = !(Flags & MachO::S_ATTR_PURE_INSTRUCTIONS) &&
@@ -709,7 +748,8 @@ error_code MachOObjectFile::isSectionData(DataRefImpl Sec, bool &Result) const {
   return object_error::success;
 }
 
-error_code MachOObjectFile::isSectionBSS(DataRefImpl Sec, bool &Result) const {
+std::error_code MachOObjectFile::isSectionBSS(DataRefImpl Sec,
+                                              bool &Result) const {
   uint32_t Flags = getSectionFlags(this, Sec);
   unsigned SectionType = Flags & MachO::SECTION_TYPE;
   Result = !(Flags & MachO::S_ATTR_PURE_INSTRUCTIONS) &&
@@ -718,7 +758,7 @@ error_code MachOObjectFile::isSectionBSS(DataRefImpl Sec, bool &Result) const {
   return object_error::success;
 }
 
-error_code
+std::error_code
 MachOObjectFile::isSectionRequiredForExecution(DataRefImpl Sec,
                                                bool &Result) const {
   // FIXME: Unimplemented.
@@ -726,15 +766,15 @@ MachOObjectFile::isSectionRequiredForExecution(DataRefImpl Sec,
   return object_error::success;
 }
 
-error_code MachOObjectFile::isSectionVirtual(DataRefImpl Sec,
-                                             bool &Result) const {
+std::error_code MachOObjectFile::isSectionVirtual(DataRefImpl Sec,
+                                                  bool &Result) const {
   // FIXME: Unimplemented.
   Result = false;
   return object_error::success;
 }
 
-error_code
-MachOObjectFile::isSectionZeroInit(DataRefImpl Sec, bool &Res) const {
+std::error_code MachOObjectFile::isSectionZeroInit(DataRefImpl Sec,
+                                                   bool &Res) const {
   uint32_t Flags = getSectionFlags(this, Sec);
   unsigned SectionType = Flags & MachO::SECTION_TYPE;
   Res = SectionType == MachO::S_ZEROFILL ||
@@ -742,8 +782,8 @@ MachOObjectFile::isSectionZeroInit(DataRefImpl Sec, bool &Res) const {
   return object_error::success;
 }
 
-error_code MachOObjectFile::isSectionReadOnlyData(DataRefImpl Sec,
-                                                  bool &Result) const {
+std::error_code MachOObjectFile::isSectionReadOnlyData(DataRefImpl Sec,
+                                                       bool &Result) const {
   // Consider using the code from isSectionText to look for __const sections.
   // Alternately, emit S_ATTR_PURE_INSTRUCTIONS and/or S_ATTR_SOME_INSTRUCTIONS
   // to use section attributes to distinguish code from data.
@@ -753,9 +793,9 @@ error_code MachOObjectFile::isSectionReadOnlyData(DataRefImpl Sec,
   return object_error::success;
 }
 
-error_code
-MachOObjectFile::sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
-                                       bool &Result) const {
+std::error_code MachOObjectFile::sectionContainsSymbol(DataRefImpl Sec,
+                                                       DataRefImpl Symb,
+                                                       bool &Result) const {
   SymbolRef::Type ST;
   this->getSymbolType(Symb, ST);
   if (ST == SymbolRef::ST_Unknown) {
@@ -803,8 +843,8 @@ void MachOObjectFile::moveRelocationNext(DataRefImpl &Rel) const {
   ++Rel.d.b;
 }
 
-error_code
-MachOObjectFile::getRelocationAddress(DataRefImpl Rel, uint64_t &Res) const {
+std::error_code MachOObjectFile::getRelocationAddress(DataRefImpl Rel,
+                                                      uint64_t &Res) const {
   uint64_t Offset;
   getRelocationOffset(Rel, Offset);
 
@@ -816,8 +856,8 @@ MachOObjectFile::getRelocationAddress(DataRefImpl Rel, uint64_t &Res) const {
   return object_error::success;
 }
 
-error_code MachOObjectFile::getRelocationOffset(DataRefImpl Rel,
-                                                uint64_t &Res) const {
+std::error_code MachOObjectFile::getRelocationOffset(DataRefImpl Rel,
+                                                     uint64_t &Res) const {
   assert(getHeader().filetype == MachO::MH_OBJECT &&
          "Only implemented for MH_OBJECT");
   MachO::any_relocation_info RE = getRelocation(Rel);
@@ -828,6 +868,9 @@ error_code MachOObjectFile::getRelocationOffset(DataRefImpl Rel,
 symbol_iterator
 MachOObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
   MachO::any_relocation_info RE = getRelocation(Rel);
+  if (isRelocationScattered(RE))
+    return symbol_end();
+
   uint32_t SymbolIdx = getPlainRelocationSymbolNum(RE);
   bool isExtern = getPlainRelocationExternal(RE);
   if (!isExtern)
@@ -843,14 +886,14 @@ MachOObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
   return symbol_iterator(SymbolRef(Sym, this));
 }
 
-error_code MachOObjectFile::getRelocationType(DataRefImpl Rel,
-                                              uint64_t &Res) const {
+std::error_code MachOObjectFile::getRelocationType(DataRefImpl Rel,
+                                                   uint64_t &Res) const {
   MachO::any_relocation_info RE = getRelocation(Rel);
   Res = getAnyRelocationType(RE);
   return object_error::success;
 }
 
-error_code
+std::error_code
 MachOObjectFile::getRelocationTypeName(DataRefImpl Rel,
                                        SmallVectorImpl<char> &Result) const {
   StringRef res;
@@ -963,7 +1006,7 @@ MachOObjectFile::getRelocationTypeName(DataRefImpl Rel,
   return object_error::success;
 }
 
-error_code
+std::error_code
 MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
                                           SmallVectorImpl<char> &Result) const {
   MachO::any_relocation_info RE = getRelocation(Rel);
@@ -1139,8 +1182,8 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
   return object_error::success;
 }
 
-error_code
-MachOObjectFile::getRelocationHidden(DataRefImpl Rel, bool &Result) const {
+std::error_code MachOObjectFile::getRelocationHidden(DataRefImpl Rel,
+                                                     bool &Result) const {
   unsigned Arch = getArch();
   uint64_t Type;
   getRelocationType(Rel, Type);
@@ -1167,16 +1210,199 @@ MachOObjectFile::getRelocationHidden(DataRefImpl Rel, bool &Result) const {
   return object_error::success;
 }
 
-error_code MachOObjectFile::getLibraryNext(DataRefImpl LibData,
-                                           LibraryRef &Res) const {
+std::error_code MachOObjectFile::getLibraryNext(DataRefImpl LibData,
+                                                LibraryRef &Res) const {
   report_fatal_error("Needed libraries unimplemented in MachOObjectFile");
 }
 
-error_code MachOObjectFile::getLibraryPath(DataRefImpl LibData,
-                                           StringRef &Res) const {
+std::error_code MachOObjectFile::getLibraryPath(DataRefImpl LibData,
+                                                StringRef &Res) const {
   report_fatal_error("Needed libraries unimplemented in MachOObjectFile");
 }
 
+//
+// guessLibraryShortName() is passed a name of a dynamic library and returns a
+// guess on what the short name is.  Then name is returned as a substring of the
+// StringRef Name passed in.  The name of the dynamic library is recognized as
+// a framework if it has one of the two following forms:
+//      Foo.framework/Versions/A/Foo
+//      Foo.framework/Foo
+// Where A and Foo can be any string.  And may contain a trailing suffix
+// starting with an underbar.  If the Name is recognized as a framework then
+// isFramework is set to true else it is set to false.  If the Name has a
+// suffix then Suffix is set to the substring in Name that contains the suffix
+// else it is set to a NULL StringRef.
+//
+// The Name of the dynamic library is recognized as a library name if it has
+// one of the two following forms:
+//      libFoo.A.dylib
+//      libFoo.dylib
+// The library may have a suffix trailing the name Foo of the form:
+//      libFoo_profile.A.dylib
+//      libFoo_profile.dylib
+//
+// The Name of the dynamic library is also recognized as a library name if it
+// has the following form:
+//      Foo.qtx
+//
+// If the Name of the dynamic library is none of the forms above then a NULL
+// StringRef is returned.
+//
+StringRef MachOObjectFile::guessLibraryShortName(StringRef Name,
+                                                 bool &isFramework,
+                                                 StringRef &Suffix) {
+  StringRef Foo, F, DotFramework, V, Dylib, Lib, Dot, Qtx;
+  size_t a, b, c, d, Idx;
+
+  isFramework = false;
+  Suffix = StringRef();
+
+  // Pull off the last component and make Foo point to it
+  a = Name.rfind('/');
+  if (a == Name.npos || a == 0)
+    goto guess_library;
+  Foo = Name.slice(a+1, Name.npos);
+
+  // Look for a suffix starting with a '_'
+  Idx = Foo.rfind('_');
+  if (Idx != Foo.npos && Foo.size() >= 2) {
+    Suffix = Foo.slice(Idx, Foo.npos);
+    Foo = Foo.slice(0, Idx);
+  }
+
+  // First look for the form Foo.framework/Foo
+  b = Name.rfind('/', a);
+  if (b == Name.npos)
+    Idx = 0;
+  else
+    Idx = b+1;
+  F = Name.slice(Idx, Idx + Foo.size());
+  DotFramework = Name.slice(Idx + Foo.size(),
+                            Idx + Foo.size() + sizeof(".framework/")-1);
+  if (F == Foo && DotFramework == ".framework/") {
+    isFramework = true;
+    return Foo;
+  }
+
+  // Next look for the form Foo.framework/Versions/A/Foo
+  if (b == Name.npos)
+    goto guess_library;
+  c =  Name.rfind('/', b);
+  if (c == Name.npos || c == 0)
+    goto guess_library;
+  V = Name.slice(c+1, Name.npos);
+  if (!V.startswith("Versions/"))
+    goto guess_library;
+  d =  Name.rfind('/', c);
+  if (d == Name.npos)
+    Idx = 0;
+  else
+    Idx = d+1;
+  F = Name.slice(Idx, Idx + Foo.size());
+  DotFramework = Name.slice(Idx + Foo.size(),
+                            Idx + Foo.size() + sizeof(".framework/")-1);
+  if (F == Foo && DotFramework == ".framework/") {
+    isFramework = true;
+    return Foo;
+  }
+
+guess_library:
+  // pull off the suffix after the "." and make a point to it
+  a = Name.rfind('.');
+  if (a == Name.npos || a == 0)
+    return StringRef();
+  Dylib = Name.slice(a, Name.npos);
+  if (Dylib != ".dylib")
+    goto guess_qtx;
+
+  // First pull off the version letter for the form Foo.A.dylib if any.
+  if (a >= 3) {
+    Dot = Name.slice(a-2, a-1);
+    if (Dot == ".")
+      a = a - 2;
+  }
+
+  b = Name.rfind('/', a);
+  if (b == Name.npos)
+    b = 0;
+  else
+    b = b+1;
+  // ignore any suffix after an underbar like Foo_profile.A.dylib
+  Idx = Name.find('_', b);
+  if (Idx != Name.npos && Idx != b) {
+    Lib = Name.slice(b, Idx);
+    Suffix = Name.slice(Idx, a);
+  }
+  else
+    Lib = Name.slice(b, a);
+  // There are incorrect library names of the form:
+  // libATS.A_profile.dylib so check for these.
+  if (Lib.size() >= 3) {
+    Dot = Lib.slice(Lib.size()-2, Lib.size()-1);
+    if (Dot == ".")
+      Lib = Lib.slice(0, Lib.size()-2);
+  }
+  return Lib;
+
+guess_qtx:
+  Qtx = Name.slice(a, Name.npos);
+  if (Qtx != ".qtx")
+    return StringRef();
+  b = Name.rfind('/', a);
+  if (b == Name.npos)
+    Lib = Name.slice(0, a);
+  else
+    Lib = Name.slice(b+1, a);
+  // There are library names of the form: QT.A.qtx so check for these.
+  if (Lib.size() >= 3) {
+    Dot = Lib.slice(Lib.size()-2, Lib.size()-1);
+    if (Dot == ".")
+      Lib = Lib.slice(0, Lib.size()-2);
+  }
+  return Lib;
+}
+
+// getLibraryShortNameByIndex() is used to get the short name of the library
+// for an undefined symbol in a linked Mach-O binary that was linked with the
+// normal two-level namespace default (that is MH_TWOLEVEL in the header).
+// It is passed the index (0 - based) of the library as translated from
+// GET_LIBRARY_ORDINAL (1 - based).
+std::error_code MachOObjectFile::getLibraryShortNameByIndex(unsigned Index,
+                                                            StringRef &Res) {
+  if (Index >= Libraries.size())
+    return object_error::parse_failed;
+
+  MachO::dylib_command D =
+    getStruct<MachO::dylib_command>(this, Libraries[Index]);
+  if (D.dylib.name >= D.cmdsize)
+    return object_error::parse_failed;
+
+  // If the cache of LibrariesShortNames is not built up do that first for
+  // all the Libraries.
+  if (LibrariesShortNames.size() == 0) {
+    for (unsigned i = 0; i < Libraries.size(); i++) {
+      MachO::dylib_command D =
+        getStruct<MachO::dylib_command>(this, Libraries[i]);
+      if (D.dylib.name >= D.cmdsize) {
+        LibrariesShortNames.push_back(StringRef());
+        continue;
+      }
+      const char *P = (const char *)(Libraries[i]) + D.dylib.name;
+      StringRef Name = StringRef(P);
+      StringRef Suffix;
+      bool isFramework;
+      StringRef shortName = guessLibraryShortName(Name, isFramework, Suffix);
+      if (shortName == StringRef())
+        LibrariesShortNames.push_back(Name);
+      else
+        LibrariesShortNames.push_back(shortName);
+    }
+  }
+
+  Res = LibrariesShortNames[Index];
+  return object_error::success;
+}
+
 basic_symbol_iterator MachOObjectFile::symbol_begin_impl() const {
   return getSymbolByIndex(0);
 }
@@ -1288,6 +1514,108 @@ Triple::ArchType MachOObjectFile::getArch(uint32_t CPUType) {
   }
 }
 
+Triple MachOObjectFile::getArch(uint32_t CPUType, uint32_t CPUSubType) {
+  switch (CPUType) {
+  case MachO::CPU_TYPE_I386:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_I386_ALL:
+      return Triple("i386-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_X86_64:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_X86_64_ALL:
+      return Triple("x86_64-apple-darwin");
+    case MachO::CPU_SUBTYPE_X86_64_H:
+      return Triple("x86_64h-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_ARM:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_ARM_V4T:
+      return Triple("armv4t-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V5TEJ:
+      return Triple("armv5e-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V6:
+      return Triple("armv6-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V6M:
+      return Triple("armv6m-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7EM:
+      return Triple("armv7em-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7K:
+      return Triple("armv7k-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7M:
+      return Triple("armv7m-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7S:
+      return Triple("armv7s-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_ARM64:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_ARM64_ALL:
+      return Triple("arm64-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_POWERPC:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_POWERPC_ALL:
+      return Triple("ppc-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_POWERPC64:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_POWERPC_ALL:
+      return Triple("ppc64-apple-darwin");
+    default:
+      return Triple();
+    }
+  default:
+    return Triple();
+  }
+}
+
+Triple MachOObjectFile::getHostArch() {
+  return Triple(sys::getDefaultTargetTriple());
+}
+
+Triple MachOObjectFile::getArch(StringRef ArchFlag) {
+  if (ArchFlag == "i386")
+    return Triple("i386-apple-darwin");
+  else if (ArchFlag == "x86_64")
+    return Triple("x86_64-apple-darwin");
+  else if (ArchFlag == "x86_64h")
+    return Triple("x86_64h-apple-darwin");
+  else if (ArchFlag == "armv4t" || ArchFlag == "arm")
+    return Triple("armv4t-apple-darwin");
+  else if (ArchFlag == "armv5e")
+    return Triple("armv5e-apple-darwin");
+  else if (ArchFlag == "armv6")
+    return Triple("armv6-apple-darwin");
+  else if (ArchFlag == "armv6m")
+    return Triple("armv6m-apple-darwin");
+  else if (ArchFlag == "armv7em")
+    return Triple("armv7em-apple-darwin");
+  else if (ArchFlag == "armv7k")
+    return Triple("armv7k-apple-darwin");
+  else if (ArchFlag == "armv7k")
+    return Triple("armv7m-apple-darwin");
+  else if (ArchFlag == "armv7s")
+    return Triple("armv7s-apple-darwin");
+  else if (ArchFlag == "arm64")
+    return Triple("arm64-apple-darwin");
+  else if (ArchFlag == "ppc")
+    return Triple("ppc-apple-darwin");
+  else if (ArchFlag == "ppc64")
+    return Triple("ppc64-apple-darwin");
+  else
+    return Triple();
+}
+
 unsigned MachOObjectFile::getArch() const {
   return getArch(getCPUType(this));
 }
@@ -1498,6 +1826,12 @@ MachOObjectFile::getVersionMinLoadCommand(const LoadCommandInfo &L) const {
   return getStruct<MachO::version_min_command>(this, L.Ptr);
 }
 
+MachO::dylib_command
+MachOObjectFile::getDylibIDLoadCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::dylib_command>(this, L.Ptr);
+}
+
+
 MachO::any_relocation_info
 MachOObjectFile::getRelocation(DataRefImpl Rel) const {
   DataRefImpl Sec;
@@ -1574,7 +1908,7 @@ StringRef MachOObjectFile::getStringTableData() const {
 
 bool MachOObjectFile::is64Bit() const {
   return getType() == getMachOType(false, true) ||
-    getType() == getMachOType(true, true);
+         getType() == getMachOType(true, true);
 }
 
 void MachOObjectFile::ReadULEB128s(uint64_t Index,
@@ -1589,23 +1923,25 @@ void MachOObjectFile::ReadULEB128s(uint64_t Index,
   }
 }
 
-ErrorOr<ObjectFile *> ObjectFile::createMachOObjectFile(MemoryBuffer *Buffer,
-                                                        bool BufferOwned) {
+const char *MachOObjectFile::getSectionPointer(DataRefImpl Rel) const {
+  return Sections[Rel.d.a];
+}
+
+ErrorOr<ObjectFile *>
+ObjectFile::createMachOObjectFile(std::unique_ptr<MemoryBuffer> &Buffer) {
   StringRef Magic = Buffer->getBuffer().slice(0, 4);
-  error_code EC;
+  std::error_code EC;
   std::unique_ptr<MachOObjectFile> Ret;
   if (Magic == "\xFE\xED\xFA\xCE")
-    Ret.reset(new MachOObjectFile(Buffer, false, false, EC, BufferOwned));
+    Ret.reset(new MachOObjectFile(std::move(Buffer), false, false, EC));
   else if (Magic == "\xCE\xFA\xED\xFE")
-    Ret.reset(new MachOObjectFile(Buffer, true, false, EC, BufferOwned));
+    Ret.reset(new MachOObjectFile(std::move(Buffer), true, false, EC));
   else if (Magic == "\xFE\xED\xFA\xCF")
-    Ret.reset(new MachOObjectFile(Buffer, false, true, EC, BufferOwned));
+    Ret.reset(new MachOObjectFile(std::move(Buffer), false, true, EC));
   else if (Magic == "\xCF\xFA\xED\xFE")
-    Ret.reset(new MachOObjectFile(Buffer, true, true, EC, BufferOwned));
-  else {
-    delete Buffer;
+    Ret.reset(new MachOObjectFile(std::move(Buffer), true, true, EC));
+  else
     return object_error::parse_failed;
-  }
 
   if (EC)
     return EC;
diff --git a/lib/Object/MachOUniversal.cpp b/lib/Object/MachOUniversal.cpp
index 5085efd..4ba5d96 100644
--- a/lib/Object/MachOUniversal.cpp
+++ b/lib/Object/MachOUniversal.cpp
@@ -23,26 +23,21 @@ using namespace llvm;
 using namespace object;
 
 template<typename T>
-static void SwapValue(T &Value) {
-  Value = sys::SwapByteOrder(Value);
-}
-
-template<typename T>
 static void SwapStruct(T &Value);
 
 template<>
 void SwapStruct(MachO::fat_header &H) {
-  SwapValue(H.magic);
-  SwapValue(H.nfat_arch);
+  sys::swapByteOrder(H.magic);
+  sys::swapByteOrder(H.nfat_arch);
 }
 
 template<>
 void SwapStruct(MachO::fat_arch &H) {
-  SwapValue(H.cputype);
-  SwapValue(H.cpusubtype);
-  SwapValue(H.offset);
-  SwapValue(H.size);
-  SwapValue(H.align);
+  sys::swapByteOrder(H.cputype);
+  sys::swapByteOrder(H.cpusubtype);
+  sys::swapByteOrder(H.offset);
+  sys::swapByteOrder(H.size);
+  sys::swapByteOrder(H.align);
 }
 
 template<typename T>
@@ -58,7 +53,7 @@ static T getUniversalBinaryStruct(const char *Ptr) {
 MachOUniversalBinary::ObjectForArch::ObjectForArch(
     const MachOUniversalBinary *Parent, uint32_t Index)
     : Parent(Parent), Index(Index) {
-  if (!Parent || Index > Parent->getNumberOfObjects()) {
+  if (!Parent || Index >= Parent->getNumberOfObjects()) {
     clear();
   } else {
     // Parse object header.
@@ -72,37 +67,29 @@ MachOUniversalBinary::ObjectForArch::ObjectForArch(
   }
 }
 
-error_code MachOUniversalBinary::ObjectForArch::getAsObjectFile(
-    std::unique_ptr<ObjectFile> &Result) const {
+ErrorOr<std::unique_ptr<ObjectFile>>
+MachOUniversalBinary::ObjectForArch::getAsObjectFile() const {
   if (Parent) {
     StringRef ParentData = Parent->getData();
     StringRef ObjectData = ParentData.substr(Header.offset, Header.size);
-    std::string ObjectName =
-        Parent->getFileName().str() + ":" +
-        Triple::getArchTypeName(MachOObjectFile::getArch(Header.cputype));
-    MemoryBuffer *ObjBuffer = MemoryBuffer::getMemBuffer(
-        ObjectData, ObjectName, false);
-    ErrorOr<ObjectFile *> Obj = ObjectFile::createMachOObjectFile(ObjBuffer);
-    if (error_code EC = Obj.getError())
-      return EC;
-    Result.reset(Obj.get());
-    return object_error::success;
+    std::string ObjectName = Parent->getFileName().str();
+    std::unique_ptr<MemoryBuffer> ObjBuffer(
+        MemoryBuffer::getMemBuffer(ObjectData, ObjectName, false));
+    return ObjectFile::createMachOObjectFile(ObjBuffer);
   }
   return object_error::parse_failed;
 }
 
-error_code MachOUniversalBinary::ObjectForArch::getAsArchive(
+std::error_code MachOUniversalBinary::ObjectForArch::getAsArchive(
     std::unique_ptr<Archive> &Result) const {
   if (Parent) {
     StringRef ParentData = Parent->getData();
     StringRef ObjectData = ParentData.substr(Header.offset, Header.size);
-    std::string ObjectName =
-        Parent->getFileName().str() + ":" +
-        Triple::getArchTypeName(MachOObjectFile::getArch(Header.cputype));
-    MemoryBuffer *ObjBuffer = MemoryBuffer::getMemBuffer(
-        ObjectData, ObjectName, false);
-    ErrorOr<Archive *> Obj = Archive::create(ObjBuffer);
-    if (error_code EC = Obj.getError())
+    std::string ObjectName = Parent->getFileName().str();
+    std::unique_ptr<MemoryBuffer> ObjBuffer(
+        MemoryBuffer::getMemBuffer(ObjectData, ObjectName, false));
+    ErrorOr<Archive *> Obj = Archive::create(std::move(ObjBuffer));
+    if (std::error_code EC = Obj.getError())
       return EC;
     Result.reset(Obj.get());
     return object_error::success;
@@ -113,20 +100,20 @@ error_code MachOUniversalBinary::ObjectForArch::getAsArchive(
 void MachOUniversalBinary::anchor() { }
 
 ErrorOr<MachOUniversalBinary *>
-MachOUniversalBinary::create(MemoryBuffer *Source) {
-  error_code EC;
+MachOUniversalBinary::create(std::unique_ptr<MemoryBuffer> Source) {
+  std::error_code EC;
   std::unique_ptr<MachOUniversalBinary> Ret(
-      new MachOUniversalBinary(Source, EC));
+      new MachOUniversalBinary(std::move(Source), EC));
   if (EC)
     return EC;
   return Ret.release();
 }
 
-MachOUniversalBinary::MachOUniversalBinary(MemoryBuffer *Source,
-                                           error_code &ec)
-  : Binary(Binary::ID_MachOUniversalBinary, Source),
-    NumberOfObjects(0) {
-  if (Source->getBufferSize() < sizeof(MachO::fat_header)) {
+MachOUniversalBinary::MachOUniversalBinary(std::unique_ptr<MemoryBuffer> Source,
+                                           std::error_code &ec)
+    : Binary(Binary::ID_MachOUniversalBinary, std::move(Source)),
+      NumberOfObjects(0) {
+  if (Data->getBufferSize() < sizeof(MachO::fat_header)) {
     ec = object_error::invalid_file_type;
     return;
   }
@@ -155,14 +142,14 @@ static bool getCTMForArch(Triple::ArchType Arch, MachO::CPUType &CTM) {
   }
 }
 
-error_code MachOUniversalBinary::getObjectForArch(
-    Triple::ArchType Arch, std::unique_ptr<ObjectFile> &Result) const {
+ErrorOr<std::unique_ptr<ObjectFile>>
+MachOUniversalBinary::getObjectForArch(Triple::ArchType Arch) const {
   MachO::CPUType CTM;
   if (!getCTMForArch(Arch, CTM))
     return object_error::arch_not_found;
   for (object_iterator I = begin_objects(), E = end_objects(); I != E; ++I) {
     if (I->getCPUType() == static_cast<uint32_t>(CTM))
-      return I->getAsObjectFile(Result);
+      return I->getAsObjectFile();
   }
   return object_error::arch_not_found;
 }
diff --git a/lib/Object/Object.cpp b/lib/Object/Object.cpp
index b0068a8..567d87f 100644
--- a/lib/Object/Object.cpp
+++ b/lib/Object/Object.cpp
@@ -59,7 +59,9 @@ wrap(const relocation_iterator *SI) {
 
 // ObjectFile creation
 LLVMObjectFileRef LLVMCreateObjectFile(LLVMMemoryBufferRef MemBuf) {
-  ErrorOr<ObjectFile*> ObjOrErr(ObjectFile::createObjectFile(unwrap(MemBuf)));
+  std::unique_ptr<MemoryBuffer> Buf(unwrap(MemBuf));
+  ErrorOr<ObjectFile *> ObjOrErr(ObjectFile::createObjectFile(Buf));
+  Buf.release();
   ObjectFile *Obj = ObjOrErr ? ObjOrErr.get() : nullptr;
   return wrap(Obj);
 }
@@ -89,7 +91,7 @@ void LLVMMoveToNextSection(LLVMSectionIteratorRef SI) {
 
 void LLVMMoveToContainingSection(LLVMSectionIteratorRef Sect,
                                  LLVMSymbolIteratorRef Sym) {
-  if (error_code ec = (*unwrap(Sym))->getSection(*unwrap(Sect)))
+  if (std::error_code ec = (*unwrap(Sym))->getSection(*unwrap(Sect)))
     report_fatal_error(ec.message());
 }
 
@@ -115,28 +117,28 @@ void LLVMMoveToNextSymbol(LLVMSymbolIteratorRef SI) {
 // SectionRef accessors
 const char *LLVMGetSectionName(LLVMSectionIteratorRef SI) {
   StringRef ret;
-  if (error_code ec = (*unwrap(SI))->getName(ret))
+  if (std::error_code ec = (*unwrap(SI))->getName(ret))
    report_fatal_error(ec.message());
   return ret.data();
 }
 
 uint64_t LLVMGetSectionSize(LLVMSectionIteratorRef SI) {
   uint64_t ret;
-  if (error_code ec = (*unwrap(SI))->getSize(ret))
+  if (std::error_code ec = (*unwrap(SI))->getSize(ret))
     report_fatal_error(ec.message());
   return ret;
 }
 
 const char *LLVMGetSectionContents(LLVMSectionIteratorRef SI) {
   StringRef ret;
-  if (error_code ec = (*unwrap(SI))->getContents(ret))
+  if (std::error_code ec = (*unwrap(SI))->getContents(ret))
     report_fatal_error(ec.message());
   return ret.data();
 }
 
 uint64_t LLVMGetSectionAddress(LLVMSectionIteratorRef SI) {
   uint64_t ret;
-  if (error_code ec = (*unwrap(SI))->getAddress(ret))
+  if (std::error_code ec = (*unwrap(SI))->getAddress(ret))
     report_fatal_error(ec.message());
   return ret;
 }
@@ -144,7 +146,7 @@ uint64_t LLVMGetSectionAddress(LLVMSectionIteratorRef SI) {
 LLVMBool LLVMGetSectionContainsSymbol(LLVMSectionIteratorRef SI,
                                  LLVMSymbolIteratorRef Sym) {
   bool ret;
-  if (error_code ec = (*unwrap(SI))->containsSymbol(**unwrap(Sym), ret))
+  if (std::error_code ec = (*unwrap(SI))->containsSymbol(**unwrap(Sym), ret))
     report_fatal_error(ec.message());
   return ret;
 }
@@ -172,21 +174,21 @@ void LLVMMoveToNextRelocation(LLVMRelocationIteratorRef SI) {
 // SymbolRef accessors
 const char *LLVMGetSymbolName(LLVMSymbolIteratorRef SI) {
   StringRef ret;
-  if (error_code ec = (*unwrap(SI))->getName(ret))
+  if (std::error_code ec = (*unwrap(SI))->getName(ret))
     report_fatal_error(ec.message());
   return ret.data();
 }
 
 uint64_t LLVMGetSymbolAddress(LLVMSymbolIteratorRef SI) {
   uint64_t ret;
-  if (error_code ec = (*unwrap(SI))->getAddress(ret))
+  if (std::error_code ec = (*unwrap(SI))->getAddress(ret))
     report_fatal_error(ec.message());
   return ret;
 }
 
 uint64_t LLVMGetSymbolSize(LLVMSymbolIteratorRef SI) {
   uint64_t ret;
-  if (error_code ec = (*unwrap(SI))->getSize(ret))
+  if (std::error_code ec = (*unwrap(SI))->getSize(ret))
     report_fatal_error(ec.message());
   return ret;
 }
@@ -194,14 +196,14 @@ uint64_t LLVMGetSymbolSize(LLVMSymbolIteratorRef SI) {
 // RelocationRef accessors
 uint64_t LLVMGetRelocationAddress(LLVMRelocationIteratorRef RI) {
   uint64_t ret;
-  if (error_code ec = (*unwrap(RI))->getAddress(ret))
+  if (std::error_code ec = (*unwrap(RI))->getAddress(ret))
     report_fatal_error(ec.message());
   return ret;
 }
 
 uint64_t LLVMGetRelocationOffset(LLVMRelocationIteratorRef RI) {
   uint64_t ret;
-  if (error_code ec = (*unwrap(RI))->getOffset(ret))
+  if (std::error_code ec = (*unwrap(RI))->getOffset(ret))
     report_fatal_error(ec.message());
   return ret;
 }
@@ -213,7 +215,7 @@ LLVMSymbolIteratorRef LLVMGetRelocationSymbol(LLVMRelocationIteratorRef RI) {
 
 uint64_t LLVMGetRelocationType(LLVMRelocationIteratorRef RI) {
   uint64_t ret;
-  if (error_code ec = (*unwrap(RI))->getType(ret))
+  if (std::error_code ec = (*unwrap(RI))->getType(ret))
     report_fatal_error(ec.message());
   return ret;
 }
@@ -221,7 +223,7 @@ uint64_t LLVMGetRelocationType(LLVMRelocationIteratorRef RI) {
 // NOTE: Caller takes ownership of returned string.
 const char *LLVMGetRelocationTypeName(LLVMRelocationIteratorRef RI) {
   SmallVector<char, 0> ret;
-  if (error_code ec = (*unwrap(RI))->getTypeName(ret))
+  if (std::error_code ec = (*unwrap(RI))->getTypeName(ret))
     report_fatal_error(ec.message());
 
   char *str = static_cast<char*>(malloc(ret.size()));
@@ -232,7 +234,7 @@ const char *LLVMGetRelocationTypeName(LLVMRelocationIteratorRef RI) {
 // NOTE: Caller takes ownership of returned string.
 const char *LLVMGetRelocationValueString(LLVMRelocationIteratorRef RI) {
   SmallVector<char, 0> ret;
-  if (error_code ec = (*unwrap(RI))->getValueString(ret))
+  if (std::error_code ec = (*unwrap(RI))->getValueString(ret))
     report_fatal_error(ec.message());
 
   char *str = static_cast<char*>(malloc(ret.size()));
diff --git a/lib/Object/ObjectFile.cpp b/lib/Object/ObjectFile.cpp
index d30f0cc..f5488c6 100644
--- a/lib/Object/ObjectFile.cpp
+++ b/lib/Object/ObjectFile.cpp
@@ -16,28 +16,27 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 using namespace llvm;
 using namespace object;
 
 void ObjectFile::anchor() { }
 
-ObjectFile::ObjectFile(unsigned int Type, MemoryBuffer *Source,
-                       bool BufferOwned)
-    : SymbolicFile(Type, Source, BufferOwned) {}
+ObjectFile::ObjectFile(unsigned int Type, std::unique_ptr<MemoryBuffer> Source)
+    : SymbolicFile(Type, std::move(Source)) {}
 
-error_code ObjectFile::printSymbolName(raw_ostream &OS,
-                                       DataRefImpl Symb) const {
+std::error_code ObjectFile::printSymbolName(raw_ostream &OS,
+                                            DataRefImpl Symb) const {
   StringRef Name;
-  if (error_code EC = getSymbolName(Symb, Name))
+  if (std::error_code EC = getSymbolName(Symb, Name))
     return EC;
   OS << Name;
   return object_error::success;
 }
 
-error_code ObjectFile::getSymbolAlignment(DataRefImpl DRI,
-                                          uint32_t &Result) const {
+std::error_code ObjectFile::getSymbolAlignment(DataRefImpl DRI,
+                                               uint32_t &Result) const {
   Result = 0;
   return object_error::success;
 }
@@ -46,9 +45,9 @@ section_iterator ObjectFile::getRelocatedSection(DataRefImpl Sec) const {
   return section_iterator(SectionRef(Sec, this));
 }
 
-ErrorOr<ObjectFile *> ObjectFile::createObjectFile(MemoryBuffer *Object,
-                                                   bool BufferOwned,
-                                                   sys::fs::file_magic Type) {
+ErrorOr<ObjectFile *>
+ObjectFile::createObjectFile(std::unique_ptr<MemoryBuffer> &Object,
+                             sys::fs::file_magic Type) {
   if (Type == sys::fs::file_magic::unknown)
     Type = sys::fs::identify_magic(Object->getBuffer());
 
@@ -58,14 +57,12 @@ ErrorOr<ObjectFile *> ObjectFile::createObjectFile(MemoryBuffer *Object,
   case sys::fs::file_magic::archive:
   case sys::fs::file_magic::macho_universal_binary:
   case sys::fs::file_magic::windows_resource:
-    if (BufferOwned)
-      delete Object;
     return object_error::invalid_file_type;
   case sys::fs::file_magic::elf_relocatable:
   case sys::fs::file_magic::elf_executable:
   case sys::fs::file_magic::elf_shared_object:
   case sys::fs::file_magic::elf_core:
-    return createELFObjectFile(Object, BufferOwned);
+    return createELFObjectFile(Object);
   case sys::fs::file_magic::macho_object:
   case sys::fs::file_magic::macho_executable:
   case sys::fs::file_magic::macho_fixed_virtual_memory_shared_lib:
@@ -76,18 +73,19 @@ ErrorOr<ObjectFile *> ObjectFile::createObjectFile(MemoryBuffer *Object,
   case sys::fs::file_magic::macho_bundle:
   case sys::fs::file_magic::macho_dynamically_linked_shared_lib_stub:
   case sys::fs::file_magic::macho_dsym_companion:
-    return createMachOObjectFile(Object, BufferOwned);
+    return createMachOObjectFile(Object);
   case sys::fs::file_magic::coff_object:
   case sys::fs::file_magic::coff_import_library:
   case sys::fs::file_magic::pecoff_executable:
-    return createCOFFObjectFile(Object, BufferOwned);
+    return createCOFFObjectFile(std::move(Object));
   }
   llvm_unreachable("Unexpected Object File Type");
 }
 
 ErrorOr<ObjectFile *> ObjectFile::createObjectFile(StringRef ObjectPath) {
-  std::unique_ptr<MemoryBuffer> File;
-  if (error_code EC = MemoryBuffer::getFile(ObjectPath, File))
+  ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
+      MemoryBuffer::getFile(ObjectPath);
+  if (std::error_code EC = FileOrErr.getError())
     return EC;
-  return createObjectFile(File.release());
+  return createObjectFile(FileOrErr.get());
 }
diff --git a/lib/Object/RecordStreamer.cpp b/lib/Object/RecordStreamer.cpp
new file mode 100644
index 0000000..081fadd
--- /dev/null
+++ b/lib/Object/RecordStreamer.cpp
@@ -0,0 +1,100 @@
+//===-- RecordStreamer.cpp - Record asm definde and used symbols ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RecordStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+using namespace llvm;
+
+void RecordStreamer::markDefined(const MCSymbol &Symbol) {
+  State &S = Symbols[Symbol.getName()];
+  switch (S) {
+  case DefinedGlobal:
+  case Global:
+    S = DefinedGlobal;
+    break;
+  case NeverSeen:
+  case Defined:
+  case Used:
+    S = Defined;
+    break;
+  }
+}
+
+void RecordStreamer::markGlobal(const MCSymbol &Symbol) {
+  State &S = Symbols[Symbol.getName()];
+  switch (S) {
+  case DefinedGlobal:
+  case Defined:
+    S = DefinedGlobal;
+    break;
+
+  case NeverSeen:
+  case Global:
+  case Used:
+    S = Global;
+    break;
+  }
+}
+
+void RecordStreamer::markUsed(const MCSymbol &Symbol) {
+  State &S = Symbols[Symbol.getName()];
+  switch (S) {
+  case DefinedGlobal:
+  case Defined:
+  case Global:
+    break;
+
+  case NeverSeen:
+  case Used:
+    S = Used;
+    break;
+  }
+}
+
+void RecordStreamer::visitUsedSymbol(const MCSymbol &Sym) { markUsed(Sym); }
+
+RecordStreamer::const_iterator RecordStreamer::begin() {
+  return Symbols.begin();
+}
+
+RecordStreamer::const_iterator RecordStreamer::end() { return Symbols.end(); }
+
+RecordStreamer::RecordStreamer(MCContext &Context) : MCStreamer(Context) {}
+
+void RecordStreamer::EmitInstruction(const MCInst &Inst,
+                                     const MCSubtargetInfo &STI) {
+  MCStreamer::EmitInstruction(Inst, STI);
+}
+
+void RecordStreamer::EmitLabel(MCSymbol *Symbol) {
+  MCStreamer::EmitLabel(Symbol);
+  markDefined(*Symbol);
+}
+
+void RecordStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
+  markDefined(*Symbol);
+  MCStreamer::EmitAssignment(Symbol, Value);
+}
+
+bool RecordStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+                                         MCSymbolAttr Attribute) {
+  if (Attribute == MCSA_Global)
+    markGlobal(*Symbol);
+  return true;
+}
+
+void RecordStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
+                                  uint64_t Size, unsigned ByteAlignment) {
+  markDefined(*Symbol);
+}
+
+void RecordStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                      unsigned ByteAlignment) {
+  markDefined(*Symbol);
+}
diff --git a/lib/Object/RecordStreamer.h b/lib/Object/RecordStreamer.h
new file mode 100644
index 0000000..10e70ef
--- /dev/null
+++ b/lib/Object/RecordStreamer.h
@@ -0,0 +1,42 @@
+//===-- RecordStreamer.h - Record asm defined and used symbols ---*- C++ -*===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJECT_RECORD_STREAMER
+#define LLVM_OBJECT_RECORD_STREAMER
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class RecordStreamer : public MCStreamer {
+public:
+  enum State { NeverSeen, Global, Defined, DefinedGlobal, Used };
+
+private:
+  StringMap<State> Symbols;
+  void markDefined(const MCSymbol &Symbol);
+  void markGlobal(const MCSymbol &Symbol);
+  void markUsed(const MCSymbol &Symbol);
+  void visitUsedSymbol(const MCSymbol &Sym) override;
+
+public:
+  typedef StringMap<State>::const_iterator const_iterator;
+  const_iterator begin();
+  const_iterator end();
+  RecordStreamer(MCContext &Context);
+  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+  void EmitLabel(MCSymbol *Symbol) override;
+  void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
+  bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
+  void EmitZerofill(const MCSection *Section, MCSymbol *Symbol, uint64_t Size,
+                    unsigned ByteAlignment) override;
+  void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                        unsigned ByteAlignment) override;
+};
+}
+#endif
diff --git a/lib/Object/SymbolicFile.cpp b/lib/Object/SymbolicFile.cpp
index 495f0b6..30cf1a0 100644
--- a/lib/Object/SymbolicFile.cpp
+++ b/lib/Object/SymbolicFile.cpp
@@ -19,14 +19,14 @@
 using namespace llvm;
 using namespace object;
 
-SymbolicFile::SymbolicFile(unsigned int Type, MemoryBuffer *Source,
-                           bool BufferOwned)
-    : Binary(Type, Source, BufferOwned) {}
+SymbolicFile::SymbolicFile(unsigned int Type,
+                           std::unique_ptr<MemoryBuffer> Source)
+    : Binary(Type, std::move(Source)) {}
 
 SymbolicFile::~SymbolicFile() {}
 
 ErrorOr<SymbolicFile *>
-SymbolicFile::createSymbolicFile(MemoryBuffer *Object, bool BufferOwned,
+SymbolicFile::createSymbolicFile(std::unique_ptr<MemoryBuffer> &Object,
                                  sys::fs::file_magic Type,
                                  LLVMContext *Context) {
   if (Type == sys::fs::file_magic::unknown)
@@ -35,14 +35,12 @@ SymbolicFile::createSymbolicFile(MemoryBuffer *Object, bool BufferOwned,
   switch (Type) {
   case sys::fs::file_magic::bitcode:
     if (Context)
-      return IRObjectFile::createIRObjectFile(Object, *Context, BufferOwned);
+      return IRObjectFile::createIRObjectFile(std::move(Object), *Context);
   // Fallthrough
   case sys::fs::file_magic::unknown:
   case sys::fs::file_magic::archive:
   case sys::fs::file_magic::macho_universal_binary:
   case sys::fs::file_magic::windows_resource:
-    if (BufferOwned)
-      delete Object;
     return object_error::invalid_file_type;
   case sys::fs::file_magic::elf_relocatable:
   case sys::fs::file_magic::elf_executable:
@@ -61,7 +59,7 @@ SymbolicFile::createSymbolicFile(MemoryBuffer *Object, bool BufferOwned,
   case sys::fs::file_magic::coff_object:
   case sys::fs::file_magic::coff_import_library:
   case sys::fs::file_magic::pecoff_executable:
-    return ObjectFile::createObjectFile(Object, BufferOwned, Type);
+    return ObjectFile::createObjectFile(Object, Type);
   }
   llvm_unreachable("Unexpected Binary File Type");
 }
diff --git a/lib/Option/ArgList.cpp b/lib/Option/ArgList.cpp
index a5ab8d7..5848bb1 100644
--- a/lib/Option/ArgList.cpp
+++ b/lib/Option/ArgList.cpp
@@ -234,44 +234,40 @@ void ArgList::AddLastArg(ArgStringList &Output, OptSpecifier Id0,
 
 void ArgList::AddAllArgs(ArgStringList &Output, OptSpecifier Id0,
                          OptSpecifier Id1, OptSpecifier Id2) const {
-  for (arg_iterator it = filtered_begin(Id0, Id1, Id2),
-         ie = filtered_end(); it != ie; ++it) {
-    (*it)->claim();
-    (*it)->render(*this, Output);
+  for (auto Arg: filtered(Id0, Id1, Id2)) {
+    Arg->claim();
+    Arg->render(*this, Output);
   }
 }
 
 void ArgList::AddAllArgValues(ArgStringList &Output, OptSpecifier Id0,
                               OptSpecifier Id1, OptSpecifier Id2) const {
-  for (arg_iterator it = filtered_begin(Id0, Id1, Id2),
-         ie = filtered_end(); it != ie; ++it) {
-    (*it)->claim();
-    for (unsigned i = 0, e = (*it)->getNumValues(); i != e; ++i)
-      Output.push_back((*it)->getValue(i));
+  for (auto Arg : filtered(Id0, Id1, Id2)) {
+    Arg->claim();
+    for (unsigned i = 0, e = Arg->getNumValues(); i != e; ++i)
+      Output.push_back(Arg->getValue(i));
   }
 }
 
 void ArgList::AddAllArgsTranslated(ArgStringList &Output, OptSpecifier Id0,
                                    const char *Translation,
                                    bool Joined) const {
-  for (arg_iterator it = filtered_begin(Id0),
-         ie = filtered_end(); it != ie; ++it) {
-    (*it)->claim();
+  for (auto Arg: filtered(Id0)) {
+    Arg->claim();
 
     if (Joined) {
       Output.push_back(MakeArgString(StringRef(Translation) +
-                                     (*it)->getValue(0)));
+                                     Arg->getValue(0)));
     } else {
       Output.push_back(Translation);
-      Output.push_back((*it)->getValue(0));
+      Output.push_back(Arg->getValue(0));
     }
   }
 }
 
 void ArgList::ClaimAllArgs(OptSpecifier Id0) const {
-  for (arg_iterator it = filtered_begin(Id0),
-         ie = filtered_end(); it != ie; ++it)
-    (*it)->claim();
+  for (auto Arg : filtered(Id0))
+    Arg->claim();
 }
 
 void ArgList::ClaimAllArgs() const {
@@ -350,30 +346,27 @@ void DerivedArgList::AddSynthesizedArg(Arg *A) {
 }
 
 Arg *DerivedArgList::MakeFlagArg(const Arg *BaseArg, const Option Opt) const {
-  SynthesizedArgs.push_back(make_unique<Arg>(
-      Opt,
-      ArgList::MakeArgString(Twine(Opt.getPrefix()) + Twine(Opt.getName())),
-      BaseArgs.MakeIndex(Opt.getName()), BaseArg));
+  SynthesizedArgs.push_back(
+      make_unique<Arg>(Opt, MakeArgString(Opt.getPrefix() + Opt.getName()),
+                       BaseArgs.MakeIndex(Opt.getName()), BaseArg));
   return SynthesizedArgs.back().get();
 }
 
 Arg *DerivedArgList::MakePositionalArg(const Arg *BaseArg, const Option Opt,
                                        StringRef Value) const {
   unsigned Index = BaseArgs.MakeIndex(Value);
-  SynthesizedArgs.push_back(make_unique<Arg>(
-      Opt,
-      ArgList::MakeArgString(Twine(Opt.getPrefix()) + Twine(Opt.getName())),
-      Index, BaseArgs.getArgString(Index), BaseArg));
+  SynthesizedArgs.push_back(
+      make_unique<Arg>(Opt, MakeArgString(Opt.getPrefix() + Opt.getName()),
+                       Index, BaseArgs.getArgString(Index), BaseArg));
   return SynthesizedArgs.back().get();
 }
 
 Arg *DerivedArgList::MakeSeparateArg(const Arg *BaseArg, const Option Opt,
                                      StringRef Value) const {
   unsigned Index = BaseArgs.MakeIndex(Opt.getName(), Value);
-  SynthesizedArgs.push_back(make_unique<Arg>(
-      Opt,
-      ArgList::MakeArgString(Twine(Opt.getPrefix()) + Twine(Opt.getName())),
-      Index, BaseArgs.getArgString(Index + 1), BaseArg));
+  SynthesizedArgs.push_back(
+      make_unique<Arg>(Opt, MakeArgString(Opt.getPrefix() + Opt.getName()),
+                       Index, BaseArgs.getArgString(Index + 1), BaseArg));
   return SynthesizedArgs.back().get();
 }
 
@@ -381,8 +374,7 @@ Arg *DerivedArgList::MakeJoinedArg(const Arg *BaseArg, const Option Opt,
                                    StringRef Value) const {
   unsigned Index = BaseArgs.MakeIndex(Opt.getName().str() + Value.str());
   SynthesizedArgs.push_back(make_unique<Arg>(
-      Opt,
-      ArgList::MakeArgString(Twine(Opt.getPrefix()) + Twine(Opt.getName())),
-      Index, BaseArgs.getArgString(Index) + Opt.getName().size(), BaseArg));
+      Opt, MakeArgString(Opt.getPrefix() + Opt.getName()), Index,
+      BaseArgs.getArgString(Index) + Opt.getName().size(), BaseArg));
   return SynthesizedArgs.back().get();
 }
diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp
index de2b13d..0121222 100644
--- a/lib/ProfileData/InstrProf.cpp
+++ b/lib/ProfileData/InstrProf.cpp
@@ -18,10 +18,10 @@
 using namespace llvm;
 
 namespace {
-class InstrProfErrorCategoryType : public error_category {
-  const char *name() const override { return "llvm.instrprof"; }
+class InstrProfErrorCategoryType : public std::error_category {
+  const char *name() const LLVM_NOEXCEPT override { return "llvm.instrprof"; }
   std::string message(int IE) const override {
-    instrprof_error::ErrorType E = static_cast<instrprof_error::ErrorType>(IE);
+    instrprof_error E = static_cast<instrprof_error>(IE);
     switch (E) {
     case instrprof_error::success:
       return "Success";
@@ -52,15 +52,10 @@ class InstrProfErrorCategoryType : public error_category {
     }
     llvm_unreachable("A value of instrprof_error has no message.");
   }
-  error_condition default_error_condition(int EV) const override {
-    if (EV == instrprof_error::success)
-      return errc::success;
-    return errc::invalid_argument;
-  }
 };
 }
 
-const error_category &llvm::instrprof_category() {
+const std::error_category &llvm::instrprof_category() {
   static InstrProfErrorCategoryType C;
   return C;
 }
diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index 7014f5e..0b36728 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp
@@ -21,10 +21,13 @@
 
 using namespace llvm;
 
-static error_code setupMemoryBuffer(std::string Path,
-                                    std::unique_ptr<MemoryBuffer> &Buffer) {
-  if (error_code EC = MemoryBuffer::getFileOrSTDIN(Path, Buffer))
+static std::error_code
+setupMemoryBuffer(std::string Path, std::unique_ptr<MemoryBuffer> &Buffer) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
+      MemoryBuffer::getFileOrSTDIN(Path);
+  if (std::error_code EC = BufferOrErr.getError())
     return EC;
+  Buffer = std::move(BufferOrErr.get());
 
   // Sanity check the file.
   if (Buffer->getBufferSize() > std::numeric_limits<unsigned>::max())
@@ -32,15 +35,16 @@ static error_code setupMemoryBuffer(std::string Path,
   return instrprof_error::success;
 }
 
-static error_code initializeReader(InstrProfReader &Reader) {
+static std::error_code initializeReader(InstrProfReader &Reader) {
   return Reader.readHeader();
 }
 
-error_code InstrProfReader::create(std::string Path,
-                                   std::unique_ptr<InstrProfReader> &Result) {
+std::error_code
+InstrProfReader::create(std::string Path,
+                        std::unique_ptr<InstrProfReader> &Result) {
   // Set up the buffer to read.
   std::unique_ptr<MemoryBuffer> Buffer;
-  if (error_code EC = setupMemoryBuffer(Path, Buffer))
+  if (std::error_code EC = setupMemoryBuffer(Path, Buffer))
     return EC;
 
   // Create the reader.
@@ -57,11 +61,11 @@ error_code InstrProfReader::create(std::string Path,
   return initializeReader(*Result);
 }
 
-error_code IndexedInstrProfReader::create(
+std::error_code IndexedInstrProfReader::create(
     std::string Path, std::unique_ptr<IndexedInstrProfReader> &Result) {
   // Set up the buffer to read.
   std::unique_ptr<MemoryBuffer> Buffer;
-  if (error_code EC = setupMemoryBuffer(Path, Buffer))
+  if (std::error_code EC = setupMemoryBuffer(Path, Buffer))
     return EC;
 
   // Create the reader.
@@ -78,7 +82,7 @@ void InstrProfIterator::Increment() {
     *this = InstrProfIterator();
 }
 
-error_code TextInstrProfReader::readNextRecord(InstrProfRecord &Record) {
+std::error_code TextInstrProfReader::readNextRecord(InstrProfRecord &Record) {
   // Skip empty lines.
   while (!Line.is_at_end() && Line->empty())
     ++Line;
@@ -157,11 +161,11 @@ bool RawInstrProfReader<IntPtrT>::hasFormat(const MemoryBuffer &DataBuffer) {
   uint64_t Magic =
     *reinterpret_cast<const uint64_t *>(DataBuffer.getBufferStart());
   return getRawMagic<IntPtrT>() == Magic ||
-    sys::SwapByteOrder(getRawMagic<IntPtrT>()) == Magic;
+    sys::getSwappedBytes(getRawMagic<IntPtrT>()) == Magic;
 }
 
 template <class IntPtrT>
-error_code RawInstrProfReader<IntPtrT>::readHeader() {
+std::error_code RawInstrProfReader<IntPtrT>::readHeader() {
   if (!hasFormat(*DataBuffer))
     return error(instrprof_error::bad_magic);
   if (DataBuffer->getBufferSize() < sizeof(RawHeader))
@@ -173,7 +177,8 @@ error_code RawInstrProfReader<IntPtrT>::readHeader() {
 }
 
 template <class IntPtrT>
-error_code RawInstrProfReader<IntPtrT>::readNextHeader(const char *CurrentPos) {
+std::error_code
+RawInstrProfReader<IntPtrT>::readNextHeader(const char *CurrentPos) {
   const char *End = DataBuffer->getBufferEnd();
   // Skip zero padding between profiles.
   while (CurrentPos != End && *CurrentPos == 0)
@@ -200,7 +205,8 @@ static uint64_t getRawVersion() {
 }
 
 template <class IntPtrT>
-error_code RawInstrProfReader<IntPtrT>::readHeader(const RawHeader &Header) {
+std::error_code
+RawInstrProfReader<IntPtrT>::readHeader(const RawHeader &Header) {
   if (swap(Header.Version) != getRawVersion())
     return error(instrprof_error::unsupported_version);
 
@@ -229,10 +235,10 @@ error_code RawInstrProfReader<IntPtrT>::readHeader(const RawHeader &Header) {
 }
 
 template <class IntPtrT>
-error_code
+std::error_code
 RawInstrProfReader<IntPtrT>::readNextRecord(InstrProfRecord &Record) {
   if (Data == DataEnd)
-    if (error_code EC = readNextHeader(ProfileEnd))
+    if (std::error_code EC = readNextHeader(ProfileEnd))
       return EC;
 
   // Get the raw data.
@@ -286,7 +292,7 @@ bool IndexedInstrProfReader::hasFormat(const MemoryBuffer &DataBuffer) {
   return Magic == IndexedInstrProf::Magic;
 }
 
-error_code IndexedInstrProfReader::readHeader() {
+std::error_code IndexedInstrProfReader::readHeader() {
   const unsigned char *Start =
       (const unsigned char *)DataBuffer->getBufferStart();
   const unsigned char *Cur = Start;
@@ -324,7 +330,7 @@ error_code IndexedInstrProfReader::readHeader() {
   return success();
 }
 
-error_code IndexedInstrProfReader::getFunctionCounts(
+std::error_code IndexedInstrProfReader::getFunctionCounts(
     StringRef FuncName, uint64_t &FuncHash, std::vector<uint64_t> &Counts) {
   const auto &Iter = Index->find(FuncName);
   if (Iter == Index->end())
@@ -339,7 +345,8 @@ error_code IndexedInstrProfReader::getFunctionCounts(
   return success();
 }
 
-error_code IndexedInstrProfReader::readNextRecord(InstrProfRecord &Record) {
+std::error_code
+IndexedInstrProfReader::readNextRecord(InstrProfRecord &Record) {
   // Are we out of records?
   if (RecordIterator == Index->data_end())
     return error(instrprof_error::eof);
diff --git a/lib/ProfileData/InstrProfWriter.cpp b/lib/ProfileData/InstrProfWriter.cpp
index 83c41d9..e55c299 100644
--- a/lib/ProfileData/InstrProfWriter.cpp
+++ b/lib/ProfileData/InstrProfWriter.cpp
@@ -66,9 +66,10 @@ public:
 };
 }
 
-error_code InstrProfWriter::addFunctionCounts(StringRef FunctionName,
-                                              uint64_t FunctionHash,
-                                              ArrayRef<uint64_t> Counters) {
+std::error_code
+InstrProfWriter::addFunctionCounts(StringRef FunctionName,
+                                   uint64_t FunctionHash,
+                                   ArrayRef<uint64_t> Counters) {
   auto Where = FunctionData.find(FunctionName);
   if (Where == FunctionData.end()) {
     // If this is the first time we've seen this function, just add it.
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index f9fe095..7989e30 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -1372,7 +1372,9 @@ APFloat::addOrSubtractSpecials(const APFloat &rhs, bool subtract)
   case PackCategoriesIntoKey(fcZero, fcNaN):
   case PackCategoriesIntoKey(fcNormal, fcNaN):
   case PackCategoriesIntoKey(fcInfinity, fcNaN):
-    sign = false;
+    // We need to be sure to flip the sign here for subtraction because we
+    // don't have a separate negate operation so -NaN becomes 0 - NaN here.
+    sign = rhs.sign ^ subtract;
     category = fcNaN;
     copySignificand(rhs);
     return opOK;
diff --git a/lib/Support/ARMWinEH.cpp b/lib/Support/ARMWinEH.cpp
new file mode 100644
index 0000000..03c150f
--- /dev/null
+++ b/lib/Support/ARMWinEH.cpp
@@ -0,0 +1,38 @@
+//===-- ARMWinEH.cpp - Windows on ARM EH Support Functions ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ARMWinEH.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace ARM {
+namespace WinEH {
+std::pair<uint16_t, uint32_t> SavedRegisterMask(const RuntimeFunction &RF) {
+  uint8_t NumRegisters = RF.Reg();
+  uint8_t RegistersVFP = RF.R();
+  uint8_t LinkRegister = RF.L();
+  uint8_t ChainedFrame = RF.C();
+
+  uint16_t GPRMask = (ChainedFrame << 11) | (LinkRegister << 14);
+  uint32_t VFPMask = 0;
+
+  if (RegistersVFP)
+    VFPMask |= (((1 << ((NumRegisters + 1) % 8)) - 1) << 8);
+  else
+    GPRMask |= (((1 << (NumRegisters + 1)) - 1) << 4);
+
+  if (PrologueFolding(RF))
+    GPRMask |= (((1 << (NumRegisters + 1)) - 1) << (~RF.StackAdjust() & 0x3));
+
+  return std::make_pair(GPRMask, VFPMask);
+}
+}
+}
+}
+
diff --git a/lib/Support/Android.mk b/lib/Support/Android.mk
index 6efccf5..5de8d3f 100644
--- a/lib/Support/Android.mk
+++ b/lib/Support/Android.mk
@@ -6,6 +6,7 @@ support_SRC_FILES := \
   APInt.cpp \
   APSInt.cpp \
   ARMBuildAttrs.cpp \
+  ARMWinEH.cpp \
   Atomic.cpp \
   BlockFrequency.cpp \
   BranchProbability.cpp \
@@ -49,13 +50,16 @@ support_SRC_FILES := \
   PrettyStackTrace.cpp \
   Process.cpp \
   Program.cpp \
+  RandomNumberGenerator.cpp \
   Regex.cpp \
   RWMutex.cpp \
+  ScaledNumber.cpp \
   SearchForAddressOfSpecialSymbol.cpp \
   Signals.cpp \
   SmallPtrSet.cpp \
   SmallVector.cpp \
   SourceMgr.cpp \
+  SpecialCaseList.cpp \
   Statistic.cpp \
   StreamableMemoryObject.cpp \
   StringExtras.cpp \
@@ -84,8 +88,7 @@ support_SRC_FILES := \
   regerror.c \
   regexec.c \
   regfree.c \
-  regstrlcpy.c \
-  system_error.cpp
+  regstrlcpy.c
 
 
 # For the host
diff --git a/lib/Support/Atomic.cpp b/lib/Support/Atomic.cpp
index 2ef32b0..ac4ff3e 100644
--- a/lib/Support/Atomic.cpp
+++ b/lib/Support/Atomic.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This header file implements atomic operations.
+//  This file implements atomic operations.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index b4c674d..9ecd559 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(LLVMSupport
   APInt.cpp
   APSInt.cpp
   ARMBuildAttrs.cpp
+  ARMWinEH.cpp
   Allocator.cpp
   BlockFrequency.cpp
   BranchProbability.cpp
@@ -40,10 +41,13 @@ add_llvm_library(LLVMSupport
   MD5.cpp
   PluginLoader.cpp
   PrettyStackTrace.cpp
+  RandomNumberGenerator.cpp
   Regex.cpp
+  ScaledNumber.cpp
   SmallPtrSet.cpp
   SmallVector.cpp
   SourceMgr.cpp
+  SpecialCaseList.cpp
   Statistic.cpp
   StreamableMemoryObject.cpp
   StringExtras.cpp
@@ -82,7 +86,6 @@ add_llvm_library(LLVMSupport
   RWMutex.cpp
   SearchForAddressOfSpecialSymbol.cpp
   Signals.cpp
-  system_error.cpp
   TargetRegistry.cpp
   ThreadLocal.cpp
   Threading.cpp
@@ -99,7 +102,6 @@ add_llvm_library(LLVMSupport
   Unix/Program.inc
   Unix/RWMutex.inc
   Unix/Signals.inc
-  Unix/system_error.inc
   Unix/ThreadLocal.inc
   Unix/TimeValue.inc
   Unix/Watchdog.inc
@@ -112,7 +114,6 @@ add_llvm_library(LLVMSupport
   Windows/Program.inc
   Windows/RWMutex.inc
   Windows/Signals.inc
-  Windows/system_error.inc
   Windows/ThreadLocal.inc
   Windows/TimeValue.inc
   Windows/Watchdog.inc
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index 37bbf48..87348f7 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -31,10 +31,10 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <cerrno>
 #include <cstdlib>
 #include <map>
+#include <system_error>
 using namespace llvm;
 using namespace cl;
 
@@ -145,6 +145,7 @@ void OptionCategory::registerCategory() {
 static void GetOptionInfo(SmallVectorImpl<Option*> &PositionalOpts,
                           SmallVectorImpl<Option*> &SinkOpts,
                           StringMap<Option*> &OptionsMap) {
+  bool HadErrors = false;
   SmallVector<const char*, 16> OptionNames;
   Option *CAOpt = nullptr;  // The ConsumeAfter option if it exists.
   for (Option *O = RegisteredOptionList; O; O = O->getNextRegisteredOption()) {
@@ -158,8 +159,9 @@ static void GetOptionInfo(SmallVectorImpl<Option*> &PositionalOpts,
     for (size_t i = 0, e = OptionNames.size(); i != e; ++i) {
       // Add argument to the argument map!
       if (OptionsMap.GetOrCreateValue(OptionNames[i], O).second != O) {
-        errs() << ProgramName << ": CommandLine Error: Argument '"
-             << OptionNames[i] << "' defined more than once!\n";
+        errs() << ProgramName << ": CommandLine Error: Option '"
+               << OptionNames[i] << "' registered more than once!\n";
+        HadErrors = true;
       }
     }
 
@@ -171,8 +173,10 @@ static void GetOptionInfo(SmallVectorImpl<Option*> &PositionalOpts,
     else if (O->getMiscFlags() & cl::Sink) // Remember sink options
       SinkOpts.push_back(O);
     else if (O->getNumOccurrencesFlag() == cl::ConsumeAfter) {
-      if (CAOpt)
+      if (CAOpt) {
         O->error("Cannot specify more than one option with cl::ConsumeAfter!");
+        HadErrors = true;
+      }
       CAOpt = O;
     }
   }
@@ -182,6 +186,12 @@ static void GetOptionInfo(SmallVectorImpl<Option*> &PositionalOpts,
 
   // Make sure that they are in order of registration not backwards.
   std::reverse(PositionalOpts.begin(), PositionalOpts.end());
+
+  // Fail hard if there were errors. These are strictly unrecoverable and
+  // indicate serious issues such as conflicting option names or an incorrectly
+  // linked LLVM distribution.
+  if (HadErrors)
+    report_fatal_error("inconsistency in registered CommandLine options");
 }
 
 
@@ -621,9 +631,11 @@ void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
 static bool ExpandResponseFile(const char *FName, StringSaver &Saver,
                                TokenizerCallback Tokenizer,
                                SmallVectorImpl<const char *> &NewArgv) {
-  std::unique_ptr<MemoryBuffer> MemBuf;
-  if (MemoryBuffer::getFile(FName, MemBuf))
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MemBufOrErr =
+      MemoryBuffer::getFile(FName);
+  if (!MemBufOrErr)
     return false;
+  std::unique_ptr<MemoryBuffer> MemBuf = std::move(MemBufOrErr.get());
   StringRef Str(MemBuf->getBufferStart(), MemBuf->getBufferSize());
 
   // If we have a UTF-16 byte order mark, convert to UTF-8 for parsing.
@@ -1699,7 +1711,7 @@ public:
     OS << "LLVM (http://llvm.org/):\n"
        << "  " << PACKAGE_NAME << " version " << PACKAGE_VERSION;
 #ifdef LLVM_VERSION_INFO
-    OS << LLVM_VERSION_INFO;
+    OS << " " << LLVM_VERSION_INFO;
 #endif
     OS << "\n  ";
 #ifndef __OPTIMIZE__
diff --git a/lib/Support/ConvertUTF.c b/lib/Support/ConvertUTF.c
index 23f17ca..128459a 100644
--- a/lib/Support/ConvertUTF.c
+++ b/lib/Support/ConvertUTF.c
@@ -51,6 +51,7 @@
 #ifdef CVTUTF_DEBUG
 #include <stdio.h>
 #endif
+#include <assert.h>
 
 static const int halfShift  = 10; /* used for shifting by 10 bits */
 
@@ -392,6 +393,99 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
 
 /* --------------------------------------------------------------------- */
 
+static unsigned
+findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
+                                          const UTF8 *sourceEnd) {
+  UTF8 b1, b2, b3;
+
+  assert(!isLegalUTF8Sequence(source, sourceEnd));
+
+  /*
+   * Unicode 6.3.0, D93b:
+   *
+   *   Maximal subpart of an ill-formed subsequence: The longest code unit
+   *   subsequence starting at an unconvertible offset that is either:
+   *   a. the initial subsequence of a well-formed code unit sequence, or
+   *   b. a subsequence of length one.
+   */
+
+  if (source == sourceEnd)
+    return 0;
+
+  /*
+   * Perform case analysis.  See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
+   * Byte Sequences.
+   */
+
+  b1 = *source;
+  ++source;
+  if (b1 >= 0xC2 && b1 <= 0xDF) {
+    /*
+     * First byte is valid, but we know that this code unit sequence is
+     * invalid, so the maximal subpart has to end after the first byte.
+     */
+    return 1;
+  }
+
+  if (source == sourceEnd)
+    return 1;
+
+  b2 = *source;
+  ++source;
+
+  if (b1 == 0xE0) {
+    return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
+  }
+  if (b1 >= 0xE1 && b1 <= 0xEC) {
+    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
+  }
+  if (b1 == 0xED) {
+    return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
+  }
+  if (b1 >= 0xEE && b1 <= 0xEF) {
+    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
+  }
+  if (b1 == 0xF0) {
+    if (b2 >= 0x90 && b2 <= 0xBF) {
+      if (source == sourceEnd)
+        return 2;
+
+      b3 = *source;
+      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
+    }
+    return 1;
+  }
+  if (b1 >= 0xF1 && b1 <= 0xF3) {
+    if (b2 >= 0x80 && b2 <= 0xBF) {
+      if (source == sourceEnd)
+        return 2;
+
+      b3 = *source;
+      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
+    }
+    return 1;
+  }
+  if (b1 == 0xF4) {
+    if (b2 >= 0x80 && b2 <= 0x8F) {
+      if (source == sourceEnd)
+        return 2;
+
+      b3 = *source;
+      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
+    }
+    return 1;
+  }
+
+  assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
+  /*
+   * There are no valid sequences that start with these bytes.  Maximal subpart
+   * is defined to have length 1 in these cases.
+   */
+  return 1;
+}
+
+/* --------------------------------------------------------------------- */
+
 /*
  * Exported function to return the total number of bytes in a codepoint
  * represented in UTF-8, given the value of the first byte.
@@ -491,9 +585,10 @@ ConversionResult ConvertUTF8toUTF16 (
 
 /* --------------------------------------------------------------------- */
 
-ConversionResult ConvertUTF8toUTF32 (
+static ConversionResult ConvertUTF8toUTF32Impl(
         const UTF8** sourceStart, const UTF8* sourceEnd, 
-        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
+        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
+        Boolean InputIsPartial) {
     ConversionResult result = conversionOK;
     const UTF8* source = *sourceStart;
     UTF32* target = *targetStart;
@@ -501,12 +596,42 @@ ConversionResult ConvertUTF8toUTF32 (
         UTF32 ch = 0;
         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
         if (extraBytesToRead >= sourceEnd - source) {
-            result = sourceExhausted; break;
+            if (flags == strictConversion || InputIsPartial) {
+                result = sourceExhausted;
+                break;
+            } else {
+                result = sourceIllegal;
+
+                /*
+                 * Replace the maximal subpart of ill-formed sequence with
+                 * replacement character.
+                 */
+                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
+                                                                    sourceEnd);
+                *target++ = UNI_REPLACEMENT_CHAR;
+                continue;
+            }
         }
+        if (target >= targetEnd) {
+            result = targetExhausted; break;
+        }
+
         /* Do this check whether lenient or strict */
         if (!isLegalUTF8(source, extraBytesToRead+1)) {
             result = sourceIllegal;
-            break;
+            if (flags == strictConversion) {
+                /* Abort conversion. */
+                break;
+            } else {
+                /*
+                 * Replace the maximal subpart of ill-formed sequence with
+                 * replacement character.
+                 */
+                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
+                                                                    sourceEnd);
+                *target++ = UNI_REPLACEMENT_CHAR;
+                continue;
+            }
         }
         /*
          * The cases all fall through. See "Note A" below.
@@ -521,10 +646,6 @@ ConversionResult ConvertUTF8toUTF32 (
         }
         ch -= offsetsFromUTF8[extraBytesToRead];
 
-        if (target >= targetEnd) {
-            source -= (extraBytesToRead+1); /* Back up the source pointer! */
-            result = targetExhausted; break;
-        }
         if (ch <= UNI_MAX_LEGAL_UTF32) {
             /*
              * UTF-16 surrogate values are illegal in UTF-32, and anything
@@ -551,6 +672,22 @@ ConversionResult ConvertUTF8toUTF32 (
     return result;
 }
 
+ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
+                                           const UTF8 *sourceEnd,
+                                           UTF32 **targetStart,
+                                           UTF32 *targetEnd,
+                                           ConversionFlags flags) {
+  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
+                                flags, /*InputIsPartial=*/true);
+}
+
+ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
+                                    const UTF8 *sourceEnd, UTF32 **targetStart,
+                                    UTF32 *targetEnd, ConversionFlags flags) {
+  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
+                                flags, /*InputIsPartial=*/false);
+}
+
 /* ---------------------------------------------------------------------
 
     Note A.
diff --git a/lib/Support/CrashRecoveryContext.cpp b/lib/Support/CrashRecoveryContext.cpp
index a426377..9b0e443 100644
--- a/lib/Support/CrashRecoveryContext.cpp
+++ b/lib/Support/CrashRecoveryContext.cpp
@@ -22,7 +22,8 @@ namespace {
 
 struct CrashRecoveryContextImpl;
 
-static ManagedStatic<sys::ThreadLocal<const CrashRecoveryContextImpl> > CurrentContext;
+static ManagedStatic<
+    sys::ThreadLocal<const CrashRecoveryContextImpl> > CurrentContext;
 
 struct CrashRecoveryContextImpl {
   CrashRecoveryContext *CRC;
@@ -231,7 +232,8 @@ void CrashRecoveryContext::Disable() {
 
 #include <signal.h>
 
-static const int Signals[] = { SIGABRT, SIGBUS, SIGFPE, SIGILL, SIGSEGV, SIGTRAP };
+static const int Signals[] =
+    { SIGABRT, SIGBUS, SIGFPE, SIGILL, SIGSEGV, SIGTRAP };
 static const unsigned NumSignals = sizeof(Signals) / sizeof(Signals[0]);
 static struct sigaction PrevActions[NumSignals];
 
@@ -330,12 +332,26 @@ const std::string &CrashRecoveryContext::getBacktrace() const {
   return CRC->Backtrace;
 }
 
-//
+// FIXME: Portability.
+static void setThreadBackgroundPriority() {
+#ifdef __APPLE__
+  setpriority(PRIO_DARWIN_THREAD, 0, PRIO_DARWIN_BG);
+#endif
+}
+
+static bool hasThreadBackgroundPriority() {
+#ifdef __APPLE__
+  return getpriority(PRIO_DARWIN_THREAD, 0) == 1;
+#else
+  return false;
+#endif
+}
 
 namespace {
 struct RunSafelyOnThreadInfo {
   function_ref<void()> Fn;
   CrashRecoveryContext *CRC;
+  bool UseBackgroundPriority;
   bool Result;
 };
 }
@@ -343,11 +359,16 @@ struct RunSafelyOnThreadInfo {
 static void RunSafelyOnThread_Dispatch(void *UserData) {
   RunSafelyOnThreadInfo *Info =
     reinterpret_cast<RunSafelyOnThreadInfo*>(UserData);
+
+  if (Info->UseBackgroundPriority)
+    setThreadBackgroundPriority();
+
   Info->Result = Info->CRC->RunSafely(Info->Fn);
 }
 bool CrashRecoveryContext::RunSafelyOnThread(function_ref<void()> Fn,
                                              unsigned RequestedStackSize) {
-  RunSafelyOnThreadInfo Info = { Fn, this, false };
+  bool UseBackgroundPriority = hasThreadBackgroundPriority();
+  RunSafelyOnThreadInfo Info = { Fn, this, UseBackgroundPriority, false };
   llvm_execute_on_thread(RunSafelyOnThread_Dispatch, &Info, RequestedStackSize);
   if (CrashRecoveryContextImpl *CRC = (CrashRecoveryContextImpl *)Impl)
     CRC->setSwitchedThread();
diff --git a/lib/Support/DataExtractor.cpp b/lib/Support/DataExtractor.cpp
index 7b82921..5d6d60a 100644
--- a/lib/Support/DataExtractor.cpp
+++ b/lib/Support/DataExtractor.cpp
@@ -21,7 +21,7 @@ static T getU(uint32_t *offset_ptr, const DataExtractor *de,
   if (de->isValidOffsetForDataOfSize(offset, sizeof(val))) {
     std::memcpy(&val, &Data[offset], sizeof(val));
     if (sys::IsLittleEndianHost != isLittleEndian)
-      val = sys::SwapByteOrder(val);
+      sys::swapByteOrder(val);
 
     // Advance the offset
     *offset_ptr += sizeof(val);
diff --git a/lib/Support/DataStream.cpp b/lib/Support/DataStream.cpp
index eec8584..32653de 100644
--- a/lib/Support/DataStream.cpp
+++ b/lib/Support/DataStream.cpp
@@ -18,10 +18,10 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Program.h"
-#include "llvm/Support/system_error.h"
 #include <cerrno>
 #include <cstdio>
 #include <string>
+#include <system_error>
 #if !defined(_MSC_VER) && !defined(__MINGW32__)
 #include <unistd.h>
 #else
@@ -64,11 +64,11 @@ public:
     return read(Fd, buf, len);
   }
 
-  error_code OpenFile(const std::string &Filename) {
+  std::error_code OpenFile(const std::string &Filename) {
     if (Filename == "-") {
       Fd = 0;
       sys::ChangeStdinToBinary();
-      return error_code::success();
+      return std::error_code();
     }
 
     return sys::fs::openFileForRead(Filename, Fd);
@@ -81,7 +81,7 @@ namespace llvm {
 DataStreamer *getDataFileStreamer(const std::string &Filename,
                                   std::string *StrError) {
   DataFileStreamer *s = new DataFileStreamer();
-  if (error_code e = s->OpenFile(Filename)) {
+  if (std::error_code e = s->OpenFile(Filename)) {
     *StrError = std::string("Could not open ") + Filename + ": " +
         e.message() + "\n";
     return nullptr;
diff --git a/lib/Support/DynamicLibrary.cpp b/lib/Support/DynamicLibrary.cpp
index 82d7c0c..d2b551e 100644
--- a/lib/Support/DynamicLibrary.cpp
+++ b/lib/Support/DynamicLibrary.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This header file implements the operating system DynamicLibrary concept.
+//  This file implements the operating system DynamicLibrary concept.
 //
 // FIXME: This file leaks ExplicitSymbols and OpenedHandles!
 //
diff --git a/lib/Support/ErrorHandling.cpp b/lib/Support/ErrorHandling.cpp
index 342c4f0..c36007f 100644
--- a/lib/Support/ErrorHandling.cpp
+++ b/lib/Support/ErrorHandling.cpp
@@ -18,8 +18,12 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/Signals.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/MutexGuard.h"
 #include "llvm/Support/Threading.h"
+#include "llvm/Support/WindowsError.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdlib>
@@ -37,17 +41,20 @@ using namespace llvm;
 static fatal_error_handler_t ErrorHandler = nullptr;
 static void *ErrorHandlerUserData = nullptr;
 
+static sys::Mutex ErrorHandlerMutex;
+
 void llvm::install_fatal_error_handler(fatal_error_handler_t handler,
                                        void *user_data) {
-  assert(!llvm_is_multithreaded() &&
-         "Cannot register error handlers after starting multithreaded mode!\n");
+  llvm::MutexGuard Lock(ErrorHandlerMutex);
   assert(!ErrorHandler && "Error handler already registered!\n");
   ErrorHandler = handler;
   ErrorHandlerUserData = user_data;
 }
 
 void llvm::remove_fatal_error_handler() {
+  llvm::MutexGuard Lock(ErrorHandlerMutex);
   ErrorHandler = nullptr;
+  ErrorHandlerUserData = nullptr;
 }
 
 void llvm::report_fatal_error(const char *Reason, bool GenCrashDiag) {
@@ -63,8 +70,18 @@ void llvm::report_fatal_error(StringRef Reason, bool GenCrashDiag) {
 }
 
 void llvm::report_fatal_error(const Twine &Reason, bool GenCrashDiag) {
-  if (ErrorHandler) {
-    ErrorHandler(ErrorHandlerUserData, Reason.str(), GenCrashDiag);
+  llvm::fatal_error_handler_t handler = nullptr;
+  void* handlerData = nullptr;
+  {
+    // Only acquire the mutex while reading the handler, so as not to invoke a
+    // user-supplied callback under a lock.
+    llvm::MutexGuard Lock(ErrorHandlerMutex);
+    handler = ErrorHandler;
+    handlerData = ErrorHandlerUserData;
+  }
+
+  if (handler) {
+    handler(handlerData, Reason.str(), GenCrashDiag);
   } else {
     // Blast the result out to stderr.  We don't try hard to make sure this
     // succeeds (e.g. handling EINTR) and we can't use errs() here because
@@ -119,3 +136,70 @@ void LLVMInstallFatalErrorHandler(LLVMFatalErrorHandler Handler) {
 void LLVMResetFatalErrorHandler() {
   remove_fatal_error_handler();
 }
+
+#ifdef LLVM_ON_WIN32
+
+#include <winerror.h>
+
+// I'd rather not double the line count of the following.
+#define MAP_ERR_TO_COND(x, y)                                                  \
+  case x:                                                                      \
+    return make_error_code(errc::y)
+
+std::error_code llvm::mapWindowsError(unsigned EV) {
+  switch (EV) {
+    MAP_ERR_TO_COND(ERROR_ACCESS_DENIED, permission_denied);
+    MAP_ERR_TO_COND(ERROR_ALREADY_EXISTS, file_exists);
+    MAP_ERR_TO_COND(ERROR_BAD_UNIT, no_such_device);
+    MAP_ERR_TO_COND(ERROR_BUFFER_OVERFLOW, filename_too_long);
+    MAP_ERR_TO_COND(ERROR_BUSY, device_or_resource_busy);
+    MAP_ERR_TO_COND(ERROR_BUSY_DRIVE, device_or_resource_busy);
+    MAP_ERR_TO_COND(ERROR_CANNOT_MAKE, permission_denied);
+    MAP_ERR_TO_COND(ERROR_CANTOPEN, io_error);
+    MAP_ERR_TO_COND(ERROR_CANTREAD, io_error);
+    MAP_ERR_TO_COND(ERROR_CANTWRITE, io_error);
+    MAP_ERR_TO_COND(ERROR_CURRENT_DIRECTORY, permission_denied);
+    MAP_ERR_TO_COND(ERROR_DEV_NOT_EXIST, no_such_device);
+    MAP_ERR_TO_COND(ERROR_DEVICE_IN_USE, device_or_resource_busy);
+    MAP_ERR_TO_COND(ERROR_DIR_NOT_EMPTY, directory_not_empty);
+    MAP_ERR_TO_COND(ERROR_DIRECTORY, invalid_argument);
+    MAP_ERR_TO_COND(ERROR_DISK_FULL, no_space_on_device);
+    MAP_ERR_TO_COND(ERROR_FILE_EXISTS, file_exists);
+    MAP_ERR_TO_COND(ERROR_FILE_NOT_FOUND, no_such_file_or_directory);
+    MAP_ERR_TO_COND(ERROR_HANDLE_DISK_FULL, no_space_on_device);
+    MAP_ERR_TO_COND(ERROR_INVALID_ACCESS, permission_denied);
+    MAP_ERR_TO_COND(ERROR_INVALID_DRIVE, no_such_device);
+    MAP_ERR_TO_COND(ERROR_INVALID_FUNCTION, function_not_supported);
+    MAP_ERR_TO_COND(ERROR_INVALID_HANDLE, invalid_argument);
+    MAP_ERR_TO_COND(ERROR_INVALID_NAME, invalid_argument);
+    MAP_ERR_TO_COND(ERROR_LOCK_VIOLATION, no_lock_available);
+    MAP_ERR_TO_COND(ERROR_LOCKED, no_lock_available);
+    MAP_ERR_TO_COND(ERROR_NEGATIVE_SEEK, invalid_argument);
+    MAP_ERR_TO_COND(ERROR_NOACCESS, permission_denied);
+    MAP_ERR_TO_COND(ERROR_NOT_ENOUGH_MEMORY, not_enough_memory);
+    MAP_ERR_TO_COND(ERROR_NOT_READY, resource_unavailable_try_again);
+    MAP_ERR_TO_COND(ERROR_OPEN_FAILED, io_error);
+    MAP_ERR_TO_COND(ERROR_OPEN_FILES, device_or_resource_busy);
+    MAP_ERR_TO_COND(ERROR_OUTOFMEMORY, not_enough_memory);
+    MAP_ERR_TO_COND(ERROR_PATH_NOT_FOUND, no_such_file_or_directory);
+    MAP_ERR_TO_COND(ERROR_BAD_NETPATH, no_such_file_or_directory);
+    MAP_ERR_TO_COND(ERROR_READ_FAULT, io_error);
+    MAP_ERR_TO_COND(ERROR_RETRY, resource_unavailable_try_again);
+    MAP_ERR_TO_COND(ERROR_SEEK, io_error);
+    MAP_ERR_TO_COND(ERROR_SHARING_VIOLATION, permission_denied);
+    MAP_ERR_TO_COND(ERROR_TOO_MANY_OPEN_FILES, too_many_files_open);
+    MAP_ERR_TO_COND(ERROR_WRITE_FAULT, io_error);
+    MAP_ERR_TO_COND(ERROR_WRITE_PROTECT, permission_denied);
+    MAP_ERR_TO_COND(WSAEACCES, permission_denied);
+    MAP_ERR_TO_COND(WSAEBADF, bad_file_descriptor);
+    MAP_ERR_TO_COND(WSAEFAULT, bad_address);
+    MAP_ERR_TO_COND(WSAEINTR, interrupted);
+    MAP_ERR_TO_COND(WSAEINVAL, invalid_argument);
+    MAP_ERR_TO_COND(WSAEMFILE, too_many_files_open);
+    MAP_ERR_TO_COND(WSAENAMETOOLONG, filename_too_long);
+  default:
+    return std::error_code(EV, std::system_category());
+  }
+}
+
+#endif
diff --git a/lib/Support/FileOutputBuffer.cpp b/lib/Support/FileOutputBuffer.cpp
index 49311c2..2e740ca 100644
--- a/lib/Support/FileOutputBuffer.cpp
+++ b/lib/Support/FileOutputBuffer.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 using llvm::sys::fs::mapped_file_region;
 
@@ -30,13 +31,13 @@ FileOutputBuffer::~FileOutputBuffer() {
   sys::fs::remove(Twine(TempPath));
 }
 
-error_code FileOutputBuffer::create(StringRef FilePath,
-                                    size_t Size,
-                                    std::unique_ptr<FileOutputBuffer> &Result,
-                                    unsigned Flags) {
+std::error_code
+FileOutputBuffer::create(StringRef FilePath, size_t Size,
+                         std::unique_ptr<FileOutputBuffer> &Result,
+                         unsigned Flags) {
   // If file already exists, it must be a regular file (to be mappable).
   sys::fs::file_status Stat;
-  error_code EC = sys::fs::status(FilePath, Stat);
+  std::error_code EC = sys::fs::status(FilePath, Stat);
   switch (Stat.type()) {
     case sys::fs::file_type::file_not_found:
       // If file does not exist, we'll create one.
@@ -81,16 +82,16 @@ error_code FileOutputBuffer::create(StringRef FilePath,
   if (Result)
     MappedFile.release();
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code FileOutputBuffer::commit(int64_t NewSmallerSize) {
+std::error_code FileOutputBuffer::commit(int64_t NewSmallerSize) {
   // Unmap buffer, letting OS flush dirty pages to file on disk.
   Region.reset(nullptr);
 
   // If requested, resize file as part of commit.
   if ( NewSmallerSize != -1 ) {
-    error_code EC = sys::fs::resize_file(Twine(TempPath), NewSmallerSize);
+    std::error_code EC = sys::fs::resize_file(Twine(TempPath), NewSmallerSize);
     if (EC)
       return EC;
   }
diff --git a/lib/Support/FileUtilities.cpp b/lib/Support/FileUtilities.cpp
index b2dc47d..8a23491 100644
--- a/lib/Support/FileUtilities.cpp
+++ b/lib/Support/FileUtilities.cpp
@@ -17,10 +17,10 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <cctype>
 #include <cstdlib>
 #include <cstring>
+#include <system_error>
 using namespace llvm;
 
 static bool isSignedChar(char C) {
@@ -176,18 +176,21 @@ int llvm::DiffFilesWithTolerance(StringRef NameA,
                                  std::string *Error) {
   // Now its safe to mmap the files into memory because both files
   // have a non-zero size.
-  std::unique_ptr<MemoryBuffer> F1;
-  if (error_code ec = MemoryBuffer::getFile(NameA, F1)) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> F1OrErr = MemoryBuffer::getFile(NameA);
+  if (std::error_code EC = F1OrErr.getError()) {
     if (Error)
-      *Error = ec.message();
+      *Error = EC.message();
     return 2;
   }
-  std::unique_ptr<MemoryBuffer> F2;
-  if (error_code ec = MemoryBuffer::getFile(NameB, F2)) {
+  std::unique_ptr<MemoryBuffer> F1 = std::move(F1OrErr.get());
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> F2OrErr = MemoryBuffer::getFile(NameB);
+  if (std::error_code EC = F2OrErr.getError()) {
     if (Error)
-      *Error = ec.message();
+      *Error = EC.message();
     return 2;
   }
+  std::unique_ptr<MemoryBuffer> F2 = std::move(F2OrErr.get());
 
   // Okay, now that we opened the files, scan them for the first difference.
   const char *File1Start = F1->getBufferStart();
diff --git a/lib/Support/GraphWriter.cpp b/lib/Support/GraphWriter.cpp
index f5b2943..e68ee43 100644
--- a/lib/Support/GraphWriter.cpp
+++ b/lib/Support/GraphWriter.cpp
@@ -68,7 +68,7 @@ StringRef llvm::DOT::getColorString(unsigned ColorNumber) {
 std::string llvm::createGraphFilename(const Twine &Name, int &FD) {
   FD = -1;
   SmallString<128> Filename;
-  error_code EC = sys::fs::createTemporaryFile(Name, "dot", FD, Filename);
+  std::error_code EC = sys::fs::createTemporaryFile(Name, "dot", FD, Filename);
   if (EC) {
     errs() << "Error: " << EC.message() << "\n";
     return "";
@@ -78,148 +78,165 @@ std::string llvm::createGraphFilename(const Twine &Name, int &FD) {
   return Filename.str();
 }
 
-// Execute the graph viewer. Return true if successful.
-static bool LLVM_ATTRIBUTE_UNUSED
-ExecGraphViewer(StringRef ExecPath, std::vector<const char*> &args,
-                StringRef Filename, bool wait, std::string &ErrMsg) {
+// Execute the graph viewer. Return true if there were errors.
+static bool ExecGraphViewer(StringRef ExecPath, std::vector<const char *> &args,
+                            StringRef Filename, bool wait,
+                            std::string &ErrMsg) {
+  assert(args.back() == nullptr);
   if (wait) {
-    if (sys::ExecuteAndWait(ExecPath, &args[0],nullptr,nullptr,0,0,&ErrMsg)) {
+    if (sys::ExecuteAndWait(ExecPath, args.data(), nullptr, nullptr, 0, 0,
+                            &ErrMsg)) {
       errs() << "Error: " << ErrMsg << "\n";
-      return false;
+      return true;
     }
     sys::fs::remove(Filename);
     errs() << " done. \n";
-  }
-  else {
-    sys::ExecuteNoWait(ExecPath, &args[0],nullptr,nullptr,0,&ErrMsg);
+  } else {
+    sys::ExecuteNoWait(ExecPath, args.data(), nullptr, nullptr, 0, &ErrMsg);
     errs() << "Remember to erase graph file: " << Filename.str() << "\n";
   }
-  return true;
+  return false;
+}
+
+struct GraphSession {
+  std::string LogBuffer;
+  bool TryFindProgram(StringRef Names, std::string &ProgramPath) {
+    raw_string_ostream Log(LogBuffer);
+    SmallVector<StringRef, 8> parts;
+    Names.split(parts, "|");
+    for (auto Name : parts) {
+      ProgramPath = sys::FindProgramByName(Name);
+      if (!ProgramPath.empty())
+        return true;
+      Log << "  Tried '" << Name << "'\n";
+    }
+    return false;
+  }
+};
+
+static const char *getProgramName(GraphProgram::Name program) {
+  switch (program) {
+  case GraphProgram::DOT:
+    return "dot";
+  case GraphProgram::FDP:
+    return "fdp";
+  case GraphProgram::NEATO:
+    return "neato";
+  case GraphProgram::TWOPI:
+    return "twopi";
+  case GraphProgram::CIRCO:
+    return "circo";
+  }
+  llvm_unreachable("bad kind");
 }
 
-void llvm::DisplayGraph(StringRef FilenameRef, bool wait,
+bool llvm::DisplayGraph(StringRef FilenameRef, bool wait,
                         GraphProgram::Name program) {
   std::string Filename = FilenameRef;
   wait &= !ViewBackground;
   std::string ErrMsg;
-#if HAVE_GRAPHVIZ
-  std::string Graphviz(LLVM_PATH_GRAPHVIZ);
-
-  std::vector<const char*> args;
-  args.push_back(Graphviz.c_str());
-  args.push_back(Filename.c_str());
-  args.push_back(nullptr);
-
-  errs() << "Running 'Graphviz' program... ";
-  if (!ExecGraphViewer(Graphviz, args, Filename, wait, ErrMsg))
-    return;
-
-#elif HAVE_XDOT
-  std::vector<const char*> args;
-  args.push_back(LLVM_PATH_XDOT);
-  args.push_back(Filename.c_str());
-
-  switch (program) {
-  case GraphProgram::DOT:   args.push_back("-f"); args.push_back("dot"); break;
-  case GraphProgram::FDP:   args.push_back("-f"); args.push_back("fdp"); break;
-  case GraphProgram::NEATO: args.push_back("-f"); args.push_back("neato");break;
-  case GraphProgram::TWOPI: args.push_back("-f"); args.push_back("twopi");break;
-  case GraphProgram::CIRCO: args.push_back("-f"); args.push_back("circo");break;
+  std::string ViewerPath;
+  GraphSession S;
+
+  // Graphviz
+  if (S.TryFindProgram("Graphviz", ViewerPath)) {
+    std::vector<const char *> args;
+    args.push_back(ViewerPath.c_str());
+    args.push_back(Filename.c_str());
+    args.push_back(nullptr);
+
+    errs() << "Running 'Graphviz' program... ";
+    return ExecGraphViewer(ViewerPath, args, Filename, wait, ErrMsg);
   }
 
-  args.push_back(0);
+  // xdot
+  if (S.TryFindProgram("xdot|xdot.py", ViewerPath)) {
+    std::vector<const char *> args;
+    args.push_back(ViewerPath.c_str());
+    args.push_back(Filename.c_str());
 
-  errs() << "Running 'xdot.py' program... ";
-  if (!ExecGraphViewer(LLVM_PATH_XDOT, args, Filename, wait, ErrMsg))
-    return;
+    args.push_back("-f");
+    args.push_back(getProgramName(program));
 
-#elif (HAVE_GV && (HAVE_DOT || HAVE_FDP || HAVE_NEATO || \
-                   HAVE_TWOPI || HAVE_CIRCO))
-  std::string PSFilename = Filename + ".ps";
-  std::string prog;
+    args.push_back(nullptr);
 
-  // Set default grapher
-#if HAVE_CIRCO
-  prog = LLVM_PATH_CIRCO;
-#endif
-#if HAVE_TWOPI
-  prog = LLVM_PATH_TWOPI;
-#endif
-#if HAVE_NEATO
-  prog = LLVM_PATH_NEATO;
-#endif
-#if HAVE_FDP
-  prog = LLVM_PATH_FDP;
-#endif
-#if HAVE_DOT
-  prog = LLVM_PATH_DOT;
-#endif
+    errs() << "Running 'xdot.py' program... ";
+    return ExecGraphViewer(ViewerPath, args, Filename, wait, ErrMsg);
+  }
 
-  // Find which program the user wants
-#if HAVE_DOT
-  if (program == GraphProgram::DOT)
-    prog = LLVM_PATH_DOT;
-#endif
-#if (HAVE_FDP)
-  if (program == GraphProgram::FDP)
-    prog = LLVM_PATH_FDP;
-#endif
-#if (HAVE_NEATO)
-  if (program == GraphProgram::NEATO)
-    prog = LLVM_PATH_NEATO;
-#endif
-#if (HAVE_TWOPI)
-  if (program == GraphProgram::TWOPI)
-    prog = LLVM_PATH_TWOPI;
-#endif
-#if (HAVE_CIRCO)
-  if (program == GraphProgram::CIRCO)
-    prog = LLVM_PATH_CIRCO;
+  enum PSViewerKind { PSV_None, PSV_OSXOpen, PSV_XDGOpen, PSV_Ghostview };
+  PSViewerKind PSViewer = PSV_None;
+#ifdef __APPLE__
+  if (!PSViewer && S.TryFindProgram("open", ViewerPath))
+    PSViewer = PSV_OSXOpen;
 #endif
+  if (!PSViewer && S.TryFindProgram("gv", ViewerPath))
+    PSViewer = PSV_Ghostview;
+  if (!PSViewer && S.TryFindProgram("xdg-open", ViewerPath))
+    PSViewer = PSV_XDGOpen;
+
+  // PostScript graph generator + PostScript viewer
+  std::string GeneratorPath;
+  if (PSViewer &&
+      (S.TryFindProgram(getProgramName(program), GeneratorPath) ||
+       S.TryFindProgram("dot|fdp|neato|twopi|circo", GeneratorPath))) {
+    std::string PSFilename = Filename + ".ps";
+
+    std::vector<const char *> args;
+    args.push_back(GeneratorPath.c_str());
+    args.push_back("-Tps");
+    args.push_back("-Nfontname=Courier");
+    args.push_back("-Gsize=7.5,10");
+    args.push_back(Filename.c_str());
+    args.push_back("-o");
+    args.push_back(PSFilename.c_str());
+    args.push_back(nullptr);
+
+    errs() << "Running '" << GeneratorPath << "' program... ";
+
+    if (ExecGraphViewer(GeneratorPath, args, Filename, wait, ErrMsg))
+      return true;
+
+    args.clear();
+    args.push_back(ViewerPath.c_str());
+    switch (PSViewer) {
+    case PSV_OSXOpen:
+      args.push_back("-W");
+      args.push_back(PSFilename.c_str());
+      break;
+    case PSV_XDGOpen:
+      wait = false;
+      args.push_back(PSFilename.c_str());
+      break;
+    case PSV_Ghostview:
+      args.push_back("--spartan");
+      args.push_back(PSFilename.c_str());
+      break;
+    case PSV_None:
+      llvm_unreachable("Invalid viewer");
+    }
+    args.push_back(nullptr);
 
-  std::vector<const char*> args;
-  args.push_back(prog.c_str());
-  args.push_back("-Tps");
-  args.push_back("-Nfontname=Courier");
-  args.push_back("-Gsize=7.5,10");
-  args.push_back(Filename.c_str());
-  args.push_back("-o");
-  args.push_back(PSFilename.c_str());
-  args.push_back(0);
-
-  errs() << "Running '" << prog << "' program... ";
-
-  if (!ExecGraphViewer(prog, args, Filename, wait, ErrMsg))
-    return;
-
-  std::string gv(LLVM_PATH_GV);
-  args.clear();
-  args.push_back(gv.c_str());
-  args.push_back(PSFilename.c_str());
-  args.push_back("--spartan");
-  args.push_back(0);
-
-  ErrMsg.clear();
-  if (!ExecGraphViewer(gv, args, PSFilename, wait, ErrMsg))
-    return;
-
-#elif HAVE_DOTTY
-  std::string dotty(LLVM_PATH_DOTTY);
+    ErrMsg.clear();
+    return ExecGraphViewer(ViewerPath, args, PSFilename, wait, ErrMsg);
+  }
 
-  std::vector<const char*> args;
-  args.push_back(dotty.c_str());
-  args.push_back(Filename.c_str());
-  args.push_back(0);
+  // dotty
+  if (S.TryFindProgram("dotty", ViewerPath)) {
+    std::vector<const char *> args;
+    args.push_back(ViewerPath.c_str());
+    args.push_back(Filename.c_str());
+    args.push_back(nullptr);
 
 // Dotty spawns another app and doesn't wait until it returns
-#if defined (__MINGW32__) || defined (_WINDOWS)
-  wait = false;
-#endif
-  errs() << "Running 'dotty' program... ";
-  if (!ExecGraphViewer(dotty, args, Filename, wait, ErrMsg))
-    return;
-#else
-  (void)Filename;
-  (void)ErrMsg;
+#ifdef LLVM_ON_WIN32
+    wait = false;
 #endif
+    errs() << "Running 'dotty' program... ";
+    return ExecGraphViewer(ViewerPath, args, Filename, wait, ErrMsg);
+  }
+
+  errs() << "Error: Couldn't find a usable graph viewer program:\n";
+  errs() << S.LogBuffer << "\n";
+  return true;
 }
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index fd0472e..e2dd6d5 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This header file implements the operating system Host concept.
+//  This file implements the operating system Host concept.
 //
 //===----------------------------------------------------------------------===//
 
@@ -570,6 +570,8 @@ StringRef sys::getHostCPUName() {
     .Case("A2", "a2")
     .Case("POWER6", "pwr6")
     .Case("POWER7", "pwr7")
+    .Case("POWER8", "pwr8")
+    .Case("POWER8E", "pwr8")
     .Default(generic);
 }
 #elif defined(__linux__) && defined(__arm__)
@@ -744,7 +746,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
       .Default("");
 
 #if defined(__aarch64__)
-    // We need to check crypto seperately since we need all of the crypto
+    // We need to check crypto separately since we need all of the crypto
     // extensions to enable the subtarget feature
     if (CPUFeatures[I] == "aes")
       crypto |= CAP_AES;
diff --git a/lib/Support/LockFileManager.cpp b/lib/Support/LockFileManager.cpp
index 9b4bfbe..3f224e0 100644
--- a/lib/Support/LockFileManager.cpp
+++ b/lib/Support/LockFileManager.cpp
@@ -9,6 +9,7 @@
 #include "llvm/Support/LockFileManager.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
@@ -32,11 +33,13 @@ Optional<std::pair<std::string, int> >
 LockFileManager::readLockFile(StringRef LockFileName) {
   // Read the owning host and PID out of the lock file. If it appears that the
   // owning process is dead, the lock file is invalid.
-  std::unique_ptr<MemoryBuffer> MB;
-  if (MemoryBuffer::getFile(LockFileName, MB)) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr =
+      MemoryBuffer::getFile(LockFileName);
+  if (!MBOrErr) {
     sys::fs::remove(LockFileName);
     return None;
   }
+  std::unique_ptr<MemoryBuffer> MB = std::move(MBOrErr.get());
 
   StringRef Hostname;
   StringRef PIDStr;
@@ -71,7 +74,7 @@ bool LockFileManager::processStillExecuting(StringRef Hostname, int PID) {
 LockFileManager::LockFileManager(StringRef FileName)
 {
   this->FileName = FileName;
-  if (error_code EC = sys::fs::make_absolute(this->FileName)) {
+  if (std::error_code EC = sys::fs::make_absolute(this->FileName)) {
     Error = EC;
     return;
   }
@@ -87,10 +90,8 @@ LockFileManager::LockFileManager(StringRef FileName)
   UniqueLockFileName = LockFileName;
   UniqueLockFileName += "-%%%%%%%%";
   int UniqueLockFileID;
-  if (error_code EC
-        = sys::fs::createUniqueFile(UniqueLockFileName.str(),
-                                    UniqueLockFileID,
-                                    UniqueLockFileName)) {
+  if (std::error_code EC = sys::fs::createUniqueFile(
+          UniqueLockFileName.str(), UniqueLockFileID, UniqueLockFileName)) {
     Error = EC;
     return;
   }
@@ -122,9 +123,9 @@ LockFileManager::LockFileManager(StringRef FileName)
 
   while (1) {
     // Create a link from the lock file name. If this succeeds, we're done.
-    error_code EC =
+    std::error_code EC =
         sys::fs::create_link(UniqueLockFileName.str(), LockFileName.str());
-    if (EC == errc::success)
+    if (!EC)
       return;
 
     if (EC != errc::file_exists) {
diff --git a/lib/Support/Makefile b/lib/Support/Makefile
index 4a2185d..39426aa 100644
--- a/lib/Support/Makefile
+++ b/lib/Support/Makefile
@@ -17,3 +17,7 @@ include $(LEVEL)/Makefile.common
 
 CompileCommonOpts := $(filter-out -pedantic,$(CompileCommonOpts))
 CompileCommonOpts := $(filter-out -Wno-long-long,$(CompileCommonOpts))
+
+ifdef LLVM_VERSION_INFO
+CompileCommonOpts += -DLLVM_VERSION_INFO='"$(LLVM_VERSION_INFO)"'
+endif
diff --git a/lib/Support/ManagedStatic.cpp b/lib/Support/ManagedStatic.cpp
index 6a1c2a5..b8fb284 100644
--- a/lib/Support/ManagedStatic.cpp
+++ b/lib/Support/ManagedStatic.cpp
@@ -14,16 +14,26 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/Atomic.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/MutexGuard.h"
 #include <cassert>
 using namespace llvm;
 
 static const ManagedStaticBase *StaticList = nullptr;
 
+static sys::Mutex& getManagedStaticMutex() {
+  // We need to use a function local static here, since this can get called
+  // during a static constructor and we need to guarantee that it's initialized
+  // correctly.
+  static sys::Mutex ManagedStaticMutex;
+  return ManagedStaticMutex;
+}
+
 void ManagedStaticBase::RegisterManagedStatic(void *(*Creator)(),
                                               void (*Deleter)(void*)) const {
   assert(Creator);
   if (llvm_is_multithreaded()) {
-    llvm_acquire_global_lock();
+    MutexGuard Lock(getManagedStaticMutex());
 
     if (!Ptr) {
       void* tmp = Creator();
@@ -43,8 +53,6 @@ void ManagedStaticBase::RegisterManagedStatic(void *(*Creator)(),
       Next = StaticList;
       StaticList = this;
     }
-
-    llvm_release_global_lock();
   } else {
     assert(!Ptr && !DeleterFn && !Next &&
            "Partially initialized ManagedStatic!?");
@@ -75,8 +83,8 @@ void ManagedStaticBase::destroy() const {
 
 /// llvm_shutdown - Deallocate and destroy all ManagedStatic variables.
 void llvm::llvm_shutdown() {
+  MutexGuard Lock(getManagedStaticMutex());
+
   while (StaticList)
     StaticList->destroy();
-
-  if (llvm_is_multithreaded()) llvm_stop_multithreaded();
 }
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index 629d885..5f4b7da 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -14,19 +14,20 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Config/config.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/Errno.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
-#include "llvm/Support/system_error.h"
 #include <cassert>
 #include <cerrno>
 #include <cstdio>
 #include <cstring>
 #include <new>
 #include <sys/types.h>
+#include <system_error>
 #if !defined(_MSC_VER) && !defined(__MINGW32__)
 #include <unistd.h>
 #else
@@ -151,17 +152,11 @@ MemoryBuffer *MemoryBuffer::getNewMemBuffer(size_t Size, StringRef BufferName) {
   return SB;
 }
 
-
-/// getFileOrSTDIN - Open the specified file as a MemoryBuffer, or open stdin
-/// if the Filename is "-".  If an error occurs, this returns null and fills
-/// in *ErrStr with a reason.  If stdin is empty, this API (unlike getSTDIN)
-/// returns an empty buffer.
-error_code MemoryBuffer::getFileOrSTDIN(StringRef Filename,
-                                        std::unique_ptr<MemoryBuffer> &Result,
-                                        int64_t FileSize) {
+ErrorOr<std::unique_ptr<MemoryBuffer>>
+MemoryBuffer::getFileOrSTDIN(StringRef Filename, int64_t FileSize) {
   if (Filename == "-")
-    return getSTDIN(Result);
-  return getFile(Filename, Result, FileSize);
+    return getSTDIN();
+  return getFile(Filename, FileSize);
 }
 
 
@@ -190,7 +185,7 @@ class MemoryBufferMMapFile : public MemoryBuffer {
 
 public:
   MemoryBufferMMapFile(bool RequiresNullTerminator, int FD, uint64_t Len,
-                       uint64_t Offset, error_code EC)
+                       uint64_t Offset, std::error_code EC)
       : MFR(FD, false, sys::fs::mapped_file_region::readonly,
             getLegalMapSize(Len, Offset), getLegalMapOffset(Offset), EC) {
     if (!EC) {
@@ -210,9 +205,8 @@ public:
 };
 }
 
-static error_code getMemoryBufferForStream(int FD,
-                                           StringRef BufferName,
-                                           std::unique_ptr<MemoryBuffer> &Result) {
+static ErrorOr<std::unique_ptr<MemoryBuffer>>
+getMemoryBufferForStream(int FD, StringRef BufferName) {
   const ssize_t ChunkSize = 4096*4;
   SmallString<ChunkSize> Buffer;
   ssize_t ReadBytes;
@@ -222,52 +216,48 @@ static error_code getMemoryBufferForStream(int FD,
     ReadBytes = read(FD, Buffer.end(), ChunkSize);
     if (ReadBytes == -1) {
       if (errno == EINTR) continue;
-      return error_code(errno, posix_category());
+      return std::error_code(errno, std::generic_category());
     }
     Buffer.set_size(Buffer.size() + ReadBytes);
   } while (ReadBytes != 0);
 
-  Result.reset(MemoryBuffer::getMemBufferCopy(Buffer, BufferName));
-  return error_code::success();
+  std::unique_ptr<MemoryBuffer> Ret(
+      MemoryBuffer::getMemBufferCopy(Buffer, BufferName));
+  return std::move(Ret);
 }
 
-static error_code getFileAux(const char *Filename,
-                             std::unique_ptr<MemoryBuffer> &Result,
-                             int64_t FileSize,
-                             bool RequiresNullTerminator,
-                             bool IsVolatileSize);
-
-error_code MemoryBuffer::getFile(Twine Filename,
-                                 std::unique_ptr<MemoryBuffer> &Result,
-                                 int64_t FileSize,
-                                 bool RequiresNullTerminator,
-                                 bool IsVolatileSize) {
+static ErrorOr<std::unique_ptr<MemoryBuffer>>
+getFileAux(const char *Filename, int64_t FileSize, bool RequiresNullTerminator,
+           bool IsVolatileSize);
+
+ErrorOr<std::unique_ptr<MemoryBuffer>>
+MemoryBuffer::getFile(Twine Filename, int64_t FileSize,
+                      bool RequiresNullTerminator, bool IsVolatileSize) {
   // Ensure the path is null terminated.
   SmallString<256> PathBuf;
   StringRef NullTerminatedName = Filename.toNullTerminatedStringRef(PathBuf);
-  return getFileAux(NullTerminatedName.data(), Result, FileSize,
-                    RequiresNullTerminator, IsVolatileSize);
+  return getFileAux(NullTerminatedName.data(), FileSize, RequiresNullTerminator,
+                    IsVolatileSize);
 }
 
-static error_code getOpenFileImpl(int FD, const char *Filename,
-                                  std::unique_ptr<MemoryBuffer> &Result,
-                                  uint64_t FileSize, uint64_t MapSize,
-                                  int64_t Offset, bool RequiresNullTerminator,
-                                  bool IsVolatileSize);
+static ErrorOr<std::unique_ptr<MemoryBuffer>>
+getOpenFileImpl(int FD, const char *Filename, uint64_t FileSize,
+                uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator,
+                bool IsVolatileSize);
 
-static error_code getFileAux(const char *Filename,
-                             std::unique_ptr<MemoryBuffer> &Result, int64_t FileSize,
-                             bool RequiresNullTerminator,
-                             bool IsVolatileSize) {
+static ErrorOr<std::unique_ptr<MemoryBuffer>>
+getFileAux(const char *Filename, int64_t FileSize, bool RequiresNullTerminator,
+           bool IsVolatileSize) {
   int FD;
-  error_code EC = sys::fs::openFileForRead(Filename, FD);
+  std::error_code EC = sys::fs::openFileForRead(Filename, FD);
   if (EC)
     return EC;
 
-  error_code ret = getOpenFileImpl(FD, Filename, Result, FileSize, FileSize, 0,
-                                   RequiresNullTerminator, IsVolatileSize);
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Ret =
+      getOpenFileImpl(FD, Filename, FileSize, FileSize, 0,
+                      RequiresNullTerminator, IsVolatileSize);
   close(FD);
-  return ret;
+  return Ret;
 }
 
 static bool shouldUseMmap(int FD,
@@ -318,11 +308,10 @@ static bool shouldUseMmap(int FD,
   return true;
 }
 
-static error_code getOpenFileImpl(int FD, const char *Filename,
-                                  std::unique_ptr<MemoryBuffer> &Result,
-                                  uint64_t FileSize, uint64_t MapSize,
-                                  int64_t Offset, bool RequiresNullTerminator,
-                                  bool IsVolatileSize) {
+static ErrorOr<std::unique_ptr<MemoryBuffer>>
+getOpenFileImpl(int FD, const char *Filename, uint64_t FileSize,
+                uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator,
+                bool IsVolatileSize) {
   static int PageSize = sys::process::get_self()->page_size();
 
   // Default is to map the full file.
@@ -331,7 +320,7 @@ static error_code getOpenFileImpl(int FD, const char *Filename,
     // file descriptor is cheaper than stat on a random path.
     if (FileSize == uint64_t(-1)) {
       sys::fs::file_status Status;
-      error_code EC = sys::fs::status(FD, Status);
+      std::error_code EC = sys::fs::status(FD, Status);
       if (EC)
         return EC;
 
@@ -341,7 +330,7 @@ static error_code getOpenFileImpl(int FD, const char *Filename,
       sys::fs::file_type Type = Status.type();
       if (Type != sys::fs::file_type::regular_file &&
           Type != sys::fs::file_type::block_file)
-        return getMemoryBufferForStream(FD, Filename, Result);
+        return getMemoryBufferForStream(FD, Filename);
 
       FileSize = Status.getSize();
     }
@@ -350,11 +339,12 @@ static error_code getOpenFileImpl(int FD, const char *Filename,
 
   if (shouldUseMmap(FD, FileSize, MapSize, Offset, RequiresNullTerminator,
                     PageSize, IsVolatileSize)) {
-    error_code EC;
-    Result.reset(new (NamedBufferAlloc(Filename)) MemoryBufferMMapFile(
-        RequiresNullTerminator, FD, MapSize, Offset, EC));
+    std::error_code EC;
+    std::unique_ptr<MemoryBuffer> Result(
+        new (NamedBufferAlloc(Filename))
+        MemoryBufferMMapFile(RequiresNullTerminator, FD, MapSize, Offset, EC));
     if (!EC)
-      return error_code::success();
+      return std::move(Result);
   }
 
   MemoryBuffer *Buf = MemoryBuffer::getNewUninitMemBuffer(MapSize, Filename);
@@ -370,7 +360,7 @@ static error_code getOpenFileImpl(int FD, const char *Filename,
   size_t BytesLeft = MapSize;
 #ifndef HAVE_PREAD
   if (lseek(FD, Offset, SEEK_SET) == -1)
-    return error_code(errno, posix_category());
+    return std::error_code(errno, std::generic_category());
 #endif
 
   while (BytesLeft) {
@@ -383,7 +373,7 @@ static error_code getOpenFileImpl(int FD, const char *Filename,
       if (errno == EINTR)
         continue;
       // Error while reading.
-      return error_code(errno, posix_category());
+      return std::error_code(errno, std::generic_category());
     }
     if (NumRead == 0) {
       memset(BufPtr, 0, BytesLeft); // zero-initialize rest of the buffer.
@@ -393,37 +383,29 @@ static error_code getOpenFileImpl(int FD, const char *Filename,
     BufPtr += NumRead;
   }
 
-  Result.swap(SB);
-  return error_code::success();
+  return std::move(SB);
 }
 
-error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
-                                     std::unique_ptr<MemoryBuffer> &Result,
-                                     uint64_t FileSize,
-                                     bool RequiresNullTerminator,
-                                     bool IsVolatileSize) {
-  return getOpenFileImpl(FD, Filename, Result, FileSize, FileSize, 0,
+ErrorOr<std::unique_ptr<MemoryBuffer>>
+MemoryBuffer::getOpenFile(int FD, const char *Filename, uint64_t FileSize,
+                          bool RequiresNullTerminator, bool IsVolatileSize) {
+  return getOpenFileImpl(FD, Filename, FileSize, FileSize, 0,
                          RequiresNullTerminator, IsVolatileSize);
 }
 
-error_code MemoryBuffer::getOpenFileSlice(int FD, const char *Filename,
-                                          std::unique_ptr<MemoryBuffer> &Result,
-                                          uint64_t MapSize, int64_t Offset,
-                                          bool IsVolatileSize) {
-  return getOpenFileImpl(FD, Filename, Result, -1, MapSize, Offset, false,
+ErrorOr<std::unique_ptr<MemoryBuffer>>
+MemoryBuffer::getOpenFileSlice(int FD, const char *Filename, uint64_t MapSize,
+                               int64_t Offset, bool IsVolatileSize) {
+  return getOpenFileImpl(FD, Filename, -1, MapSize, Offset, false,
                          IsVolatileSize);
 }
 
-//===----------------------------------------------------------------------===//
-// MemoryBuffer::getSTDIN implementation.
-//===----------------------------------------------------------------------===//
-
-error_code MemoryBuffer::getSTDIN(std::unique_ptr<MemoryBuffer> &Result) {
+ErrorOr<std::unique_ptr<MemoryBuffer>> MemoryBuffer::getSTDIN() {
   // Read in all of the data from stdin, we cannot mmap stdin.
   //
   // FIXME: That isn't necessarily true, we should try to mmap stdin and
   // fallback if it fails.
   sys::ChangeStdinToBinary();
 
-  return getMemoryBufferForStream(0, "<stdin>", Result);
+  return getMemoryBufferForStream(0, "<stdin>");
 }
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index b8d676f..d5a0ec5 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -164,12 +165,12 @@ enum FSEntity {
 };
 
 // Implemented in Unix/Path.inc and Windows/Path.inc.
-static error_code TempDir(SmallVectorImpl<char> &result);
+static std::error_code TempDir(SmallVectorImpl<char> &result);
 
-static error_code createUniqueEntity(const Twine &Model, int &ResultFD,
-                                     SmallVectorImpl<char> &ResultPath,
-                                     bool MakeAbsolute, unsigned Mode,
-                                     FSEntity Type) {
+static std::error_code createUniqueEntity(const Twine &Model, int &ResultFD,
+                                          SmallVectorImpl<char> &ResultPath,
+                                          bool MakeAbsolute, unsigned Mode,
+                                          FSEntity Type) {
   SmallString<128> ModelStorage;
   Model.toVector(ModelStorage);
 
@@ -177,7 +178,7 @@ static error_code createUniqueEntity(const Twine &Model, int &ResultFD,
     // Make model absolute by prepending a temp directory if it's not already.
     if (!sys::path::is_absolute(Twine(ModelStorage))) {
       SmallString<128> TDir;
-      if (error_code EC = TempDir(TDir))
+      if (std::error_code EC = TempDir(TDir))
         return EC;
       sys::path::append(TDir, Twine(ModelStorage));
       ModelStorage.swap(TDir);
@@ -201,7 +202,7 @@ retry_random_path:
   // Try to open + create the file.
   switch (Type) {
   case FS_File: {
-    if (error_code EC =
+    if (std::error_code EC =
             sys::fs::openFileForWrite(Twine(ResultPath.begin()), ResultFD,
                                       sys::fs::F_RW | sys::fs::F_Excl, Mode)) {
       if (EC == errc::file_exists)
@@ -209,26 +210,27 @@ retry_random_path:
       return EC;
     }
 
-    return error_code::success();
+    return std::error_code();
   }
 
   case FS_Name: {
     bool Exists;
-    error_code EC = sys::fs::exists(ResultPath.begin(), Exists);
+    std::error_code EC = sys::fs::exists(ResultPath.begin(), Exists);
     if (EC)
       return EC;
     if (Exists)
       goto retry_random_path;
-    return error_code::success();
+    return std::error_code();
   }
 
   case FS_Dir: {
-    if (error_code EC = sys::fs::create_directory(ResultPath.begin(), false)) {
+    if (std::error_code EC =
+            sys::fs::create_directory(ResultPath.begin(), false)) {
       if (EC == errc::file_exists)
         goto retry_random_path;
       return EC;
     }
-    return error_code::success();
+    return std::error_code();
   }
   }
   llvm_unreachable("Invalid Type");
@@ -705,29 +707,30 @@ bool is_relative(const Twine &path) {
 
 namespace fs {
 
-error_code getUniqueID(const Twine Path, UniqueID &Result) {
+std::error_code getUniqueID(const Twine Path, UniqueID &Result) {
   file_status Status;
-  error_code EC = status(Path, Status);
+  std::error_code EC = status(Path, Status);
   if (EC)
     return EC;
   Result = Status.getUniqueID();
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code createUniqueFile(const Twine &Model, int &ResultFd,
-                            SmallVectorImpl<char> &ResultPath, unsigned Mode) {
+std::error_code createUniqueFile(const Twine &Model, int &ResultFd,
+                                 SmallVectorImpl<char> &ResultPath,
+                                 unsigned Mode) {
   return createUniqueEntity(Model, ResultFd, ResultPath, false, Mode, FS_File);
 }
 
-error_code createUniqueFile(const Twine &Model,
-                            SmallVectorImpl<char> &ResultPath) {
+std::error_code createUniqueFile(const Twine &Model,
+                                 SmallVectorImpl<char> &ResultPath) {
   int Dummy;
   return createUniqueEntity(Model, Dummy, ResultPath, false, 0, FS_Name);
 }
 
-static error_code createTemporaryFile(const Twine &Model, int &ResultFD,
-                                      llvm::SmallVectorImpl<char> &ResultPath,
-                                      FSEntity Type) {
+static std::error_code
+createTemporaryFile(const Twine &Model, int &ResultFD,
+                    llvm::SmallVectorImpl<char> &ResultPath, FSEntity Type) {
   SmallString<128> Storage;
   StringRef P = Model.toNullTerminatedStringRef(Storage);
   assert(P.find_first_of(separators) == StringRef::npos &&
@@ -737,24 +740,22 @@ static error_code createTemporaryFile(const Twine &Model, int &ResultFD,
                             true, owner_read | owner_write, Type);
 }
 
-static error_code
+static std::error_code
 createTemporaryFile(const Twine &Prefix, StringRef Suffix, int &ResultFD,
-                    llvm::SmallVectorImpl<char> &ResultPath,
-                    FSEntity Type) {
+                    llvm::SmallVectorImpl<char> &ResultPath, FSEntity Type) {
   const char *Middle = Suffix.empty() ? "-%%%%%%" : "-%%%%%%.";
   return createTemporaryFile(Prefix + Middle + Suffix, ResultFD, ResultPath,
                              Type);
 }
 
-
-error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
-                               int &ResultFD,
-                               SmallVectorImpl<char> &ResultPath) {
+std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
+                                    int &ResultFD,
+                                    SmallVectorImpl<char> &ResultPath) {
   return createTemporaryFile(Prefix, Suffix, ResultFD, ResultPath, FS_File);
 }
 
-error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
-                               SmallVectorImpl<char> &ResultPath) {
+std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
+                                    SmallVectorImpl<char> &ResultPath) {
   int Dummy;
   return createTemporaryFile(Prefix, Suffix, Dummy, ResultPath, FS_Name);
 }
@@ -762,14 +763,14 @@ error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
 
 // This is a mkdtemp with a different pattern. We use createUniqueEntity mostly
 // for consistency. We should try using mkdtemp.
-error_code createUniqueDirectory(const Twine &Prefix,
-                                 SmallVectorImpl<char> &ResultPath) {
+std::error_code createUniqueDirectory(const Twine &Prefix,
+                                      SmallVectorImpl<char> &ResultPath) {
   int Dummy;
   return createUniqueEntity(Prefix + "-%%%%%%", Dummy, ResultPath,
                             true, 0, FS_Dir);
 }
 
-error_code make_absolute(SmallVectorImpl<char> &path) {
+std::error_code make_absolute(SmallVectorImpl<char> &path) {
   StringRef p(path.data(), path.size());
 
   bool rootDirectory = path::has_root_directory(p),
@@ -781,11 +782,12 @@ error_code make_absolute(SmallVectorImpl<char> &path) {
 
   // Already absolute.
   if (rootName && rootDirectory)
-    return error_code::success();
+    return std::error_code();
 
   // All of the following conditions will need the current directory.
   SmallString<128> current_dir;
-  if (error_code ec = current_path(current_dir)) return ec;
+  if (std::error_code ec = current_path(current_dir))
+    return ec;
 
   // Relative path. Prepend the current directory.
   if (!rootName && !rootDirectory) {
@@ -793,7 +795,7 @@ error_code make_absolute(SmallVectorImpl<char> &path) {
     path::append(current_dir, p);
     // Set path to the result.
     path.swap(current_dir);
-    return error_code::success();
+    return std::error_code();
   }
 
   if (!rootName && rootDirectory) {
@@ -802,7 +804,7 @@ error_code make_absolute(SmallVectorImpl<char> &path) {
     path::append(curDirRootName, p);
     // Set path to the result.
     path.swap(curDirRootName);
-    return error_code::success();
+    return std::error_code();
   }
 
   if (rootName && !rootDirectory) {
@@ -814,19 +816,19 @@ error_code make_absolute(SmallVectorImpl<char> &path) {
     SmallString<128> res;
     path::append(res, pRootName, bRootDirectory, bRelativePath, pRelativePath);
     path.swap(res);
-    return error_code::success();
+    return std::error_code();
   }
 
   llvm_unreachable("All rootName and rootDirectory combinations should have "
                    "occurred above!");
 }
 
-error_code create_directories(const Twine &Path, bool IgnoreExisting) {
+std::error_code create_directories(const Twine &Path, bool IgnoreExisting) {
   SmallString<128> PathStorage;
   StringRef P = Path.toStringRef(PathStorage);
 
   // Be optimistic and try to create the directory
-  error_code EC = create_directory(P, IgnoreExisting);
+  std::error_code EC = create_directory(P, IgnoreExisting);
   // If we succeeded, or had any error other than the parent not existing, just
   // return it.
   if (EC != errc::no_such_file_or_directory)
@@ -844,6 +846,40 @@ error_code create_directories(const Twine &Path, bool IgnoreExisting) {
   return create_directory(P, IgnoreExisting);
 }
 
+std::error_code copy_file(const Twine &From, const Twine &To) {
+  int ReadFD, WriteFD;
+  if (std::error_code EC = openFileForRead(From, ReadFD))
+    return EC;
+  if (std::error_code EC = openFileForWrite(To, WriteFD, F_None)) {
+    close(ReadFD);
+    return EC;
+  }
+
+  const size_t BufSize = 4096;
+  char *Buf = new char[BufSize];
+  int BytesRead = 0, BytesWritten = 0;
+  for (;;) {
+    BytesRead = read(ReadFD, Buf, BufSize);
+    if (BytesRead <= 0)
+      break;
+    while (BytesRead) {
+      BytesWritten = write(WriteFD, Buf, BytesRead);
+      if (BytesWritten < 0)
+        break;
+      BytesRead -= BytesWritten;
+    }
+    if (BytesWritten < 0)
+      break;
+  }
+  close(ReadFD);
+  close(WriteFD);
+  delete[] Buf;
+
+  if (BytesRead < 0 || BytesWritten < 0)
+    return std::error_code(errno, std::generic_category());
+  return std::error_code();
+}
+
 bool exists(file_status status) {
   return status_known(status) && status.type() != file_type::file_not_found;
 }
@@ -856,24 +892,24 @@ bool is_directory(file_status status) {
   return status.type() == file_type::directory_file;
 }
 
-error_code is_directory(const Twine &path, bool &result) {
+std::error_code is_directory(const Twine &path, bool &result) {
   file_status st;
-  if (error_code ec = status(path, st))
+  if (std::error_code ec = status(path, st))
     return ec;
   result = is_directory(st);
-  return error_code::success();
+  return std::error_code();
 }
 
 bool is_regular_file(file_status status) {
   return status.type() == file_type::regular_file;
 }
 
-error_code is_regular_file(const Twine &path, bool &result) {
+std::error_code is_regular_file(const Twine &path, bool &result) {
   file_status st;
-  if (error_code ec = status(path, st))
+  if (std::error_code ec = status(path, st))
     return ec;
   result = is_regular_file(st);
-  return error_code::success();
+  return std::error_code();
 }
 
 bool is_other(file_status status) {
@@ -890,26 +926,8 @@ void directory_entry::replace_filename(const Twine &filename, file_status st) {
   Status = st;
 }
 
-error_code has_magic(const Twine &path, const Twine &magic, bool &result) {
-  SmallString<32>  MagicStorage;
-  StringRef Magic = magic.toStringRef(MagicStorage);
-  SmallString<32> Buffer;
-
-  if (error_code ec = get_magic(path, Magic.size(), Buffer)) {
-    if (ec == errc::value_too_large) {
-      // Magic.size() > file_size(Path).
-      result = false;
-      return error_code::success();
-    }
-    return ec;
-  }
-
-  result = Magic == Buffer;
-  return error_code::success();
-}
-
 /// @brief Identify the magic in magic.
-  file_magic identify_magic(StringRef Magic) {
+file_magic identify_magic(StringRef Magic) {
   if (Magic.size() < 4)
     return file_magic::unknown;
   switch ((unsigned char)Magic[0]) {
@@ -1040,17 +1058,21 @@ error_code has_magic(const Twine &path, const Twine &magic, bool &result) {
   return file_magic::unknown;
 }
 
-error_code identify_magic(const Twine &path, file_magic &result) {
-  SmallString<32> Magic;
-  error_code ec = get_magic(path, Magic.capacity(), Magic);
-  if (ec && ec != errc::value_too_large)
-    return ec;
+std::error_code identify_magic(const Twine &Path, file_magic &Result) {
+  int FD;
+  if (std::error_code EC = openFileForRead(Path, FD))
+    return EC;
+
+  char Buffer[32];
+  int Length = read(FD, Buffer, sizeof(Buffer));
+  if (close(FD) != 0 || Length < 0)
+    return std::error_code(errno, std::generic_category());
 
-  result = identify_magic(Magic);
-  return error_code::success();
+  Result = identify_magic(StringRef(Buffer, Length));
+  return std::error_code();
 }
 
-error_code directory_entry::status(file_status &result) const {
+std::error_code directory_entry::status(file_status &result) const {
   return fs::status(Path, result);
 }
 
diff --git a/lib/Support/Process.cpp b/lib/Support/Process.cpp
index 0380ed9..0d42e0e 100644
--- a/lib/Support/Process.cpp
+++ b/lib/Support/Process.cpp
@@ -7,13 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This header file implements the operating system Process concept.
+//  This file implements the operating system Process concept.
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Process.h"
+#include "llvm/Support/Program.h"
 
 using namespace llvm;
 using namespace sys;
@@ -66,6 +69,33 @@ TimeValue self_process::get_wall_time() const {
   return getElapsedWallTime();
 }
 
+Optional<std::string> Process::FindInEnvPath(const std::string& EnvName,
+                                             const std::string& FileName)
+{
+  Optional<std::string> FoundPath;
+  Optional<std::string> OptPath = Process::GetEnv(EnvName);
+  if (!OptPath.hasValue())
+    return FoundPath;
+
+  const char EnvPathSeparatorStr[] = {EnvPathSeparator, '\0'};
+  SmallVector<StringRef, 8> Dirs;
+  SplitString(OptPath.getValue(), Dirs, EnvPathSeparatorStr);
+
+  for (const auto &Dir : Dirs) {
+    if (Dir.empty())
+      continue;
+
+    SmallString<128> FilePath(Dir);
+    path::append(FilePath, FileName);
+    if (fs::exists(Twine(FilePath))) {
+      FoundPath = FilePath.str();
+      break;
+    }
+  }
+
+  return FoundPath;
+}
+
 
 #define COLOR(FGBG, CODE, BOLD) "\033[0;" BOLD FGBG CODE "m"
 
diff --git a/lib/Support/Program.cpp b/lib/Support/Program.cpp
index 83f2ec4..b84b82b 100644
--- a/lib/Support/Program.cpp
+++ b/lib/Support/Program.cpp
@@ -7,13 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This header file implements the operating system Program concept.
+//  This file implements the operating system Program concept.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Program.h"
 #include "llvm/Config/config.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 using namespace llvm;
 using namespace sys;
 
@@ -34,7 +34,8 @@ int sys::ExecuteAndWait(StringRef Program, const char **args, const char **envp,
   if (Execute(PI, Program, args, envp, redirects, memoryLimit, ErrMsg)) {
     if (ExecutionFailed)
       *ExecutionFailed = false;
-    ProcessInfo Result = Wait(PI, secondsToWait, true, ErrMsg);
+    ProcessInfo Result = Wait(
+        PI, secondsToWait, /*WaitUntilTerminates=*/secondsToWait == 0, ErrMsg);
     return Result.ReturnCode;
   }
 
diff --git a/lib/Support/RandomNumberGenerator.cpp b/lib/Support/RandomNumberGenerator.cpp
new file mode 100644
index 0000000..c50e7cb
--- /dev/null
+++ b/lib/Support/RandomNumberGenerator.cpp
@@ -0,0 +1,61 @@
+//===-- RandomNumberGenerator.cpp - Implement RNG class -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements random number generation (RNG).
+// The current implementation is NOT cryptographically secure as it uses
+// the C++11 <random> facilities.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "rng"
+#include "llvm/Support/RandomNumberGenerator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+// Tracking BUG: 19665
+// http://llvm.org/bugs/show_bug.cgi?id=19665
+//
+// Do not change to cl::opt<uint64_t> since this silently breaks argument parsing.
+static cl::opt<unsigned long long>
+Seed("rng-seed", cl::value_desc("seed"),
+     cl::desc("Seed for the random number generator"), cl::init(0));
+
+RandomNumberGenerator::RandomNumberGenerator(StringRef Salt) {
+  DEBUG(
+    if (Seed == 0)
+      errs() << "Warning! Using unseeded random number generator.\n"
+  );
+
+  // Combine seed and salt using std::seed_seq.
+  // Entropy: Seed-low, Seed-high, Salt...
+  std::vector<uint32_t> Data;
+  Data.reserve(2 + Salt.size()/4 + 1);
+  Data.push_back(Seed);
+  Data.push_back(Seed >> 32);
+
+  uint32_t Pack = 0;
+  for (size_t I = 0; I < Salt.size(); ++I) {
+    Pack <<= 8;
+    Pack += Salt[I];
+
+    if (I%4 == 3)
+      Data.push_back(Pack);
+  }
+  Data.push_back(Pack);
+
+  std::seed_seq SeedSeq(Data.begin(), Data.end());
+  Generator.seed(SeedSeq);
+}
+
+uint64_t RandomNumberGenerator::next(uint64_t Max) {
+  std::uniform_int_distribution<uint64_t> distribution(0, Max - 1);
+  return distribution(Generator);
+}
diff --git a/lib/Support/ScaledNumber.cpp b/lib/Support/ScaledNumber.cpp
new file mode 100644
index 0000000..3fe027b
--- /dev/null
+++ b/lib/Support/ScaledNumber.cpp
@@ -0,0 +1,319 @@
+//==- lib/Support/ScaledNumber.cpp - Support for scaled numbers -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of some scaled number algorithms.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ScaledNumber.h"
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace llvm::ScaledNumbers;
+
+std::pair<uint64_t, int16_t> ScaledNumbers::multiply64(uint64_t LHS,
+                                                       uint64_t RHS) {
+  // Separate into two 32-bit digits (U.L).
+  auto getU = [](uint64_t N) { return N >> 32; };
+  auto getL = [](uint64_t N) { return N & UINT32_MAX; };
+  uint64_t UL = getU(LHS), LL = getL(LHS), UR = getU(RHS), LR = getL(RHS);
+
+  // Compute cross products.
+  uint64_t P1 = UL * UR, P2 = UL * LR, P3 = LL * UR, P4 = LL * LR;
+
+  // Sum into two 64-bit digits.
+  uint64_t Upper = P1, Lower = P4;
+  auto addWithCarry = [&](uint64_t N) {
+    uint64_t NewLower = Lower + (getL(N) << 32);
+    Upper += getU(N) + (NewLower < Lower);
+    Lower = NewLower;
+  };
+  addWithCarry(P2);
+  addWithCarry(P3);
+
+  // Check whether the upper digit is empty.
+  if (!Upper)
+    return std::make_pair(Lower, 0);
+
+  // Shift as little as possible to maximize precision.
+  unsigned LeadingZeros = countLeadingZeros(Upper);
+  int Shift = 64 - LeadingZeros;
+  if (LeadingZeros)
+    Upper = Upper << LeadingZeros | Lower >> Shift;
+  return getRounded(Upper, Shift,
+                    Shift && (Lower & UINT64_C(1) << (Shift - 1)));
+}
+
+static uint64_t getHalf(uint64_t N) { return (N >> 1) + (N & 1); }
+
+std::pair<uint32_t, int16_t> ScaledNumbers::divide32(uint32_t Dividend,
+                                                     uint32_t Divisor) {
+  assert(Dividend && "expected non-zero dividend");
+  assert(Divisor && "expected non-zero divisor");
+
+  // Use 64-bit math and canonicalize the dividend to gain precision.
+  uint64_t Dividend64 = Dividend;
+  int Shift = 0;
+  if (int Zeros = countLeadingZeros(Dividend64)) {
+    Shift -= Zeros;
+    Dividend64 <<= Zeros;
+  }
+  uint64_t Quotient = Dividend64 / Divisor;
+  uint64_t Remainder = Dividend64 % Divisor;
+
+  // If Quotient needs to be shifted, leave the rounding to getAdjusted().
+  if (Quotient > UINT32_MAX)
+    return getAdjusted<uint32_t>(Quotient, Shift);
+
+  // Round based on the value of the next bit.
+  return getRounded<uint32_t>(Quotient, Shift, Remainder >= getHalf(Divisor));
+}
+
+std::pair<uint64_t, int16_t> ScaledNumbers::divide64(uint64_t Dividend,
+                                                     uint64_t Divisor) {
+  assert(Dividend && "expected non-zero dividend");
+  assert(Divisor && "expected non-zero divisor");
+
+  // Minimize size of divisor.
+  int Shift = 0;
+  if (int Zeros = countTrailingZeros(Divisor)) {
+    Shift -= Zeros;
+    Divisor >>= Zeros;
+  }
+
+  // Check for powers of two.
+  if (Divisor == 1)
+    return std::make_pair(Dividend, Shift);
+
+  // Maximize size of dividend.
+  if (int Zeros = countLeadingZeros(Dividend)) {
+    Shift -= Zeros;
+    Dividend <<= Zeros;
+  }
+
+  // Start with the result of a divide.
+  uint64_t Quotient = Dividend / Divisor;
+  Dividend %= Divisor;
+
+  // Continue building the quotient with long division.
+  while (!(Quotient >> 63) && Dividend) {
+    // Shift Dividend and check for overflow.
+    bool IsOverflow = Dividend >> 63;
+    Dividend <<= 1;
+    --Shift;
+
+    // Get the next bit of Quotient.
+    Quotient <<= 1;
+    if (IsOverflow || Divisor <= Dividend) {
+      Quotient |= 1;
+      Dividend -= Divisor;
+    }
+  }
+
+  return getRounded(Quotient, Shift, Dividend >= getHalf(Divisor));
+}
+
+int ScaledNumbers::compareImpl(uint64_t L, uint64_t R, int ScaleDiff) {
+  assert(ScaleDiff >= 0 && "wrong argument order");
+  assert(ScaleDiff < 64 && "numbers too far apart");
+
+  uint64_t L_adjusted = L >> ScaleDiff;
+  if (L_adjusted < R)
+    return -1;
+  if (L_adjusted > R)
+    return 1;
+
+  return L > L_adjusted << ScaleDiff ? 1 : 0;
+}
+
+static void appendDigit(std::string &Str, unsigned D) {
+  assert(D < 10);
+  Str += '0' + D % 10;
+}
+
+static void appendNumber(std::string &Str, uint64_t N) {
+  while (N) {
+    appendDigit(Str, N % 10);
+    N /= 10;
+  }
+}
+
+static bool doesRoundUp(char Digit) {
+  switch (Digit) {
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9':
+    return true;
+  default:
+    return false;
+  }
+}
+
+static std::string toStringAPFloat(uint64_t D, int E, unsigned Precision) {
+  assert(E >= ScaledNumbers::MinScale);
+  assert(E <= ScaledNumbers::MaxScale);
+
+  // Find a new E, but don't let it increase past MaxScale.
+  int LeadingZeros = ScaledNumberBase::countLeadingZeros64(D);
+  int NewE = std::min(ScaledNumbers::MaxScale, E + 63 - LeadingZeros);
+  int Shift = 63 - (NewE - E);
+  assert(Shift <= LeadingZeros);
+  assert(Shift == LeadingZeros || NewE == ScaledNumbers::MaxScale);
+  D <<= Shift;
+  E = NewE;
+
+  // Check for a denormal.
+  unsigned AdjustedE = E + 16383;
+  if (!(D >> 63)) {
+    assert(E == ScaledNumbers::MaxScale);
+    AdjustedE = 0;
+  }
+
+  // Build the float and print it.
+  uint64_t RawBits[2] = {D, AdjustedE};
+  APFloat Float(APFloat::x87DoubleExtended, APInt(80, RawBits));
+  SmallVector<char, 24> Chars;
+  Float.toString(Chars, Precision, 0);
+  return std::string(Chars.begin(), Chars.end());
+}
+
+static std::string stripTrailingZeros(const std::string &Float) {
+  size_t NonZero = Float.find_last_not_of('0');
+  assert(NonZero != std::string::npos && "no . in floating point string");
+
+  if (Float[NonZero] == '.')
+    ++NonZero;
+
+  return Float.substr(0, NonZero + 1);
+}
+
+std::string ScaledNumberBase::toString(uint64_t D, int16_t E, int Width,
+                                       unsigned Precision) {
+  if (!D)
+    return "0.0";
+
+  // Canonicalize exponent and digits.
+  uint64_t Above0 = 0;
+  uint64_t Below0 = 0;
+  uint64_t Extra = 0;
+  int ExtraShift = 0;
+  if (E == 0) {
+    Above0 = D;
+  } else if (E > 0) {
+    if (int Shift = std::min(int16_t(countLeadingZeros64(D)), E)) {
+      D <<= Shift;
+      E -= Shift;
+
+      if (!E)
+        Above0 = D;
+    }
+  } else if (E > -64) {
+    Above0 = D >> -E;
+    Below0 = D << (64 + E);
+  } else if (E > -120) {
+    Below0 = D >> (-E - 64);
+    Extra = D << (128 + E);
+    ExtraShift = -64 - E;
+  }
+
+  // Fall back on APFloat for very small and very large numbers.
+  if (!Above0 && !Below0)
+    return toStringAPFloat(D, E, Precision);
+
+  // Append the digits before the decimal.
+  std::string Str;
+  size_t DigitsOut = 0;
+  if (Above0) {
+    appendNumber(Str, Above0);
+    DigitsOut = Str.size();
+  } else
+    appendDigit(Str, 0);
+  std::reverse(Str.begin(), Str.end());
+
+  // Return early if there's nothing after the decimal.
+  if (!Below0)
+    return Str + ".0";
+
+  // Append the decimal and beyond.
+  Str += '.';
+  uint64_t Error = UINT64_C(1) << (64 - Width);
+
+  // We need to shift Below0 to the right to make space for calculating
+  // digits.  Save the precision we're losing in Extra.
+  Extra = (Below0 & 0xf) << 56 | (Extra >> 8);
+  Below0 >>= 4;
+  size_t SinceDot = 0;
+  size_t AfterDot = Str.size();
+  do {
+    if (ExtraShift) {
+      --ExtraShift;
+      Error *= 5;
+    } else
+      Error *= 10;
+
+    Below0 *= 10;
+    Extra *= 10;
+    Below0 += (Extra >> 60);
+    Extra = Extra & (UINT64_MAX >> 4);
+    appendDigit(Str, Below0 >> 60);
+    Below0 = Below0 & (UINT64_MAX >> 4);
+    if (DigitsOut || Str.back() != '0')
+      ++DigitsOut;
+    ++SinceDot;
+  } while (Error && (Below0 << 4 | Extra >> 60) >= Error / 2 &&
+           (!Precision || DigitsOut <= Precision || SinceDot < 2));
+
+  // Return early for maximum precision.
+  if (!Precision || DigitsOut <= Precision)
+    return stripTrailingZeros(Str);
+
+  // Find where to truncate.
+  size_t Truncate =
+      std::max(Str.size() - (DigitsOut - Precision), AfterDot + 1);
+
+  // Check if there's anything to truncate.
+  if (Truncate >= Str.size())
+    return stripTrailingZeros(Str);
+
+  bool Carry = doesRoundUp(Str[Truncate]);
+  if (!Carry)
+    return stripTrailingZeros(Str.substr(0, Truncate));
+
+  // Round with the first truncated digit.
+  for (std::string::reverse_iterator I(Str.begin() + Truncate), E = Str.rend();
+       I != E; ++I) {
+    if (*I == '.')
+      continue;
+    if (*I == '9') {
+      *I = '0';
+      continue;
+    }
+
+    ++*I;
+    Carry = false;
+    break;
+  }
+
+  // Add "1" in front if we still need to carry.
+  return stripTrailingZeros(std::string(Carry, '1') + Str.substr(0, Truncate));
+}
+
+raw_ostream &ScaledNumberBase::print(raw_ostream &OS, uint64_t D, int16_t E,
+                                     int Width, unsigned Precision) {
+  return OS << toString(D, E, Width, Precision);
+}
+
+void ScaledNumberBase::dump(uint64_t D, int16_t E, int Width) {
+  print(dbgs(), D, E, Width, 0) << "[" << Width << ":" << D << "*2^" << E
+                                << "]";
+}
diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp
index acd75fb..003cb56 100644
--- a/lib/Support/SourceMgr.cpp
+++ b/lib/Support/SourceMgr.cpp
@@ -20,14 +20,14 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 using namespace llvm;
 
 static const size_t TabStop = 8;
 
 namespace {
   struct LineNoCacheTy {
-    int LastQueryBufferID;
+    unsigned LastQueryBufferID;
     const char *LastQuery;
     unsigned LineNoOfQuery;
   };
@@ -49,48 +49,44 @@ SourceMgr::~SourceMgr() {
   }
 }
 
-/// AddIncludeFile - Search for a file with the specified name in the current
-/// directory or in one of the IncludeDirs.  If no file is found, this returns
-/// ~0, otherwise it returns the buffer ID of the stacked file.
-size_t SourceMgr::AddIncludeFile(const std::string &Filename,
-                                 SMLoc IncludeLoc,
-                                 std::string &IncludedFile) {
-  std::unique_ptr<MemoryBuffer> NewBuf;
+unsigned SourceMgr::AddIncludeFile(const std::string &Filename,
+                                   SMLoc IncludeLoc,
+                                   std::string &IncludedFile) {
   IncludedFile = Filename;
-  MemoryBuffer::getFile(IncludedFile.c_str(), NewBuf);
+  ErrorOr<std::unique_ptr<MemoryBuffer>> NewBufOrErr =
+      MemoryBuffer::getFile(IncludedFile.c_str());
 
   // If the file didn't exist directly, see if it's in an include path.
-  for (unsigned i = 0, e = IncludeDirectories.size(); i != e && !NewBuf; ++i) {
-    IncludedFile = IncludeDirectories[i] + sys::path::get_separator().data() + Filename;
-    MemoryBuffer::getFile(IncludedFile.c_str(), NewBuf);
+  for (unsigned i = 0, e = IncludeDirectories.size(); i != e && !NewBufOrErr;
+       ++i) {
+    IncludedFile =
+        IncludeDirectories[i] + sys::path::get_separator().data() + Filename;
+    NewBufOrErr = MemoryBuffer::getFile(IncludedFile.c_str());
   }
 
-  if (!NewBuf) return ~0U;
+  if (!NewBufOrErr)
+    return 0;
 
-  return AddNewSourceBuffer(NewBuf.release(), IncludeLoc);
+  return AddNewSourceBuffer(NewBufOrErr.get().release(), IncludeLoc);
 }
 
-
-/// FindBufferContainingLoc - Return the ID of the buffer containing the
-/// specified location, returning -1 if not found.
-int SourceMgr::FindBufferContainingLoc(SMLoc Loc) const {
+unsigned SourceMgr::FindBufferContainingLoc(SMLoc Loc) const {
   for (unsigned i = 0, e = Buffers.size(); i != e; ++i)
     if (Loc.getPointer() >= Buffers[i].Buffer->getBufferStart() &&
         // Use <= here so that a pointer to the null at the end of the buffer
         // is included as part of the buffer.
         Loc.getPointer() <= Buffers[i].Buffer->getBufferEnd())
-      return i;
-  return -1;
+      return i + 1;
+  return 0;
 }
 
-/// getLineAndColumn - Find the line and column number for the specified
-/// location in the specified file.  This is not a fast method.
 std::pair<unsigned, unsigned>
-SourceMgr::getLineAndColumn(SMLoc Loc, int BufferID) const {
-  if (BufferID == -1) BufferID = FindBufferContainingLoc(Loc);
-  assert(BufferID != -1 && "Invalid Location!");
+SourceMgr::getLineAndColumn(SMLoc Loc, unsigned BufferID) const {
+  if (!BufferID)
+    BufferID = FindBufferContainingLoc(Loc);
+  assert(BufferID && "Invalid Location!");
 
-  MemoryBuffer *Buff = getBufferInfo(BufferID).Buffer;
+  const MemoryBuffer *Buff = getMemoryBuffer(BufferID);
 
   // Count the number of \n's between the start of the file and the specified
   // location.
@@ -132,8 +128,8 @@ SourceMgr::getLineAndColumn(SMLoc Loc, int BufferID) const {
 void SourceMgr::PrintIncludeStack(SMLoc IncludeLoc, raw_ostream &OS) const {
   if (IncludeLoc == SMLoc()) return;  // Top of stack.
 
-  int CurBuf = FindBufferContainingLoc(IncludeLoc);
-  assert(CurBuf != -1 && "Invalid or unspecified location!");
+  unsigned CurBuf = FindBufferContainingLoc(IncludeLoc);
+  assert(CurBuf && "Invalid or unspecified location!");
 
   PrintIncludeStack(getBufferInfo(CurBuf).IncludeLoc, OS);
 
@@ -143,11 +139,6 @@ void SourceMgr::PrintIncludeStack(SMLoc IncludeLoc, raw_ostream &OS) const {
 }
 
 
-/// GetMessage - Return an SMDiagnostic at the specified location with the
-/// specified string.
-///
-/// @param Type - If non-null, the kind of message (e.g., "error") which is
-/// prefixed to the message.
 SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
                                    const Twine &Msg,
                                    ArrayRef<SMRange> Ranges,
@@ -161,10 +152,10 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
   std::string LineStr;
   
   if (Loc.isValid()) {
-    int CurBuf = FindBufferContainingLoc(Loc);
-    assert(CurBuf != -1 && "Invalid or unspecified location!");
+    unsigned CurBuf = FindBufferContainingLoc(Loc);
+    assert(CurBuf && "Invalid or unspecified location!");
 
-    MemoryBuffer *CurMB = getBufferInfo(CurBuf).Buffer;
+    const MemoryBuffer *CurMB = getMemoryBuffer(CurBuf);
     BufferID = CurMB->getBufferIdentifier();
     
     // Scan backward to find the start of the line.
@@ -211,27 +202,30 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
                       LineStr, ColRanges, FixIts);
 }
 
-void SourceMgr::PrintMessage(raw_ostream &OS, SMLoc Loc,
-                             SourceMgr::DiagKind Kind,
-                             const Twine &Msg, ArrayRef<SMRange> Ranges,
-                             ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
-  SMDiagnostic Diagnostic = GetMessage(Loc, Kind, Msg, Ranges, FixIts);
-  
+void SourceMgr::PrintMessage(raw_ostream &OS, const SMDiagnostic &Diagnostic,
+                             bool ShowColors) const {
   // Report the message with the diagnostic handler if present.
   if (DiagHandler) {
     DiagHandler(Diagnostic, DiagContext);
     return;
   }
 
-  if (Loc != SMLoc()) {
-    int CurBuf = FindBufferContainingLoc(Loc);
-    assert(CurBuf != -1 && "Invalid or unspecified location!");
+  if (Diagnostic.getLoc().isValid()) {
+    unsigned CurBuf = FindBufferContainingLoc(Diagnostic.getLoc());
+    assert(CurBuf && "Invalid or unspecified location!");
     PrintIncludeStack(getBufferInfo(CurBuf).IncludeLoc, OS);
   }
 
   Diagnostic.print(nullptr, OS, ShowColors);
 }
 
+void SourceMgr::PrintMessage(raw_ostream &OS, SMLoc Loc,
+                             SourceMgr::DiagKind Kind,
+                             const Twine &Msg, ArrayRef<SMRange> Ranges,
+                             ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
+  PrintMessage(OS, GetMessage(Loc, Kind, Msg, Ranges, FixIts), ShowColors);
+}
+
 void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
                              const Twine &Msg, ArrayRef<SMRange> Ranges,
                              ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
diff --git a/lib/Transforms/Utils/SpecialCaseList.cpp b/lib/Support/SpecialCaseList.cpp
index 2c6fcd1..21e43c5 100644
--- a/lib/Transforms/Utils/SpecialCaseList.cpp
+++ b/lib/Support/SpecialCaseList.cpp
@@ -14,20 +14,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/SpecialCaseList.h"
+#include "llvm/Support/SpecialCaseList.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Module.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <string>
+#include <system_error>
 #include <utility>
 
 namespace llvm {
@@ -38,10 +34,12 @@ namespace llvm {
 /// reason for doing so is efficiency; StringSet is much faster at matching
 /// literal strings than Regex.
 struct SpecialCaseList::Entry {
-  StringSet<> Strings;
-  Regex *RegEx;
+  Entry() {}
+  Entry(Entry &&Other)
+      : Strings(std::move(Other.Strings)), RegEx(std::move(Other.RegEx)) {}
 
-  Entry() : RegEx(nullptr) {}
+  StringSet<> Strings;
+  std::unique_ptr<Regex> RegEx;
 
   bool match(StringRef Query) const {
     return Strings.count(Query) || (RegEx && RegEx->match(Query));
@@ -54,12 +52,13 @@ SpecialCaseList *SpecialCaseList::create(
     const StringRef Path, std::string &Error) {
   if (Path.empty())
     return new SpecialCaseList();
-  std::unique_ptr<MemoryBuffer> File;
-  if (error_code EC = MemoryBuffer::getFile(Path, File)) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
+      MemoryBuffer::getFile(Path);
+  if (std::error_code EC = FileOrErr.getError()) {
     Error = (Twine("Can't open file '") + Path + "': " + EC.message()).str();
     return nullptr;
   }
-  return create(File.get(), Error);
+  return create(FileOrErr.get().get(), Error);
 }
 
 SpecialCaseList *SpecialCaseList::create(
@@ -150,66 +149,16 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB, std::string &Error) {
     for (StringMap<std::string>::const_iterator II = I->second.begin(),
                                                 IE = I->second.end();
          II != IE; ++II) {
-      Entries[I->getKey()][II->getKey()].RegEx = new Regex(II->getValue());
+      Entries[I->getKey()][II->getKey()].RegEx.reset(new Regex(II->getValue()));
     }
   }
   return true;
 }
 
-SpecialCaseList::~SpecialCaseList() {
-  for (StringMap<StringMap<Entry> >::iterator I = Entries.begin(),
-                                              E = Entries.end();
-       I != E; ++I) {
-    for (StringMap<Entry>::const_iterator II = I->second.begin(),
-                                          IE = I->second.end();
-         II != IE; ++II) {
-      delete II->second.RegEx;
-    }
-  }
-}
-
-bool SpecialCaseList::isIn(const Function& F, const StringRef Category) const {
-  return isIn(*F.getParent(), Category) ||
-         inSectionCategory("fun", F.getName(), Category);
-}
-
-static StringRef GetGlobalTypeString(const GlobalValue &G) {
-  // Types of GlobalVariables are always pointer types.
-  Type *GType = G.getType()->getElementType();
-  // For now we support blacklisting struct types only.
-  if (StructType *SGType = dyn_cast<StructType>(GType)) {
-    if (!SGType->isLiteral())
-      return SGType->getName();
-  }
-  return "<unknown type>";
-}
-
-bool SpecialCaseList::isIn(const GlobalVariable &G,
-                           const StringRef Category) const {
-  return isIn(*G.getParent(), Category) ||
-         inSectionCategory("global", G.getName(), Category) ||
-         inSectionCategory("type", GetGlobalTypeString(G), Category);
-}
-
-bool SpecialCaseList::isIn(const GlobalAlias &GA,
-                           const StringRef Category) const {
-  if (isIn(*GA.getParent(), Category))
-    return true;
-
-  if (isa<FunctionType>(GA.getType()->getElementType()))
-    return inSectionCategory("fun", GA.getName(), Category);
-
-  return inSectionCategory("global", GA.getName(), Category) ||
-         inSectionCategory("type", GetGlobalTypeString(GA), Category);
-}
-
-bool SpecialCaseList::isIn(const Module &M, const StringRef Category) const {
-  return inSectionCategory("src", M.getModuleIdentifier(), Category);
-}
+SpecialCaseList::~SpecialCaseList() {}
 
-bool SpecialCaseList::inSectionCategory(const StringRef Section,
-                                        const StringRef Query,
-                                        const StringRef Category) const {
+bool SpecialCaseList::inSection(const StringRef Section, const StringRef Query,
+                                const StringRef Category) const {
   StringMap<StringMap<Entry> >::const_iterator I = Entries.find(Section);
   if (I == Entries.end()) return false;
   StringMap<Entry>::const_iterator II = I->second.find(Category);
diff --git a/lib/Support/StringMap.cpp b/lib/Support/StringMap.cpp
index 72a6d82..ddb7349 100644
--- a/lib/Support/StringMap.cpp
+++ b/lib/Support/StringMap.cpp
@@ -181,7 +181,7 @@ StringMapEntryBase *StringMapImpl::RemoveKey(StringRef Key) {
 
 /// RehashTable - Grow the table, redistributing values into the buckets with
 /// the appropriate mod-of-hashtable-size.
-void StringMapImpl::RehashTable() {
+unsigned StringMapImpl::RehashTable(unsigned BucketNo) {
   unsigned NewSize;
   unsigned *HashTable = (unsigned *)(TheTable + NumBuckets + 1);
 
@@ -193,9 +193,10 @@ void StringMapImpl::RehashTable() {
   } else if (NumBuckets-(NumItems+NumTombstones) <= NumBuckets/8) {
     NewSize = NumBuckets;
   } else {
-    return;
+    return BucketNo;
   }
 
+  unsigned NewBucketNo = BucketNo;
   // Allocate one extra bucket which will always be non-empty.  This allows the
   // iterators to stop at end.
   StringMapEntryBase **NewTableArray =
@@ -215,6 +216,8 @@ void StringMapImpl::RehashTable() {
       if (!NewTableArray[NewBucket]) {
         NewTableArray[FullHash & (NewSize-1)] = Bucket;
         NewHashArray[FullHash & (NewSize-1)] = FullHash;
+        if (I == BucketNo)
+          NewBucketNo = NewBucket;
         continue;
       }
       
@@ -227,6 +230,8 @@ void StringMapImpl::RehashTable() {
       // Finally found a slot.  Fill it in.
       NewTableArray[NewBucket] = Bucket;
       NewHashArray[NewBucket] = FullHash;
+      if (I == BucketNo)
+        NewBucketNo = NewBucket;
     }
   }
   
@@ -235,4 +240,5 @@ void StringMapImpl::RehashTable() {
   TheTable = NewTableArray;
   NumBuckets = NewSize;
   NumTombstones = 0;
+  return NewBucketNo;
 }
diff --git a/lib/Support/StringPool.cpp b/lib/Support/StringPool.cpp
index ff607cf..76faabc 100644
--- a/lib/Support/StringPool.cpp
+++ b/lib/Support/StringPool.cpp
@@ -27,7 +27,7 @@ PooledStringPtr StringPool::intern(StringRef Key) {
   if (I != InternTable.end())
     return PooledStringPtr(&*I);
   
-  entry_t *S = entry_t::Create(Key.begin(), Key.end());
+  entry_t *S = entry_t::Create(Key);
   S->getValue().Pool = this;
   InternTable.insert(S);
   
diff --git a/lib/Support/TargetRegistry.cpp b/lib/Support/TargetRegistry.cpp
index a008831..f691883 100644
--- a/lib/Support/TargetRegistry.cpp
+++ b/lib/Support/TargetRegistry.cpp
@@ -116,17 +116,6 @@ void TargetRegistry::RegisterTarget(Target &T,
   T.HasJIT = HasJIT;
 }
 
-const Target *TargetRegistry::getClosestTargetForJIT(std::string &Error) {
-  const Target *TheTarget = lookupTarget(sys::getDefaultTargetTriple(), Error);
-
-  if (TheTarget && !TheTarget->hasJIT()) {
-    Error = "No JIT compatible target available for this host";
-    return nullptr;
-  }
-
-  return TheTarget;
-}
-
 static int TargetArraySortFn(const std::pair<StringRef, const Target *> *LHS,
                              const std::pair<StringRef, const Target *> *RHS) {
   return LHS->first.compare(RHS->first);
diff --git a/lib/Support/Threading.cpp b/lib/Support/Threading.cpp
index 1acfa79..ca7f3f6 100644
--- a/lib/Support/Threading.cpp
+++ b/lib/Support/Threading.cpp
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements llvm_start_multithreaded() and friends.
+// This file defines helper functions for running LLVM in a multi-threaded
+// environment.
 //
 //===----------------------------------------------------------------------===//
 
@@ -19,50 +20,14 @@
 
 using namespace llvm;
 
-static bool multithreaded_mode = false;
-
-static sys::Mutex* global_lock = nullptr;
-
-bool llvm::llvm_start_multithreaded() {
+bool llvm::llvm_is_multithreaded() {
 #if LLVM_ENABLE_THREADS != 0
-  assert(!multithreaded_mode && "Already multithreaded!");
-  multithreaded_mode = true;
-  global_lock = new sys::Mutex(true);
-
-  // We fence here to ensure that all initialization is complete BEFORE we
-  // return from llvm_start_multithreaded().
-  sys::MemoryFence();
   return true;
 #else
   return false;
 #endif
 }
 
-void llvm::llvm_stop_multithreaded() {
-#if LLVM_ENABLE_THREADS != 0
-  assert(multithreaded_mode && "Not currently multithreaded!");
-
-  // We fence here to insure that all threaded operations are complete BEFORE we
-  // return from llvm_stop_multithreaded().
-  sys::MemoryFence();
-
-  multithreaded_mode = false;
-  delete global_lock;
-#endif
-}
-
-bool llvm::llvm_is_multithreaded() {
-  return multithreaded_mode;
-}
-
-void llvm::llvm_acquire_global_lock() {
-  if (multithreaded_mode) global_lock->acquire();
-}
-
-void llvm::llvm_release_global_lock() {
-  if (multithreaded_mode) global_lock->release();
-}
-
 #if LLVM_ENABLE_THREADS != 0 && defined(HAVE_PTHREAD_H)
 #include <pthread.h>
 
diff --git a/lib/Support/TimeValue.cpp b/lib/Support/TimeValue.cpp
index bd8af17..4a70797 100644
--- a/lib/Support/TimeValue.cpp
+++ b/lib/Support/TimeValue.cpp
@@ -53,7 +53,7 @@ TimeValue::normalize( void ) {
 
 }
 
-/// Include the platform specific portion of TimeValue class
+/// Include the platform-specific portion of TimeValue class
 #ifdef LLVM_ON_UNIX
 #include "Unix/TimeValue.inc"
 #endif
diff --git a/lib/Support/Timer.cpp b/lib/Support/Timer.cpp
index 61465ae..210bda7 100644
--- a/lib/Support/Timer.cpp
+++ b/lib/Support/Timer.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
+#include "llvm/Support/MutexGuard.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -84,14 +85,13 @@ static TimerGroup *getDefaultTimerGroup() {
   sys::MemoryFence();
   if (tmp) return tmp;
   
-  llvm_acquire_global_lock();
+  sys::SmartScopedLock<true> Lock(*TimerLock);
   tmp = DefaultTimerGroup;
   if (!tmp) {
     tmp = new TimerGroup("Miscellaneous Ungrouped Timers");
     sys::MemoryFence();
     DefaultTimerGroup = tmp;
   }
-  llvm_release_global_lock();
 
   return tmp;
 }
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index b3d48fb..b74ee13 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -50,6 +50,7 @@ const char *Triple::getArchTypeName(ArchType Kind) {
   case amdil:       return "amdil";
   case spir:        return "spir";
   case spir64:      return "spir64";
+  case kalimba:     return "kalimba";
   }
 
   llvm_unreachable("Invalid ArchType!");
@@ -101,6 +102,7 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {
   case amdil:       return "amdil";
   case spir:        return "spir";
   case spir64:      return "spir";
+  case kalimba:     return "kalimba";
   }
 }
 
@@ -115,7 +117,9 @@ const char *Triple::getVendorTypeName(VendorType Kind) {
   case BGQ: return "bgq";
   case Freescale: return "fsl";
   case IBM: return "ibm";
+  case ImaginationTechnologies: return "img";
   case NVIDIA: return "nvidia";
+  case CSR: return "csr";
   }
 
   llvm_unreachable("Invalid VendorType!");
@@ -207,6 +211,7 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("amdil", amdil)
     .Case("spir", spir)
     .Case("spir64", spir64)
+    .Case("kalimba", kalimba)
     .Default(UnknownArch);
 }
 
@@ -280,6 +285,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("amdil", Triple::amdil)
     .Case("spir", Triple::spir)
     .Case("spir64", Triple::spir64)
+    .Case("kalimba", Triple::kalimba)
     .Default(Triple::UnknownArch);
 }
 
@@ -292,7 +298,9 @@ static Triple::VendorType parseVendor(StringRef VendorName) {
     .Case("bgq", Triple::BGQ)
     .Case("fsl", Triple::Freescale)
     .Case("ibm", Triple::IBM)
+    .Case("img", Triple::ImaginationTechnologies)
     .Case("nvidia", Triple::NVIDIA)
+    .Case("csr", Triple::CSR)
     .Default(Triple::UnknownVendor);
 }
 
@@ -737,9 +745,8 @@ void Triple::setObjectFormat(ObjectFormatType Kind) {
   if (Environment == UnknownEnvironment)
     return setEnvironmentName(getObjectFormatTypeName(Kind));
 
-  Twine Env = getEnvironmentTypeName(Environment) + Twine("-") +
-              getObjectFormatTypeName(Kind);
-  setEnvironmentName(Env.str());
+  setEnvironmentName((getEnvironmentTypeName(Environment) + Twine("-") +
+                      getObjectFormatTypeName(Kind)).str());
 }
 
 void Triple::setArchName(StringRef Str) {
@@ -799,6 +806,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::x86:
   case llvm::Triple::xcore:
   case llvm::Triple::spir:
+  case llvm::Triple::kalimba:
     return 32;
 
   case llvm::Triple::arm64:
@@ -850,6 +858,7 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::arm:
   case Triple::armeb:
   case Triple::hexagon:
+  case Triple::kalimba:
   case Triple::le32:
   case Triple::mips:
   case Triple::mipsel:
@@ -884,6 +893,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::arm:
   case Triple::armeb:
   case Triple::hexagon:
+  case Triple::kalimba:
   case Triple::le32:
   case Triple::msp430:
   case Triple::r600:
diff --git a/lib/Support/Unix/Memory.inc b/lib/Support/Unix/Memory.inc
index 23b49b7..c9d89a8 100644
--- a/lib/Support/Unix/Memory.inc
+++ b/lib/Support/Unix/Memory.inc
@@ -83,8 +83,8 @@ MemoryBlock
 Memory::allocateMappedMemory(size_t NumBytes,
                              const MemoryBlock *const NearBlock,
                              unsigned PFlags,
-                             error_code &EC) {
-  EC = error_code::success();
+                             std::error_code &EC) {
+  EC = std::error_code();
   if (NumBytes == 0)
     return MemoryBlock();
 
@@ -95,7 +95,7 @@ Memory::allocateMappedMemory(size_t NumBytes,
 #ifdef NEED_DEV_ZERO_FOR_MMAP
   static int zero_fd = open("/dev/zero", O_RDWR);
   if (zero_fd == -1) {
-    EC = error_code(errno, system_category());
+    EC = std::error_code(errno, std::generic_category());
     return MemoryBlock();
   }
   fd = zero_fd;
@@ -123,7 +123,7 @@ Memory::allocateMappedMemory(size_t NumBytes,
     if (NearBlock) //Try again without a near hint
       return allocateMappedMemory(NumBytes, nullptr, PFlags, EC);
 
-    EC = error_code(errno, system_category());
+    EC = std::error_code(errno, std::generic_category());
     return MemoryBlock();
   }
 
@@ -137,38 +137,38 @@ Memory::allocateMappedMemory(size_t NumBytes,
   return Result;
 }
 
-error_code
+std::error_code
 Memory::releaseMappedMemory(MemoryBlock &M) {
   if (M.Address == nullptr || M.Size == 0)
-    return error_code::success();
+    return std::error_code();
 
   if (0 != ::munmap(M.Address, M.Size))
-    return error_code(errno, system_category());
+    return std::error_code(errno, std::generic_category());
 
   M.Address = nullptr;
   M.Size = 0;
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code
+std::error_code
 Memory::protectMappedMemory(const MemoryBlock &M, unsigned Flags) {
   if (M.Address == nullptr || M.Size == 0)
-    return error_code::success();
+    return std::error_code();
 
   if (!Flags)
-    return error_code(EINVAL, generic_category());
+    return std::error_code(EINVAL, std::generic_category());
 
   int Protect = getPosixProtectionFlags(Flags);
 
   int Result = ::mprotect(M.Address, M.Size, Protect);
   if (Result != 0)
-    return error_code(errno, system_category());
+    return std::error_code(errno, std::generic_category());
 
   if (Flags & MF_EXEC)
     Memory::InvalidateInstructionCache(M.Address, M.Size);
 
-  return error_code::success();
+  return std::error_code();
 }
 
 /// AllocateRWX - Allocate a slab of memory with read/write/execute
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index 519a016..623547a 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -87,7 +87,7 @@ namespace {
   };
 }
 
-static error_code TempDir(SmallVectorImpl<char> &result) {
+static std::error_code TempDir(SmallVectorImpl<char> &result) {
   // FIXME: Don't use TMPDIR if program is SUID or SGID enabled.
   const char *dir = nullptr;
   (dir = std::getenv("TMPDIR")) || (dir = std::getenv("TMP")) ||
@@ -100,7 +100,7 @@ static error_code TempDir(SmallVectorImpl<char> &result) {
   result.clear();
   StringRef d(dir);
   result.append(d.begin(), d.end());
-  return error_code::success();
+  return std::error_code();
 }
 
 namespace llvm {
@@ -225,7 +225,7 @@ UniqueID file_status::getUniqueID() const {
   return UniqueID(fs_st_dev, fs_st_ino);
 }
 
-error_code current_path(SmallVectorImpl<char> &result) {
+std::error_code current_path(SmallVectorImpl<char> &result) {
   result.clear();
 
   const char *pwd = ::getenv("PWD");
@@ -235,7 +235,7 @@ error_code current_path(SmallVectorImpl<char> &result) {
       !llvm::sys::fs::status(".", DotStatus) &&
       PWDStatus.getUniqueID() == DotStatus.getUniqueID()) {
     result.append(pwd, pwd + strlen(pwd));
-    return error_code::success();
+    return std::error_code();
   }
 
 #ifdef MAXPATHLEN
@@ -248,8 +248,8 @@ error_code current_path(SmallVectorImpl<char> &result) {
   while (true) {
     if (::getcwd(result.data(), result.capacity()) == nullptr) {
       // See if there was a real error.
-      if (errno != errc::not_enough_memory)
-        return error_code(errno, system_category());
+      if (errno != ENOMEM)
+        return std::error_code(errno, std::generic_category());
       // Otherwise there just wasn't enough space.
       result.reserve(result.capacity() * 2);
     } else
@@ -257,22 +257,22 @@ error_code current_path(SmallVectorImpl<char> &result) {
   }
 
   result.set_size(strlen(result.data()));
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code create_directory(const Twine &path, bool IgnoreExisting) {
+std::error_code create_directory(const Twine &path, bool IgnoreExisting) {
   SmallString<128> path_storage;
   StringRef p = path.toNullTerminatedStringRef(path_storage);
 
   if (::mkdir(p.begin(), S_IRWXU | S_IRWXG) == -1) {
-    if (errno != errc::file_exists || !IgnoreExisting)
-      return error_code(errno, system_category());
+    if (errno != EEXIST || !IgnoreExisting)
+      return std::error_code(errno, std::generic_category());
   }
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code normalize_separators(SmallVectorImpl<char> &Path) {
+std::error_code normalize_separators(SmallVectorImpl<char> &Path) {
   for (auto PI = Path.begin(), PE = Path.end(); PI < PE; ++PI) {
     if (*PI == '\\') {
       auto PN = PI + 1;
@@ -282,12 +282,12 @@ error_code normalize_separators(SmallVectorImpl<char> &Path) {
         *PI = '/';
     }
   }
-  return error_code::success();
+  return std::error_code();
 }
 
 // Note that we are using symbolic link because hard links are not supported by
 // all filesystems (SMB doesn't).
-error_code create_link(const Twine &to, const Twine &from) {
+std::error_code create_link(const Twine &to, const Twine &from) {
   // Get arguments.
   SmallString<128> from_storage;
   SmallString<128> to_storage;
@@ -295,20 +295,20 @@ error_code create_link(const Twine &to, const Twine &from) {
   StringRef t = to.toNullTerminatedStringRef(to_storage);
 
   if (::symlink(t.begin(), f.begin()) == -1)
-    return error_code(errno, system_category());
+    return std::error_code(errno, std::generic_category());
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code remove(const Twine &path, bool IgnoreNonExisting) {
+std::error_code remove(const Twine &path, bool IgnoreNonExisting) {
   SmallString<128> path_storage;
   StringRef p = path.toNullTerminatedStringRef(path_storage);
 
   struct stat buf;
   if (lstat(p.begin(), &buf) != 0) {
-    if (errno != errc::no_such_file_or_directory || !IgnoreNonExisting)
-      return error_code(errno, system_category());
-    return error_code::success();
+    if (errno != ENOENT || !IgnoreNonExisting)
+      return std::error_code(errno, std::generic_category());
+    return std::error_code();
   }
 
   // Note: this check catches strange situations. In all cases, LLVM should
@@ -320,14 +320,14 @@ error_code remove(const Twine &path, bool IgnoreNonExisting) {
     return make_error_code(errc::operation_not_permitted);
 
   if (::remove(p.begin()) == -1) {
-    if (errno != errc::no_such_file_or_directory || !IgnoreNonExisting)
-      return error_code(errno, system_category());
+    if (errno != ENOENT || !IgnoreNonExisting)
+      return std::error_code(errno, std::generic_category());
   }
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code rename(const Twine &from, const Twine &to) {
+std::error_code rename(const Twine &from, const Twine &to) {
   // Get arguments.
   SmallString<128> from_storage;
   SmallString<128> to_storage;
@@ -335,33 +335,33 @@ error_code rename(const Twine &from, const Twine &to) {
   StringRef t = to.toNullTerminatedStringRef(to_storage);
 
   if (::rename(f.begin(), t.begin()) == -1)
-    return error_code(errno, system_category());
+    return std::error_code(errno, std::generic_category());
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code resize_file(const Twine &path, uint64_t size) {
+std::error_code resize_file(const Twine &path, uint64_t size) {
   SmallString<128> path_storage;
   StringRef p = path.toNullTerminatedStringRef(path_storage);
 
   if (::truncate(p.begin(), size) == -1)
-    return error_code(errno, system_category());
+    return std::error_code(errno, std::generic_category());
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code exists(const Twine &path, bool &result) {
+std::error_code exists(const Twine &path, bool &result) {
   SmallString<128> path_storage;
   StringRef p = path.toNullTerminatedStringRef(path_storage);
 
   if (::access(p.begin(), F_OK) == -1) {
-    if (errno != errc::no_such_file_or_directory)
-      return error_code(errno, system_category());
+    if (errno != ENOENT)
+      return std::error_code(errno, std::generic_category());
     result = false;
   } else
     result = true;
 
-  return error_code::success();
+  return std::error_code();
 }
 
 bool can_write(const Twine &Path) {
@@ -390,18 +390,20 @@ bool equivalent(file_status A, file_status B) {
          A.fs_st_ino == B.fs_st_ino;
 }
 
-error_code equivalent(const Twine &A, const Twine &B, bool &result) {
+std::error_code equivalent(const Twine &A, const Twine &B, bool &result) {
   file_status fsA, fsB;
-  if (error_code ec = status(A, fsA)) return ec;
-  if (error_code ec = status(B, fsB)) return ec;
+  if (std::error_code ec = status(A, fsA))
+    return ec;
+  if (std::error_code ec = status(B, fsB))
+    return ec;
   result = equivalent(fsA, fsB);
-  return error_code::success();
+  return std::error_code();
 }
 
-static error_code fillStatus(int StatRet, const struct stat &Status,
+static std::error_code fillStatus(int StatRet, const struct stat &Status,
                              file_status &Result) {
   if (StatRet != 0) {
-    error_code ec(errno, system_category());
+    std::error_code ec(errno, std::generic_category());
     if (ec == errc::no_such_file_or_directory)
       Result = file_status(file_type::file_not_found);
     else
@@ -429,10 +431,10 @@ static error_code fillStatus(int StatRet, const struct stat &Status,
       file_status(Type, Perms, Status.st_dev, Status.st_ino, Status.st_mtime,
                   Status.st_uid, Status.st_gid, Status.st_size);
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code status(const Twine &Path, file_status &Result) {
+std::error_code status(const Twine &Path, file_status &Result) {
   SmallString<128> PathStorage;
   StringRef P = Path.toNullTerminatedStringRef(PathStorage);
 
@@ -441,36 +443,36 @@ error_code status(const Twine &Path, file_status &Result) {
   return fillStatus(StatRet, Status, Result);
 }
 
-error_code status(int FD, file_status &Result) {
+std::error_code status(int FD, file_status &Result) {
   struct stat Status;
   int StatRet = ::fstat(FD, &Status);
   return fillStatus(StatRet, Status, Result);
 }
 
-error_code setLastModificationAndAccessTime(int FD, TimeValue Time) {
+std::error_code setLastModificationAndAccessTime(int FD, TimeValue Time) {
 #if defined(HAVE_FUTIMENS)
   timespec Times[2];
   Times[0].tv_sec = Time.toEpochTime();
   Times[0].tv_nsec = 0;
   Times[1] = Times[0];
   if (::futimens(FD, Times))
-    return error_code(errno, system_category());
-  return error_code::success();
+    return std::error_code(errno, std::generic_category());
+  return std::error_code();
 #elif defined(HAVE_FUTIMES)
   timeval Times[2];
   Times[0].tv_sec = Time.toEpochTime();
   Times[0].tv_usec = 0;
   Times[1] = Times[0];
   if (::futimes(FD, Times))
-    return error_code(errno, system_category());
-  return error_code::success();
+    return std::error_code(errno, std::generic_category());
+  return std::error_code();
 #else
 #warning Missing futimes() and futimens()
-  return make_error_code(errc::not_supported);
+  return make_error_code(errc::function_not_supported);
 #endif
 }
 
-error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
+std::error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
   AutoFD ScopedFD(FD);
   if (!CloseFD)
     ScopedFD.take();
@@ -478,7 +480,7 @@ error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
   // Figure out how large the file is.
   struct stat FileInfo;
   if (fstat(FD, &FileInfo) == -1)
-    return error_code(errno, system_category());
+    return std::error_code(errno, std::generic_category());
   uint64_t FileSize = FileInfo.st_size;
 
   if (Size == 0)
@@ -486,7 +488,7 @@ error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
   else if (FileSize < Size) {
     // We need to grow the file.
     if (ftruncate(FD, Size) == -1)
-      return error_code(errno, system_category());
+      return std::error_code(errno, std::generic_category());
   }
 
   int flags = (Mode == readwrite) ? MAP_SHARED : MAP_PRIVATE;
@@ -496,15 +498,15 @@ error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
 #endif
   Mapping = ::mmap(nullptr, Size, prot, flags, FD, Offset);
   if (Mapping == MAP_FAILED)
-    return error_code(errno, system_category());
-  return error_code::success();
+    return std::error_code(errno, std::generic_category());
+  return std::error_code();
 }
 
 mapped_file_region::mapped_file_region(const Twine &path,
                                        mapmode mode,
                                        uint64_t length,
                                        uint64_t offset,
-                                       error_code &ec)
+                                       std::error_code &ec)
   : Mode(mode)
   , Size(length)
   , Mapping() {
@@ -519,7 +521,7 @@ mapped_file_region::mapped_file_region(const Twine &path,
   int oflags = (mode == readonly) ? O_RDONLY : O_RDWR;
   int ofd = ::open(name.begin(), oflags);
   if (ofd == -1) {
-    ec = error_code(errno, system_category());
+    ec = std::error_code(errno, std::generic_category());
     return;
   }
 
@@ -533,7 +535,7 @@ mapped_file_region::mapped_file_region(int fd,
                                        mapmode mode,
                                        uint64_t length,
                                        uint64_t offset,
-                                       error_code &ec)
+                                       std::error_code &ec)
   : Mode(mode)
   , Size(length)
   , Mapping() {
@@ -583,12 +585,12 @@ int mapped_file_region::alignment() {
   return process::get_self()->page_size();
 }
 
-error_code detail::directory_iterator_construct(detail::DirIterState &it,
+std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
                                                 StringRef path){
   SmallString<128> path_null(path);
   DIR *directory = ::opendir(path_null.c_str());
   if (!directory)
-    return error_code(errno, system_category());
+    return std::error_code(errno, std::generic_category());
 
   it.IterationHandle = reinterpret_cast<intptr_t>(directory);
   // Add something for replace_filename to replace.
@@ -597,19 +599,19 @@ error_code detail::directory_iterator_construct(detail::DirIterState &it,
   return directory_iterator_increment(it);
 }
 
-error_code detail::directory_iterator_destruct(detail::DirIterState &it) {
+std::error_code detail::directory_iterator_destruct(detail::DirIterState &it) {
   if (it.IterationHandle)
     ::closedir(reinterpret_cast<DIR *>(it.IterationHandle));
   it.IterationHandle = 0;
   it.CurrentEntry = directory_entry();
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code detail::directory_iterator_increment(detail::DirIterState &it) {
+std::error_code detail::directory_iterator_increment(detail::DirIterState &it) {
   errno = 0;
   dirent *cur_dir = ::readdir(reinterpret_cast<DIR *>(it.IterationHandle));
   if (cur_dir == nullptr && errno != 0) {
-    return error_code(errno, system_category());
+    return std::error_code(errno, std::generic_category());
   } else if (cur_dir != nullptr) {
     StringRef name(cur_dir->d_name, NAMLEN(cur_dir));
     if ((name.size() == 1 && name[0] == '.') ||
@@ -619,80 +621,20 @@ error_code detail::directory_iterator_increment(detail::DirIterState &it) {
   } else
     return directory_iterator_destruct(it);
 
-  return error_code::success();
-}
-
-error_code get_magic(const Twine &path, uint32_t len,
-                     SmallVectorImpl<char> &result) {
-  SmallString<128> PathStorage;
-  StringRef Path = path.toNullTerminatedStringRef(PathStorage);
-  result.set_size(0);
-
-  // Open path.
-  std::FILE *file = std::fopen(Path.data(), "rb");
-  if (!file)
-    return error_code(errno, system_category());
-
-  // Reserve storage.
-  result.reserve(len);
-
-  // Read magic!
-  size_t size = std::fread(result.data(), 1, len, file);
-  if (std::ferror(file) != 0) {
-    std::fclose(file);
-    return error_code(errno, system_category());
-  } else if (size != len) {
-    if (std::feof(file) != 0) {
-      std::fclose(file);
-      result.set_size(size);
-      return make_error_code(errc::value_too_large);
-    }
-  }
-  std::fclose(file);
-  result.set_size(size);
-  return error_code::success();
-}
-
-error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,  
-                                            bool map_writable, void *&result) {
-  SmallString<128> path_storage;
-  StringRef name = path.toNullTerminatedStringRef(path_storage);
-  int oflags = map_writable ? O_RDWR : O_RDONLY;
-  int ofd = ::open(name.begin(), oflags);
-  if ( ofd == -1 )
-    return error_code(errno, system_category());
-  AutoFD fd(ofd);
-  int flags = map_writable ? MAP_SHARED : MAP_PRIVATE;
-  int prot = map_writable ? (PROT_READ|PROT_WRITE) : PROT_READ;
-#ifdef MAP_FILE
-  flags |= MAP_FILE;
-#endif
-  result = ::mmap(nullptr, size, prot, flags, fd, file_offset);
-  if (result == MAP_FAILED) {
-    return error_code(errno, system_category());
-  }
-  
-  return error_code::success();
-}
-
-error_code unmap_file_pages(void *base, size_t size) {
-  if ( ::munmap(base, size) == -1 )
-    return error_code(errno, system_category());
-   
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code openFileForRead(const Twine &Name, int &ResultFD) {
+std::error_code openFileForRead(const Twine &Name, int &ResultFD) {
   SmallString<128> Storage;
   StringRef P = Name.toNullTerminatedStringRef(Storage);
   while ((ResultFD = open(P.begin(), O_RDONLY)) < 0) {
     if (errno != EINTR)
-      return error_code(errno, system_category());
+      return std::error_code(errno, std::generic_category());
   }
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code openFileForWrite(const Twine &Name, int &ResultFD,
+std::error_code openFileForWrite(const Twine &Name, int &ResultFD,
                             sys::fs::OpenFlags Flags, unsigned Mode) {
   // Verify that we don't have both "append" and "excl".
   assert((!(Flags & sys::fs::F_Excl) || !(Flags & sys::fs::F_Append)) &&
@@ -717,9 +659,9 @@ error_code openFileForWrite(const Twine &Name, int &ResultFD,
   StringRef P = Name.toNullTerminatedStringRef(Storage);
   while ((ResultFD = open(P.begin(), OpenFlags, Mode)) < 0) {
     if (errno != EINTR)
-      return error_code(errno, system_category());
+      return std::error_code(errno, std::generic_category());
   }
-  return error_code::success();
+  return std::error_code();
 }
 
 } // end namespace fs
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index 8faa638..d2c5dbc 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -47,7 +47,6 @@
 using namespace llvm;
 using namespace sys;
 
-
 process::id_type self_process::get_id() {
   return getpid();
 }
@@ -190,12 +189,13 @@ Optional<std::string> Process::GetEnv(StringRef Name) {
   return std::string(Val);
 }
 
-error_code Process::GetArgumentVector(SmallVectorImpl<const char *> &ArgsOut,
-                                      ArrayRef<const char *> ArgsIn,
-                                      SpecificBumpPtrAllocator<char> &) {
+std::error_code
+Process::GetArgumentVector(SmallVectorImpl<const char *> &ArgsOut,
+                           ArrayRef<const char *> ArgsIn,
+                           SpecificBumpPtrAllocator<char> &) {
   ArgsOut.append(ArgsIn.begin(), ArgsIn.end());
 
-  return error_code::success();
+  return std::error_code();
 }
 
 bool Process::StandardInIsUserInput() {
diff --git a/lib/Support/Unix/Program.inc b/lib/Support/Unix/Program.inc
index 1225a9c..06a33cd 100644
--- a/lib/Support/Unix/Program.inc
+++ b/lib/Support/Unix/Program.inc
@@ -48,6 +48,7 @@
 #endif
 
 namespace llvm {
+
 using namespace sys;
 
 ProcessInfo::ProcessInfo() : Pid(0), ReturnCode(0) {}
@@ -349,7 +350,11 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
   // Parent process: Wait for the child process to terminate.
   int status;
   ProcessInfo WaitResult;
-  WaitResult.Pid = waitpid(ChildPid, &status, WaitPidOptions);
+
+  do {
+    WaitResult.Pid = waitpid(ChildPid, &status, WaitPidOptions);
+  } while (WaitUntilTerminates && WaitResult.Pid == -1 && errno == EINTR);
+
   if (WaitResult.Pid != PI.Pid) {
     if (WaitResult.Pid == 0) {
       // Non-blocking wait.
@@ -425,14 +430,14 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
   return WaitResult;
 }
 
-error_code sys::ChangeStdinToBinary(){
+  std::error_code sys::ChangeStdinToBinary(){
   // Do nothing, as Unix doesn't differentiate between text and binary.
-  return make_error_code(errc::success);
+    return std::error_code();
 }
 
-error_code sys::ChangeStdoutToBinary(){
+  std::error_code sys::ChangeStdoutToBinary(){
   // Do nothing, as Unix doesn't differentiate between text and binary.
-  return make_error_code(errc::success);
+    return std::error_code();
 }
 
 bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
diff --git a/lib/Support/Unix/system_error.inc b/lib/Support/Unix/system_error.inc
deleted file mode 100644
index 681e919..0000000
--- a/lib/Support/Unix/system_error.inc
+++ /dev/null
@@ -1,34 +0,0 @@
-//===- llvm/Support/Unix/system_error.inc - Unix error_code ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides the Unix specific implementation of the error_code
-// and error_condition classes.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//=== WARNING: Implementation here must contain only generic UNIX code that
-//===          is guaranteed to work on *all* UNIX variants.
-//===----------------------------------------------------------------------===//
-
-using namespace llvm;
-
-std::string
-_system_error_category::message(int ev) const {
-  return _do_message::message(ev);
-}
-
-error_condition
-_system_error_category::default_error_condition(int ev) const {
-#ifdef ELAST
-  if (ev > ELAST)
-    return error_condition(ev, system_category());
-#endif  // ELAST
-  return error_condition(ev, generic_category());
-}
diff --git a/lib/Support/Windows/DynamicLibrary.inc b/lib/Support/Windows/DynamicLibrary.inc
index 5d0278f..5ed0b70 100644
--- a/lib/Support/Windows/DynamicLibrary.inc
+++ b/lib/Support/Windows/DynamicLibrary.inc
@@ -85,7 +85,7 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
   }
 
   SmallVector<wchar_t, MAX_PATH> filenameUnicode;
-  if (error_code ec = windows::UTF8ToUTF16(filename, filenameUnicode)) {
+  if (std::error_code ec = windows::UTF8ToUTF16(filename, filenameUnicode)) {
     SetLastError(ec.value());
     MakeErrMsg(errMsg, std::string(filename) + ": Can't convert to UTF-16: ");
     return DynamicLibrary();
diff --git a/lib/Support/Windows/Memory.inc b/lib/Support/Windows/Memory.inc
index ebe7878..ae8371a 100644
--- a/lib/Support/Windows/Memory.inc
+++ b/lib/Support/Windows/Memory.inc
@@ -15,6 +15,7 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Process.h"
+#include "llvm/Support/WindowsError.h"
 
 // The Windows.h header must be the last one included.
 #include "WindowsSupport.h"
@@ -69,8 +70,8 @@ namespace sys {
 MemoryBlock Memory::allocateMappedMemory(size_t NumBytes,
                                          const MemoryBlock *const NearBlock,
                                          unsigned Flags,
-                                         error_code &EC) {
-  EC = error_code::success();
+                                         std::error_code &EC) {
+  EC = std::error_code();
   if (NumBytes == 0)
     return MemoryBlock();
 
@@ -99,7 +100,7 @@ MemoryBlock Memory::allocateMappedMemory(size_t NumBytes,
       // Try again without the NearBlock hint
       return allocateMappedMemory(NumBytes, NULL, Flags, EC);
     }
-    EC = error_code(::GetLastError(), system_category());
+    EC = mapWindowsError(::GetLastError());
     return MemoryBlock();
   }
 
@@ -113,34 +114,34 @@ MemoryBlock Memory::allocateMappedMemory(size_t NumBytes,
   return Result;
 }
 
-error_code Memory::releaseMappedMemory(MemoryBlock &M) {
+  std::error_code Memory::releaseMappedMemory(MemoryBlock &M) {
   if (M.Address == 0 || M.Size == 0)
-    return error_code::success();
+    return std::error_code();
 
   if (!VirtualFree(M.Address, 0, MEM_RELEASE))
-    return error_code(::GetLastError(), system_category());
+    return mapWindowsError(::GetLastError());
 
   M.Address = 0;
   M.Size = 0;
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code Memory::protectMappedMemory(const MemoryBlock &M,
+  std::error_code Memory::protectMappedMemory(const MemoryBlock &M,
                                        unsigned Flags) {
   if (M.Address == 0 || M.Size == 0)
-    return error_code::success();
+    return std::error_code();
 
   DWORD Protect = getWindowsProtectionFlags(Flags);
 
   DWORD OldFlags;
   if (!VirtualProtect(M.Address, M.Size, Protect, &OldFlags))
-    return error_code(::GetLastError(), system_category());
+    return mapWindowsError(::GetLastError());
 
   if (Flags & MF_EXEC)
     Memory::InvalidateInstructionCache(M.Address, M.Size);
 
-  return error_code::success();
+  return std::error_code();
 }
 
 /// InvalidateInstructionCache - Before the JIT can run a block of code
@@ -156,18 +157,18 @@ MemoryBlock Memory::AllocateRWX(size_t NumBytes,
                                 const MemoryBlock *NearBlock,
                                 std::string *ErrMsg) {
   MemoryBlock MB;
-  error_code EC;
+  std::error_code EC;
   MB = allocateMappedMemory(NumBytes, NearBlock,
                             MF_READ|MF_WRITE|MF_EXEC, EC);
-  if (EC != error_code::success() && ErrMsg) {
+  if (EC != std::error_code() && ErrMsg) {
     MakeErrMsg(ErrMsg, EC.message());
   }
   return MB;
 }
 
 bool Memory::ReleaseRWX(MemoryBlock &M, std::string *ErrMsg) {
-  error_code EC = releaseMappedMemory(M);
-  if (EC == error_code::success())
+  std::error_code EC = releaseMappedMemory(M);
+  if (EC == std::error_code())
     return false;
   MakeErrMsg(ErrMsg, EC.message());
   return true;
diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc
index e59888e..7a1bc04 100644
--- a/lib/Support/Windows/Path.inc
+++ b/lib/Support/Windows/Path.inc
@@ -17,6 +17,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/WindowsError.h"
 #include <fcntl.h>
 #include <io.h>
 #include <sys/stat.h>
@@ -44,7 +45,11 @@ using namespace llvm;
 using llvm::sys::windows::UTF8ToUTF16;
 using llvm::sys::windows::UTF16ToUTF8;
 
-static error_code TempDir(SmallVectorImpl<char> &Result) {
+static std::error_code windows_error(DWORD E) {
+  return mapWindowsError(E);
+}
+
+static std::error_code TempDir(SmallVectorImpl<char> &Result) {
   SmallVector<wchar_t, 64> Res;
 retry_temp_dir:
   DWORD Len = ::GetTempPathW(Res.capacity(), Res.begin());
@@ -119,7 +124,7 @@ TimeValue file_status::getLastModificationTime() const {
   return Ret;
 }
 
-error_code current_path(SmallVectorImpl<char> &result) {
+std::error_code current_path(SmallVectorImpl<char> &result) {
   SmallVector<wchar_t, MAX_PATH> cur_path;
   DWORD len = MAX_PATH;
 
@@ -141,30 +146,30 @@ error_code current_path(SmallVectorImpl<char> &result) {
   return UTF16ToUTF8(cur_path.begin(), cur_path.size(), result);
 }
 
-error_code create_directory(const Twine &path, bool IgnoreExisting) {
+std::error_code create_directory(const Twine &path, bool IgnoreExisting) {
   SmallString<128> path_storage;
   SmallVector<wchar_t, 128> path_utf16;
 
-  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
-                                  path_utf16))
+  if (std::error_code ec =
+          UTF8ToUTF16(path.toStringRef(path_storage), path_utf16))
     return ec;
 
   if (!::CreateDirectoryW(path_utf16.begin(), NULL)) {
-    error_code ec = windows_error(::GetLastError());
-    if (ec != windows_error::already_exists || !IgnoreExisting)
-      return ec;
+    DWORD LastError = ::GetLastError();
+    if (LastError != ERROR_ALREADY_EXISTS || !IgnoreExisting)
+      return windows_error(LastError);
   }
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code normalize_separators(SmallVectorImpl<char> &Path) {
+std::error_code normalize_separators(SmallVectorImpl<char> &Path) {
   (void) Path;
-  return error_code::success();
+  return std::error_code();
 }
 
 // We can't use symbolic links for windows.
-error_code create_link(const Twine &to, const Twine &from) {
+std::error_code create_link(const Twine &to, const Twine &from) {
   // Get arguments.
   SmallString<128> from_storage;
   SmallString<128> to_storage;
@@ -174,47 +179,49 @@ error_code create_link(const Twine &to, const Twine &from) {
   // Convert to utf-16.
   SmallVector<wchar_t, 128> wide_from;
   SmallVector<wchar_t, 128> wide_to;
-  if (error_code ec = UTF8ToUTF16(f, wide_from)) return ec;
-  if (error_code ec = UTF8ToUTF16(t, wide_to)) return ec;
+  if (std::error_code ec = UTF8ToUTF16(f, wide_from))
+    return ec;
+  if (std::error_code ec = UTF8ToUTF16(t, wide_to))
+    return ec;
 
   if (!::CreateHardLinkW(wide_from.begin(), wide_to.begin(), NULL))
     return windows_error(::GetLastError());
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code remove(const Twine &path, bool IgnoreNonExisting) {
+std::error_code remove(const Twine &path, bool IgnoreNonExisting) {
   SmallString<128> path_storage;
   SmallVector<wchar_t, 128> path_utf16;
 
   file_status ST;
-  if (error_code EC = status(path, ST)) {
+  if (std::error_code EC = status(path, ST)) {
     if (EC != errc::no_such_file_or_directory || !IgnoreNonExisting)
       return EC;
-    return error_code::success();
+    return std::error_code();
   }
 
-  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
-                                  path_utf16))
+  if (std::error_code ec =
+          UTF8ToUTF16(path.toStringRef(path_storage), path_utf16))
     return ec;
 
   if (ST.type() == file_type::directory_file) {
     if (!::RemoveDirectoryW(c_str(path_utf16))) {
-      error_code EC = windows_error(::GetLastError());
+      std::error_code EC = windows_error(::GetLastError());
       if (EC != errc::no_such_file_or_directory || !IgnoreNonExisting)
         return EC;
     }
-    return error_code::success();
+    return std::error_code();
   }
   if (!::DeleteFileW(c_str(path_utf16))) {
-    error_code EC = windows_error(::GetLastError());
+    std::error_code EC = windows_error(::GetLastError());
     if (EC != errc::no_such_file_or_directory || !IgnoreNonExisting)
       return EC;
   }
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code rename(const Twine &from, const Twine &to) {
+std::error_code rename(const Twine &from, const Twine &to) {
   // Get arguments.
   SmallString<128> from_storage;
   SmallString<128> to_storage;
@@ -224,16 +231,18 @@ error_code rename(const Twine &from, const Twine &to) {
   // Convert to utf-16.
   SmallVector<wchar_t, 128> wide_from;
   SmallVector<wchar_t, 128> wide_to;
-  if (error_code ec = UTF8ToUTF16(f, wide_from)) return ec;
-  if (error_code ec = UTF8ToUTF16(t, wide_to)) return ec;
+  if (std::error_code ec = UTF8ToUTF16(f, wide_from))
+    return ec;
+  if (std::error_code ec = UTF8ToUTF16(t, wide_to))
+    return ec;
 
-  error_code ec = error_code::success();
+  std::error_code ec = std::error_code();
   for (int i = 0; i < 2000; i++) {
     if (::MoveFileExW(wide_from.begin(), wide_to.begin(),
                       MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING))
-      return error_code::success();
-    ec = windows_error(::GetLastError());
-    if (ec != windows_error::access_denied)
+      return std::error_code();
+    DWORD LastError = ::GetLastError();
+    if (LastError != ERROR_ACCESS_DENIED)
       break;
     // Retry MoveFile() at ACCESS_DENIED.
     // System scanners (eg. indexer) might open the source file when
@@ -244,46 +253,46 @@ error_code rename(const Twine &from, const Twine &to) {
   return ec;
 }
 
-error_code resize_file(const Twine &path, uint64_t size) {
+std::error_code resize_file(const Twine &path, uint64_t size) {
   SmallString<128> path_storage;
   SmallVector<wchar_t, 128> path_utf16;
 
-  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
-                                  path_utf16))
+  if (std::error_code ec =
+          UTF8ToUTF16(path.toStringRef(path_storage), path_utf16))
     return ec;
 
   int fd = ::_wopen(path_utf16.begin(), O_BINARY | _O_RDWR, S_IWRITE);
   if (fd == -1)
-    return error_code(errno, generic_category());
+    return std::error_code(errno, std::generic_category());
 #ifdef HAVE__CHSIZE_S
   errno_t error = ::_chsize_s(fd, size);
 #else
   errno_t error = ::_chsize(fd, size);
 #endif
   ::close(fd);
-  return error_code(error, generic_category());
+  return std::error_code(error, std::generic_category());
 }
 
-error_code exists(const Twine &path, bool &result) {
+std::error_code exists(const Twine &path, bool &result) {
   SmallString<128> path_storage;
   SmallVector<wchar_t, 128> path_utf16;
 
-  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
-                                  path_utf16))
+  if (std::error_code ec =
+          UTF8ToUTF16(path.toStringRef(path_storage), path_utf16))
     return ec;
 
   DWORD attributes = ::GetFileAttributesW(path_utf16.begin());
 
   if (attributes == INVALID_FILE_ATTRIBUTES) {
     // See if the file didn't actually exist.
-    error_code ec = make_error_code(windows_error(::GetLastError()));
-    if (ec != windows_error::file_not_found &&
-        ec != windows_error::path_not_found)
-      return ec;
+    DWORD LastError = ::GetLastError();
+    if (LastError != ERROR_FILE_NOT_FOUND &&
+        LastError != ERROR_PATH_NOT_FOUND)
+      return windows_error(LastError);
     result = false;
   } else
     result = true;
-  return error_code::success();
+  return std::error_code();
 }
 
 bool can_write(const Twine &Path) {
@@ -320,12 +329,14 @@ bool equivalent(file_status A, file_status B) {
          A.VolumeSerialNumber == B.VolumeSerialNumber;
 }
 
-error_code equivalent(const Twine &A, const Twine &B, bool &result) {
+std::error_code equivalent(const Twine &A, const Twine &B, bool &result) {
   file_status fsA, fsB;
-  if (error_code ec = status(A, fsA)) return ec;
-  if (error_code ec = status(B, fsB)) return ec;
+  if (std::error_code ec = status(A, fsA))
+    return ec;
+  if (std::error_code ec = status(B, fsB))
+    return ec;
   result = equivalent(fsA, fsB);
-  return error_code::success();
+  return std::error_code();
 }
 
 static bool isReservedName(StringRef path) {
@@ -351,7 +362,7 @@ static bool isReservedName(StringRef path) {
   return false;
 }
 
-static error_code getStatus(HANDLE FileHandle, file_status &Result) {
+static std::error_code getStatus(HANDLE FileHandle, file_status &Result) {
   if (FileHandle == INVALID_HANDLE_VALUE)
     goto handle_status_error;
 
@@ -363,16 +374,16 @@ static error_code getStatus(HANDLE FileHandle, file_status &Result) {
     if (Err != NO_ERROR)
       return windows_error(Err);
     Result = file_status(file_type::type_unknown);
-    return error_code::success();
+    return std::error_code();
   }
   case FILE_TYPE_DISK:
     break;
   case FILE_TYPE_CHAR:
     Result = file_status(file_type::character_file);
-    return error_code::success();
+    return std::error_code();
   case FILE_TYPE_PIPE:
     Result = file_status(file_type::fifo_file);
-    return error_code::success();
+    return std::error_code();
   }
 
   BY_HANDLE_FILE_INFORMATION Info;
@@ -388,32 +399,32 @@ static error_code getStatus(HANDLE FileHandle, file_status &Result) {
                     Info.ftLastWriteTime.dwLowDateTime,
                     Info.dwVolumeSerialNumber, Info.nFileSizeHigh,
                     Info.nFileSizeLow, Info.nFileIndexHigh, Info.nFileIndexLow);
-    return error_code::success();
+    return std::error_code();
   }
 
 handle_status_error:
-  error_code EC = windows_error(::GetLastError());
-  if (EC == windows_error::file_not_found ||
-      EC == windows_error::path_not_found)
+  DWORD LastError = ::GetLastError();
+  if (LastError == ERROR_FILE_NOT_FOUND ||
+      LastError == ERROR_PATH_NOT_FOUND)
     Result = file_status(file_type::file_not_found);
-  else if (EC == windows_error::sharing_violation)
+  else if (LastError == ERROR_SHARING_VIOLATION)
     Result = file_status(file_type::type_unknown);
   else
     Result = file_status(file_type::status_error);
-  return EC;
+  return windows_error(LastError);
 }
 
-error_code status(const Twine &path, file_status &result) {
+std::error_code status(const Twine &path, file_status &result) {
   SmallString<128> path_storage;
   SmallVector<wchar_t, 128> path_utf16;
 
   StringRef path8 = path.toStringRef(path_storage);
   if (isReservedName(path8)) {
     result = file_status(file_type::character_file);
-    return error_code::success();
+    return std::error_code();
   }
 
-  if (error_code ec = UTF8ToUTF16(path8, path_utf16))
+  if (std::error_code ec = UTF8ToUTF16(path8, path_utf16))
     return ec;
 
   DWORD attr = ::GetFileAttributesW(path_utf16.begin());
@@ -444,12 +455,12 @@ error_code status(const Twine &path, file_status &result) {
     return getStatus(h, result);
 }
 
-error_code status(int FD, file_status &Result) {
+std::error_code status(int FD, file_status &Result) {
   HANDLE FileHandle = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
   return getStatus(FileHandle, Result);
 }
 
-error_code setLastModificationAndAccessTime(int FD, TimeValue Time) {
+std::error_code setLastModificationAndAccessTime(int FD, TimeValue Time) {
   ULARGE_INTEGER UI;
   UI.QuadPart = Time.toWin32Time();
   FILETIME FT;
@@ -458,52 +469,10 @@ error_code setLastModificationAndAccessTime(int FD, TimeValue Time) {
   HANDLE FileHandle = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
   if (!SetFileTime(FileHandle, NULL, &FT, &FT))
     return windows_error(::GetLastError());
-  return error_code::success();
-}
-
-error_code get_magic(const Twine &path, uint32_t len,
-                     SmallVectorImpl<char> &result) {
-  SmallString<128> path_storage;
-  SmallVector<wchar_t, 128> path_utf16;
-  result.set_size(0);
-
-  // Convert path to UTF-16.
-  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
-                                  path_utf16))
-    return ec;
-
-  // Open file.
-  HANDLE file = ::CreateFileW(c_str(path_utf16),
-                              GENERIC_READ,
-                              FILE_SHARE_READ,
-                              NULL,
-                              OPEN_EXISTING,
-                              FILE_ATTRIBUTE_READONLY,
-                              NULL);
-  if (file == INVALID_HANDLE_VALUE)
-    return windows_error(::GetLastError());
-
-  // Allocate buffer.
-  result.reserve(len);
-
-  // Get magic!
-  DWORD bytes_read = 0;
-  BOOL read_success = ::ReadFile(file, result.data(), len, &bytes_read, NULL);
-  error_code ec = windows_error(::GetLastError());
-  ::CloseHandle(file);
-  if (!read_success || (bytes_read != len)) {
-    // Set result size to the number of bytes read if it's valid.
-    if (bytes_read <= len)
-      result.set_size(bytes_read);
-    // ERROR_HANDLE_EOF is mapped to errc::value_too_large.
-    return ec;
-  }
-
-  result.set_size(len);
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
+std::error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
   FileDescriptor = FD;
   // Make sure that the requested size fits within SIZE_T.
   if (Size > std::numeric_limits<SIZE_T>::max()) {
@@ -528,7 +497,7 @@ error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
                            (Offset + Size) & 0xffffffff,
                            0);
   if (FileMappingHandle == NULL) {
-    error_code ec = windows_error(GetLastError());
+    std::error_code ec = windows_error(GetLastError());
     if (FileDescriptor) {
       if (CloseFD)
         _close(FileDescriptor);
@@ -549,7 +518,7 @@ error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
                             Offset & 0xffffffff,
                             Size);
   if (Mapping == NULL) {
-    error_code ec = windows_error(GetLastError());
+    std::error_code ec = windows_error(GetLastError());
     ::CloseHandle(FileMappingHandle);
     if (FileDescriptor) {
       if (CloseFD)
@@ -563,7 +532,7 @@ error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
     MEMORY_BASIC_INFORMATION mbi;
     SIZE_T Result = VirtualQuery(Mapping, &mbi, sizeof(mbi));
     if (Result == 0) {
-      error_code ec = windows_error(GetLastError());
+      std::error_code ec = windows_error(GetLastError());
       ::UnmapViewOfFile(Mapping);
       ::CloseHandle(FileMappingHandle);
       if (FileDescriptor) {
@@ -584,14 +553,14 @@ error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
       _close(FileDescriptor); // Also closes FileHandle.
   } else
     ::CloseHandle(FileHandle);
-  return error_code::success();
+  return std::error_code();
 }
 
 mapped_file_region::mapped_file_region(const Twine &path,
                                        mapmode mode,
                                        uint64_t length,
                                        uint64_t offset,
-                                       error_code &ec)
+                                       std::error_code &ec)
   : Mode(mode)
   , Size(length)
   , Mapping()
@@ -636,7 +605,7 @@ mapped_file_region::mapped_file_region(int fd,
                                        mapmode mode,
                                        uint64_t length,
                                        uint64_t offset,
-                                       error_code &ec)
+                                       std::error_code &ec)
   : Mode(mode)
   , Size(length)
   , Mapping()
@@ -704,12 +673,11 @@ int mapped_file_region::alignment() {
   return SysInfo.dwAllocationGranularity;
 }
 
-error_code detail::directory_iterator_construct(detail::DirIterState &it,
+std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
                                                 StringRef path){
   SmallVector<wchar_t, 128> path_utf16;
 
-  if (error_code ec = UTF8ToUTF16(path,
-                                  path_utf16))
+  if (std::error_code ec = UTF8ToUTF16(path, path_utf16))
     return ec;
 
   // Convert path to the format that Windows is happy with.
@@ -733,19 +701,19 @@ error_code detail::directory_iterator_construct(detail::DirIterState &it,
          (FilenameLen == 2 && FirstFind.cFileName[0] == L'.' &&
                               FirstFind.cFileName[1] == L'.'))
     if (!::FindNextFileW(FindHandle, &FirstFind)) {
-      error_code ec = windows_error(::GetLastError());
+      DWORD LastError = ::GetLastError();
       // Check for end.
-      if (ec == windows_error::no_more_files)
+      if (LastError == ERROR_NO_MORE_FILES)
         return detail::directory_iterator_destruct(it);
-      return ec;
+      return windows_error(LastError);
     } else
       FilenameLen = ::wcslen(FirstFind.cFileName);
 
   // Construct the current directory entry.
   SmallString<128> directory_entry_name_utf8;
-  if (error_code ec = UTF16ToUTF8(FirstFind.cFileName,
-                                  ::wcslen(FirstFind.cFileName),
-                                  directory_entry_name_utf8))
+  if (std::error_code ec =
+          UTF16ToUTF8(FirstFind.cFileName, ::wcslen(FirstFind.cFileName),
+                      directory_entry_name_utf8))
     return ec;
 
   it.IterationHandle = intptr_t(FindHandle.take());
@@ -753,26 +721,26 @@ error_code detail::directory_iterator_construct(detail::DirIterState &it,
   path::append(directory_entry_path, directory_entry_name_utf8.str());
   it.CurrentEntry = directory_entry(directory_entry_path.str());
 
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code detail::directory_iterator_destruct(detail::DirIterState &it) {
+std::error_code detail::directory_iterator_destruct(detail::DirIterState &it) {
   if (it.IterationHandle != 0)
     // Closes the handle if it's valid.
     ScopedFindHandle close(HANDLE(it.IterationHandle));
   it.IterationHandle = 0;
   it.CurrentEntry = directory_entry();
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code detail::directory_iterator_increment(detail::DirIterState &it) {
+std::error_code detail::directory_iterator_increment(detail::DirIterState &it) {
   WIN32_FIND_DATAW FindData;
   if (!::FindNextFileW(HANDLE(it.IterationHandle), &FindData)) {
-    error_code ec = windows_error(::GetLastError());
+    DWORD LastError = ::GetLastError();
     // Check for end.
-    if (ec == windows_error::no_more_files)
+    if (LastError == ERROR_NO_MORE_FILES)
       return detail::directory_iterator_destruct(it);
-    return ec;
+    return windows_error(LastError);
   }
 
   size_t FilenameLen = ::wcslen(FindData.cFileName);
@@ -782,60 +750,50 @@ error_code detail::directory_iterator_increment(detail::DirIterState &it) {
     return directory_iterator_increment(it);
 
   SmallString<128> directory_entry_path_utf8;
-  if (error_code ec = UTF16ToUTF8(FindData.cFileName,
-                                  ::wcslen(FindData.cFileName),
-                                  directory_entry_path_utf8))
+  if (std::error_code ec =
+          UTF16ToUTF8(FindData.cFileName, ::wcslen(FindData.cFileName),
+                      directory_entry_path_utf8))
     return ec;
 
   it.CurrentEntry.replace_filename(Twine(directory_entry_path_utf8));
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,
-                                            bool map_writable, void *&result) {
-  assert(0 && "NOT IMPLEMENTED");
-  return windows_error::invalid_function;
-}
-
-error_code unmap_file_pages(void *base, size_t size) {
-  assert(0 && "NOT IMPLEMENTED");
-  return windows_error::invalid_function;
-}
-
-error_code openFileForRead(const Twine &Name, int &ResultFD) {
+std::error_code openFileForRead(const Twine &Name, int &ResultFD) {
   SmallString<128> PathStorage;
   SmallVector<wchar_t, 128> PathUTF16;
 
-  if (error_code EC = UTF8ToUTF16(Name.toStringRef(PathStorage),
-                                  PathUTF16))
+  if (std::error_code EC =
+          UTF8ToUTF16(Name.toStringRef(PathStorage), PathUTF16))
     return EC;
 
   HANDLE H = ::CreateFileW(PathUTF16.begin(), GENERIC_READ,
                            FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
                            OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
   if (H == INVALID_HANDLE_VALUE) {
-    error_code EC = windows_error(::GetLastError());
+    DWORD LastError = ::GetLastError();
+    std::error_code EC = windows_error(LastError);
     // Provide a better error message when trying to open directories.
     // This only runs if we failed to open the file, so there is probably
     // no performances issues.
-    if (EC != windows_error::access_denied)
+    if (LastError != ERROR_ACCESS_DENIED)
       return EC;
     if (is_directory(Name))
-      return error_code(errc::is_a_directory, posix_category());
+      return make_error_code(errc::is_a_directory);
     return EC;
   }
 
   int FD = ::_open_osfhandle(intptr_t(H), 0);
   if (FD == -1) {
     ::CloseHandle(H);
-    return windows_error::invalid_handle;
+    return windows_error(ERROR_INVALID_HANDLE);
   }
 
   ResultFD = FD;
-  return error_code::success();
+  return std::error_code();
 }
 
-error_code openFileForWrite(const Twine &Name, int &ResultFD,
+std::error_code openFileForWrite(const Twine &Name, int &ResultFD,
                             sys::fs::OpenFlags Flags, unsigned Mode) {
   // Verify that we don't have both "append" and "excl".
   assert((!(Flags & sys::fs::F_Excl) || !(Flags & sys::fs::F_Append)) &&
@@ -844,8 +802,8 @@ error_code openFileForWrite(const Twine &Name, int &ResultFD,
   SmallString<128> PathStorage;
   SmallVector<wchar_t, 128> PathUTF16;
 
-  if (error_code EC = UTF8ToUTF16(Name.toStringRef(PathStorage),
-                                  PathUTF16))
+  if (std::error_code EC =
+          UTF8ToUTF16(Name.toStringRef(PathStorage), PathUTF16))
     return EC;
 
   DWORD CreationDisposition;
@@ -865,14 +823,15 @@ error_code openFileForWrite(const Twine &Name, int &ResultFD,
                            CreationDisposition, FILE_ATTRIBUTE_NORMAL, NULL);
 
   if (H == INVALID_HANDLE_VALUE) {
-    error_code EC = windows_error(::GetLastError());
+    DWORD LastError = ::GetLastError();
+    std::error_code EC = windows_error(LastError);
     // Provide a better error message when trying to open directories.
     // This only runs if we failed to open the file, so there is probably
     // no performances issues.
-    if (EC != windows_error::access_denied)
+    if (LastError != ERROR_ACCESS_DENIED)
       return EC;
     if (is_directory(Name))
-      return error_code(errc::is_a_directory, posix_category());
+      return make_error_code(errc::is_a_directory);
     return EC;
   }
 
@@ -886,11 +845,11 @@ error_code openFileForWrite(const Twine &Name, int &ResultFD,
   int FD = ::_open_osfhandle(intptr_t(H), OpenFlags);
   if (FD == -1) {
     ::CloseHandle(H);
-    return windows_error::invalid_handle;
+    return windows_error(ERROR_INVALID_HANDLE);
   }
 
   ResultFD = FD;
-  return error_code::success();
+  return std::error_code();
 }
 } // end namespace fs
 
@@ -911,14 +870,14 @@ bool home_directory(SmallVectorImpl<char> &result) {
 } // end namespace path
 
 namespace windows {
-llvm::error_code UTF8ToUTF16(llvm::StringRef utf8,
-                             llvm::SmallVectorImpl<wchar_t> &utf16) {
+std::error_code UTF8ToUTF16(llvm::StringRef utf8,
+                            llvm::SmallVectorImpl<wchar_t> &utf16) {
   if (!utf8.empty()) {
     int len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, utf8.begin(),
                                     utf8.size(), utf16.begin(), 0);
 
     if (len == 0)
-      return llvm::windows_error(::GetLastError());
+      return windows_error(::GetLastError());
 
     utf16.reserve(len + 1);
     utf16.set_size(len);
@@ -927,25 +886,25 @@ llvm::error_code UTF8ToUTF16(llvm::StringRef utf8,
                                 utf8.size(), utf16.begin(), utf16.size());
 
     if (len == 0)
-      return llvm::windows_error(::GetLastError());
+      return windows_error(::GetLastError());
   }
 
   // Make utf16 null terminated.
   utf16.push_back(0);
   utf16.pop_back();
 
-  return llvm::error_code::success();
+  return std::error_code();
 }
 
-llvm::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
-                             llvm::SmallVectorImpl<char> &utf8) {
+std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
+                            llvm::SmallVectorImpl<char> &utf8) {
   if (utf16_len) {
     // Get length.
     int len = ::WideCharToMultiByte(CP_UTF8, 0, utf16, utf16_len, utf8.begin(),
                                     0, NULL, NULL);
 
     if (len == 0)
-      return llvm::windows_error(::GetLastError());
+      return windows_error(::GetLastError());
 
     utf8.reserve(len);
     utf8.set_size(len);
@@ -955,14 +914,14 @@ llvm::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
                                 utf8.size(), NULL, NULL);
 
     if (len == 0)
-      return llvm::windows_error(::GetLastError());
+      return windows_error(::GetLastError());
   }
 
   // Make utf8 null terminated.
   utf8.push_back(0);
   utf8.pop_back();
 
-  return llvm::error_code::success();
+  return std::error_code();
 }
 } // end namespace windows
 } // end namespace sys
diff --git a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc
index c3df801..81aee0e 100644
--- a/lib/Support/Windows/Process.inc
+++ b/lib/Support/Windows/Process.inc
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/WindowsError.h"
 #include <malloc.h>
 
 // The Windows.h header must be after LLVM and standard headers.
@@ -47,7 +49,6 @@
 using namespace llvm;
 using namespace sys;
 
-
 process::id_type self_process::get_id() {
   return GetCurrentProcessId();
 }
@@ -178,12 +179,16 @@ Optional<std::string> Process::GetEnv(StringRef Name) {
   return std::string(Res.data());
 }
 
-error_code
+static std::error_code windows_error(DWORD E) {
+  return mapWindowsError(E);
+}
+
+std::error_code
 Process::GetArgumentVector(SmallVectorImpl<const char *> &Args,
                            ArrayRef<const char *>,
                            SpecificBumpPtrAllocator<char> &ArgAllocator) {
   int NewArgCount;
-  error_code ec;
+  std::error_code ec;
 
   wchar_t **UnicodeCommandLine = CommandLineToArgvW(GetCommandLineW(),
                                                     &NewArgCount);
@@ -208,7 +213,7 @@ Process::GetArgumentVector(SmallVectorImpl<const char *> &Args,
   if (ec)
     return ec;
 
-  return error_code::success();
+  return std::error_code();
 }
 
 bool Process::StandardInIsUserInput() {
@@ -363,12 +368,12 @@ unsigned Process::GetRandomNumber() {
   HCRYPTPROV HCPC;
   if (!::CryptAcquireContextW(&HCPC, NULL, NULL, PROV_RSA_FULL,
                               CRYPT_VERIFYCONTEXT))
-    assert(false && "Could not acquire a cryptographic context");
+    report_fatal_error("Could not acquire a cryptographic context");
 
   ScopedCryptContext CryptoProvider(HCPC);
   unsigned Ret;
   if (!::CryptGenRandom(CryptoProvider, sizeof(Ret),
                         reinterpret_cast<BYTE *>(&Ret)))
-    assert(false && "Could not generate a random number");
+    report_fatal_error("Could not generate a random number");
   return Ret;
 }
diff --git a/lib/Support/Windows/Program.inc b/lib/Support/Windows/Program.inc
index 5827c10..b2f71ae 100644
--- a/lib/Support/Windows/Program.inc
+++ b/lib/Support/Windows/Program.inc
@@ -226,7 +226,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
     // an environment block by concatenating them.
     for (unsigned i = 0; envp[i]; ++i) {
       SmallVector<wchar_t, MAX_PATH> EnvString;
-      if (error_code ec = windows::UTF8ToUTF16(envp[i], EnvString)) {
+      if (std::error_code ec = windows::UTF8ToUTF16(envp[i], EnvString)) {
         SetLastError(ec.value());
         MakeErrMsg(ErrMsg, "Unable to convert environment variable to UTF-16");
         return false;
@@ -290,7 +290,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
   fflush(stderr);
 
   SmallVector<wchar_t, MAX_PATH> ProgramUtf16;
-  if (error_code ec = windows::UTF8ToUTF16(Program, ProgramUtf16)) {
+  if (std::error_code ec = windows::UTF8ToUTF16(Program, ProgramUtf16)) {
     SetLastError(ec.value());
     MakeErrMsg(ErrMsg,
                std::string("Unable to convert application name to UTF-16"));
@@ -298,7 +298,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
   }
 
   SmallVector<wchar_t, MAX_PATH> CommandUtf16;
-  if (error_code ec = windows::UTF8ToUTF16(command.get(), CommandUtf16)) {
+  if (std::error_code ec = windows::UTF8ToUTF16(command.get(), CommandUtf16)) {
     SetLastError(ec.value());
     MakeErrMsg(ErrMsg,
                std::string("Unable to convert command-line to UTF-16"));
@@ -422,18 +422,18 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
   return WaitResult;
 }
 
-error_code sys::ChangeStdinToBinary(){
+  std::error_code sys::ChangeStdinToBinary(){
   int result = _setmode( _fileno(stdin), _O_BINARY );
   if (result == -1)
-    return error_code(errno, generic_category());
-  return make_error_code(errc::success);
+    return std::error_code(errno, std::generic_category());
+  return std::error_code();
 }
 
-error_code sys::ChangeStdoutToBinary(){
+  std::error_code sys::ChangeStdoutToBinary(){
   int result = _setmode( _fileno(stdout), _O_BINARY );
   if (result == -1)
-    return error_code(errno, generic_category());
-  return make_error_code(errc::success);
+    return std::error_code(errno, std::generic_category());
+  return std::error_code();
 }
 
 bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
diff --git a/lib/Support/Windows/WindowsSupport.h b/lib/Support/Windows/WindowsSupport.h
index 6bef444..f68835b 100644
--- a/lib/Support/Windows/WindowsSupport.h
+++ b/lib/Support/Windows/WindowsSupport.h
@@ -32,7 +32,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h" // Get build system configuration settings
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 #include <windows.h>
 #include <wincrypt.h>
 #include <cassert>
@@ -163,10 +163,9 @@ c_str(SmallVectorImpl<T> &str) {
 
 namespace sys {
 namespace windows {
-error_code UTF8ToUTF16(StringRef utf8,
-                       SmallVectorImpl<wchar_t> &utf16);
-error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
-                       SmallVectorImpl<char> &utf8);
+std::error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
+std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
+                            SmallVectorImpl<char> &utf8);
 } // end namespace windows
 } // end namespace sys
 } // end namespace llvm.
diff --git a/lib/Support/Windows/system_error.inc b/lib/Support/Windows/system_error.inc
deleted file mode 100644
index 37ec81d..0000000
--- a/lib/Support/Windows/system_error.inc
+++ /dev/null
@@ -1,142 +0,0 @@
-//===- llvm/Support/Win32/system_error.inc - Windows error_code --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides the Windows specific implementation of the error_code
-// and error_condition classes.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//=== WARNING: Implementation here must contain only generic Windows code that
-//===          is guaranteed to work on *all* Windows variants.
-//===----------------------------------------------------------------------===//
-
-#include <windows.h>
-#include <winerror.h>
-
-using namespace llvm;
-
-std::string
-_system_error_category::message(int ev) const {
-  LPVOID lpMsgBuf = 0;
-  DWORD retval = ::FormatMessageA(
-    FORMAT_MESSAGE_ALLOCATE_BUFFER |
-    FORMAT_MESSAGE_FROM_SYSTEM |
-    FORMAT_MESSAGE_IGNORE_INSERTS,
-    NULL,
-    ev,
-    MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), // Default language
-    (LPSTR) &lpMsgBuf,
-    0,
-    NULL);
-  if (retval == 0) {
-    ::LocalFree(lpMsgBuf);
-    return std::string("Unknown error");
-  }
-
-  std::string str( static_cast<LPCSTR>(lpMsgBuf) );
-  ::LocalFree(lpMsgBuf);
-
-  while (str.size()
-     && (str[str.size()-1] == '\n' || str[str.size()-1] == '\r'))
-    str.erase( str.size()-1 );
-  if (str.size() && str[str.size()-1] == '.')
-    str.erase( str.size()-1 );
-  return str;
-}
-
-// I'd rather not double the line count of the following.
-#define MAP_ERR_TO_COND(x, y) case x: return make_error_condition(errc::y)
-
-error_condition
-_system_error_category::default_error_condition(int ev) const {
-  switch (ev) {
-  MAP_ERR_TO_COND(0, success);
-  // Windows system -> posix_errno decode table  ---------------------------//
-  // see WinError.h comments for descriptions of errors
-  MAP_ERR_TO_COND(ERROR_ACCESS_DENIED,       permission_denied);
-  MAP_ERR_TO_COND(ERROR_ALREADY_EXISTS,      file_exists);
-  MAP_ERR_TO_COND(ERROR_BAD_UNIT,            no_such_device);
-  MAP_ERR_TO_COND(ERROR_BUFFER_OVERFLOW,     filename_too_long);
-  MAP_ERR_TO_COND(ERROR_BUSY,                device_or_resource_busy);
-  MAP_ERR_TO_COND(ERROR_BUSY_DRIVE,          device_or_resource_busy);
-  MAP_ERR_TO_COND(ERROR_CANNOT_MAKE,         permission_denied);
-  MAP_ERR_TO_COND(ERROR_CANTOPEN,            io_error);
-  MAP_ERR_TO_COND(ERROR_CANTREAD,            io_error);
-  MAP_ERR_TO_COND(ERROR_CANTWRITE,           io_error);
-  MAP_ERR_TO_COND(ERROR_CURRENT_DIRECTORY,   permission_denied);
-  MAP_ERR_TO_COND(ERROR_DEV_NOT_EXIST,       no_such_device);
-  MAP_ERR_TO_COND(ERROR_DEVICE_IN_USE,       device_or_resource_busy);
-  MAP_ERR_TO_COND(ERROR_DIR_NOT_EMPTY,       directory_not_empty);
-  MAP_ERR_TO_COND(ERROR_DIRECTORY,           invalid_argument);
-  MAP_ERR_TO_COND(ERROR_DISK_FULL,           no_space_on_device);
-  MAP_ERR_TO_COND(ERROR_FILE_EXISTS,         file_exists);
-  MAP_ERR_TO_COND(ERROR_FILE_NOT_FOUND,      no_such_file_or_directory);
-  MAP_ERR_TO_COND(ERROR_HANDLE_DISK_FULL,    no_space_on_device);
-  MAP_ERR_TO_COND(ERROR_HANDLE_EOF,          value_too_large);
-  MAP_ERR_TO_COND(ERROR_INVALID_ACCESS,      permission_denied);
-  MAP_ERR_TO_COND(ERROR_INVALID_DRIVE,       no_such_device);
-  MAP_ERR_TO_COND(ERROR_INVALID_FUNCTION,    function_not_supported);
-  MAP_ERR_TO_COND(ERROR_INVALID_HANDLE,      invalid_argument);
-  MAP_ERR_TO_COND(ERROR_INVALID_NAME,        invalid_argument);
-  MAP_ERR_TO_COND(ERROR_LOCK_VIOLATION,      no_lock_available);
-  MAP_ERR_TO_COND(ERROR_LOCKED,              no_lock_available);
-  MAP_ERR_TO_COND(ERROR_NEGATIVE_SEEK,       invalid_argument);
-  MAP_ERR_TO_COND(ERROR_NOACCESS,            permission_denied);
-  MAP_ERR_TO_COND(ERROR_NOT_ENOUGH_MEMORY,   not_enough_memory);
-  MAP_ERR_TO_COND(ERROR_NOT_READY,           resource_unavailable_try_again);
-  MAP_ERR_TO_COND(ERROR_NOT_SAME_DEVICE,     cross_device_link);
-  MAP_ERR_TO_COND(ERROR_OPEN_FAILED,         io_error);
-  MAP_ERR_TO_COND(ERROR_OPEN_FILES,          device_or_resource_busy);
-  MAP_ERR_TO_COND(ERROR_OPERATION_ABORTED,   operation_canceled);
-  MAP_ERR_TO_COND(ERROR_OUTOFMEMORY,         not_enough_memory);
-  MAP_ERR_TO_COND(ERROR_PATH_NOT_FOUND,      no_such_file_or_directory);
-  MAP_ERR_TO_COND(ERROR_BAD_NETPATH,         no_such_file_or_directory);
-  MAP_ERR_TO_COND(ERROR_READ_FAULT,          io_error);
-  MAP_ERR_TO_COND(ERROR_RETRY,               resource_unavailable_try_again);
-  MAP_ERR_TO_COND(ERROR_SEEK,                io_error);
-  MAP_ERR_TO_COND(ERROR_SHARING_VIOLATION,   permission_denied);
-  MAP_ERR_TO_COND(ERROR_TOO_MANY_OPEN_FILES, too_many_files_open);
-  MAP_ERR_TO_COND(ERROR_WRITE_FAULT,         io_error);
-  MAP_ERR_TO_COND(ERROR_WRITE_PROTECT,       permission_denied);
-  MAP_ERR_TO_COND(ERROR_SEM_TIMEOUT,         timed_out);
-  MAP_ERR_TO_COND(WSAEACCES,                 permission_denied);
-  MAP_ERR_TO_COND(WSAEADDRINUSE,             address_in_use);
-  MAP_ERR_TO_COND(WSAEADDRNOTAVAIL,          address_not_available);
-  MAP_ERR_TO_COND(WSAEAFNOSUPPORT,           address_family_not_supported);
-  MAP_ERR_TO_COND(WSAEALREADY,               connection_already_in_progress);
-  MAP_ERR_TO_COND(WSAEBADF,                  bad_file_descriptor);
-  MAP_ERR_TO_COND(WSAECONNABORTED,           connection_aborted);
-  MAP_ERR_TO_COND(WSAECONNREFUSED,           connection_refused);
-  MAP_ERR_TO_COND(WSAECONNRESET,             connection_reset);
-  MAP_ERR_TO_COND(WSAEDESTADDRREQ,           destination_address_required);
-  MAP_ERR_TO_COND(WSAEFAULT,                 bad_address);
-  MAP_ERR_TO_COND(WSAEHOSTUNREACH,           host_unreachable);
-  MAP_ERR_TO_COND(WSAEINPROGRESS,            operation_in_progress);
-  MAP_ERR_TO_COND(WSAEINTR,                  interrupted);
-  MAP_ERR_TO_COND(WSAEINVAL,                 invalid_argument);
-  MAP_ERR_TO_COND(WSAEISCONN,                already_connected);
-  MAP_ERR_TO_COND(WSAEMFILE,                 too_many_files_open);
-  MAP_ERR_TO_COND(WSAEMSGSIZE,               message_size);
-  MAP_ERR_TO_COND(WSAENAMETOOLONG,           filename_too_long);
-  MAP_ERR_TO_COND(WSAENETDOWN,               network_down);
-  MAP_ERR_TO_COND(WSAENETRESET,              network_reset);
-  MAP_ERR_TO_COND(WSAENETUNREACH,            network_unreachable);
-  MAP_ERR_TO_COND(WSAENOBUFS,                no_buffer_space);
-  MAP_ERR_TO_COND(WSAENOPROTOOPT,            no_protocol_option);
-  MAP_ERR_TO_COND(WSAENOTCONN,               not_connected);
-  MAP_ERR_TO_COND(WSAENOTSOCK,               not_a_socket);
-  MAP_ERR_TO_COND(WSAEOPNOTSUPP,             operation_not_supported);
-  MAP_ERR_TO_COND(WSAEPROTONOSUPPORT,        protocol_not_supported);
-  MAP_ERR_TO_COND(WSAEPROTOTYPE,             wrong_protocol_type);
-  MAP_ERR_TO_COND(WSAETIMEDOUT,              timed_out);
-  MAP_ERR_TO_COND(WSAEWOULDBLOCK,            operation_would_block);
-  default: return error_condition(ev, system_category());
-  }
-}
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index e5f9494..5212624 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Casting.h"
@@ -56,9 +57,7 @@ Input::Input(StringRef InputContent,
 Input::~Input() {
 }
 
-error_code Input::error() {
-  return EC;
-}
+std::error_code Input::error() { return EC; }
 
 // Pin the vtables to this file.
 void Input::HNode::anchor() {}
@@ -90,8 +89,8 @@ bool Input::setCurrentDocument() {
   return false;
 }
 
-void Input::nextDocument() {
-  ++DocIterator;
+bool Input::nextDocument() {
+  return ++DocIterator != Strm->end();
 }
 
 bool Input::mapTag(StringRef Tag, bool Default) {
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index f55838e..f7c213a 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -22,10 +22,10 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
-#include "llvm/Support/system_error.h"
 #include <cctype>
 #include <cerrno>
 #include <sys/stat.h>
+#include <system_error>
 
 // <fcntl.h> may provide O_BINARY.
 #if defined(HAVE_FCNTL_H)
@@ -450,7 +450,7 @@ raw_fd_ostream::raw_fd_ostream(const char *Filename, std::string &ErrorInfo,
     return;
   }
 
-  error_code EC = sys::fs::openFileForWrite(Filename, FD, Flags);
+  std::error_code EC = sys::fs::openFileForWrite(Filename, FD, Flags);
 
   if (EC) {
     ErrorInfo = "Error opening output file '" + std::string(Filename) + "': " +
diff --git a/lib/Support/regcclass.h b/lib/Support/regcclass.h
index 2cea3e4..7fd6604 100644
--- a/lib/Support/regcclass.h
+++ b/lib/Support/regcclass.h
@@ -37,6 +37,9 @@
  *	@(#)cclass.h	8.3 (Berkeley) 3/20/94
  */
 
+#ifndef LLVM_SUPPORT_REGCCLASS_H
+#define LLVM_SUPPORT_REGCCLASS_H
+
 /* character-class table */
 static struct cclass {
 	const char *name;
@@ -68,3 +71,5 @@ static struct cclass {
 					""} ,
 	{ NULL,		0,		"" }
 };
+
+#endif
diff --git a/lib/Support/regcname.h b/lib/Support/regcname.h
index 3c0bb24..891d255 100644
--- a/lib/Support/regcname.h
+++ b/lib/Support/regcname.h
@@ -35,6 +35,9 @@
  *	@(#)cname.h	8.3 (Berkeley) 3/20/94
  */
 
+#ifndef LLVM_SUPPORT_REGCNAME_H
+#define LLVM_SUPPORT_REGCNAME_H
+
 /* character-name table */
 static struct cname {
 	const char *name;
@@ -137,3 +140,5 @@ static struct cname {
 	{ "DEL",			'\177' },
 	{ NULL,				0 }
 };
+
+#endif
diff --git a/lib/Support/regex2.h b/lib/Support/regex2.h
index 21659c3..d81bfbc 100644
--- a/lib/Support/regex2.h
+++ b/lib/Support/regex2.h
@@ -35,6 +35,9 @@
  *	@(#)regex2.h	8.4 (Berkeley) 3/20/94
  */
 
+#ifndef LLVM_SUPPORT_REGEX2_H
+#define LLVM_SUPPORT_REGEX2_H
+
 /*
  * internals of regex_t
  */
@@ -155,3 +158,5 @@ struct re_guts {
 /* misc utilities */
 #define	OUT	(CHAR_MAX+1)	/* a non-character value */
 #define	ISWORD(c)	(isalnum(c&0xff) || (c) == '_')
+
+#endif
diff --git a/lib/Support/regutils.h b/lib/Support/regutils.h
index d0ee100..49a975c 100644
--- a/lib/Support/regutils.h
+++ b/lib/Support/regutils.h
@@ -35,6 +35,9 @@
  *	@(#)utils.h	8.3 (Berkeley) 3/20/94
  */
 
+#ifndef LLVM_SUPPORT_REGUTILS_H
+#define LLVM_SUPPORT_REGUTILS_H
+
 /* utility definitions */
 #define	NC		(CHAR_MAX - CHAR_MIN + 1)
 typedef unsigned char uch;
@@ -51,3 +54,5 @@ typedef unsigned char uch;
 #ifdef USEBCOPY
 #define	memmove(d, s, c)	bcopy(s, d, c)
 #endif
+
+#endif
diff --git a/lib/Support/system_error.cpp b/lib/Support/system_error.cpp
deleted file mode 100644
index 299f54a..0000000
--- a/lib/Support/system_error.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-//===---------------------- system_error.cpp ------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This was lifted from libc++ and modified for C++03.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/system_error.h"
-#include "llvm/Support/Errno.h"
-#include <cstring>
-#include <string>
-
-namespace llvm {
-
-// class error_category
-
-error_category::error_category() {
-}
-
-error_category::~error_category() {
-}
-
-error_condition
-error_category::default_error_condition(int ev) const {
-  return error_condition(ev, *this);
-}
-
-bool
-error_category::equivalent(int code, const error_condition& condition) const {
-  return default_error_condition(code) == condition;
-}
-
-bool
-error_category::equivalent(const error_code& code, int condition) const {
-  return *this == code.category() && code.value() == condition;
-}
-
-std::string
-_do_message::message(int ev) const {
-  return std::string(sys::StrError(ev));
-}
-
-class _generic_error_category : public _do_message {
-public:
-  const char* name() const override;
-  std::string message(int ev) const override;
-};
-
-const char*
-_generic_error_category::name() const {
-  return "generic";
-}
-
-std::string
-_generic_error_category::message(int ev) const {
-#ifdef ELAST
-  if (ev > ELAST)
-    return std::string("unspecified generic_category error");
-#endif  // ELAST
-  return _do_message::message(ev);
-}
-
-const error_category&
-generic_category() {
-  static _generic_error_category s;
-  return s;
-}
-
-class _system_error_category : public _do_message {
-public:
-  const char* name() const override;
-  std::string message(int ev) const override;
-  error_condition default_error_condition(int ev) const override;
-};
-
-const char*
-_system_error_category::name() const {
-  return "system";
-}
-
-// std::string _system_error_category::message(int ev) const {
-// Is in Platform/system_error.inc
-
-// error_condition _system_error_category::default_error_condition(int ev) const
-// Is in Platform/system_error.inc
-
-const error_category&
-system_category() {
-  static _system_error_category s;
-  return s;
-}
-
-const error_category&
-posix_category() {
-#ifdef LLVM_ON_WIN32
-  return generic_category();
-#else
-  return system_category();
-#endif
-}
-
-// error_condition
-
-std::string
-error_condition::message() const {
-  return _cat_->message(_val_);
-}
-
-// error_code
-
-std::string
-error_code::message() const {
-  return _cat_->message(_val_);
-}
-
-} // end namespace llvm
-
-// Include the truly platform-specific parts of this class.
-#if defined(LLVM_ON_UNIX)
-#include "Unix/system_error.inc"
-#endif
-#if defined(LLVM_ON_WIN32)
-#include "Windows/system_error.inc"
-#endif
diff --git a/lib/TableGen/Android.mk b/lib/TableGen/Android.mk
index 1f01ef7..0fd94bb 100644
--- a/lib/TableGen/Android.mk
+++ b/lib/TableGen/Android.mk
@@ -4,6 +4,7 @@ libtablegen_SRC_FILES := \
   Error.cpp \
   Main.cpp \
   Record.cpp \
+  SetTheory.cpp \
   StringMatcher.cpp \
   TableGenBackend.cpp \
   TGLexer.cpp \
diff --git a/lib/TableGen/CMakeLists.txt b/lib/TableGen/CMakeLists.txt
index 935d674..fb70218 100644
--- a/lib/TableGen/CMakeLists.txt
+++ b/lib/TableGen/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_library(LLVMTableGen
   Error.cpp
   Main.cpp
   Record.cpp
+  SetTheory.cpp
   StringMatcher.cpp
   TableGenBackend.cpp
   TGLexer.cpp
diff --git a/lib/TableGen/Main.cpp b/lib/TableGen/Main.cpp
index 476026d..e317fbf 100644
--- a/lib/TableGen/Main.cpp
+++ b/lib/TableGen/Main.cpp
@@ -20,12 +20,12 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/system_error.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Main.h"
 #include "llvm/TableGen/Record.h"
 #include <algorithm>
 #include <cstdio>
+#include <system_error>
 using namespace llvm;
 
 namespace {
@@ -81,14 +81,14 @@ int TableGenMain(char *argv0, TableGenMainFn *MainFn) {
   RecordKeeper Records;
 
   // Parse the input file.
-  std::unique_ptr<MemoryBuffer> File;
-  if (error_code ec =
-        MemoryBuffer::getFileOrSTDIN(InputFilename, File)) {
-    errs() << "Could not open input file '" << InputFilename << "': "
-           << ec.message() <<"\n";
+  ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
+      MemoryBuffer::getFileOrSTDIN(InputFilename);
+  if (std::error_code EC = FileOrErr.getError()) {
+    errs() << "Could not open input file '" << InputFilename
+           << "': " << EC.message() << "\n";
     return 1;
   }
-  MemoryBuffer *F = File.release();
+  MemoryBuffer *F = FileOrErr.get().release();
 
   // Tell SrcMgr about this buffer, which is what TGParser will pick up.
   SrcMgr.AddNewSourceBuffer(F, SMLoc());
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index c553a21..f7843dc 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@@ -811,20 +811,14 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
   }
   case HEAD: {
     if (ListInit *LHSl = dyn_cast<ListInit>(LHS)) {
-      if (LHSl->getSize() == 0) {
-        assert(0 && "Empty list in car");
-        return nullptr;
-      }
+      assert(LHSl->getSize() != 0 && "Empty list in car");
       return LHSl->getElement(0);
     }
     break;
   }
   case TAIL: {
     if (ListInit *LHSl = dyn_cast<ListInit>(LHS)) {
-      if (LHSl->getSize() == 0) {
-        assert(0 && "Empty list in cdr");
-        return nullptr;
-      }
+      assert(LHSl->getSize() != 0 && "Empty list in cdr");
       // Note the +1.  We can't just pass the result of getValues()
       // directly.
       ArrayRef<Init *>::iterator begin = LHSl->getValues().begin()+1;
diff --git a/utils/TableGen/SetTheory.cpp b/lib/TableGen/SetTheory.cpp
index 5ead7ed..c99c2ba 100644
--- a/utils/TableGen/SetTheory.cpp
+++ b/lib/TableGen/SetTheory.cpp
@@ -12,10 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SetTheory.h"
 #include "llvm/Support/Format.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/SetTheory.h"
 
 using namespace llvm;
 
diff --git a/lib/TableGen/TGLexer.cpp b/lib/TableGen/TGLexer.cpp
index 1ec2eea..fc1d3ca 100644
--- a/lib/TableGen/TGLexer.cpp
+++ b/lib/TableGen/TGLexer.cpp
@@ -27,9 +27,9 @@
 using namespace llvm;
 
 TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
-  CurBuffer = 0;
-  CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
-  CurPtr = CurBuf->getBufferStart();
+  CurBuffer = SrcMgr.getMainFileID();
+  CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
+  CurPtr = CurBuf.begin();
   TokStart = nullptr;
 }
 
@@ -52,7 +52,7 @@ int TGLexer::getNextChar() {
   case 0: {
     // A nul character in the stream is either the end of the current buffer or
     // a random nul in the file.  Disambiguate that here.
-    if (CurPtr-1 != CurBuf->getBufferEnd())
+    if (CurPtr-1 != CurBuf.end())
       return 0;  // Just whitespace.
     
     // If this is the end of an included file, pop the parent file off the
@@ -60,7 +60,7 @@ int TGLexer::getNextChar() {
     SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
     if (ParentIncludeLoc != SMLoc()) {
       CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
-      CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
+      CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
       CurPtr = ParentIncludeLoc.getPointer();
       return getNextChar();
     }
@@ -187,7 +187,7 @@ tgtok::TokKind TGLexer::LexString() {
   
   while (*CurPtr != '"') {
     // If we hit the end of the buffer, report an error.
-    if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd())
+    if (*CurPtr == 0 && CurPtr == CurBuf.end())
       return ReturnError(StrStart, "End of file in string literal");
     
     if (*CurPtr == '\n' || *CurPtr == '\r')
@@ -220,7 +220,7 @@ tgtok::TokKind TGLexer::LexString() {
 
     // If we hit the end of the buffer, report an error.
     case '\0':
-      if (CurPtr == CurBuf->getBufferEnd())
+      if (CurPtr == CurBuf.end())
         return ReturnError(StrStart, "End of file in string literal");
       // FALL THROUGH
     default:
@@ -304,7 +304,7 @@ bool TGLexer::LexInclude() {
   
   CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
                                     IncludedFile);
-  if (CurBuffer == -1) {
+  if (!CurBuffer) {
     PrintError(getLoc(), "Could not find include file '" + Filename + "'");
     return true;
   }
@@ -319,8 +319,8 @@ bool TGLexer::LexInclude() {
   }
   Dependencies.insert(std::make_pair(IncludedFile, getLoc()));
   // Save the line number and lex buffer of the includer.
-  CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
-  CurPtr = CurBuf->getBufferStart();
+  CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
+  CurPtr = CurBuf.begin();
   return false;
 }
 
@@ -333,7 +333,7 @@ void TGLexer::SkipBCPLComment() {
       return;  // Newline is end of comment.
     case 0:
       // If this is the end of the buffer, end the comment.
-      if (CurPtr == CurBuf->getBufferEnd())
+      if (CurPtr == CurBuf.end())
         return;
       break;
     }
diff --git a/lib/TableGen/TGLexer.h b/lib/TableGen/TGLexer.h
index 1e599f8..a2c95ca 100644
--- a/lib/TableGen/TGLexer.h
+++ b/lib/TableGen/TGLexer.h
@@ -14,6 +14,7 @@
 #ifndef TGLEXER_H
 #define TGLEXER_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/SMLoc.h"
 #include <cassert>
@@ -21,7 +22,6 @@
 #include <string>
 
 namespace llvm {
-class MemoryBuffer;
 class SourceMgr;
 class SMLoc;
 class Twine;
@@ -63,7 +63,7 @@ class TGLexer {
   SourceMgr &SrcMgr;
   
   const char *CurPtr;
-  const MemoryBuffer *CurBuf;
+  StringRef CurBuf;
 
   // Information about the current token.
   const char *TokStart;
@@ -73,7 +73,7 @@ class TGLexer {
 
   /// CurBuffer - This is the current buffer index we're lexing from as managed
   /// by the SourceMgr object.
-  int CurBuffer;
+  unsigned CurBuffer;
 
 public:
   typedef std::map<std::string, SMLoc> DependenciesMapTy;
diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp
index 038e018..0550692 100644
--- a/lib/TableGen/TGParser.cpp
+++ b/lib/TableGen/TGParser.cpp
@@ -360,8 +360,13 @@ bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){
   }
 
   if (Records.getDef(IterRec->getNameInitAsString())) {
-    Error(Loc, "def already exists: " + IterRec->getNameInitAsString());
-    return true;
+    // If this record is anonymous, it's no problem, just generate a new name
+    if (IterRec->isAnonymous())
+      IterRec->setName(GetNewAnonymousName());
+    else {
+      Error(Loc, "def already exists: " + IterRec->getNameInitAsString());
+      return true;
+    }
   }
 
   Records.addDef(IterRec);
@@ -782,7 +787,7 @@ Init *TGParser::ParseIDValue(Record *CurRec,
 ///
 /// Operation ::= XOperator ['<' Type '>'] '(' Args ')'
 ///
-Init *TGParser::ParseOperation(Record *CurRec) {
+Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
   switch (Lex.getCode()) {
   default:
     TokError("unknown operation");
@@ -845,7 +850,7 @@ Init *TGParser::ParseOperation(Record *CurRec) {
         ListRecTy *LType = dyn_cast<ListRecTy>(LHSt->getType());
         StringRecTy *SType = dyn_cast<StringRecTy>(LHSt->getType());
         if (!LType && !SType) {
-          TokError("expected list or string type argumnet in unary operator");
+          TokError("expected list or string type argument in unary operator");
           return nullptr;
         }
       }
@@ -853,7 +858,7 @@ Init *TGParser::ParseOperation(Record *CurRec) {
       if (Code == UnOpInit::HEAD
           || Code == UnOpInit::TAIL) {
         if (!LHSl && !LHSt) {
-          TokError("expected list type argumnet in unary operator");
+          TokError("expected list type argument in unary operator");
           return nullptr;
         }
 
@@ -877,7 +882,7 @@ Init *TGParser::ParseOperation(Record *CurRec) {
           assert(LHSt && "expected list type argument in unary operator");
           ListRecTy *LType = dyn_cast<ListRecTy>(LHSt->getType());
           if (!LType) {
-            TokError("expected list type argumnet in unary operator");
+            TokError("expected list type argument in unary operator");
             return nullptr;
           }
           if (Code == UnOpInit::HEAD) {
@@ -1021,8 +1026,9 @@ Init *TGParser::ParseOperation(Record *CurRec) {
     }
     Lex.Lex();  // eat the ','
 
-    Init *MHS = ParseValue(CurRec);
-    if (!MHS) return nullptr;
+    Init *MHS = ParseValue(CurRec, ItemType);
+    if (!MHS)
+      return nullptr;
 
     if (Lex.getCode() != tgtok::comma) {
       TokError("expected ',' in ternary operator");
@@ -1030,8 +1036,9 @@ Init *TGParser::ParseOperation(Record *CurRec) {
     }
     Lex.Lex();  // eat the ','
 
-    Init *RHS = ParseValue(CurRec);
-    if (!RHS) return nullptr;
+    Init *RHS = ParseValue(CurRec, ItemType);
+    if (!RHS)
+      return nullptr;
 
     if (Lex.getCode() != tgtok::r_paren) {
       TokError("expected ')' in binary operator");
@@ -1441,7 +1448,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
   case tgtok::XIf:
   case tgtok::XForEach:
   case tgtok::XSubst: {  // Value ::= !ternop '(' Value ',' Value ',' Value ')'
-    return ParseOperation(CurRec);
+    return ParseOperation(CurRec, ItemType);
   }
   }
 
diff --git a/lib/TableGen/TGParser.h b/lib/TableGen/TGParser.h
index 6fd442a..9f4b7e9 100644
--- a/lib/TableGen/TGParser.h
+++ b/lib/TableGen/TGParser.h
@@ -181,7 +181,7 @@ private:  // Parser methods.
   std::vector<unsigned> ParseRangeList();
   bool ParseRangePiece(std::vector<unsigned> &Ranges);
   RecTy *ParseType();
-  Init *ParseOperation(Record *CurRec);
+  Init *ParseOperation(Record *CurRec, RecTy *ItemType);
   RecTy *ParseOperatorType();
   Init *ParseObjectName(MultiClass *CurMultiClass);
   Record *ParseClassID();
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 1ad5ac8..e6a27c3 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -60,6 +60,7 @@ def AArch64InstrInfo : InstrInfo;
 // AArch64 Processors supported.
 //
 include "AArch64SchedA53.td"
+include "AArch64SchedA57.td"
 include "AArch64SchedCyclone.td"
 
 def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
@@ -89,7 +90,7 @@ def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
                                               FeatureCRC]>;
 
 def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
-def : ProcessorModel<"cortex-a57", NoSchedModel, [ProcA57]>;
+def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
 def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
index 04906f6..ab2c4b7 100644
--- a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
+++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
@@ -214,8 +214,8 @@ AArch64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
   if (SExt->getType() != ConsideredSExtType)
     return false;
 
-  for (const Use &U : SExt->uses()) {
-    if (isa<GetElementPtrInst>(*U))
+  for (const User *U : SExt->users()) {
+    if (isa<GetElementPtrInst>(U))
       return true;
   }
 
@@ -267,8 +267,7 @@ AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
     }
 
     // Now try to get through the chain of definitions.
-    while (isa<Instruction>(SExt->getOperand(0))) {
-      Instruction *Inst = dyn_cast<Instruction>(SExt->getOperand(0));
+    while (auto *Inst = dyn_cast<Instruction>(SExt->getOperand(0))) {
       DEBUG(dbgs() << "Try to get through:\n" << *Inst << '\n');
       if (!canGetThrough(Inst) || !shouldGetThrough(Inst)) {
         // We cannot get through something that is not an Instruction
@@ -285,10 +284,10 @@ AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
         // assertion on the type as all involved sext operation may have not
         // been moved yet.
         while (!Inst->use_empty()) {
-          Value::use_iterator UseIt = Inst->use_begin();
-          Instruction *UseInst = dyn_cast<Instruction>(*UseIt);
-          assert(UseInst && "Use of sext is not an Instruction!");
-          UseInst->setOperand(UseIt->getOperandNo(), SExt);
+          Use &U = *Inst->use_begin();
+          Instruction *User = dyn_cast<Instruction>(U.getUser());
+          assert(User && "User of sext is not an Instruction!");
+          User->setOperand(U.getOperandNo(), SExt);
         }
         ToRemove.insert(Inst);
         SExt->setOperand(0, Inst->getOperand(0));
@@ -385,11 +384,11 @@ void AArch64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses,
       if (ToRemove.count(Inst))
         continue;
       bool inserted = false;
-      for (auto Pt : CurPts) {
+      for (auto &Pt : CurPts) {
         if (DT.dominates(Inst, Pt)) {
           DEBUG(dbgs() << "Replace all uses of:\n" << *Pt << "\nwith:\n"
                        << *Inst << '\n');
-          (Pt)->replaceAllUsesWith(Inst);
+          Pt->replaceAllUsesWith(Inst);
           ToRemove.insert(Pt);
           Pt = Inst;
           inserted = true;
@@ -436,7 +435,7 @@ void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
 
       bool insert = false;
       // #1.
-      for (const Use &U : SExt->uses()) {
+      for (const User *U : SExt->users()) {
         const Instruction *Inst = dyn_cast<GetElementPtrInst>(U);
         if (Inst && Inst->getNumOperands() > 2) {
           DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index c3ee9bb..cd94e24 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -211,7 +211,7 @@ void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
   const MachineOperand &MO = MI->getOperand(OpNum);
   switch (MO.getType()) {
   default:
-    assert(0 && "<unknown operand type>");
+    llvm_unreachable("<unknown operand type>");
   case MachineOperand::MO_Register: {
     unsigned Reg = MO.getReg();
     assert(TargetRegisterInfo::isPhysicalRegister(Reg));
diff --git a/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
index 5209452..484e7e8 100644
--- a/lib/Target/AArch64/AArch64BranchRelaxation.cpp
+++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
@@ -291,7 +291,7 @@ static bool isConditionalBranch(unsigned Opc) {
 static MachineBasicBlock *getDestBlock(MachineInstr *MI) {
   switch (MI->getOpcode()) {
   default:
-    assert(0 && "unexpected opcode!");
+    llvm_unreachable("unexpected opcode!");
   case AArch64::TBZW:
   case AArch64::TBNZW:
   case AArch64::TBZX:
@@ -309,7 +309,7 @@ static MachineBasicBlock *getDestBlock(MachineInstr *MI) {
 static unsigned getOppositeConditionOpcode(unsigned Opc) {
   switch (Opc) {
   default:
-    assert(0 && "unexpected opcode!");
+    llvm_unreachable("unexpected opcode!");
   case AArch64::TBNZW:   return AArch64::TBZW;
   case AArch64::TBNZX:   return AArch64::TBZX;
   case AArch64::TBZW:    return AArch64::TBNZW;
@@ -325,7 +325,7 @@ static unsigned getOppositeConditionOpcode(unsigned Opc) {
 static unsigned getBranchDisplacementBits(unsigned Opc) {
   switch (Opc) {
   default:
-    assert(0 && "unexpected opcode!");
+    llvm_unreachable("unexpected opcode!");
   case AArch64::TBNZW:
   case AArch64::TBZW:
   case AArch64::TBNZX:
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index ded2e17..8e8bd3d 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -18,9 +18,6 @@ class CCIfAlign<string Align, CCAction A> :
 class CCIfBigEndian<CCAction A> :
   CCIf<"State.getTarget().getDataLayout()->isBigEndian()", A>;
 
-class CCIfUnallocated<string Reg, CCAction A> :
-  CCIf<"!State.isAllocated(AArch64::" # Reg # ")", A>;
-
 //===----------------------------------------------------------------------===//
 // ARM AAPCS64 Calling Convention
 //===----------------------------------------------------------------------===//
@@ -45,7 +42,7 @@ def CC_AArch64_AAPCS : CallingConv<[
 
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
   // up to eight each of GPR and FPR.
-  CCIfType<[i1, i8, i16], CCIfUnallocated<"X7", CCPromoteToType<i32>>>,
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
   CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
                                           [X0, X1, X2, X3, X4, X5, X6, X7]>>,
   // i128 is split to two i64s, we can't fit half to register X7.
@@ -120,7 +117,7 @@ def CC_AArch64_DarwinPCS : CallingConv<[
 
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
   // up to eight each of GPR and FPR.
-  CCIfType<[i1, i8, i16], CCIfUnallocated<"X7", CCPromoteToType<i32>>>,
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
   CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
                                           [X0, X1, X2, X3, X4, X5, X6, X7]>>,
   // i128 is split to two i64s, we can't fit half to register X7.
@@ -143,8 +140,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[
            CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
   // If more than will fit in registers, pass them on the stack instead.
-  CCIfType<[i1, i8], CCAssignToStack<1, 1>>,
-  CCIfType<[i16], CCAssignToStack<2, 2>>,
+  CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>,
+  CCIf<"ValVT == MVT::i16", CCAssignToStack<2, 2>>,
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
   CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
            CCAssignToStack<8, 8>>,
@@ -172,12 +169,11 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
 // 32bit quantity as undef.
 def CC_AArch64_WebKit_JS : CallingConv<[
   // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0).
-  CCIfType<[i1, i8, i16], CCIfUnallocated<"X0", CCPromoteToType<i32>>>,
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
   CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>,
   CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>,
 
   // Pass the remaining arguments on the stack instead.
-  CCIfType<[i1, i8, i16], CCAssignToStack<4, 4>>,
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
   CCIfType<[i64, f64], CCAssignToStack<8, 8>>
 ]>;
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index c3b5369..2164d77 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -240,21 +240,15 @@ unsigned AArch64FastISel::AArch64MaterializeFP(const ConstantFP *CFP, MVT VT) {
 }
 
 unsigned AArch64FastISel::AArch64MaterializeGV(const GlobalValue *GV) {
-  // We can't handle thread-local variables quickly yet. Unfortunately we have
-  // to peer through any aliases to find out if that rule applies.
-  const GlobalValue *TLSGV = GV;
-  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    TLSGV = GA->getAliasee();
+  // We can't handle thread-local variables quickly yet.
+  if (GV->isThreadLocal())
+    return 0;
 
   // MachO still uses GOT for large code-model accesses, but ELF requires
   // movz/movk sequences, which FastISel doesn't handle yet.
   if (TM.getCodeModel() != CodeModel::Small && !Subtarget->isTargetMachO())
     return 0;
 
-  if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(TLSGV))
-    if (GVar->isThreadLocal())
-      return 0;
-
   unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
 
   EVT DestEVT = TLI.getValueType(GV->getType(), true);
@@ -469,11 +463,18 @@ bool AArch64FastISel::SimplifyAddress(Address &Addr, MVT VT,
     break;
   }
 
-  // FIXME: If this is a stack pointer and the offset needs to be simplified
-  // then put the alloca address into a register, set the base type back to
-  // register and continue. This should almost never happen.
+  //If this is a stack pointer and the offset needs to be simplified then put
+  // the alloca address into a register, set the base type back to register and
+  // continue. This should almost never happen.
   if (needsLowering && Addr.getKind() == Address::FrameIndexBase) {
-    return false;
+    unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
+            ResultReg)
+        .addFrameIndex(Addr.getFI())
+        .addImm(0)
+        .addImm(0);
+    Addr.setKind(Address::RegBase);
+    Addr.setReg(ResultReg);
   }
 
   // Since the offset is too large for the load/store instruction get the
@@ -1224,7 +1225,6 @@ bool AArch64FastISel::ProcessCallArgs(
       Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ false);
       if (Arg == 0)
         return false;
-      ArgVT = DestVT;
       break;
     }
     case CCValAssign::AExt:
@@ -1235,7 +1235,6 @@ bool AArch64FastISel::ProcessCallArgs(
       Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ true);
       if (Arg == 0)
         return false;
-      ArgVT = DestVT;
       break;
     }
     default:
@@ -1254,7 +1253,7 @@ bool AArch64FastISel::ProcessCallArgs(
       assert(VA.isMemLoc() && "Assuming store on stack.");
 
       // Need to store on the stack.
-      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
+      unsigned ArgSize = (ArgVT.getSizeInBits() + 7) / 8;
 
       unsigned BEAlign = 0;
       if (ArgSize < 8 && !Subtarget->isLittleEndian())
@@ -1468,10 +1467,12 @@ bool AArch64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src,
     bool RV;
     unsigned ResultReg;
     RV = EmitLoad(VT, ResultReg, Src);
-    assert(RV == true && "Should be able to handle this load.");
+    if (!RV)
+      return false;
+
     RV = EmitStore(VT, ResultReg, Dest);
-    assert(RV == true && "Should be able to handle this store.");
-    (void)RV;
+    if (!RV)
+      return false;
 
     int64_t Size = VT.getSizeInBits() / 8;
     Len -= Size;
@@ -1749,6 +1750,17 @@ unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
 unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
                                      bool isZExt) {
   assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?");
+
+  // FastISel does not have plumbing to deal with extensions where the SrcVT or
+  // DestVT are odd things, so test to make sure that they are both types we can
+  // handle (i1/i8/i16/i32 for SrcVT and i8/i16/i32/i64 for DestVT), otherwise
+  // bail out to SelectionDAG.
+  if (((DestVT != MVT::i8) && (DestVT != MVT::i16) &&
+       (DestVT != MVT::i32) && (DestVT != MVT::i64)) ||
+      ((SrcVT !=  MVT::i1) && (SrcVT !=  MVT::i8) &&
+       (SrcVT !=  MVT::i16) && (SrcVT !=  MVT::i32)))
+    return 0;
+
   unsigned Opc;
   unsigned Imm = 0;
 
@@ -1895,6 +1907,7 @@ bool AArch64FastISel::SelectMul(const Instruction *I) {
   case MVT::i32:
     ZReg = AArch64::WZR;
     Opc = AArch64::MADDWrrr;
+    SrcVT = MVT::i32;
     break;
   case MVT::i64:
     ZReg = AArch64::XZR;
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index deb306a..9c33717 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -158,7 +158,7 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves(
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
-  const AArch64InstrInfo *TII = TM.getInstrInfo();
+  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
   DebugLoc DL = MBB.findDebugLoc(MBBI);
 
   // Add callee saved registers to move list.
@@ -204,8 +204,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *Fn = MF.getFunction();
-  const AArch64RegisterInfo *RegInfo = TM.getRegisterInfo();
-  const AArch64InstrInfo *TII = TM.getInstrInfo();
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 0e00d16..7686e6f 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -18,18 +18,11 @@
 
 namespace llvm {
 
-class AArch64Subtarget;
-class AArch64TargetMachine;
-
 class AArch64FrameLowering : public TargetFrameLowering {
-  const AArch64TargetMachine &TM;
-
 public:
-  explicit AArch64FrameLowering(const AArch64TargetMachine &TM,
-                              const AArch64Subtarget &STI)
+  explicit AArch64FrameLowering()
       : TargetFrameLowering(StackGrowsDown, 16, 0, 16,
-                            false /*StackRealignable*/),
-        TM(TM) {}
+                            false /*StackRealignable*/) {}
 
   void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI,
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 7007ffc..3f49fab 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -153,9 +153,6 @@ public:
   SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
   SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
 
-  SDNode *SelectSIMDAddSubNarrowing(unsigned IntNo, SDNode *Node);
-  SDNode *SelectSIMDXtnNarrowing(unsigned IntNo, SDNode *Node);
-
   SDNode *SelectBitfieldExtractOp(SDNode *N);
   SDNode *SelectBitfieldInsertOp(SDNode *N);
 
@@ -596,8 +593,9 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
     const GlobalValue *GV = GAN->getGlobal();
     unsigned Alignment = GV->getAlignment();
     const DataLayout *DL = TLI->getDataLayout();
-    if (Alignment == 0 && !Subtarget->isTargetDarwin())
-      Alignment = DL->getABITypeAlignment(GV->getType()->getElementType());
+    Type *Ty = GV->getType()->getElementType();
+    if (Alignment == 0 && Ty->isSized() && !Subtarget->isTargetDarwin())
+      Alignment = DL->getABITypeAlignment(Ty);
 
     if (Alignment >= Size)
       return true;
@@ -2111,7 +2109,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
                 .getVectorElementType()
                 .getSizeInBits()) {
     default:
-      assert(0 && "Unexpected vector element type!");
+      llvm_unreachable("Unexpected vector element type!");
     case 64:
       SubReg = AArch64::dsub;
       break;
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 80d6669..28d0035 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -67,15 +67,15 @@ EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
 //===----------------------------------------------------------------------===//
 // AArch64 Lowering public interface.
 //===----------------------------------------------------------------------===//
-static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
-  if (TM.getSubtarget<AArch64Subtarget>().isTargetDarwin())
+static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
+  if (TT.isOSBinFormatMachO())
     return new AArch64_MachoTargetObjectFile();
 
   return new AArch64_ELFTargetObjectFile();
 }
 
-AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
-    : TargetLowering(TM, createTLOF(TM)) {
+AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
+    : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
   Subtarget = &TM.getSubtarget<AArch64Subtarget>();
 
   // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
@@ -627,7 +627,7 @@ MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
 
 unsigned AArch64TargetLowering::getMaximalGlobalOffset() const {
   // FIXME: On AArch64, this depends on the type.
-  // Basically, the addressable offsets are o to 4095 * Ty.getSizeInBytes().
+  // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes().
   // and the offset has to be a multiple of the related size in bytes.
   return 4095;
 }
@@ -823,8 +823,7 @@ AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 #ifndef NDEBUG
     MI->dump();
 #endif
-    assert(0 && "Unexpected instruction for custom inserter!");
-    break;
+    llvm_unreachable("Unexpected instruction for custom inserter!");
 
   case AArch64::F128CSEL:
     return EmitF128CSEL(MI, BB);
@@ -833,7 +832,6 @@ AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case TargetOpcode::PATCHPOINT:
     return emitPatchPoint(MI, BB);
   }
-  llvm_unreachable("Unexpected instruction for custom inserter!");
 }
 
 //===----------------------------------------------------------------------===//
@@ -1273,7 +1271,7 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
   bool ExtraOp = false;
   switch (Op.getOpcode()) {
   default:
-    assert(0 && "Invalid code");
+    llvm_unreachable("Invalid code");
   case ISD::ADDC:
     Opc = AArch64ISD::ADDS;
     break;
@@ -1387,24 +1385,22 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
   EVT InVT = Op.getOperand(0).getValueType();
   EVT VT = Op.getValueType();
 
-  // FP_TO_XINT conversion from the same type are legal.
-  if (VT.getSizeInBits() == InVT.getSizeInBits())
-    return Op;
-
-  if (InVT == MVT::v2f64 || InVT == MVT::v4f32) {
+  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
     SDLoc dl(Op);
     SDValue Cv =
         DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
                     Op.getOperand(0));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
-  } else if (InVT == MVT::v2f32) {
+  }
+
+  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
     SDLoc dl(Op);
     SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0));
     return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
   }
 
   // Type changing conversions are illegal.
-  return SDValue();
+  return Op;
 }
 
 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
@@ -1440,32 +1436,23 @@ static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   SDValue In = Op.getOperand(0);
   EVT InVT = In.getValueType();
 
-  // v2i32 to v2f32 is legal.
-  if (VT == MVT::v2f32 && InVT == MVT::v2i32)
-    return Op;
-
-  // This function only handles v2f64 outputs.
-  if (VT == MVT::v2f64) {
-    // Extend the input argument to a v2i64 that we can feed into the
-    // floating point conversion. Zero or sign extend based on whether
-    // we're doing a signed or unsigned float conversion.
-    unsigned Opc =
-        Op.getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
-    assert(Op.getNumOperands() == 1 && "FP conversions take one argument");
-    SDValue Promoted = DAG.getNode(Opc, dl, MVT::v2i64, Op.getOperand(0));
-    return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), Promoted);
+  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
+    MVT CastVT =
+        MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
+                         InVT.getVectorNumElements());
+    In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
+    return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0));
   }
 
-  // Scalarize v2i64 to v2f32 conversions.
-  std::vector<SDValue> BuildVectorOps;
-  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
-    SDValue Sclr = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, In,
-                               DAG.getConstant(i, MVT::i64));
-    Sclr = DAG.getNode(Op->getOpcode(), dl, MVT::f32, Sclr);
-    BuildVectorOps.push_back(Sclr);
+  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
+    unsigned CastOpc =
+        Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    EVT CastVT = VT.changeVectorElementTypeToInteger();
+    In = DAG.getNode(CastOpc, dl, CastVT, In);
+    return DAG.getNode(Op.getOpcode(), dl, VT, In);
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, BuildVectorOps);
+  return Op;
 }
 
 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
@@ -1516,7 +1503,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
   StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(CallingConv::Fast, RetTy, Callee, &Args, 0);
+    .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0);
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
@@ -1711,7 +1698,9 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       InVals.push_back(FrameIdxN);
 
       continue;
-    } if (VA.isRegLoc()) {
+    }
+    
+    if (VA.isRegLoc()) {
       // Arguments stored in registers.
       EVT RegVT = VA.getLocVT();
 
@@ -1772,10 +1761,16 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
       SDValue ArgValue;
 
+      // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
       ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
+      MVT MemVT = VA.getValVT();
+
       switch (VA.getLocInfo()) {
       default:
         break;
+      case CCValAssign::BCvt:
+        MemVT = VA.getLocVT();
+        break;
       case CCValAssign::SExt:
         ExtType = ISD::SEXTLOAD;
         break;
@@ -1787,10 +1782,9 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
         break;
       }
 
-      ArgValue = DAG.getExtLoad(ExtType, DL, VA.getValVT(), Chain, FIN,
+      ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
                                 MachinePointerInfo::getFixedStack(FI),
-                                VA.getLocVT(),
-                                false, false, false, 0);
+                                MemVT, false, false, false, nullptr);
 
       InVals.push_back(ArgValue);
     }
@@ -2339,11 +2333,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
         // promoted to a legal register type i32, we should truncate Arg back to
         // i1/i8/i16.
-        if (Arg.getValueType().isSimple() &&
-            Arg.getValueType().getSimpleVT() == MVT::i32 &&
-            (VA.getLocVT() == MVT::i1 || VA.getLocVT() == MVT::i8 ||
-             VA.getLocVT() == MVT::i16))
-          Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg);
+        if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
+            VA.getValVT() == MVT::i16)
+          Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
 
         SDValue Store =
             DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
@@ -4116,6 +4108,7 @@ static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
 // shuffle in combination with VEXTs.
 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
                                                   SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
   unsigned NumElts = VT.getVectorNumElements();
@@ -4164,35 +4157,47 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
 
   SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
   int VEXTOffsets[2] = { 0, 0 };
+  int OffsetMultipliers[2] = { 1, 1 };
 
   // This loop extracts the usage patterns of the source vectors
   // and prepares appropriate SDValues for a shuffle if possible.
   for (unsigned i = 0; i < SourceVecs.size(); ++i) {
-    if (SourceVecs[i].getValueType() == VT) {
+    unsigned NumSrcElts = SourceVecs[i].getValueType().getVectorNumElements();
+    SDValue CurSource = SourceVecs[i];
+    if (SourceVecs[i].getValueType().getVectorElementType() !=
+        VT.getVectorElementType()) {
+      // It may hit this case if SourceVecs[i] is AssertSext/AssertZext.
+      // Then bitcast it to the vector which holds asserted element type,
+      // and record the multiplier of element width between SourceVecs and
+      // Build_vector which is needed to extract the correct lanes later.
+      EVT CastVT =
+          EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                           SourceVecs[i].getValueSizeInBits() /
+                               VT.getVectorElementType().getSizeInBits());
+
+      CurSource = DAG.getNode(ISD::BITCAST, dl, CastVT, SourceVecs[i]);
+      OffsetMultipliers[i] = CastVT.getVectorNumElements() / NumSrcElts;
+      NumSrcElts *= OffsetMultipliers[i];
+      MaxElts[i] *= OffsetMultipliers[i];
+      MinElts[i] *= OffsetMultipliers[i];
+    }
+
+    if (CurSource.getValueType() == VT) {
       // No VEXT necessary
-      ShuffleSrcs[i] = SourceVecs[i];
+      ShuffleSrcs[i] = CurSource;
       VEXTOffsets[i] = 0;
       continue;
-    } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
+    } else if (NumSrcElts < NumElts) {
       // We can pad out the smaller vector for free, so if it's part of a
       // shuffle...
-      ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, SourceVecs[i],
-                                   DAG.getUNDEF(SourceVecs[i].getValueType()));
+      ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, CurSource,
+                                   DAG.getUNDEF(CurSource.getValueType()));
       continue;
     }
 
-    // Don't attempt to extract subvectors from BUILD_VECTOR sources
-    // that expand or trunc the original value.
-    // TODO: We can try to bitcast and ANY_EXTEND the result but
-    // we need to consider the cost of vector ANY_EXTEND, and the
-    // legality of all the types.
-    if (SourceVecs[i].getValueType().getVectorElementType() !=
-        VT.getVectorElementType())
-      return SDValue();
-
     // Since only 64-bit and 128-bit vectors are legal on ARM and
     // we've eliminated the other cases...
-    assert(SourceVecs[i].getValueType().getVectorNumElements() == 2 * NumElts &&
+    assert(NumSrcElts == 2 * NumElts &&
            "unexpected vector sizes in ReconstructShuffle");
 
     if (MaxElts[i] - MinElts[i] >= NumElts) {
@@ -4203,22 +4208,20 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     if (MinElts[i] >= NumElts) {
       // The extraction can just take the second half
       VEXTOffsets[i] = NumElts;
-      ShuffleSrcs[i] =
-          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
-                      DAG.getIntPtrConstant(NumElts));
+      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
+                                   DAG.getIntPtrConstant(NumElts));
     } else if (MaxElts[i] < NumElts) {
       // The extraction can just take the first half
       VEXTOffsets[i] = 0;
-      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                   SourceVecs[i], DAG.getIntPtrConstant(0));
+      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
+                                   DAG.getIntPtrConstant(0));
     } else {
       // An actual VEXT is needed
       VEXTOffsets[i] = MinElts[i];
-      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                     SourceVecs[i], DAG.getIntPtrConstant(0));
-      SDValue VEXTSrc2 =
-          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
-                      DAG.getIntPtrConstant(NumElts));
+      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
+                                     DAG.getIntPtrConstant(0));
+      SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
+                                     DAG.getIntPtrConstant(NumElts));
       unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
       ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
                                    DAG.getConstant(Imm, MVT::i32));
@@ -4238,9 +4241,10 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     int ExtractElt =
         cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
     if (ExtractVec == SourceVecs[0]) {
-      Mask.push_back(ExtractElt - VEXTOffsets[0]);
+      Mask.push_back(ExtractElt * OffsetMultipliers[0] - VEXTOffsets[0]);
     } else {
-      Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
+      Mask.push_back(ExtractElt * OffsetMultipliers[1] + NumElts -
+                     VEXTOffsets[1]);
     }
   }
 
@@ -5177,11 +5181,37 @@ FailedModImm:
   return Op;
 }
 
+// Normalize the operands of BUILD_VECTOR. The value of constant operands will
+// be truncated to fit element width.
+static SDValue NormalizeBuildVector(SDValue Op,
+                                    SelectionDAG &DAG) {
+  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  EVT EltTy= VT.getVectorElementType();
+
+  if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
+    return Op;
+
+  SmallVector<SDValue, 16> Ops;
+  for (unsigned I = 0, E = VT.getVectorNumElements(); I != E; ++I) {
+    SDValue Lane = Op.getOperand(I);
+    if (Lane.getOpcode() == ISD::Constant) {
+      APInt LowBits(EltTy.getSizeInBits(),
+                    cast<ConstantSDNode>(Lane)->getZExtValue());
+      Lane = DAG.getConstant(LowBits.getZExtValue(), MVT::i32);
+    }
+    Ops.push_back(Lane);
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+}
+
 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
                                                  SelectionDAG &DAG) const {
-  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
+  Op = NormalizeBuildVector(Op, DAG);
+  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
 
   APInt CnstBits(VT.getSizeInBits(), 0);
   APInt UndefBits(VT.getSizeInBits(), 0);
@@ -6047,18 +6077,14 @@ bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
     return false;
   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
-  if (NumBits1 <= NumBits2)
-    return false;
-  return true;
+  return NumBits1 > NumBits2;
 }
 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
-  if (!VT1.isInteger() || !VT2.isInteger())
+  if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
     return false;
   unsigned NumBits1 = VT1.getSizeInBits();
   unsigned NumBits2 = VT2.getSizeInBits();
-  if (NumBits1 <= NumBits2)
-    return false;
-  return true;
+  return NumBits1 > NumBits2;
 }
 
 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
@@ -6068,18 +6094,14 @@ bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
     return false;
   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
-  if (NumBits1 == 32 && NumBits2 == 64)
-    return true;
-  return false;
+  return NumBits1 == 32 && NumBits2 == 64;
 }
 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
-  if (!VT1.isInteger() || !VT2.isInteger())
+  if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
     return false;
   unsigned NumBits1 = VT1.getSizeInBits();
   unsigned NumBits2 = VT2.getSizeInBits();
-  if (NumBits1 == 32 && NumBits2 == 64)
-    return true;
-  return false;
+  return NumBits1 == 32 && NumBits2 == 64;
 }
 
 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
@@ -6092,8 +6114,9 @@ bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
     return false;
 
   // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
-  return (VT1.isSimple() && VT1.isInteger() && VT2.isSimple() &&
-          VT2.isInteger() && VT1.getSizeInBits() <= 32);
+  return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
+          VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
+          VT1.getSizeInBits() <= 32);
 }
 
 bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType,
@@ -6346,23 +6369,45 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
     APInt Value = C->getAPIntValue();
     EVT VT = N->getValueType(0);
-    APInt VP1 = Value + 1;
-    if (VP1.isPowerOf2()) {
-      // Multiplying by one less than a power of two, replace with a shift
-      // and a subtract.
-      SDValue ShiftedVal =
-          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
-                      DAG.getConstant(VP1.logBase2(), MVT::i64));
-      return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
-    }
-    APInt VM1 = Value - 1;
-    if (VM1.isPowerOf2()) {
-      // Multiplying by one more than a power of two, replace with a shift
-      // and an add.
-      SDValue ShiftedVal =
-          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
-                      DAG.getConstant(VM1.logBase2(), MVT::i64));
-      return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
+    if (Value.isNonNegative()) {
+      // (mul x, 2^N + 1) => (add (shl x, N), x)
+      APInt VM1 = Value - 1;
+      if (VM1.isPowerOf2()) {
+        SDValue ShiftedVal =
+            DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                        DAG.getConstant(VM1.logBase2(), MVT::i64));
+        return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal,
+                           N->getOperand(0));
+      }
+      // (mul x, 2^N - 1) => (sub (shl x, N), x)
+      APInt VP1 = Value + 1;
+      if (VP1.isPowerOf2()) {
+        SDValue ShiftedVal =
+            DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                        DAG.getConstant(VP1.logBase2(), MVT::i64));
+        return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal,
+                           N->getOperand(0));
+      }
+    } else {
+      // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
+      APInt VNM1 = -Value - 1;
+      if (VNM1.isPowerOf2()) {
+        SDValue ShiftedVal =
+            DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                        DAG.getConstant(VNM1.logBase2(), MVT::i64));
+        SDValue Add =
+            DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
+        return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), Add);
+      }
+      // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
+      APInt VNP1 = -Value + 1;
+      if (VNP1.isPowerOf2()) {
+        SDValue ShiftedVal =
+            DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                        DAG.getConstant(VNP1.logBase2(), MVT::i64));
+        return DAG.getNode(ISD::SUB, SDLoc(N), VT, N->getOperand(0),
+                           ShiftedVal);
+      }
     }
   }
   return SDValue();
@@ -6687,7 +6732,7 @@ static SDValue tryCombineFixedPointConvert(SDNode *N,
     else if (Vec.getValueType() == MVT::v2i64)
       VecResTy = MVT::v2f64;
     else
-      assert(0 && "unexpected vector type!");
+      llvm_unreachable("unexpected vector type!");
 
     SDValue Convert =
         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
@@ -7020,7 +7065,7 @@ static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
   if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits)
     return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
                        DAG.getConstant(-ShiftAmount, MVT::i32));
-  else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount <= ElemBits)
+  else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits)
     return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
                        DAG.getConstant(ShiftAmount, MVT::i32));
 
@@ -7867,6 +7912,18 @@ bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
   return Inst->getType()->getPrimitiveSizeInBits() <= 128;
 }
 
+TargetLoweringBase::LegalizeTypeAction
+AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
+  MVT SVT = VT.getSimpleVT();
+  // During type legalization, we prefer to widen v1i8, v1i16, v1i32  to v8i8,
+  // v4i16, v2i32 instead of to promote.
+  if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32
+      || SVT == MVT::v1f32)
+    return TypeWidenVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
 Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                              AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index de16c4d..cb0b9ef 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -197,7 +197,7 @@ class AArch64TargetLowering : public TargetLowering {
   bool RequireStrictAlign;
 
 public:
-  explicit AArch64TargetLowering(AArch64TargetMachine &TM);
+  explicit AArch64TargetLowering(TargetMachine &TM);
 
   /// Selects the correct CCAssignFn for a the given CallingConvention
   /// value.
@@ -324,6 +324,9 @@ public:
 
   bool shouldExpandAtomicInIR(Instruction *Inst) const override;
 
+  TargetLoweringBase::LegalizeTypeAction
+  getPreferredVectorAction(EVT VT) const override;
+
 private:
   /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index d455d7e..5007172 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -448,13 +448,19 @@ def logical_imm64_XFORM : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(enc, MVT::i32);
 }]>;
 
-def LogicalImm32Operand : AsmOperandClass {
-  let Name = "LogicalImm32";
-  let DiagnosticType = "LogicalSecondSource";
-}
-def LogicalImm64Operand : AsmOperandClass {
-  let Name = "LogicalImm64";
-  let DiagnosticType = "LogicalSecondSource";
+let DiagnosticType = "LogicalSecondSource" in {
+  def LogicalImm32Operand : AsmOperandClass {
+    let Name = "LogicalImm32";
+  }
+  def LogicalImm64Operand : AsmOperandClass {
+    let Name = "LogicalImm64";
+  }
+  def LogicalImm32NotOperand : AsmOperandClass {
+    let Name = "LogicalImm32Not";
+  }
+  def LogicalImm64NotOperand : AsmOperandClass {
+    let Name = "LogicalImm64Not";
+  }
 }
 def logical_imm32 : Operand<i32>, PatLeaf<(imm), [{
   return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 32);
@@ -468,6 +474,12 @@ def logical_imm64 : Operand<i64>, PatLeaf<(imm), [{
   let PrintMethod = "printLogicalImm64";
   let ParserMatchClass = LogicalImm64Operand;
 }
+def logical_imm32_not : Operand<i32> {
+  let ParserMatchClass = LogicalImm32NotOperand;
+}
+def logical_imm64_not : Operand<i64> {
+  let ParserMatchClass = LogicalImm64NotOperand;
+}
 
 // imm0_65535 predicate - True if the immediate is in the range [0,65535].
 def Imm0_65535Operand : AsmImmRange<0, 65535>;
@@ -963,8 +975,14 @@ def ccode : Operand<i32> {
   let ParserMatchClass = CondCode;
 }
 def inv_ccode : Operand<i32> {
+  // AL and NV are invalid in the aliases which use inv_ccode
   let PrintMethod = "printInverseCondCode";
   let ParserMatchClass = CondCode;
+  let MCOperandPredicate = [{
+    return MCOp.isImm() &&
+           MCOp.getImm() != AArch64CC::AL &&
+           MCOp.getImm() != AArch64CC::NV;
+  }];
 }
 
 // Conditional branch target. 19-bit immediate. The low two bits of the target
@@ -1323,13 +1341,13 @@ class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
 multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
   def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
       [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>,
-      Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> {
+      Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
     let Inst{31} = 0;
   }
 
   def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
       [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>,
-      Sched<[WriteIM64, ReadIMA, ReadIM, ReadIM]> {
+      Sched<[WriteIM64, ReadIM, ReadIM, ReadIMA]> {
     let Inst{31} = 1;
   }
 }
@@ -1339,7 +1357,7 @@ class WideMulAccum<bit isSub, bits<3> opc, string asm,
   : BaseMulAccum<isSub, opc, GPR32, GPR64, asm,
     [(set GPR64:$Rd, (AccNode GPR64:$Ra,
                             (mul (ExtNode GPR32:$Rn), (ExtNode GPR32:$Rm))))]>,
-    Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> {
+    Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
   let Inst{31} = 1;
 }
 
@@ -1738,6 +1756,10 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp> {
                   WZR, GPR32:$src1, GPR32:$src2, 0), 5>;
   def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrs")
                   XZR, GPR64:$src1, GPR64:$src2, 0), 5>;
+  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrx")
+                  WZR, GPR32sponly:$src1, GPR32:$src2, 16), 5>;
+  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrx64")
+                  XZR, GPR64sponly:$src1, GPR64:$src2, 24), 5>;
 
   // Register/register aliases with no shift when SP is not used.
   def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
@@ -1925,22 +1947,32 @@ class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
     : InstAlias<asm#" $dst, $src1, $src2",
                 (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
 
-let AddedComplexity = 6 in
-multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode> {
+multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
+                      string Alias> {
+  let AddedComplexity = 6 in
   def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic,
                            [(set GPR32sp:$Rd, (OpNode GPR32:$Rn,
                                                logical_imm32:$imm))]> {
     let Inst{31} = 0;
     let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
   }
+  let AddedComplexity = 6 in
   def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic,
                            [(set GPR64sp:$Rd, (OpNode GPR64:$Rn,
                                                logical_imm64:$imm))]> {
     let Inst{31} = 1;
   }
+
+  def : InstAlias<Alias # " $Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn,
+                      logical_imm32_not:$imm), 0>;
+  def : InstAlias<Alias # " $Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn,
+                       logical_imm64_not:$imm), 0>;
 }
 
-multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode> {
+multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode,
+                       string Alias> {
   let isCompare = 1, Defs = [NZCV] in {
   def Wri  : BaseLogicalImm<opc, GPR32, GPR32, logical_imm32, mnemonic,
       [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> {
@@ -1952,6 +1984,13 @@ multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode> {
     let Inst{31} = 1;
   }
   } // end Defs = [NZCV]
+
+  def : InstAlias<Alias # " $Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn,
+                      logical_imm32_not:$imm), 0>;
+  def : InstAlias<Alias # " $Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn,
+                       logical_imm64_not:$imm), 0>;
 }
 
 class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index ff115c0..ce85b2c 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -35,8 +35,14 @@ AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
 /// GetInstSize - Return the number of bytes of code the specified
 /// instruction may be.  This returns the maximum number of bytes.
 unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
-  const MCInstrDesc &Desc = MI->getDesc();
+  const MachineBasicBlock &MBB = *MI->getParent();
+  const MachineFunction *MF = MBB.getParent();
+  const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+
+  if (MI->getOpcode() == AArch64::INLINEASM)
+    return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI);
 
+  const MCInstrDesc &Desc = MI->getDesc();
   switch (Desc.getOpcode()) {
   default:
     // Anything not explicitly designated otherwise is a nomal 4-byte insn.
@@ -1224,7 +1230,7 @@ void AArch64InstrInfo::copyPhysRegTuple(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
     unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode,
     llvm::ArrayRef<unsigned> Indices) const {
-  assert(getSubTarget().hasNEON() &&
+  assert(Subtarget.hasNEON() &&
          "Unexpected register copy without NEON");
   const TargetRegisterInfo *TRI = &getRegisterInfo();
   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
@@ -1385,7 +1391,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   if (AArch64::FPR128RegClass.contains(DestReg) &&
       AArch64::FPR128RegClass.contains(SrcReg)) {
-    if(getSubTarget().hasNEON()) {
+    if(Subtarget.hasNEON()) {
       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
           .addReg(SrcReg)
           .addReg(SrcReg, getKillRegState(KillSrc));
@@ -1406,7 +1412,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   if (AArch64::FPR64RegClass.contains(DestReg) &&
       AArch64::FPR64RegClass.contains(SrcReg)) {
-    if(getSubTarget().hasNEON()) {
+    if(Subtarget.hasNEON()) {
       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
                                        &AArch64::FPR128RegClass);
       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
@@ -1423,7 +1429,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   if (AArch64::FPR32RegClass.contains(DestReg) &&
       AArch64::FPR32RegClass.contains(SrcReg)) {
-    if(getSubTarget().hasNEON()) {
+    if(Subtarget.hasNEON()) {
       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
                                        &AArch64::FPR128RegClass);
       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
@@ -1440,7 +1446,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   if (AArch64::FPR16RegClass.contains(DestReg) &&
       AArch64::FPR16RegClass.contains(SrcReg)) {
-    if(getSubTarget().hasNEON()) {
+    if(Subtarget.hasNEON()) {
       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
                                        &AArch64::FPR128RegClass);
       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
@@ -1461,7 +1467,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   if (AArch64::FPR8RegClass.contains(DestReg) &&
       AArch64::FPR8RegClass.contains(SrcReg)) {
-    if(getSubTarget().hasNEON()) {
+    if(Subtarget.hasNEON()) {
       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
                                        &AArch64::FPR128RegClass);
       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
@@ -1577,39 +1583,39 @@ void AArch64InstrInfo::storeRegToStackSlot(
     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
       Opc = AArch64::STRQui;
     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
+      assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
       Opc = AArch64::ST1Twov1d, Offset = false;
     }
     break;
   case 24:
     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
+      assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
       Opc = AArch64::ST1Threev1d, Offset = false;
     }
     break;
   case 32:
     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
+      assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
       Opc = AArch64::ST1Fourv1d, Offset = false;
     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
+      assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
       Opc = AArch64::ST1Twov2d, Offset = false;
     }
     break;
   case 48:
     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
+      assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
       Opc = AArch64::ST1Threev2d, Offset = false;
     }
     break;
   case 64:
     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
+      assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
       Opc = AArch64::ST1Fourv2d, Offset = false;
     }
@@ -1675,39 +1681,39 @@ void AArch64InstrInfo::loadRegFromStackSlot(
     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
       Opc = AArch64::LDRQui;
     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
+      assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
       Opc = AArch64::LD1Twov1d, Offset = false;
     }
     break;
   case 24:
     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
+      assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
       Opc = AArch64::LD1Threev1d, Offset = false;
     }
     break;
   case 32:
     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
+      assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
       Opc = AArch64::LD1Fourv1d, Offset = false;
     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
+      assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
       Opc = AArch64::LD1Twov2d, Offset = false;
     }
     break;
   case 48:
     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
+      assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
       Opc = AArch64::LD1Threev2d, Offset = false;
     }
     break;
   case 64:
     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
-      assert(getSubTarget().hasNEON() &&
+      assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
       Opc = AArch64::LD1Fourv2d, Offset = false;
     }
@@ -1726,7 +1732,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg, int Offset,
-                           const AArch64InstrInfo *TII,
+                           const TargetInstrInfo *TII,
                            MachineInstr::MIFlag Flag, bool SetNZCV) {
   if (DestReg == SrcReg && Offset == 0)
     return;
@@ -1835,7 +1841,7 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
     *OutUnscaledOp = 0;
   switch (MI.getOpcode()) {
   default:
-    assert(0 && "unhandled opcode in rewriteAArch64FrameIndex");
+    llvm_unreachable("unhandled opcode in rewriteAArch64FrameIndex");
   // Vector spills/fills can't take an immediate offset.
   case AArch64::LD1Twov2d:
   case AArch64::LD1Threev2d:
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 90ce75f..f70b82b 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -44,8 +44,6 @@ public:
   /// always be able to get register info as well (through this method).
   const AArch64RegisterInfo &getRegisterInfo() const { return RI; }
 
-  const AArch64Subtarget &getSubTarget() const { return Subtarget; }
-
   unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
 
   bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
@@ -168,7 +166,7 @@ private:
 /// if necessary, to be replaced by the scavenger at the end of PEI.
 void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                      DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset,
-                     const AArch64InstrInfo *TII,
+                     const TargetInstrInfo *TII,
                      MachineInstr::MIFlag = MachineInstr::NoFlags,
                      bool SetNZCV = false);
 
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 9ad36e8..1211fba 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -323,7 +323,7 @@ def : Pat<(AArch64LOADgot tconstpool:$addr),
 // System instructions.
 //===----------------------------------------------------------------------===//
 
-def HINT  : HintI<"hint">;
+def HINT : HintI<"hint">;
 def : InstAlias<"nop",  (HINT 0b000)>;
 def : InstAlias<"yield",(HINT 0b001)>;
 def : InstAlias<"wfe",  (HINT 0b010)>;
@@ -671,10 +671,10 @@ def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_aarch64_crc32cx, "crc32cx">;
 //===----------------------------------------------------------------------===//
 
 // (immediate)
-defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag>;
-defm AND  : LogicalImm<0b00, "and", and>;
-defm EOR  : LogicalImm<0b10, "eor", xor>;
-defm ORR  : LogicalImm<0b01, "orr", or>;
+defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag, "bics">;
+defm AND  : LogicalImm<0b00, "and", and, "bic">;
+defm EOR  : LogicalImm<0b10, "eor", xor, "eon">;
+defm ORR  : LogicalImm<0b01, "orr", or, "orn">;
 
 // FIXME: these aliases *are* canonical sometimes (when movz can't be
 // used). Actually, it seems to be working right now, but putting logical_immXX
@@ -737,6 +737,10 @@ def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
 defm CLS    : OneOperandData<0b101, "cls">;
 defm CLZ    : OneOperandData<0b100, "clz", ctlz>;
 defm RBIT   : OneOperandData<0b000, "rbit">;
+
+def : Pat<(int_aarch64_rbit GPR32:$Rn), (RBITWr $Rn)>;
+def : Pat<(int_aarch64_rbit GPR64:$Rn), (RBITXr $Rn)>;
+
 def  REV16Wr : OneWRegData<0b001, "rev16",
                                   UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
 def  REV16Xr : OneXRegData<0b001, "rev16", null_frag>;
@@ -2238,6 +2242,81 @@ def : Pat<(f32_to_f16 FPR32:$Rn),
 def FCVTSHpseudo : Pseudo<(outs FPR32:$Rd), (ins FPR32:$Rn),
                           [(set (f32 FPR32:$Rd), (f16_to_f32 i32:$Rn))]>;
 
+// When converting from f16 coming directly from a load, make sure we
+// load into the FPR16 registers rather than going through the GPRs.
+//   f16->f32
+def : Pat<(f32 (f16_to_f32 (i32
+                (zextloadi16 (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                    ro_Wextend16:$extend))))),
+          (FCVTSHr (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend))>;
+def : Pat<(f32 (f16_to_f32 (i32
+                (zextloadi16 (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                    ro_Xextend16:$extend))))),
+          (FCVTSHr (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend))>;
+def : Pat <(f32 (f16_to_f32 (i32
+                  (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (FCVTSHr (LDRHui GPR64sp:$Rn, uimm12s2:$offset))>;
+def : Pat <(f32 (f16_to_f32 (i32
+                  (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+           (FCVTSHr (LDURHi GPR64sp:$Rn, simm9:$offset))>;
+
+//   f16->f64
+def : Pat<(f64 (fextend (f32 (f16_to_f32 (i32
+                (zextloadi16 (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                    ro_Wextend16:$extend))))))),
+          (FCVTDHr (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend))>;
+def : Pat<(f64 (fextend (f32 (f16_to_f32 (i32
+                (zextloadi16 (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                    ro_Xextend16:$extend))))))),
+          (FCVTDHr (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend))>;
+def : Pat <(f64 (fextend (f32 (f16_to_f32 (i32
+                  (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))))),
+           (FCVTDHr (LDRHui GPR64sp:$Rn, uimm12s2:$offset))>;
+def : Pat <(f64 (fextend (f32 (f16_to_f32 (i32
+                  (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))))),
+           (FCVTDHr (LDURHi GPR64sp:$Rn, simm9:$offset))>;
+
+// When converting to f16 going directly to a store, make sure we use the
+// appropriate direct conversion instructions and store via the FPR16
+// registers rather than going through the GPRs.
+let AddedComplexity = 10 in {
+// f32->f16
+def : Pat< (truncstorei16 (assertzext (i32 (f32_to_f16 FPR32:$Rt))),
+                          (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend16:$extend)),
+           (STRHroW (FCVTHSr FPR32:$Rt), GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend16:$extend)>;
+def : Pat< (truncstorei16 (assertzext (i32 (f32_to_f16 FPR32:$Rt))),
+                          (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend16:$extend)),
+           (STRHroX (FCVTHSr FPR32:$Rt), GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend16:$extend)>;
+def : Pat <(truncstorei16 (assertzext (i32 (f32_to_f16 FPR32:$Rt))),
+              (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
+           (STRHui (FCVTHSr FPR32:$Rt), GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat <(truncstorei16 (assertzext (i32 (f32_to_f16 FPR32:$Rt))),
+              (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
+           (STURHi (FCVTHSr FPR32:$Rt), GPR64sp:$Rn, simm9:$offset)>;
+// f64->f16
+def : Pat< (truncstorei16 (assertzext (i32 (f32_to_f16 (f32 (fround FPR64:$Rt))))),
+                          (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend16:$extend)),
+           (STRHroW (FCVTHDr FPR64:$Rt), GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend16:$extend)>;
+def : Pat< (truncstorei16 (assertzext (i32 (f32_to_f16 (f32 (fround FPR64:$Rt))))),
+                          (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend16:$extend)),
+           (STRHroX (FCVTHDr FPR64:$Rt), GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend16:$extend)>;
+def : Pat <(truncstorei16 (assertzext (i32 (f32_to_f16 (f32 (fround FPR64:$Rt))))),
+              (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
+           (STRHui (FCVTHDr FPR64:$Rt), GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat <(truncstorei16 (assertzext (i32 (f32_to_f16 (f32 (fround FPR64:$Rt))))),
+              (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
+           (STURHi (FCVTHDr FPR64:$Rt), GPR64sp:$Rn, simm9:$offset)>;
+}
+
+
 //===----------------------------------------------------------------------===//
 // Floating point single operand instructions.
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index e7454be..3df9c4f 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -40,14 +40,13 @@ STATISTIC(NumPreFolded, "Number of pre-index updates folded");
 STATISTIC(NumUnscaledPairCreated,
           "Number of load/store from unscaled generated");
 
-static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", cl::init(20),
-                                   cl::Hidden);
+static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
+                                   cl::init(20), cl::Hidden);
 
 // Place holder while testing unscaled load/store combining
-static cl::opt<bool>
-EnableAArch64UnscaledMemOp("aarch64-unscaled-mem-op", cl::Hidden,
-                         cl::desc("Allow AArch64 unscaled load/store combining"),
-                         cl::init(true));
+static cl::opt<bool> EnableAArch64UnscaledMemOp(
+    "aarch64-unscaled-mem-op", cl::Hidden,
+    cl::desc("Allow AArch64 unscaled load/store combining"), cl::init(true));
 
 namespace {
 struct AArch64LoadStoreOpt : public MachineFunctionPass {
@@ -60,19 +59,19 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Scan the instructions looking for a load/store that can be combined
   // with the current instruction into a load/store pair.
   // Return the matching instruction if one is found, else MBB->end().
-  // If a matching instruction is found, mergeForward is set to true if the
+  // If a matching instruction is found, MergeForward is set to true if the
   // merge is to remove the first instruction and replace the second with
   // a pair-wise insn, and false if the reverse is true.
   MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
-                                               bool &mergeForward,
+                                               bool &MergeForward,
                                                unsigned Limit);
   // Merge the two instructions indicated into a single pair-wise instruction.
-  // If mergeForward is true, erase the first instruction and fold its
+  // If MergeForward is true, erase the first instruction and fold its
   // operation into the second. If false, the reverse. Return the instruction
   // following the first instruction (which may change during processing).
   MachineBasicBlock::iterator
   mergePairedInsns(MachineBasicBlock::iterator I,
-                   MachineBasicBlock::iterator Paired, bool mergeForward);
+                   MachineBasicBlock::iterator Paired, bool MergeForward);
 
   // Scan the instruction list to find a base register update that can
   // be combined with the current instruction (a load or store) using
@@ -142,7 +141,7 @@ static bool isUnscaledLdst(unsigned Opc) {
 int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
   switch (MemMI->getOpcode()) {
   default:
-    llvm_unreachable("Opcode has has unknown size!");
+    llvm_unreachable("Opcode has unknown size!");
   case AArch64::STRSui:
   case AArch64::STURSi:
     return 4;
@@ -217,16 +216,26 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
   switch (Opc) {
   default:
     llvm_unreachable("Opcode has no pre-indexed equivalent!");
-  case AArch64::STRSui:    return AArch64::STRSpre;
-  case AArch64::STRDui:    return AArch64::STRDpre;
-  case AArch64::STRQui:    return AArch64::STRQpre;
-  case AArch64::STRWui:    return AArch64::STRWpre;
-  case AArch64::STRXui:    return AArch64::STRXpre;
-  case AArch64::LDRSui:    return AArch64::LDRSpre;
-  case AArch64::LDRDui:    return AArch64::LDRDpre;
-  case AArch64::LDRQui:    return AArch64::LDRQpre;
-  case AArch64::LDRWui:    return AArch64::LDRWpre;
-  case AArch64::LDRXui:    return AArch64::LDRXpre;
+  case AArch64::STRSui:
+    return AArch64::STRSpre;
+  case AArch64::STRDui:
+    return AArch64::STRDpre;
+  case AArch64::STRQui:
+    return AArch64::STRQpre;
+  case AArch64::STRWui:
+    return AArch64::STRWpre;
+  case AArch64::STRXui:
+    return AArch64::STRXpre;
+  case AArch64::LDRSui:
+    return AArch64::LDRSpre;
+  case AArch64::LDRDui:
+    return AArch64::LDRDpre;
+  case AArch64::LDRQui:
+    return AArch64::LDRQpre;
+  case AArch64::LDRWui:
+    return AArch64::LDRWpre;
+  case AArch64::LDRXui:
+    return AArch64::LDRXpre;
   }
 }
 
@@ -260,7 +269,7 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
 MachineBasicBlock::iterator
 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
                                       MachineBasicBlock::iterator Paired,
-                                      bool mergeForward) {
+                                      bool MergeForward) {
   MachineBasicBlock::iterator NextI = I;
   ++NextI;
   // If NextI is the second of the two instructions to be merged, we need
@@ -276,12 +285,12 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
 
   unsigned NewOpc = getMatchingPairOpcode(I->getOpcode());
   // Insert our new paired instruction after whichever of the paired
-  // instructions mergeForward indicates.
-  MachineBasicBlock::iterator InsertionPoint = mergeForward ? Paired : I;
-  // Also based on mergeForward is from where we copy the base register operand
+  // instructions MergeForward indicates.
+  MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
+  // Also based on MergeForward is from where we copy the base register operand
   // so we get the flags compatible with the input code.
   MachineOperand &BaseRegOp =
-      mergeForward ? Paired->getOperand(1) : I->getOperand(1);
+      MergeForward ? Paired->getOperand(1) : I->getOperand(1);
 
   // Which register is Rt and which is Rt2 depends on the offset order.
   MachineInstr *RtMI, *Rt2MI;
@@ -355,8 +364,8 @@ static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
   if (IsUnscaled) {
     // Convert the byte-offset used by unscaled into an "element" offset used
     // by the scaled pair load/store instructions.
-    int elemOffset = Offset / OffsetStride;
-    if (elemOffset > 63 || elemOffset < -64)
+    int ElemOffset = Offset / OffsetStride;
+    if (ElemOffset > 63 || ElemOffset < -64)
       return false;
   }
   return true;
@@ -374,14 +383,14 @@ static int alignTo(int Num, int PowOf2) {
 /// be combined with the current instruction into a load/store pair.
 MachineBasicBlock::iterator
 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
-                                      bool &mergeForward, unsigned Limit) {
+                                      bool &MergeForward, unsigned Limit) {
   MachineBasicBlock::iterator E = I->getParent()->end();
   MachineBasicBlock::iterator MBBI = I;
   MachineInstr *FirstMI = I;
   ++MBBI;
 
   int Opc = FirstMI->getOpcode();
-  bool mayLoad = FirstMI->mayLoad();
+  bool MayLoad = FirstMI->mayLoad();
   bool IsUnscaled = isUnscaledLdst(Opc);
   unsigned Reg = FirstMI->getOperand(0).getReg();
   unsigned BaseReg = FirstMI->getOperand(1).getReg();
@@ -453,7 +462,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // If the destination register of the loads is the same register, bail
         // and keep looking. A load-pair instruction with both destination
         // registers the same is UNPREDICTABLE and will result in an exception.
-        if (mayLoad && Reg == MI->getOperand(0).getReg()) {
+        if (MayLoad && Reg == MI->getOperand(0).getReg()) {
           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
           continue;
         }
@@ -462,7 +471,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // the two instructions, we can combine the second into the first.
         if (!ModifiedRegs[MI->getOperand(0).getReg()] &&
             !UsedRegs[MI->getOperand(0).getReg()]) {
-          mergeForward = false;
+          MergeForward = false;
           return MBBI;
         }
 
@@ -471,7 +480,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // second.
         if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] &&
             !UsedRegs[FirstMI->getOperand(0).getReg()]) {
-          mergeForward = true;
+          MergeForward = true;
           return MBBI;
         }
         // Unable to combine these instructions due to interference in between.
@@ -798,14 +807,14 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
         break;
       }
       // Look ahead up to ScanLimit instructions for a pairable instruction.
-      bool mergeForward = false;
+      bool MergeForward = false;
       MachineBasicBlock::iterator Paired =
-          findMatchingInsn(MBBI, mergeForward, ScanLimit);
+          findMatchingInsn(MBBI, MergeForward, ScanLimit);
       if (Paired != E) {
         // Merge the loads into a pair. Keeping the iterator straight is a
         // pain, so we let the merge routine tell us what the next instruction
         // is after it's done mucking about.
-        MBBI = mergePairedInsns(MBBI, Paired, mergeForward);
+        MBBI = mergePairedInsns(MBBI, Paired, MergeForward);
 
         Modified = true;
         ++NumPairCreated;
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index ab6d375..75a17b9 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -51,7 +51,7 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO,
              AArch64II::MO_PAGEOFF)
       RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF;
     else
-      assert(0 && "Unexpected target flags with MO_GOT on GV operand");
+      llvm_unreachable("Unexpected target flags with MO_GOT on GV operand");
   } else if ((MO.getTargetFlags() & AArch64II::MO_TLS) != 0) {
     if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
       RefKind = MCSymbolRefExpr::VK_TLVPPAGE;
@@ -154,7 +154,7 @@ bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO,
                                       MCOperand &MCOp) const {
   switch (MO.getType()) {
   default:
-    assert(0 && "unknown operand type");
+    llvm_unreachable("unknown operand type");
   case MachineOperand::MO_Register:
     // Ignore all implicit register operands.
     if (MO.isImplicit())
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index 21c927f..a30e4ad 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -175,7 +175,7 @@ def GPR64all : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR, SP)>;
 // This is for indirect tail calls to store the address of the destination.
 def tcGPR64 : RegisterClass<"AArch64", [i64], 64, (sub GPR64common, X19, X20, X21,
                                                      X22, X23, X24, X25, X26,
-                                                     X27, X28)>;
+                                                     X27, X28, FP, LR)>;
 
 // GPR register classes for post increment amount of vector load/store that
 // has alternate printing when Rm=31 and prints a constant immediate value
diff --git a/lib/Target/AArch64/AArch64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td
index 0c3949e..d709bee 100644
--- a/lib/Target/AArch64/AArch64SchedA53.td
+++ b/lib/Target/AArch64/AArch64SchedA53.td
@@ -148,9 +148,9 @@ def : ReadAdvance<ReadVLD, 0>;
 
 // ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable
 //       operands are needed one cycle later if and only if they are to be
-//       shifted. Otherwise, they too are needed two cycle later. This same
+//       shifted. Otherwise, they too are needed two cycles later. This same
 //       ReadAdvance applies to Extended registers as well, even though there is
-//       a seperate SchedPredicate for them.
+//       a separate SchedPredicate for them.
 def : ReadAdvance<ReadI, 2, [WriteImm,WriteI,
                              WriteISReg, WriteIEReg,WriteIS,
                              WriteID32,WriteID64,
diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td
new file mode 100644
index 0000000..8209f96
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedA57.td
@@ -0,0 +1,304 @@
+//=- AArch64SchedA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for ARM Cortex-A57 to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def CortexA57Model : SchedMachineModel {
+  let IssueWidth        =   8; // 3-way decode and 8-way issue
+  let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
+  let LoadLatency       =   4; // Optimistic load latency
+  let MispredictPenalty =  14; // Fetch + Decode/Rename/Dispatch + Branch
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Cortex-A57.
+// Cortex A-57 has 8 pipelines that each has its own 8-entry queue where
+// micro-ops wait for their operands and then issue out-of-order.
+
+def A57UnitB : ProcResource<1> { let BufferSize = 8; }  // Type B micro-ops
+def A57UnitI : ProcResource<2> { let BufferSize = 8; }  // Type I micro-ops
+def A57UnitM : ProcResource<1> { let BufferSize = 8; }  // Type M micro-ops
+def A57UnitL : ProcResource<1> { let BufferSize = 8; }  // Type L micro-ops
+def A57UnitS : ProcResource<1> { let BufferSize = 8; }  // Type S micro-ops
+def A57UnitX : ProcResource<1> { let BufferSize = 8; }  // Type X micro-ops
+def A57UnitW : ProcResource<1> { let BufferSize = 8; }  // Type W micro-ops
+let SchedModel = CortexA57Model in {
+  def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>;    // Type V micro-ops
+}
+
+
+let SchedModel = CortexA57Model in {
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Cortex-A57.
+
+include "AArch64SchedA57WriteRes.td"
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latency for
+// Cortex-A57. The Cortex-A57 types are directly associated with resources, so
+// defining the aliases precludes the need for mapping them using WriteRes. The
+// aliases are sufficient for creating a coarse, working model. As the model
+// evolves, InstRWs will be used to override these SchedAliases.
+
+def : SchedAlias<WriteImm,   A57Write_1cyc_1I>;
+def : SchedAlias<WriteI,     A57Write_1cyc_1I>;
+def : SchedAlias<WriteISReg, A57Write_2cyc_1M>;
+def : SchedAlias<WriteIEReg, A57Write_2cyc_1M>;
+def : SchedAlias<WriteExtr,  A57Write_1cyc_1I>;
+def : SchedAlias<WriteIS,    A57Write_1cyc_1I>;
+def : SchedAlias<WriteID32,  A57Write_19cyc_1M>;
+def : SchedAlias<WriteID64,  A57Write_35cyc_1M>;
+def : SchedAlias<WriteIM32,  A57Write_3cyc_1M>;
+def : SchedAlias<WriteIM64,  A57Write_5cyc_1M>;
+def : SchedAlias<WriteBr,    A57Write_1cyc_1B>;
+def : SchedAlias<WriteBrReg, A57Write_1cyc_1B>;
+def : SchedAlias<WriteLD,    A57Write_4cyc_1L>;
+def : SchedAlias<WriteST,    A57Write_1cyc_1S>;
+def : SchedAlias<WriteSTP,   A57Write_1cyc_1S>;
+def : SchedAlias<WriteAdr,   A57Write_1cyc_1I>;
+def : SchedAlias<WriteLDIdx, A57Write_4cyc_1I_1L>;
+def : SchedAlias<WriteSTIdx, A57Write_1cyc_1I_1S>;
+def : SchedAlias<WriteF,     A57Write_3cyc_1V>;
+def : SchedAlias<WriteFCmp,  A57Write_3cyc_1V>;
+def : SchedAlias<WriteFCvt,  A57Write_5cyc_1V>;
+def : SchedAlias<WriteFCopy, A57Write_3cyc_1V>;
+def : SchedAlias<WriteFImm,  A57Write_3cyc_1V>;
+def : SchedAlias<WriteFMul,  A57Write_5cyc_1V>;
+def : SchedAlias<WriteFDiv,  A57Write_18cyc_1X>;
+def : SchedAlias<WriteV,     A57Write_3cyc_1V>;
+def : SchedAlias<WriteVLD,   A57Write_5cyc_1L>;
+def : SchedAlias<WriteVST,   A57Write_1cyc_1S>;
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi,    []> { let Latency = 4; }
+
+// Forwarding logic is not [yet] explicitly modeled beyond what is captured
+// in the latencies of the A57 Generic SchedWriteRes's.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+
+//===----------------------------------------------------------------------===//
+// Specialize the coarse model by associating instruction groups with the
+// subtarget-defined types. As the modeled is refined, this will override most
+// of the above ShchedAlias mappings.
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+
+// Branch Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_1cyc_1B_1I], (instrs BL)>;
+def : InstRW<[A57Write_2cyc_1B_1I], (instrs BLR)>;
+
+
+// Divide and Multiply Instructions
+// -----------------------------------------------------------------------------
+
+// Multiply high
+def : InstRW<[A57Write_6cyc_1M], (instrs SMULHrr, UMULHrr)>;
+
+
+// Miscellaneous Data-Processing Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_1cyc_1I],    (instrs EXTRWrri)>;
+def : InstRW<[A57Write_3cyc_1I_1M], (instrs EXTRXrri)>;
+def : InstRW<[A57Write_2cyc_1M],    (instregex "BFM")>;
+
+
+// Cryptography Extensions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_3cyc_1W], (instregex "CRC32")>;
+
+
+// Vector Load
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_8cyc_1L_1V],           (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1i(8|16|32)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],            (instregex "LD1i(64)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],  (instregex "LD1i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_1V],           (instregex "LD1Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],            (instregex "LD1Rv(1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],  (instregex "LD1Rv(1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_1V],           (instregex "LD1Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_5cyc_1L],              (instregex "LD1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],    (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],              (instregex "LD1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],    (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],              (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],    (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_7cyc_3L],            (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_7cyc_3L, WriteAdr],  (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_4L],           (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_4L, WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_2V],           (instregex "LD2i(8|16)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD2i(8|16)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],            (instregex "LD2i(32)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],  (instregex "LD2i(32)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_1V],            (instregex "LD2i(64)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr],  (instregex "LD2i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_1V],            (instregex "LD2Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr],  (instregex "LD2Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],             (instregex "LD2Rv(1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],   (instregex "LD2Rv(1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_2V],           (instregex "LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_1V],             (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr],   (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_2V],           (instregex "LD2Twov(16b|8h|4s)$")>;
+def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr], (instregex "LD2Twov(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD2Twov(2d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD2Twov(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_1L_3V],           (instregex "LD3i(8|16)$")>;
+def : InstRW<[A57Write_9cyc_1L_3V, WriteAdr], (instregex "LD3i(8|16)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_2V],            (instregex "LD3i(32)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],  (instregex "LD3i(32)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD3i(64)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD3i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_2V],             (instregex "LD3Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],   (instregex "LD3Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],              (instregex "LD3Rv(1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],    (instregex "LD3Rv(1d)_POST$")>;
+def : InstRW<[A57Write_9cyc_1L_3V],            (instregex "LD3Rv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_9cyc_1L_3V, WriteAdr],  (instregex "LD3Rv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_3V],           (instregex "LD3Rv(2d)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD3Rv(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_2L_2V],               (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr],     (instregex "LD3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_10cyc_3L_4V],           (instregex "LD3Threev(16b|8h|4s)$")>;
+def : InstRW<[A57Write_10cyc_3L_4V, WriteAdr], (instregex "LD3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_8cyc_4L],               (instregex "LD3Threev(2d)$")>;
+def : InstRW<[A57Write_8cyc_4L, WriteAdr],     (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_2L_3V],           (instregex "LD4i(8|16)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD4i(8|16)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_2V],             (instregex "LD4i(32)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],   (instregex "LD4i(32)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_3V],           (instregex "LD4i(64)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD4i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_2V],              (instregex "LD4Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],    (instregex "LD4Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],               (instregex "LD4Rv(1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],     (instregex "LD4Rv(1d)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_3V],            (instregex "LD4Rv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr],  (instregex "LD4Rv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_4V],           (instregex "LD4Rv(2d)$")>;
+def : InstRW<[A57Write_9cyc_2L_4V, WriteAdr], (instregex "LD4Rv(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_2L_2V],                (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr],      (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_11cyc_4L_4V],           (instregex "LD4Fourv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_11cyc_4L_4V, WriteAdr], (instregex "LD4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_8cyc_4L],                (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[A57Write_8cyc_4L, WriteAdr],      (instregex "LD4Fourv(2d)_POST$")>;
+
+// Vector Store
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_1cyc_1S],            (instregex "ST1i(8|16|32)$")>;
+def : InstRW<[A57Write_1cyc_1S, WriteAdr],  (instregex "ST1i(8|16|32)_POST$")>;
+def : InstRW<[A57Write_3cyc_1S_1V],           (instregex "ST1i(64)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST1i(64)_POST$")>;
+
+def : InstRW<[A57Write_1cyc_1S],                  (instregex "ST1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_1cyc_1S, WriteAdr],        (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_2cyc_2S],                 (instregex "ST1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_2cyc_2S, WriteAdr],       (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_2cyc_2S],                 (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_2cyc_2S, WriteAdr],       (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S],               (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr],     (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_3cyc_3S],                (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_3cyc_3S, WriteAdr],      (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_6cyc_6S],             (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_6cyc_6S, WriteAdr],   (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S],               (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr],     (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_8S],           (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_8S, WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_1S_1V],           (instregex "ST2i(8|16|32)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST2i(8|16|32)_POST$")>;
+def : InstRW<[A57Write_2cyc_2S],           (instregex "ST2i(64)$")>;
+def : InstRW<[A57Write_2cyc_2S, WriteAdr], (instregex "ST2i(64)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_2S_1V],              (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr],    (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S_2V],           (instregex "ST2Twov(16b|8h|4s)$")>;
+def : InstRW<[A57Write_4cyc_4S_2V, WriteAdr], (instregex "ST2Twov(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S],             (instregex "ST2Twov(2d)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr],   (instregex "ST2Twov(2d)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_1S_1V],            (instregex "ST3i(8|16)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr],  (instregex "ST3i(8|16)_POST$")>;
+def : InstRW<[A57Write_3cyc_3S],           (instregex "ST3i(32)$")>;
+def : InstRW<[A57Write_3cyc_3S, WriteAdr], (instregex "ST3i(32)_POST$")>;
+def : InstRW<[A57Write_3cyc_2S_1V],           (instregex "ST3i(64)$")>;
+def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr], (instregex "ST3i(64)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_3S_2V],                 (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[A57Write_3cyc_3S_2V, WriteAdr],       (instregex "ST3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_6cyc_6S_4V],           (instregex "ST3Threev(16b|8h|4s)$")>;
+def : InstRW<[A57Write_6cyc_6S_4V, WriteAdr], (instregex "ST3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_6cyc_6S],                (instregex "ST3Threev(2d)$")>;
+def : InstRW<[A57Write_6cyc_6S, WriteAdr],      (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_1S_1V],             (instregex "ST4i(8|16)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr],   (instregex "ST4i(8|16)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S],           (instregex "ST4i(32)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr], (instregex "ST4i(32)_POST$")>;
+def : InstRW<[A57Write_3cyc_2S_1V],            (instregex "ST4i(64)$")>;
+def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr],  (instregex "ST4i(64)_POST$")>;
+
+def : InstRW<[A57Write_4cyc_4S_2V],                  (instregex "ST4Fourv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_4cyc_4S_2V, WriteAdr],        (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_8cyc_8S_4V],           (instregex "ST4Fourv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_8cyc_8S_4V, WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_8cyc_8S],                (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[A57Write_8cyc_8S, WriteAdr],      (instregex "ST4Fourv(2d)_POST$")>;
+
+} // SchedModel = CortexA57Model
diff --git a/lib/Target/AArch64/AArch64SchedA57WriteRes.td b/lib/Target/AArch64/AArch64SchedA57WriteRes.td
new file mode 100644
index 0000000..a8f421b
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedA57WriteRes.td
@@ -0,0 +1,512 @@
+//=- AArch64SchedA57WriteRes.td - ARM Cortex-A57 Write Res ---*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains all of the Cortex-A57 specific SchedWriteRes types. The approach
+// below is to define a generic SchedWriteRes for every combination of
+// latency and microOps. The naming conventions is to use a prefix, one field
+// for latency, and one or more microOp count/type designators.
+//   Prefix: A57Write
+//   Latency: #cyc
+//   MicroOp Count/Types: #(B|I|M|L|S|X|W|V)
+//
+// e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are
+//      11 micro-ops to be issued down one I pipe, six S pipes and four V pipes.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Define Generic 1 micro-op types
+
+def A57Write_5cyc_1L  : SchedWriteRes<[A57UnitL]> { let Latency = 5;  }
+def A57Write_5cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 5;  }
+def A57Write_5cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 5;  }
+def A57Write_5cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 5;  }
+def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; }
+def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18; }
+def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19; }
+def A57Write_1cyc_1B  : SchedWriteRes<[A57UnitB]> { let Latency = 1;  }
+def A57Write_1cyc_1I  : SchedWriteRes<[A57UnitI]> { let Latency = 1;  }
+def A57Write_1cyc_1S  : SchedWriteRes<[A57UnitS]> { let Latency = 1;  }
+def A57Write_2cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 2;  }
+def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32; }
+def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35; }
+def A57Write_3cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 3;  }
+def A57Write_3cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 3;  }
+def A57Write_3cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 3;  }
+def A57Write_3cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 3;  }
+def A57Write_4cyc_1L  : SchedWriteRes<[A57UnitL]> { let Latency = 4;  }
+def A57Write_4cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
+def A57Write_9cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
+def A57Write_6cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 6;  }
+def A57Write_6cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 6;  }
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 2 micro-op types
+
+def A57Write_64cyc_2X    : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 64;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_1I_1L  : SchedWriteRes<[A57UnitI,
+                                          A57UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_7cyc_1V_1X  : SchedWriteRes<[A57UnitV,
+                                          A57UnitX]> {
+  let Latency     = 7;
+  let NumMicroOps = 2;
+}
+def A57Write_8cyc_1L_1V  : SchedWriteRes<[A57UnitL,
+                                          A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 2;
+}
+def A57Write_9cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 2;
+}
+def A57Write_8cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 8;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_2L     : SchedWriteRes<[A57UnitL, A57UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_2W     : SchedWriteRes<[A57UnitW, A57UnitW]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_1I_1L  : SchedWriteRes<[A57UnitI,
+                                          A57UnitL]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL,
+                                          A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 2;
+}
+def A57Write_10cyc_2V    : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 2;
+}
+def A57Write_1cyc_1B_1I  : SchedWriteRes<[A57UnitB,
+                                          A57UnitI]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+def A57Write_1cyc_1I_1S  : SchedWriteRes<[A57UnitI,
+                                          A57UnitS]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_1B_1I  : SchedWriteRes<[A57UnitB,
+                                          A57UnitI]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_2S     : SchedWriteRes<[A57UnitS, A57UnitS]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_36cyc_2X    : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 36;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_1I_1M  : SchedWriteRes<[A57UnitI,
+                                          A57UnitM]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_1I_1S  : SchedWriteRes<[A57UnitI,
+                                          A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_1S_1V  : SchedWriteRes<[A57UnitS,
+                                          A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_4cyc_1I_1L  : SchedWriteRes<[A57UnitI,
+                                          A57UnitL]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+def A57Write_4cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 3 micro-op types
+
+def A57Write_10cyc_3V       : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 3;
+}
+def A57Write_2cyc_1I_2S     : SchedWriteRes<[A57UnitI,
+                                             A57UnitS, A57UnitS]> {
+  let Latency     = 2;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_1I_1S_1V  : SchedWriteRes<[A57UnitI,
+                                             A57UnitS,
+                                             A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_1M_2S     : SchedWriteRes<[A57UnitM,
+                                             A57UnitS, A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_3S        : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_2S_1V     : SchedWriteRes<[A57UnitS, A57UnitS,
+                                             A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_5cyc_1I_2L     : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL]> {
+  let Latency     = 5;
+  let NumMicroOps = 3;
+}
+def A57Write_6cyc_1I_2L     : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 3;
+}
+def A57Write_6cyc_3V        : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 3;
+}
+def A57Write_7cyc_3L        : SchedWriteRes<[A57UnitL, A57UnitL, A57UnitL]> {
+  let Latency     = 7;
+  let NumMicroOps = 3;
+}
+def A57Write_8cyc_1I_1L_1V  : SchedWriteRes<[A57UnitI,
+                                             A57UnitL,
+                                             A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 3;
+}
+def A57Write_8cyc_1L_2V     : SchedWriteRes<[A57UnitL,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 3;
+}
+def A57Write_8cyc_3V        : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 3;
+}
+def A57Write_9cyc_3V        : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 3;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 4 micro-op types
+
+def A57Write_2cyc_2I_2S    : SchedWriteRes<[A57UnitI, A57UnitI,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 2;
+  let NumMicroOps = 4;
+}
+def A57Write_3cyc_2I_2S    : SchedWriteRes<[A57UnitI, A57UnitI,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 4;
+}
+def A57Write_3cyc_1I_3S    : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS, A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 4;
+}
+def A57Write_3cyc_1I_2S_1V : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 4;
+}
+def A57Write_4cyc_4S       : SchedWriteRes<[A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 4;
+  let NumMicroOps = 4;
+}
+def A57Write_7cyc_1I_3L    : SchedWriteRes<[A57UnitI,
+                                            A57UnitL, A57UnitL, A57UnitL]> {
+  let Latency     = 7;
+  let NumMicroOps = 4;
+}
+def A57Write_5cyc_2I_2L    : SchedWriteRes<[A57UnitI, A57UnitI,
+                                            A57UnitL, A57UnitL]> {
+  let Latency     = 5;
+  let NumMicroOps = 4;
+}
+def A57Write_8cyc_1I_1L_2V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 4;
+}
+def A57Write_8cyc_4L       : SchedWriteRes<[A57UnitL, A57UnitL,
+                                            A57UnitL, A57UnitL]> {
+  let Latency     = 8;
+  let NumMicroOps = 4;
+}
+def A57Write_9cyc_2L_2V    : SchedWriteRes<[A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 4;
+}
+def A57Write_9cyc_1L_3V    : SchedWriteRes<[A57UnitL,
+                                            A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 4;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 5 micro-op types
+
+def A57Write_3cyc_3S_2V    : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 5;
+}
+def A57Write_8cyc_1I_4L    : SchedWriteRes<[A57UnitI,
+                                            A57UnitL, A57UnitL,
+                                            A57UnitL, A57UnitL]> {
+  let Latency     = 8;
+  let NumMicroOps = 5;
+}
+def A57Write_4cyc_1I_4S    : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 4;
+  let NumMicroOps = 5;
+}
+def A57Write_9cyc_1I_2L_2V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 5;
+}
+def A57Write_9cyc_1I_1L_3V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL,
+                                            A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 5;
+}
+def A57Write_9cyc_2L_3V    : SchedWriteRes<[A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 5;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 6 micro-op types
+
+def A57Write_3cyc_1I_3S_2V : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 6;
+}
+def A57Write_4cyc_2I_4S    : SchedWriteRes<[A57UnitI, A57UnitI,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 4;
+  let NumMicroOps = 6;
+}
+def A57Write_4cyc_4S_2V    : SchedWriteRes<[A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 6;
+}
+def A57Write_6cyc_6S       : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS, A57UnitS]> {
+  let Latency     = 6;
+  let NumMicroOps = 6;
+}
+def A57Write_9cyc_1I_2L_3V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+def A57Write_9cyc_1I_1L_4V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL,
+                                            A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+def A57Write_9cyc_2L_4V    : SchedWriteRes<[A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 7 micro-op types
+
+def A57Write_10cyc_3L_4V : SchedWriteRes<[A57UnitL, A57UnitL, A57UnitL,
+                                          A57UnitV, A57UnitV,
+                                          A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 7;
+}
+def A57Write_4cyc_1I_4S_2V  : SchedWriteRes<[A57UnitI,
+                                             A57UnitS, A57UnitS,
+                                             A57UnitS, A57UnitS,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 7;
+}
+def A57Write_6cyc_1I_6S  : SchedWriteRes<[A57UnitI,
+                                          A57UnitS, A57UnitS, A57UnitS,
+                                          A57UnitS, A57UnitS, A57UnitS]> {
+  let Latency     = 6;
+  let NumMicroOps = 7;
+}
+def A57Write_9cyc_1I_2L_4V  : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL,
+                                             A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 7;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 8 micro-op types
+
+def A57Write_10cyc_1I_3L_4V : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL, A57UnitL,
+                                             A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 8;
+}
+def A57Write_11cyc_4L_4V : SchedWriteRes<[A57UnitL, A57UnitL,
+                                          A57UnitL, A57UnitL,
+                                          A57UnitV, A57UnitV,
+                                          A57UnitV, A57UnitV]> {
+  let Latency     = 11;
+  let NumMicroOps = 8;
+}
+def A57Write_8cyc_8S  : SchedWriteRes<[A57UnitS, A57UnitS,
+                                       A57UnitS, A57UnitS,
+                                       A57UnitS, A57UnitS,
+                                       A57UnitS, A57UnitS]> {
+  let Latency     = 8;
+  let NumMicroOps = 8;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 9 micro-op types
+
+def A57Write_8cyc_1I_8S  : SchedWriteRes<[A57UnitI,
+                                          A57UnitS, A57UnitS,
+                                          A57UnitS, A57UnitS,
+                                          A57UnitS, A57UnitS,
+                                          A57UnitS, A57UnitS]> {
+  let Latency     = 8;
+  let NumMicroOps = 9;
+}
+def A57Write_11cyc_1I_4L_4V : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL,
+                                             A57UnitL, A57UnitL,
+                                             A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 11;
+  let NumMicroOps = 9;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 10 micro-op types
+
+def A57Write_6cyc_6S_4V : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS,
+                                         A57UnitS, A57UnitS, A57UnitS,
+                                         A57UnitV, A57UnitV,
+                                         A57UnitV, A57UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 10;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 11 micro-op types
+
+def A57Write_6cyc_1I_6S_4V : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 11;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 12 micro-op types
+
+def A57Write_8cyc_8S_4V : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS, A57UnitS,
+                                         A57UnitS, A57UnitS, A57UnitS, A57UnitS,
+                                         A57UnitV, A57UnitV,
+                                         A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 12;
+}
+
+//===----------------------------------------------------------------------===//
+// Define Generic 13 micro-op types
+
+def A57Write_8cyc_1I_8S_4V : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 13;
+}
+
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 5c65b75..1bf64fc 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -16,9 +16,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-selectiondag-info"
 
-AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const TargetMachine &TM)
-    : TargetSelectionDAGInfo(TM),
-      Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {}
+AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const DataLayout *DL)
+    : TargetSelectionDAGInfo(DL) {}
 
 AArch64SelectionDAGInfo::~AArch64SelectionDAGInfo() {}
 
@@ -30,7 +29,9 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
   ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
   ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
   const char *bzeroEntry =
-      (V && V->isNullValue()) ? Subtarget->getBZeroEntry() : nullptr;
+      (V && V->isNullValue())
+          ? DAG.getTarget().getSubtarget<AArch64Subtarget>().getBZeroEntry()
+          : nullptr;
   // For small size (< 256), it is not beneficial to use bzero
   // instead of memset.
   if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
@@ -50,7 +51,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(dl).setChain(Chain)
       .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                 DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0)
+                 DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args), 0)
       .setDiscardResult();
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
     return CallResult.second;
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index 8381f99..1180eea 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -19,12 +19,8 @@
 namespace llvm {
 
 class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo {
-  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const AArch64Subtarget *Subtarget;
-
 public:
-  explicit AArch64SelectionDAGInfo(const TargetMachine &TM);
+  explicit AArch64SelectionDAGInfo(const DataLayout *DL);
   ~AArch64SelectionDAGInfo();
 
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index cd69994..bb0b72c 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -30,21 +30,35 @@ static cl::opt<bool>
 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
                      "converter pass"), cl::init(true), cl::Hidden);
 
-AArch64Subtarget::AArch64Subtarget(const std::string &TT,
-                                   const std::string &CPU,
-                                   const std::string &FS, bool LittleEndian)
-    : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
-      HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false),
-      HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), CPUString(CPU),
-      TargetTriple(TT), IsLittleEndian(LittleEndian) {
+AArch64Subtarget &
+AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) {
   // Determine default and user-specified characteristics
 
   if (CPUString.empty())
     CPUString = "generic";
 
   ParseSubtargetFeatures(CPUString, FS);
+  return *this;
 }
 
+AArch64Subtarget::AArch64Subtarget(const std::string &TT,
+                                   const std::string &CPU,
+                                   const std::string &FS, TargetMachine &TM,
+                                   bool LittleEndian)
+    : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
+      HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false),
+      HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), CPUString(CPU),
+      TargetTriple(TT),
+      // This nested ternary is horrible, but DL needs to be properly
+      // initialized
+      // before TLInfo is constructed.
+      DL(isTargetMachO()
+             ? "e-m:o-i64:64-i128:128-n32:64-S128"
+             : (LittleEndian ? "e-m:e-i64:64-i128:128-n32:64-S128"
+                             : "E-m:e-i64:64-i128:128-n32:64-S128")),
+      FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS)),
+      TSInfo(&DL), TLInfo(TM) {}
+
 /// ClassifyGlobalReference - Find the target operand flags that describe
 /// how a global value should be referenced for the current subtarget.
 unsigned char
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 590ea05..52124f6 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -14,8 +14,13 @@
 #ifndef AArch64SUBTARGET_H
 #define AArch64SUBTARGET_H
 
-#include "llvm/Target/TargetSubtargetInfo.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64FrameLowering.h"
+#include "AArch64ISelLowering.h"
 #include "AArch64RegisterInfo.h"
+#include "AArch64SelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
 #define GET_SUBTARGETINFO_HEADER
@@ -49,15 +54,32 @@ protected:
   /// TargetTriple - What processor and OS we're targeting.
   Triple TargetTriple;
 
-  /// IsLittleEndian - Is the target little endian?
-  bool IsLittleEndian;
+  const DataLayout DL;
+  AArch64FrameLowering FrameLowering;
+  AArch64InstrInfo InstrInfo;
+  AArch64SelectionDAGInfo TSInfo;
+  AArch64TargetLowering TLInfo;
+private:
+  /// initializeSubtargetDependencies - Initializes using CPUString and the
+  /// passed in feature string so that we can use initializer lists for
+  /// subtarget initialization.
+  AArch64Subtarget &initializeSubtargetDependencies(StringRef FS);
 
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   AArch64Subtarget(const std::string &TT, const std::string &CPU,
-                 const std::string &FS, bool LittleEndian);
-
+		   const std::string &FS, TargetMachine &TM, bool LittleEndian);
+
+  const AArch64SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  const AArch64FrameLowering *getFrameLowering() const {
+    return &FrameLowering;
+  }
+  const AArch64TargetLowering *getTargetLowering() const {
+    return &TLInfo;
+  }
+  const AArch64InstrInfo *getInstrInfo() const { return &InstrInfo; }
+  const DataLayout *getDataLayout() const { return &DL; }
   bool enableMachineScheduler() const override { return true; }
 
   bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
@@ -69,7 +91,7 @@ public:
   bool hasCrypto() const { return HasCrypto; }
   bool hasCRC() const { return HasCRC; }
 
-  bool isLittleEndian() const { return IsLittleEndian; }
+  bool isLittleEndian() const { return DL.isLittleEndian(); }
 
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
 
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 0b5dd2f..f99b90b 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -53,6 +53,12 @@ static cl::opt<bool>
 EnableLoadStoreOpt("aarch64-load-store-opt", cl::desc("Enable the load/store pair"
                    " optimization pass"), cl::init(true), cl::Hidden);
 
+static cl::opt<bool>
+EnableAtomicTidy("aarch64-atomic-cfg-tidy", cl::Hidden,
+                 cl::desc("Run SimplifyCFG after expanding atomic operations"
+                          " to make use of cmpxchg flow-based information"),
+                 cl::init(true));
+
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
@@ -71,16 +77,7 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT,
                                            CodeGenOpt::Level OL,
                                            bool LittleEndian)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-      Subtarget(TT, CPU, FS, LittleEndian),
-      // This nested ternary is horrible, but DL needs to be properly
-      // initialized
-      // before TLInfo is constructed.
-      DL(Subtarget.isTargetMachO()
-             ? "e-m:o-i64:64-i128:128-n32:64-S128"
-             : (LittleEndian ? "e-m:e-i64:64-i128:128-n32:64-S128"
-                             : "E-m:e-i64:64-i128:128-n32:64-S128")),
-      InstrInfo(Subtarget), TLInfo(*this), FrameLowering(*this, Subtarget),
-      TSInfo(*this) {
+      Subtarget(TT, CPU, FS, *this, LittleEndian) {
   initAsmInfo();
 }
 
@@ -113,6 +110,7 @@ public:
     return getTM<AArch64TargetMachine>();
   }
 
+  void addIRPasses()  override;
   bool addPreISel() override;
   bool addInstSelector() override;
   bool addILPOpts() override;
@@ -135,6 +133,20 @@ TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
   return new AArch64PassConfig(this, PM);
 }
 
+void AArch64PassConfig::addIRPasses() {
+  // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
+  // ourselves.
+  addPass(createAtomicExpandLoadLinkedPass(TM));
+
+  // Cmpxchg instructions are often used with a subsequent comparison to
+  // determine whether it succeeded. We can exploit existing control-flow in
+  // ldrex/strex loops to simplify this, but it needs tidying up.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
+    addPass(createCFGSimplificationPass());
+
+  TargetPassConfig::addIRPasses();
+}
+
 // Pass Pipeline Configuration
 bool AArch64PassConfig::addPreISel() {
   // Run promote constant before global merge, so that the promoted constants
@@ -146,10 +158,6 @@ bool AArch64PassConfig::addPreISel() {
   if (TM->getOptLevel() != CodeGenOpt::None)
     addPass(createAArch64AddressTypePromotionPass());
 
-  // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
-  // ourselves.
-  addPass(createAtomicExpandLoadLinkedPass(TM));
-
   return false;
 }
 
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index 079b19b..852cb3f 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -15,13 +15,9 @@
 #define AArch64TARGETMACHINE_H
 
 #include "AArch64InstrInfo.h"
-#include "AArch64ISelLowering.h"
 #include "AArch64Subtarget.h"
-#include "AArch64FrameLowering.h"
-#include "AArch64SelectionDAGInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/MC/MCStreamer.h"
 
 namespace llvm {
 
@@ -29,13 +25,6 @@ class AArch64TargetMachine : public LLVMTargetMachine {
 protected:
   AArch64Subtarget Subtarget;
 
-private:
-  const DataLayout DL;
-  AArch64InstrInfo InstrInfo;
-  AArch64TargetLowering TLInfo;
-  AArch64FrameLowering FrameLowering;
-  AArch64SelectionDAGInfo TSInfo;
-
 public:
   AArch64TargetMachine(const Target &T, StringRef TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
@@ -46,18 +35,22 @@ public:
     return &Subtarget;
   }
   const AArch64TargetLowering *getTargetLowering() const override {
-    return &TLInfo;
+    return getSubtargetImpl()->getTargetLowering();
+  }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
   }
-  const DataLayout *getDataLayout() const override { return &DL; }
   const AArch64FrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
+    return getSubtargetImpl()->getFrameLowering();
+  }
+  const AArch64InstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
   }
-  const AArch64InstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const AArch64RegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
+    return &getInstrInfo()->getRegisterInfo();
   }
   const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
-    return &TSInfo;
+    return getSubtargetImpl()->getSelectionDAGInfo();
   }
 
   // Pass Pipeline Configuration
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 33e482a..1dac14b 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -306,28 +306,64 @@ unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
     // LowerVectorINT_TO_FP:
     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+
+    // Complex: to v2f32
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
+
+    // Complex: to v4f32
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8,  4 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8,  3 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
+
+    // Complex: to v2f64
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
+
+
     // LowerVectorFP_TO_INT
+    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
     { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
     { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
     { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
     { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
-    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
-    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
-    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 4 },
-    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 4 },
-    { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 },
-    { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 4 },
-    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 },
-    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 },
+
+    // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
+    { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
+    { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f32, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
+    { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f32, 1 },
+
+    // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
+    { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
+    { ISD::FP_TO_SINT, MVT::v4i8,  MVT::v4f32, 2 },
+    { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
+    { ISD::FP_TO_UINT, MVT::v4i8,  MVT::v4f32, 2 },
+
+    // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
+    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
+    { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
+    { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f64, 2 },
+    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
+    { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
+    { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f64, 2 },
   };
 
   int Idx = ConvertCostTableLookup<MVT>(
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 65b77c5..c42d11e 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -38,14 +38,19 @@ namespace {
 class AArch64Operand;
 
 class AArch64AsmParser : public MCTargetAsmParser {
-public:
-  typedef SmallVectorImpl<MCParsedAsmOperand *> OperandVector;
-
 private:
   StringRef Mnemonic; ///< Instruction mnemonic.
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
 
+  // Map of register aliases registers via the .req directive.
+  StringMap<std::pair<bool, unsigned> > RegisterReqs;
+
+  AArch64TargetStreamer &getTargetStreamer() {
+    MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
+    return static_cast<AArch64TargetStreamer &>(TS);
+  }
+
   MCAsmParser &getParser() const { return Parser; }
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
 
@@ -54,6 +59,7 @@ private:
   bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
   AArch64CC::CondCode parseCondCodeString(StringRef Cond);
   bool parseCondCode(OperandVector &Operands, bool invertCondCode);
+  unsigned matchRegisterNameAlias(StringRef Name, bool isVector);
   int tryParseRegister();
   int tryMatchVectorRegister(StringRef &Kind, bool expected);
   bool parseRegister(OperandVector &Operands);
@@ -70,6 +76,10 @@ private:
   bool parseDirectiveTLSDescCall(SMLoc L);
 
   bool parseDirectiveLOH(StringRef LOH, SMLoc L);
+  bool parseDirectiveLtorg(SMLoc L);
+
+  bool parseDirectiveReq(StringRef Name, SMLoc L);
+  bool parseDirectiveUnreq(SMLoc L);
 
   bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc);
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
@@ -108,6 +118,8 @@ public:
                  const MCTargetOptions &Options)
       : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
     MCAsmParserExtension::Initialize(_Parser);
+    if (Parser.getStreamer().getTargetStreamer() == nullptr)
+      new AArch64TargetStreamer(Parser.getStreamer());
 
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
@@ -117,7 +129,7 @@ public:
                         SMLoc NameLoc, OperandVector &Operands) override;
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
   bool ParseDirective(AsmToken DirectiveID) override;
-  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
                                       unsigned Kind) override;
 
   static bool classifySymbolRef(const MCExpr *Expr,
@@ -240,10 +252,10 @@ private:
   // the add<>Operands() calls.
   MCContext &Ctx;
 
+public:
   AArch64Operand(KindTy K, MCContext &_Ctx)
       : MCParsedAsmOperand(), Kind(K), Ctx(_Ctx) {}
 
-public:
   AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) {
     Kind = o.Kind;
     StartLoc = o.StartLoc;
@@ -607,7 +619,11 @@ public:
     const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
     if (!MCE)
       return false;
-    return AArch64_AM::isLogicalImmediate(MCE->getValue(), 32);
+    int64_t Val = MCE->getValue();
+    if (Val >> 32 != 0 && Val >> 32 != ~0LL)
+      return false;
+    Val &= 0xFFFFFFFF;
+    return AArch64_AM::isLogicalImmediate(Val, 32);
   }
   bool isLogicalImm64() const {
     if (!isImm())
@@ -617,6 +633,23 @@ public:
       return false;
     return AArch64_AM::isLogicalImmediate(MCE->getValue(), 64);
   }
+  bool isLogicalImm32Not() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = ~MCE->getValue() & 0xFFFFFFFF;
+    return AArch64_AM::isLogicalImmediate(Val, 32);
+  }
+  bool isLogicalImm64Not() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return AArch64_AM::isLogicalImmediate(~MCE->getValue(), 64);
+  }
   bool isShiftedImm() const { return Kind == k_ShiftedImm; }
   bool isAddSubImm() const {
     if (!isShiftedImm() && !isImm())
@@ -1348,7 +1381,8 @@ public:
     assert(N == 1 && "Invalid number of operands!");
     const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
     assert(MCE && "Invalid logical immediate operand!");
-    uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 32);
+    uint64_t encoding =
+        AArch64_AM::encodeLogicalImmediate(MCE->getValue() & 0xFFFFFFFF, 32);
     Inst.addOperand(MCOperand::CreateImm(encoding));
   }
 
@@ -1360,6 +1394,22 @@ public:
     Inst.addOperand(MCOperand::CreateImm(encoding));
   }
 
+  void addLogicalImm32NotOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    int64_t Val = ~MCE->getValue() & 0xFFFFFFFF;
+    uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, 32);
+    Inst.addOperand(MCOperand::CreateImm(encoding));
+  }
+
+  void addLogicalImm64NotOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    uint64_t encoding =
+        AArch64_AM::encodeLogicalImmediate(~MCE->getValue(), 64);
+    Inst.addOperand(MCOperand::CreateImm(encoding));
+  }
+
   void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
@@ -1523,9 +1573,9 @@ public:
 
   void print(raw_ostream &OS) const override;
 
-  static AArch64Operand *CreateToken(StringRef Str, bool IsSuffix, SMLoc S,
-                                   MCContext &Ctx) {
-    AArch64Operand *Op = new AArch64Operand(k_Token, Ctx);
+  static std::unique_ptr<AArch64Operand>
+  CreateToken(StringRef Str, bool IsSuffix, SMLoc S, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Token, Ctx);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->Tok.IsSuffix = IsSuffix;
@@ -1534,9 +1584,9 @@ public:
     return Op;
   }
 
-  static AArch64Operand *CreateReg(unsigned RegNum, bool isVector, SMLoc S,
-                                 SMLoc E, MCContext &Ctx) {
-    AArch64Operand *Op = new AArch64Operand(k_Register, Ctx);
+  static std::unique_ptr<AArch64Operand>
+  CreateReg(unsigned RegNum, bool isVector, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Register, Ctx);
     Op->Reg.RegNum = RegNum;
     Op->Reg.isVector = isVector;
     Op->StartLoc = S;
@@ -1544,10 +1594,10 @@ public:
     return Op;
   }
 
-  static AArch64Operand *CreateVectorList(unsigned RegNum, unsigned Count,
-                                        unsigned NumElements, char ElementKind,
-                                        SMLoc S, SMLoc E, MCContext &Ctx) {
-    AArch64Operand *Op = new AArch64Operand(k_VectorList, Ctx);
+  static std::unique_ptr<AArch64Operand>
+  CreateVectorList(unsigned RegNum, unsigned Count, unsigned NumElements,
+                   char ElementKind, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_VectorList, Ctx);
     Op->VectorList.RegNum = RegNum;
     Op->VectorList.Count = Count;
     Op->VectorList.NumElements = NumElements;
@@ -1557,28 +1607,29 @@ public:
     return Op;
   }
 
-  static AArch64Operand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E,
-                                         MCContext &Ctx) {
-    AArch64Operand *Op = new AArch64Operand(k_VectorIndex, Ctx);
+  static std::unique_ptr<AArch64Operand>
+  CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_VectorIndex, Ctx);
     Op->VectorIndex.Val = Idx;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static AArch64Operand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E,
-                                 MCContext &Ctx) {
-    AArch64Operand *Op = new AArch64Operand(k_Immediate, Ctx);
+  static std::unique_ptr<AArch64Operand> CreateImm(const MCExpr *Val, SMLoc S,
+                                                   SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Immediate, Ctx);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static AArch64Operand *CreateShiftedImm(const MCExpr *Val,
-                                          unsigned ShiftAmount, SMLoc S,
-                                          SMLoc E, MCContext &Ctx) {
-    AArch64Operand *Op = new AArch64Operand(k_ShiftedImm, Ctx);
+  static std::unique_ptr<AArch64Operand> CreateShiftedImm(const MCExpr *Val,
+                                                          unsigned ShiftAmount,
+                                                          SMLoc S, SMLoc E,
+                                                          MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_ShiftedImm, Ctx);
     Op->ShiftedImm .Val = Val;
     Op->ShiftedImm.ShiftAmount = ShiftAmount;
     Op->StartLoc = S;
@@ -1586,34 +1637,36 @@ public:
     return Op;
   }
 
-  static AArch64Operand *CreateCondCode(AArch64CC::CondCode Code, SMLoc S,
-                                        SMLoc E, MCContext &Ctx) {
-    AArch64Operand *Op = new AArch64Operand(k_CondCode, Ctx);
+  static std::unique_ptr<AArch64Operand>
+  CreateCondCode(AArch64CC::CondCode Code, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_CondCode, Ctx);
     Op->CondCode.Code = Code;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static AArch64Operand *CreateFPImm(unsigned Val, SMLoc S, MCContext &Ctx) {
-    AArch64Operand *Op = new AArch64Operand(k_FPImm, Ctx);
+  static std::unique_ptr<AArch64Operand> CreateFPImm(unsigned Val, SMLoc S,
+                                                     MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_FPImm, Ctx);
     Op->FPImm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
-  static AArch64Operand *CreateBarrier(unsigned Val, SMLoc S, MCContext &Ctx) {
-    AArch64Operand *Op = new AArch64Operand(k_Barrier, Ctx);
+  static std::unique_ptr<AArch64Operand> CreateBarrier(unsigned Val, SMLoc S,
+                                                       MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Barrier, Ctx);
     Op->Barrier.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
-  static AArch64Operand *CreateSysReg(StringRef Str, SMLoc S,
-                                    uint64_t FeatureBits, MCContext &Ctx) {
-    AArch64Operand *Op = new AArch64Operand(k_SysReg, Ctx);
+  static std::unique_ptr<AArch64Operand>
+  CreateSysReg(StringRef Str, SMLoc S, uint64_t FeatureBits, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_SysReg, Ctx);
     Op->SysReg.Data = Str.data();
     Op->SysReg.Length = Str.size();
     Op->SysReg.FeatureBits = FeatureBits;
@@ -1622,27 +1675,28 @@ public:
     return Op;
   }
 
-  static AArch64Operand *CreateSysCR(unsigned Val, SMLoc S, SMLoc E,
-                                   MCContext &Ctx) {
-    AArch64Operand *Op = new AArch64Operand(k_SysCR, Ctx);
+  static std::unique_ptr<AArch64Operand> CreateSysCR(unsigned Val, SMLoc S,
+                                                     SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_SysCR, Ctx);
     Op->SysCRImm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static AArch64Operand *CreatePrefetch(unsigned Val, SMLoc S, MCContext &Ctx) {
-    AArch64Operand *Op = new AArch64Operand(k_Prefetch, Ctx);
+  static std::unique_ptr<AArch64Operand> CreatePrefetch(unsigned Val, SMLoc S,
+                                                        MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Prefetch, Ctx);
     Op->Prefetch.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
-  static AArch64Operand *CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp,
-                                         unsigned Val, bool HasExplicitAmount,
-                                         SMLoc S, SMLoc E, MCContext &Ctx) {
-    AArch64Operand *Op = new AArch64Operand(k_ShiftExtend, Ctx);
+  static std::unique_ptr<AArch64Operand>
+  CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val,
+                    bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_ShiftExtend, Ctx);
     Op->ShiftExtend.Type = ShOp;
     Op->ShiftExtend.Amount = Val;
     Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount;
@@ -1816,6 +1870,26 @@ bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
   return (RegNo == (unsigned)-1);
 }
 
+// Matches a register name or register alias previously defined by '.req'
+unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name,
+                                                  bool isVector) {
+  unsigned RegNum = isVector ? matchVectorRegName(Name)
+                             : MatchRegisterName(Name);
+
+  if (RegNum == 0) {
+    // Check for aliases registered via .req. Canonicalize to lower case.
+    // That's more consistent since register names are case insensitive, and
+    // it's how the original entry was passed in from MC/MCParser/AsmParser.
+    auto Entry = RegisterReqs.find(Name.lower());
+    if (Entry == RegisterReqs.end())
+      return 0;
+    // set RegNum if the match is the right kind of register
+    if (isVector == Entry->getValue().first)
+      RegNum = Entry->getValue().second;
+  }
+  return RegNum;
+}
+
 /// tryParseRegister - Try to parse a register name. The token must be an
 /// Identifier when called, and if it is a register name the token is eaten and
 /// the register is added to the operand list.
@@ -1824,7 +1898,7 @@ int AArch64AsmParser::tryParseRegister() {
   assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
 
   std::string lowerCase = Tok.getString().lower();
-  unsigned RegNum = MatchRegisterName(lowerCase);
+  unsigned RegNum = matchRegisterNameAlias(lowerCase, false);
   // Also handle a few aliases of registers.
   if (RegNum == 0)
     RegNum = StringSwitch<unsigned>(lowerCase)
@@ -1854,7 +1928,8 @@ int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
   // a '.'.
   size_t Start = 0, Next = Name.find('.');
   StringRef Head = Name.slice(Start, Next);
-  unsigned RegNum = matchVectorRegName(Head);
+  unsigned RegNum = matchRegisterNameAlias(Head, true);
+
   if (RegNum) {
     if (Next != StringRef::npos) {
       Kind = Name.slice(Next, StringRef::npos);
@@ -2183,8 +2258,11 @@ bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
     return TokError("invalid condition code");
   Parser.Lex(); // Eat identifier token.
 
-  if (invertCondCode)
+  if (invertCondCode) {
+    if (CC == AArch64CC::AL || CC == AArch64CC::NV)
+      return TokError("condition codes AL and NV are invalid for this instruction");
     CC = AArch64CC::getInvertedCondCode(AArch64CC::CondCode(CC));
+  }
 
   Operands.push_back(
       AArch64Operand::CreateCondCode(CC, S, getLoc(), getContext()));
@@ -2849,7 +2927,7 @@ AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
   if (!Tok.is(AsmToken::Identifier))
     return MatchOperand_NoMatch;
 
-  unsigned RegNum = MatchRegisterName(Tok.getString().lower());
+  unsigned RegNum = matchRegisterNameAlias(Tok.getString().lower(), false);
 
   MCContext &Ctx = getContext();
   const MCRegisterInfo *RI = Ctx.getRegisterInfo();
@@ -3000,6 +3078,43 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
     Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E, getContext()));
     return false;
   }
+  case AsmToken::Equal: {
+    SMLoc Loc = Parser.getTok().getLoc();
+    if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val)
+      return Error(Loc, "unexpected token in operand");
+    Parser.Lex(); // Eat '='
+    const MCExpr *SubExprVal;
+    if (getParser().parseExpression(SubExprVal))
+      return true;
+
+    MCContext& Ctx = getContext();
+    E = SMLoc::getFromPointer(Loc.getPointer() - 1);
+    // If the op is an imm and can be fit into a mov, then replace ldr with mov.
+    if (isa<MCConstantExpr>(SubExprVal) && Operands.size() >= 2 &&
+        static_cast<AArch64Operand &>(*Operands[1]).isReg()) {
+      bool IsXReg =  AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+            Operands[1]->getReg());
+      uint64_t Imm = (cast<MCConstantExpr>(SubExprVal))->getValue();
+      uint32_t ShiftAmt = 0, MaxShiftAmt = IsXReg ? 48 : 16;
+      while(Imm > 0xFFFF && countTrailingZeros(Imm) >= 16) {
+        ShiftAmt += 16;
+        Imm >>= 16;
+      }
+      if (ShiftAmt <= MaxShiftAmt && Imm <= 0xFFFF) {
+          Operands[0] = AArch64Operand::CreateToken("movz", false, Loc, Ctx);
+          Operands.push_back(AArch64Operand::CreateImm(
+                     MCConstantExpr::Create(Imm, Ctx), S, E, Ctx));
+        if (ShiftAmt)
+          Operands.push_back(AArch64Operand::CreateShiftExtend(AArch64_AM::LSL,
+                     ShiftAmt, true, S, E, Ctx));
+        return false;
+      }
+    }
+    // If it is a label or an imm that cannot fit in a movz, put it into CP.
+    const MCExpr *CPLoc = getTargetStreamer().addConstantPoolEntry(SubExprVal);
+    Operands.push_back(AArch64Operand::CreateImm(CPLoc, S, E, Ctx));
+    return false;
+  }
   }
 }
 
@@ -3029,6 +3144,15 @@ bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
              .Case("bnv", "b.nv")
              .Default(Name);
 
+  // First check for the AArch64-specific .req directive.
+  if (Parser.getTok().is(AsmToken::Identifier) &&
+      Parser.getTok().getIdentifier() == ".req") {
+    parseDirectiveReq(Name, NameLoc);
+    // We always return 'error' for this, as we're done with this
+    // statement and don't need to match the 'instruction."
+    return true;
+  }
+
   // Create the leading tokens for the mnemonic, split by '.' characters.
   size_t Start = 0, Next = Name.find('.');
   StringRef Head = Name.slice(Start, Next);
@@ -3443,8 +3567,7 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
   case Match_MnemonicFail:
     return Error(Loc, "unrecognized instruction mnemonic");
   default:
-    assert(0 && "unexpected error code!");
-    return Error(Loc, "invalid instruction format");
+    llvm_unreachable("unexpected error code!");
   }
 }
 
@@ -3456,23 +3579,23 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                                unsigned &ErrorInfo,
                                                bool MatchingInlineAsm) {
   assert(!Operands.empty() && "Unexpect empty operand list!");
-  AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[0]);
-  assert(Op->isToken() && "Leading operand should always be a mnemonic!");
+  AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[0]);
+  assert(Op.isToken() && "Leading operand should always be a mnemonic!");
 
-  StringRef Tok = Op->getToken();
+  StringRef Tok = Op.getToken();
   unsigned NumOperands = Operands.size();
 
   if (NumOperands == 4 && Tok == "lsl") {
-    AArch64Operand *Op2 = static_cast<AArch64Operand *>(Operands[2]);
-    AArch64Operand *Op3 = static_cast<AArch64Operand *>(Operands[3]);
-    if (Op2->isReg() && Op3->isImm()) {
-      const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
+    AArch64Operand &Op2 = static_cast<AArch64Operand &>(*Operands[2]);
+    AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
+    if (Op2.isReg() && Op3.isImm()) {
+      const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
       if (Op3CE) {
         uint64_t Op3Val = Op3CE->getValue();
         uint64_t NewOp3Val = 0;
         uint64_t NewOp4Val = 0;
         if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains(
-                Op2->getReg())) {
+                Op2.getReg())) {
           NewOp3Val = (32 - Op3Val) & 0x1f;
           NewOp4Val = 31 - Op3Val;
         } else {
@@ -3484,26 +3607,24 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
         const MCExpr *NewOp4 = MCConstantExpr::Create(NewOp4Val, getContext());
 
         Operands[0] = AArch64Operand::CreateToken(
-            "ubfm", false, Op->getStartLoc(), getContext());
-        Operands[3] = AArch64Operand::CreateImm(NewOp3, Op3->getStartLoc(),
-                                                Op3->getEndLoc(), getContext());
+            "ubfm", false, Op.getStartLoc(), getContext());
         Operands.push_back(AArch64Operand::CreateImm(
-            NewOp4, Op3->getStartLoc(), Op3->getEndLoc(), getContext()));
-        delete Op3;
-        delete Op;
+            NewOp4, Op3.getStartLoc(), Op3.getEndLoc(), getContext()));
+        Operands[3] = AArch64Operand::CreateImm(NewOp3, Op3.getStartLoc(),
+                                                Op3.getEndLoc(), getContext());
       }
     }
   } else if (NumOperands == 5) {
     // FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and
     // UBFIZ -> UBFM aliases.
     if (Tok == "bfi" || Tok == "sbfiz" || Tok == "ubfiz") {
-      AArch64Operand *Op1 = static_cast<AArch64Operand *>(Operands[1]);
-      AArch64Operand *Op3 = static_cast<AArch64Operand *>(Operands[3]);
-      AArch64Operand *Op4 = static_cast<AArch64Operand *>(Operands[4]);
+      AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
+      AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
+      AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]);
 
-      if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
-        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
-        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
+      if (Op1.isReg() && Op3.isImm() && Op4.isImm()) {
+        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
+        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm());
 
         if (Op3CE && Op4CE) {
           uint64_t Op3Val = Op3CE->getValue();
@@ -3511,21 +3632,21 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 
           uint64_t RegWidth = 0;
           if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
-              Op1->getReg()))
+                  Op1.getReg()))
             RegWidth = 64;
           else
             RegWidth = 32;
 
           if (Op3Val >= RegWidth)
-            return Error(Op3->getStartLoc(),
+            return Error(Op3.getStartLoc(),
                          "expected integer in range [0, 31]");
           if (Op4Val < 1 || Op4Val > RegWidth)
-            return Error(Op4->getStartLoc(),
+            return Error(Op4.getStartLoc(),
                          "expected integer in range [1, 32]");
 
           uint64_t NewOp3Val = 0;
           if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains(
-                  Op1->getReg()))
+                  Op1.getReg()))
             NewOp3Val = (32 - Op3Val) & 0x1f;
           else
             NewOp3Val = (64 - Op3Val) & 0x3f;
@@ -3533,7 +3654,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
           uint64_t NewOp4Val = Op4Val - 1;
 
           if (NewOp3Val != 0 && NewOp4Val >= NewOp3Val)
-            return Error(Op4->getStartLoc(),
+            return Error(Op4.getStartLoc(),
                          "requested insert overflows register");
 
           const MCExpr *NewOp3 =
@@ -3541,24 +3662,20 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
           const MCExpr *NewOp4 =
               MCConstantExpr::Create(NewOp4Val, getContext());
           Operands[3] = AArch64Operand::CreateImm(
-              NewOp3, Op3->getStartLoc(), Op3->getEndLoc(), getContext());
+              NewOp3, Op3.getStartLoc(), Op3.getEndLoc(), getContext());
           Operands[4] = AArch64Operand::CreateImm(
-              NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext());
+              NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext());
           if (Tok == "bfi")
             Operands[0] = AArch64Operand::CreateToken(
-                "bfm", false, Op->getStartLoc(), getContext());
+                "bfm", false, Op.getStartLoc(), getContext());
           else if (Tok == "sbfiz")
             Operands[0] = AArch64Operand::CreateToken(
-                "sbfm", false, Op->getStartLoc(), getContext());
+                "sbfm", false, Op.getStartLoc(), getContext());
           else if (Tok == "ubfiz")
             Operands[0] = AArch64Operand::CreateToken(
-                "ubfm", false, Op->getStartLoc(), getContext());
+                "ubfm", false, Op.getStartLoc(), getContext());
           else
             llvm_unreachable("No valid mnemonic for alias?");
-
-          delete Op;
-          delete Op3;
-          delete Op4;
         }
       }
 
@@ -3566,13 +3683,13 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       // UBFX -> UBFM aliases.
     } else if (NumOperands == 5 &&
                (Tok == "bfxil" || Tok == "sbfx" || Tok == "ubfx")) {
-      AArch64Operand *Op1 = static_cast<AArch64Operand *>(Operands[1]);
-      AArch64Operand *Op3 = static_cast<AArch64Operand *>(Operands[3]);
-      AArch64Operand *Op4 = static_cast<AArch64Operand *>(Operands[4]);
+      AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
+      AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
+      AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]);
 
-      if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
-        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
-        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
+      if (Op1.isReg() && Op3.isImm() && Op4.isImm()) {
+        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
+        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm());
 
         if (Op3CE && Op4CE) {
           uint64_t Op3Val = Op3CE->getValue();
@@ -3580,42 +3697,39 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 
           uint64_t RegWidth = 0;
           if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
-              Op1->getReg()))
+                  Op1.getReg()))
             RegWidth = 64;
           else
             RegWidth = 32;
 
           if (Op3Val >= RegWidth)
-            return Error(Op3->getStartLoc(),
+            return Error(Op3.getStartLoc(),
                          "expected integer in range [0, 31]");
           if (Op4Val < 1 || Op4Val > RegWidth)
-            return Error(Op4->getStartLoc(),
+            return Error(Op4.getStartLoc(),
                          "expected integer in range [1, 32]");
 
           uint64_t NewOp4Val = Op3Val + Op4Val - 1;
 
           if (NewOp4Val >= RegWidth || NewOp4Val < Op3Val)
-            return Error(Op4->getStartLoc(),
+            return Error(Op4.getStartLoc(),
                          "requested extract overflows register");
 
           const MCExpr *NewOp4 =
               MCConstantExpr::Create(NewOp4Val, getContext());
           Operands[4] = AArch64Operand::CreateImm(
-              NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext());
+              NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext());
           if (Tok == "bfxil")
             Operands[0] = AArch64Operand::CreateToken(
-                "bfm", false, Op->getStartLoc(), getContext());
+                "bfm", false, Op.getStartLoc(), getContext());
           else if (Tok == "sbfx")
             Operands[0] = AArch64Operand::CreateToken(
-                "sbfm", false, Op->getStartLoc(), getContext());
+                "sbfm", false, Op.getStartLoc(), getContext());
           else if (Tok == "ubfx")
             Operands[0] = AArch64Operand::CreateToken(
-                "ubfm", false, Op->getStartLoc(), getContext());
+                "ubfm", false, Op.getStartLoc(), getContext());
           else
             llvm_unreachable("No valid mnemonic for alias?");
-
-          delete Op;
-          delete Op4;
         }
       }
     }
@@ -3626,63 +3740,58 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   if (NumOperands == 3 && (Tok == "sxtw" || Tok == "uxtw")) {
     // The source register can be Wn here, but the matcher expects a
     // GPR64. Twiddle it here if necessary.
-    AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[2]);
-    if (Op->isReg()) {
-      unsigned Reg = getXRegFromWReg(Op->getReg());
-      Operands[2] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(),
-                                              Op->getEndLoc(), getContext());
-      delete Op;
+    AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
+    if (Op.isReg()) {
+      unsigned Reg = getXRegFromWReg(Op.getReg());
+      Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
+                                              Op.getEndLoc(), getContext());
     }
   }
   // FIXME: Likewise for sxt[bh] with a Xd dst operand
   else if (NumOperands == 3 && (Tok == "sxtb" || Tok == "sxth")) {
-    AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[1]);
-    if (Op->isReg() &&
+    AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
+    if (Op.isReg() &&
         AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
-            Op->getReg())) {
+            Op.getReg())) {
       // The source register can be Wn here, but the matcher expects a
       // GPR64. Twiddle it here if necessary.
-      AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[2]);
-      if (Op->isReg()) {
-        unsigned Reg = getXRegFromWReg(Op->getReg());
-        Operands[2] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(),
-                                                Op->getEndLoc(), getContext());
-        delete Op;
+      AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
+      if (Op.isReg()) {
+        unsigned Reg = getXRegFromWReg(Op.getReg());
+        Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
+                                                Op.getEndLoc(), getContext());
       }
     }
   }
   // FIXME: Likewise for uxt[bh] with a Xd dst operand
   else if (NumOperands == 3 && (Tok == "uxtb" || Tok == "uxth")) {
-    AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[1]);
-    if (Op->isReg() &&
+    AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
+    if (Op.isReg() &&
         AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
-            Op->getReg())) {
+            Op.getReg())) {
       // The source register can be Wn here, but the matcher expects a
       // GPR32. Twiddle it here if necessary.
-      AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[1]);
-      if (Op->isReg()) {
-        unsigned Reg = getWRegFromXReg(Op->getReg());
-        Operands[1] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(),
-                                                Op->getEndLoc(), getContext());
-        delete Op;
+      AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
+      if (Op.isReg()) {
+        unsigned Reg = getWRegFromXReg(Op.getReg());
+        Operands[1] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
+                                                Op.getEndLoc(), getContext());
       }
     }
   }
 
   // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR.
   if (NumOperands == 3 && Tok == "fmov") {
-    AArch64Operand *RegOp = static_cast<AArch64Operand *>(Operands[1]);
-    AArch64Operand *ImmOp = static_cast<AArch64Operand *>(Operands[2]);
-    if (RegOp->isReg() && ImmOp->isFPImm() &&
-        ImmOp->getFPImm() == (unsigned)-1) {
+    AArch64Operand &RegOp = static_cast<AArch64Operand &>(*Operands[1]);
+    AArch64Operand &ImmOp = static_cast<AArch64Operand &>(*Operands[2]);
+    if (RegOp.isReg() && ImmOp.isFPImm() && ImmOp.getFPImm() == (unsigned)-1) {
       unsigned zreg =
           AArch64MCRegisterClasses[AArch64::FPR32RegClassID].contains(
-              RegOp->getReg())
+              RegOp.getReg())
               ? AArch64::WZR
               : AArch64::XZR;
-      Operands[2] = AArch64Operand::CreateReg(zreg, false, Op->getStartLoc(),
-                                              Op->getEndLoc(), getContext());
-      delete ImmOp;
+      Operands[2] = AArch64Operand::CreateReg(zreg, false, Op.getStartLoc(),
+                                              Op.getEndLoc(), getContext());
     }
   }
 
@@ -3735,14 +3844,14 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
-      ErrorLoc = ((AArch64Operand *)Operands[ErrorInfo])->getStartLoc();
+      ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
       if (ErrorLoc == SMLoc())
         ErrorLoc = IDLoc;
     }
     // If the match failed on a suffix token operand, tweak the diagnostic
     // accordingly.
-    if (((AArch64Operand *)Operands[ErrorInfo])->isToken() &&
-        ((AArch64Operand *)Operands[ErrorInfo])->isTokenSuffix())
+    if (((AArch64Operand &)*Operands[ErrorInfo]).isToken() &&
+        ((AArch64Operand &)*Operands[ErrorInfo]).isTokenSuffix())
       MatchResult = Match_InvalidSuffix;
 
     return showMatchError(ErrorLoc, MatchResult);
@@ -3794,9 +3903,11 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_InvalidLabel:
   case Match_MSR:
   case Match_MRS: {
+    if (ErrorInfo >= Operands.size())
+      return Error(IDLoc, "too few operands for instruction");
     // Any time we get here, there's nothing fancy to do. Just get the
     // operand SMLoc and display the diagnostic.
-    SMLoc ErrorLoc = ((AArch64Operand *)Operands[ErrorInfo])->getStartLoc();
+    SMLoc ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
     if (ErrorLoc == SMLoc())
       ErrorLoc = IDLoc;
     return showMatchError(ErrorLoc, MatchResult);
@@ -3819,6 +3930,10 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
     return parseDirectiveWord(8, Loc);
   if (IDVal == ".tlsdesccall")
     return parseDirectiveTLSDescCall(Loc);
+  if (IDVal == ".ltorg" || IDVal == ".pool")
+    return parseDirectiveLtorg(Loc);
+  if (IDVal == ".unreq")
+    return parseDirectiveUnreq(DirectiveID.getLoc());
 
   return parseDirectiveLOH(IDVal, Loc);
 }
@@ -3920,6 +4035,66 @@ bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
   return false;
 }
 
+/// parseDirectiveLtorg
+///  ::= .ltorg | .pool
+bool AArch64AsmParser::parseDirectiveLtorg(SMLoc L) {
+  getTargetStreamer().emitCurrentConstantPool();
+  return false;
+}
+
+/// parseDirectiveReq
+///  ::= name .req registername
+bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
+  Parser.Lex(); // Eat the '.req' token.
+  SMLoc SRegLoc = getLoc();
+  unsigned RegNum = tryParseRegister();
+  bool IsVector = false;
+
+  if (RegNum == static_cast<unsigned>(-1)) {
+    StringRef Kind;
+    RegNum = tryMatchVectorRegister(Kind, false);
+    if (!Kind.empty()) {
+      Error(SRegLoc, "vector register without type specifier expected");
+      return false;
+    }
+    IsVector = true;
+  }
+
+  if (RegNum == static_cast<unsigned>(-1)) {
+    Parser.eatToEndOfStatement();
+    Error(SRegLoc, "register name or alias expected");
+    return false;
+  }
+
+  // Shouldn't be anything else.
+  if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
+    Error(Parser.getTok().getLoc(), "unexpected input in .req directive");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+
+  auto pair = std::make_pair(IsVector, RegNum);
+  if (RegisterReqs.GetOrCreateValue(Name, pair).getValue() != pair)
+    Warning(L, "ignoring redefinition of register alias '" + Name + "'");
+
+  return true;
+}
+
+/// parseDirectiveUneq
+///  ::= .unreq registername
+bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) {
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    Error(Parser.getTok().getLoc(), "unexpected input in .unreq directive.");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+  RegisterReqs.erase(Parser.getTok().getIdentifier().lower());
+  Parser.Lex(); // Eat the identifier.
+  return false;
+}
+
 bool
 AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
                                     AArch64MCExpr::VariantKind &ELFRefKind,
@@ -3986,9 +4161,9 @@ extern "C" void LLVMInitializeAArch64AsmParser() {
 
 // Define this matcher function after the auto-generated include so we
 // have the match class enum definitions.
-unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
+unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
                                                       unsigned Kind) {
-  AArch64Operand *Op = static_cast<AArch64Operand *>(AsmOp);
+  AArch64Operand &Op = static_cast<AArch64Operand &>(AsmOp);
   // If the kind is a token for a literal immediate, check if our asm
   // operand matches. This is for InstAliases which have a fixed-value
   // immediate in the syntax.
@@ -4036,9 +4211,9 @@ unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
     ExpectedVal = 8;
     break;
   }
-  if (!Op->isImm())
+  if (!Op.isImm())
     return Match_InvalidOperand;
-  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm());
   if (!CE)
     return Match_InvalidOperand;
   if (CE->getValue() == ExpectedVal)
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
index 2466368..2057c51 100644
--- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -37,8 +37,7 @@ getVariant(uint64_t LLVMDisassembler_VariantKind) {
   case LLVMDisassembler_VariantKind_ARM64_TLVP:
   case LLVMDisassembler_VariantKind_ARM64_TLVOFF:
   default:
-    assert(0 && "bad LLVMDisassembler_VariantKind");
-    return MCSymbolRefExpr::VK_None;
+    llvm_unreachable("bad LLVMDisassembler_VariantKind");
   }
 }
 
diff --git a/lib/Target/AArch64/Disassembler/CMakeLists.txt b/lib/Target/AArch64/Disassembler/CMakeLists.txt
index be4ccad..d64c05b 100644
--- a/lib/Target/AArch64/Disassembler/CMakeLists.txt
+++ b/lib/Target/AArch64/Disassembler/CMakeLists.txt
@@ -4,11 +4,5 @@ add_llvm_library(LLVMAArch64Disassembler
   AArch64Disassembler.cpp
   AArch64ExternalSymbolizer.cpp
   )
-# workaround for hanging compilation on MSVC8, 9 and 10
-#if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
-#set_property(
-#  SOURCE ARMDisassembler.cpp
-#  PROPERTY COMPILE_FLAGS "/Od"
-#  )
-#endif()
+
 add_dependencies(LLVMAArch64Disassembler AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index f484a5b..8a21f06 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -918,7 +918,7 @@ void AArch64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo,
     else
       O << getRegisterName(Reg);
   } else
-    assert(0 && "unknown operand kind in printPostIncOperand64");
+    llvm_unreachable("unknown operand kind in printPostIncOperand64");
 }
 
 void AArch64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo,
@@ -1109,7 +1109,7 @@ static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) {
   while (Stride--) {
     switch (Reg) {
     default:
-      assert(0 && "Vector register expected!");
+      llvm_unreachable("Vector register expected!");
     case AArch64::Q0:  Reg = AArch64::Q1;  break;
     case AArch64::Q1:  Reg = AArch64::Q2;  break;
     case AArch64::Q2:  Reg = AArch64::Q3;  break;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index d8900d4..a917616 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -86,7 +86,7 @@ public:
 static unsigned getFixupKindNumBytes(unsigned Kind) {
   switch (Kind) {
   default:
-    assert(0 && "Unknown fixup kind!");
+    llvm_unreachable("Unknown fixup kind!");
 
   case AArch64::fixup_aarch64_tlsdesc_call:
     return 0;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index dc4a8bf..1763b40 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -96,4 +96,6 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) {
   ExceptionsType = ExceptionHandling::DwarfCFI;
 
   UseIntegratedAssembler = true;
+
+  HasIdentDirective = true;
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index 464a18c..f051357 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -218,13 +218,9 @@ AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                                         const MCSubtargetInfo &STI) const {
   if (MO.isReg())
     return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
-  else {
-    assert(MO.isImm() && "did not expect relocated expression");
-    return static_cast<unsigned>(MO.getImm());
-  }
 
-  assert(0 && "Unable to encode MCOperand!");
-  return 0;
+  assert(MO.isImm() && "did not expect relocated expression");
+  return static_cast<unsigned>(MO.getImm());
 }
 
 template<unsigned FixupKind> uint32_t
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 85c3ec7..42a6787 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -81,37 +81,8 @@ void AArch64MCExpr::PrintImpl(raw_ostream &OS) const {
   OS << *Expr;
 }
 
-// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
-// that method should be made public?
-// FIXME: really do above: now that two backends are using it.
-static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
-  switch (Value->getKind()) {
-  case MCExpr::Target:
-    llvm_unreachable("Can't handle nested target expr!");
-    break;
-
-  case MCExpr::Constant:
-    break;
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
-    AddValueSymbolsImpl(BE->getLHS(), Asm);
-    AddValueSymbolsImpl(BE->getRHS(), Asm);
-    break;
-  }
-
-  case MCExpr::SymbolRef:
-    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
-    break;
-
-  case MCExpr::Unary:
-    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
-    break;
-  }
-}
-
-void AArch64MCExpr::AddValueSymbols(MCAssembler *Asm) const {
-  AddValueSymbolsImpl(getSubExpr(), Asm);
+void AArch64MCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*getSubExpr());
 }
 
 const MCSection *AArch64MCExpr::FindAssociatedSection() const {
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index e869ed0..5422f9d 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -147,7 +147,7 @@ public:
 
   void PrintImpl(raw_ostream &OS) const override;
 
-  void AddValueSymbols(MCAssembler *) const override;
+  void visitUsedExpr(MCStreamer &Streamer) const override;
 
   const MCSection *FindAssociatedSection() const override;
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 5c86189..ba95366 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -75,7 +75,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
     Log2Size = llvm::Log2_32(4);
     switch (Sym->getKind()) {
     default:
-      assert(0 && "Unexpected symbol reference variant kind!");
+      llvm_unreachable("Unexpected symbol reference variant kind!");
     case MCSymbolRefExpr::VK_PAGEOFF:
       RelocType = unsigned(MachO::ARM64_RELOC_PAGEOFF12);
       return true;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
new file mode 100644
index 0000000..f9aeb35
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -0,0 +1,40 @@
+//===- AArch64TargetStreamer.cpp - AArch64TargetStreamer class --*- C++ -*---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64TargetStreamer class.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/MapVector.h"
+#include "llvm/MC/ConstantPools.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+//
+// AArch64TargetStreamer Implemenation
+//
+AArch64TargetStreamer::AArch64TargetStreamer(MCStreamer &S)
+    : MCTargetStreamer(S), ConstantPools(new AssemblerConstantPools()) {}
+
+AArch64TargetStreamer::~AArch64TargetStreamer() {}
+
+// The constant pool handling is shared by all AArch64TargetStreamer
+// implementations.
+const MCExpr *AArch64TargetStreamer::addConstantPoolEntry(const MCExpr *Expr) {
+  return ConstantPools->addEntry(Streamer, Expr);
+}
+
+void AArch64TargetStreamer::emitCurrentConstantPool() {
+  ConstantPools->emitForCurrentSection(Streamer);
+}
+
+// finish() - write out any non-empty assembler constant pools.
+void AArch64TargetStreamer::finish() { ConstantPools->emitAll(Streamer); }
diff --git a/lib/Target/AArch64/MCTargetDesc/Android.mk b/lib/Target/AArch64/MCTargetDesc/Android.mk
index e9d2323..a23c0e5 100644
--- a/lib/Target/AArch64/MCTargetDesc/Android.mk
+++ b/lib/Target/AArch64/MCTargetDesc/Android.mk
@@ -14,7 +14,8 @@ aarch64_mc_desc_SRC_FILES := \
   AArch64MCAsmInfo.cpp \
   AArch64MCCodeEmitter.cpp \
   AArch64MCExpr.cpp \
-  AArch64MCTargetDesc.cpp
+  AArch64MCTargetDesc.cpp \
+  AArch64TargetStreamer.cpp
 
 # For the host
 # =====================================================
diff --git a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
index 7d5bced..6d8be5e 100644
--- a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
@@ -7,6 +7,7 @@ add_llvm_library(LLVMAArch64Desc
   AArch64MCExpr.cpp
   AArch64MCTargetDesc.cpp
   AArch64MachObjectWriter.cpp
+  AArch64TargetStreamer.cpp
 )
 add_dependencies(LLVMAArch64Desc AArch64CommonTableGen)
 
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 9e4c389..9d2ce21 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -233,23 +233,9 @@ inline static const char *getCondCodeName(CondCode Code) {
 }
 
 inline static CondCode getInvertedCondCode(CondCode Code) {
-  switch (Code) {
-  default: llvm_unreachable("Unknown condition code");
-  case EQ:  return NE;
-  case NE:  return EQ;
-  case HS:  return LO;
-  case LO:  return HS;
-  case MI:  return PL;
-  case PL:  return MI;
-  case VS:  return VC;
-  case VC:  return VS;
-  case HI:  return LS;
-  case LS:  return HI;
-  case GE:  return LT;
-  case LT:  return GE;
-  case GT:  return LE;
-  case LE:  return GT;
-  }
+  // To reverse a condition it's necessary to only invert the low bit:
+
+  return static_cast<CondCode>(static_cast<unsigned>(Code) ^ 0x1);
 }
 
 /// Given a condition code, return NZCV flags that would satisfy that condition.
diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp
index 94faf6f..92eaf9e 100644
--- a/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/lib/Target/ARM/A15SDOptimizer.cpp
@@ -321,8 +321,7 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
       return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg());
   }
 
-  assert(0 && "Unhandled update pattern!");
-  return 0;
+  llvm_unreachable("Unhandled update pattern!");
 }
 
 // Return true if this MachineInstr inserts a scalar (SPR) value into
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 55e9fe5..28d2610 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -82,7 +82,8 @@ void ARMAsmPrinter::EmitXXStructor(const Constant *CV) {
   const GlobalValue *GV = dyn_cast<GlobalValue>(CV->stripPointerCasts());
   assert(GV && "C++ constructor pointer was not a GlobalValue!");
 
-  const MCExpr *E = MCSymbolRefExpr::Create(getSymbol(GV),
+  const MCExpr *E = MCSymbolRefExpr::Create(GetARMGVSymbol(GV,
+                                                           ARMII::MO_NO_FLAG),
                                             (Subtarget->isTargetELF()
                                              ? MCSymbolRefExpr::VK_ARM_TARGET1
                                              : MCSymbolRefExpr::VK_None),
@@ -164,7 +165,7 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     else if ((Modifier && strcmp(Modifier, "hi16") == 0) ||
              (TF & ARMII::MO_HI16))
       O << ":upper16:";
-    O << *getSymbol(GV);
+    O << *GetARMGVSymbol(GV, TF);
 
     printOffset(MO.getOffset(), O);
     if (TF == ARMII::MO_PLT)
@@ -730,6 +731,32 @@ void ARMAsmPrinter::emitAttributes() {
   if (Subtarget->hasDivideInARMMode() && !Subtarget->hasV8Ops())
       ATS.emitAttribute(ARMBuildAttrs::DIV_use, ARMBuildAttrs::AllowDIVExt);
 
+  if (MMI) {
+    if (const Module *SourceModule = MMI->getModule()) {
+      // ABI_PCS_wchar_t to indicate wchar_t width
+      // FIXME: There is no way to emit value 0 (wchar_t prohibited).
+      if (auto WCharWidthValue = cast_or_null<ConstantInt>(
+              SourceModule->getModuleFlag("wchar_size"))) {
+        int WCharWidth = WCharWidthValue->getZExtValue();
+        assert((WCharWidth == 2 || WCharWidth == 4) &&
+               "wchar_t width must be 2 or 4 bytes");
+        ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_wchar_t, WCharWidth);
+      }
+
+      // ABI_enum_size to indicate enum width
+      // FIXME: There is no way to emit value 0 (enums prohibited) or value 3
+      //        (all enums contain a value needing 32 bits to encode).
+      if (auto EnumWidthValue = cast_or_null<ConstantInt>(
+              SourceModule->getModuleFlag("min_enum_size"))) {
+        int EnumWidth = EnumWidthValue->getZExtValue();
+        assert((EnumWidth == 1 || EnumWidth == 4) &&
+               "Minimum enum width must be 1 or 4 bytes");
+        int EnumBuildAttr = EnumWidth == 1 ? 1 : 2;
+        ATS.emitAttribute(ARMBuildAttrs::ABI_enum_size, EnumBuildAttr);
+      }
+    }
+  }
+
   if (Subtarget->hasTrustZone() && Subtarget->hasVirtualization())
       ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
                         ARMBuildAttrs::AllowTZVirtualization);
@@ -768,23 +795,41 @@ getModifierVariantKind(ARMCP::ARMCPModifier Modifier) {
 
 MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
                                         unsigned char TargetFlags) {
-  bool isIndirect = Subtarget->isTargetMachO() &&
-    (TargetFlags & ARMII::MO_NONLAZY) &&
-    Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel());
-  if (!isIndirect)
-    return getSymbol(GV);
+  if (Subtarget->isTargetMachO()) {
+    bool IsIndirect = (TargetFlags & ARMII::MO_NONLAZY) &&
+      Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel());
+
+    if (!IsIndirect)
+      return getSymbol(GV);
 
-  // FIXME: Remove this when Darwin transition to @GOT like syntax.
-  MCSymbol *MCSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
-  MachineModuleInfoMachO &MMIMachO =
-    MMI->getObjFileInfo<MachineModuleInfoMachO>();
-  MachineModuleInfoImpl::StubValueTy &StubSym =
-    GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(MCSym) :
-    MMIMachO.getGVStubEntry(MCSym);
-  if (!StubSym.getPointer())
-    StubSym = MachineModuleInfoImpl::
-      StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
-  return MCSym;
+    // FIXME: Remove this when Darwin transition to @GOT like syntax.
+    MCSymbol *MCSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+    MachineModuleInfoMachO &MMIMachO =
+      MMI->getObjFileInfo<MachineModuleInfoMachO>();
+    MachineModuleInfoImpl::StubValueTy &StubSym =
+      GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(MCSym)
+                                : MMIMachO.getGVStubEntry(MCSym);
+    if (!StubSym.getPointer())
+      StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV),
+                                                   !GV->hasInternalLinkage());
+    return MCSym;
+  } else if (Subtarget->isTargetCOFF()) {
+    assert(Subtarget->isTargetWindows() &&
+           "Windows is the only supported COFF target");
+
+    bool IsIndirect = (TargetFlags & ARMII::MO_DLLIMPORT);
+    if (!IsIndirect)
+      return getSymbol(GV);
+
+    SmallString<128> Name;
+    Name = "__imp_";
+    getNameWithPrefix(Name, GV);
+
+    return OutContext.GetOrCreateSymbol(Name);
+  } else if (Subtarget->isTargetELF()) {
+    return getSymbol(GV);
+  }
+  llvm_unreachable("unexpected target");
 }
 
 void ARMAsmPrinter::
@@ -928,7 +973,7 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
   for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
     MachineBasicBlock *MBB = JTBBs[i];
     const MCExpr *MBBSymbolExpr = MCSymbolRefExpr::Create(MBB->getSymbol(),
-                                                      OutContext);
+                                                          OutContext);
     // If this isn't a TBB or TBH, the entries are direct branch instructions.
     if (OffsetWidth == 4) {
       EmitToStreamer(OutStreamer, MCInstBuilder(ARM::t2B)
@@ -1225,8 +1270,10 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       // Add 's' bit operand (always reg0 for this)
       .addReg(0));
 
-    const GlobalValue *GV = MI->getOperand(0).getGlobal();
-    MCSymbol *GVSym = getSymbol(GV);
+    const MachineOperand &Op = MI->getOperand(0);
+    const GlobalValue *GV = Op.getGlobal();
+    const unsigned TF = Op.getTargetFlags();
+    MCSymbol *GVSym = GetARMGVSymbol(GV, TF);
     const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
     EmitToStreamer(OutStreamer, MCInstBuilder(ARM::Bcc)
       .addExpr(GVSymExpr)
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index bc266e8..0288db9 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -102,14 +103,15 @@ ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI)
 
 // Use a ScoreboardHazardRecognizer for prepass ARM scheduling. TargetInstrImpl
 // currently defaults to no prepass hazard recognizer.
-ScheduleHazardRecognizer *ARMBaseInstrInfo::
-CreateTargetHazardRecognizer(const TargetMachine *TM,
-                             const ScheduleDAG *DAG) const {
+ScheduleHazardRecognizer *
+ARMBaseInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+                                               const ScheduleDAG *DAG) const {
   if (usePreRAHazardRecognizer()) {
-    const InstrItineraryData *II = TM->getInstrItineraryData();
+    const InstrItineraryData *II =
+        &static_cast<const ARMSubtarget *>(STI)->getInstrItineraryData();
     return new ScoreboardHazardRecognizer(II, DAG, "pre-RA-sched");
   }
-  return TargetInstrInfo::CreateTargetHazardRecognizer(TM, DAG);
+  return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG);
 }
 
 ScheduleHazardRecognizer *ARMBaseInstrInfo::
@@ -1885,7 +1887,8 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
                                       unsigned NumBytes) {
   // This optimisation potentially adds lots of load and store
   // micro-operations, it's only really a great benefit to code-size.
-  if (!Subtarget.isMinSize())
+  if (!MF.getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::MinSize))
     return false;
 
   // If only one register is pushed/popped, LLVM can use an LDR/STR
@@ -4358,6 +4361,29 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI,
   MI->addRegisterKilled(DReg, TRI, true);
 }
 
+void ARMBaseInstrInfo::getUnconditionalBranch(
+    MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const {
+  if (Subtarget.isThumb())
+    Branch.setOpcode(ARM::tB);
+  else if (Subtarget.isThumb2())
+    Branch.setOpcode(ARM::t2B);
+  else
+    Branch.setOpcode(ARM::Bcc);
+
+  Branch.addOperand(MCOperand::CreateExpr(BranchTarget));
+  Branch.addOperand(MCOperand::CreateImm(ARMCC::AL));
+  Branch.addOperand(MCOperand::CreateReg(0));
+}
+
+void ARMBaseInstrInfo::getTrap(MCInst &MI) const {
+  if (Subtarget.isThumb())
+    MI.setOpcode(ARM::tTRAP);
+  else if (Subtarget.useNaClTrap())
+    MI.setOpcode(ARM::TRAPNaCl);
+  else
+    MI.setOpcode(ARM::TRAP);
+}
+
 bool ARMBaseInstrInfo::hasNOP() const {
   return (Subtarget.getFeatureBits() & ARM::HasV6T2Ops) != 0;
 }
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 4b3e740..b8d6758 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -50,7 +50,7 @@ public:
   const ARMSubtarget &getSubtarget() const { return Subtarget; }
 
   ScheduleHazardRecognizer *
-  CreateTargetHazardRecognizer(const TargetMachine *TM,
+  CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
                                const ScheduleDAG *DAG) const override;
 
   ScheduleHazardRecognizer *
@@ -229,6 +229,13 @@ public:
                                       const TargetRegisterInfo*) const override;
   void breakPartialRegDependency(MachineBasicBlock::iterator, unsigned,
                                  const TargetRegisterInfo *TRI) const override;
+
+  void
+  getUnconditionalBranch(MCInst &Branch,
+                         const MCSymbolRefExpr *BranchTarget) const override;
+
+  void getTrap(MCInst &MI) const override;
+
   /// Get the number of addresses by LDM or VLDM or zero for unknown.
   unsigned getNumLDMAddresses(const MachineInstr *MI) const;
 
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index a2eee9f..cdd91c7 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -45,9 +45,12 @@ using namespace llvm;
 
 ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMSubtarget &sti)
     : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), STI(sti), BasePtr(ARM::R6) {
-  if (STI.isTargetMachO())
-    FramePtr = ARM::R7;
-  else if (STI.isTargetWindows())
+  if (STI.isTargetMachO()) {
+    if (STI.isTargetDarwin() || STI.isThumb1Only())
+      FramePtr = ARM::R7;
+    else
+      FramePtr = ARM::R11;
+  } else if (STI.isTargetWindows())
     FramePtr = ARM::R11;
   else // ARM EABI
     FramePtr = STI.isThumb() ? ARM::R7 : ARM::R11;
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index 2fd7edd..5fb6ebf 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -15,6 +15,7 @@
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
 #include "ARMRelocations.h"
 #include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 6045738..51d3dbb 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -927,10 +927,16 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     }
     case ARM::tTPsoft:
     case ARM::TPsoft: {
-      MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(),
-                TII->get(Opcode == ARM::tTPsoft ? ARM::tBL : ARM::BL))
-        .addExternalSymbol("__aeabi_read_tp", 0);
+      MachineInstrBuilder MIB;
+      if (Opcode == ARM::tTPsoft)
+        MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                      TII->get( ARM::tBL))
+              .addImm((unsigned)ARMCC::AL).addReg(0)
+              .addExternalSymbol("__aeabi_read_tp", 0);
+      else
+        MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                      TII->get( ARM::BL))
+              .addExternalSymbol("__aeabi_read_tp", 0);
 
       MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       TransferImpOps(MI, MIB, MIB);
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 6f8fb1a..e2d90cd 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -590,7 +590,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
 
   // Use movw+movt when possible, it avoids constant pool entries.
   // Non-darwin targets only support static movt relocations in FastISel.
-  if (Subtarget->useMovt() &&
+  if (Subtarget->useMovt(*FuncInfo.MF) &&
       (Subtarget->isTargetMachO() || RelocM == Reloc::Static)) {
     unsigned Opc;
     unsigned char TF = 0;
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 0caf4bf..a67b360 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -39,6 +39,10 @@ static MachineBasicBlock::iterator
 skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
                         unsigned NumAlignedDPRCS2Regs);
 
+ARMFrameLowering::ARMFrameLowering(const ARMSubtarget &sti)
+    : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4),
+      STI(sti) {}
+
 /// hasFP - Return true if the specified function should have a dedicated frame
 /// pointer register.  This is true if the function has variable sized allocas
 /// or if frame pointer elimination is disabled.
@@ -220,7 +224,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
     case ARM::R10:
     case ARM::R11:
     case ARM::R12:
-      if (STI.isTargetMachO()) {
+      if (STI.isTargetDarwin()) {
         GPRCS2Size += 4;
         break;
       }
@@ -380,7 +384,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
       case ARM::R10:
       case ARM::R11:
       case ARM::R12:
-        if (STI.isTargetMachO())
+        if (STI.isTargetDarwin())
           break;
         // fallthrough
       case ARM::R0:
@@ -445,7 +449,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
       case ARM::R10:
       case ARM::R11:
       case ARM::R12:
-        if (STI.isTargetMachO()) {
+        if (STI.isTargetDarwin()) {
           unsigned DwarfReg =  MRI->getDwarfRegNum(Reg, true);
           unsigned Offset = MFI->getObjectOffset(FI);
           unsigned CFIIndex = MMI.addFrameInst(
@@ -810,7 +814,7 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
     unsigned LastReg = 0;
     for (; i != 0; --i) {
       unsigned Reg = CSI[i-1].getReg();
-      if (!(Func)(Reg, STI.isTargetMachO())) continue;
+      if (!(Func)(Reg, STI.isTargetDarwin())) continue;
 
       // D-registers in the aligned area DPRCS2 are NOT spilled here.
       if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
@@ -888,7 +892,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
     bool DeleteRet = false;
     for (; i != 0; --i) {
       unsigned Reg = CSI[i-1].getReg();
-      if (!(Func)(Reg, STI.isTargetMachO())) continue;
+      if (!(Func)(Reg, STI.isTargetDarwin())) continue;
 
       // The aligned reloads from area DPRCS2 are not inserted here.
       if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
@@ -1438,7 +1442,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     if (Spilled) {
       NumGPRSpills++;
 
-      if (!STI.isTargetMachO()) {
+      if (!STI.isTargetDarwin()) {
         if (Reg == ARM::LR)
           LRSpilled = true;
         CS1Spilled = true;
@@ -1460,7 +1464,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
         break;
       }
     } else {
-      if (!STI.isTargetMachO()) {
+      if (!STI.isTargetDarwin()) {
         UnspilledCS1GPRs.push_back(Reg);
         continue;
       }
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index 981d320..709afbc 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -14,7 +14,6 @@
 #ifndef ARM_FRAMEINFO_H
 #define ARM_FRAMEINFO_H
 
-#include "ARMSubtarget.h"
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
@@ -25,10 +24,7 @@ protected:
   const ARMSubtarget &STI;
 
 public:
-  explicit ARMFrameLowering(const ARMSubtarget &sti)
-    : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4),
-      STI(sti) {
-  }
+  explicit ARMFrameLowering(const ARMSubtarget &sti);
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 08d598d..38547cf 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -60,22 +60,17 @@ enum AddrMode2Type {
 };
 
 class ARMDAGToDAGISel : public SelectionDAGISel {
-  ARMBaseTargetMachine &TM;
-
   /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
   /// make the right decision when generating code for different targets.
   const ARMSubtarget *Subtarget;
 
 public:
-  explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm,
-                           CodeGenOpt::Level OptLevel)
-    : SelectionDAGISel(tm, OptLevel), TM(tm),
-      Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
-  }
+  explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(tm, OptLevel) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override {
     // Reset the subtarget each time through.
-    Subtarget = &TM.getSubtarget<ARMSubtarget>();
+    Subtarget = &MF.getTarget().getSubtarget<ARMSubtarget>();
     SelectionDAGISel::runOnMachineFunction(MF);
     return true;
   }
@@ -429,8 +424,8 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
   if (Use->getOpcode() == ISD::CopyToReg)
     return true;
   if (Use->isMachineOpcode()) {
-    const ARMBaseInstrInfo *TII =
-      static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
+    const ARMBaseInstrInfo *TII = static_cast<const ARMBaseInstrInfo *>(
+        CurDAG->getTarget().getInstrInfo());
 
     const MCInstrDesc &MCID = TII->get(Use->getMachineOpcode());
     if (MCID.mayStore())
@@ -2444,7 +2439,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   case ISD::Constant: {
     unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
     bool UseCP = true;
-    if (Subtarget->useMovt())
+    if (Subtarget->useMovt(*MF))
       // Thumb2-aware targets have the MOVT instruction, so all immediates can
       // be done with MOV + MOVT, at worst.
       UseCP = false;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 00d07e8..4bfa5a8 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -155,16 +155,16 @@ void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
   addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
 }
 
-static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
-  if (TM.getSubtarget<ARMSubtarget>().isTargetMachO())
+static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
+  if (TT.isOSBinFormatMachO())
     return new TargetLoweringObjectFileMachO();
-  if (TM.getSubtarget<ARMSubtarget>().isTargetWindows())
+  if (TT.isOSWindows())
     return new TargetLoweringObjectFileCOFF();
   return new ARMElfTargetObjectFile();
 }
 
 ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
-    : TargetLowering(TM, createTLOF(TM)) {
+    : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
   Subtarget = &TM.getSubtarget<ARMSubtarget>();
   RegInfo = TM.getRegisterInfo();
   Itins = TM.getInstrItineraryData();
@@ -710,7 +710,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setExceptionSelectorRegister(ARM::R1);
   }
 
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+  if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  else
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+
   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
   // the default expansion.
   if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) {
@@ -983,6 +987,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
   case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
 
+  case ARMISD::WIN__CHKSTK:   return "ARMISD:::WIN__CHKSTK";
+
   case ARMISD::VCEQ:          return "ARMISD::VCEQ";
   case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
   case ARMISD::VCGE:          return "ARMISD::VCGE";
@@ -1199,7 +1205,7 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
   case CallingConv::C:
     if (!Subtarget->isAAPCS_ABI())
       return CallingConv::ARM_APCS;
-    else if (Subtarget->hasVFP2() &&
+    else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() &&
              getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
              !isVarArg)
       return CallingConv::ARM_AAPCS_VFP;
@@ -1207,10 +1213,10 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
       return CallingConv::ARM_AAPCS;
   case CallingConv::Fast:
     if (!Subtarget->isAAPCS_ABI()) {
-      if (Subtarget->hasVFP2() && !isVarArg)
+      if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
         return CallingConv::Fast;
       return CallingConv::ARM_APCS;
-    } else if (Subtarget->hasVFP2() && !isVarArg)
+    } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
       return CallingConv::ARM_AAPCS_VFP;
     else
       return CallingConv::ARM_AAPCS;
@@ -1598,8 +1604,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
   if (EnableARMLongCalls) {
-    assert (getTargetMachine().getRelocationModel() == Reloc::Static
-            && "long-calls with non-static relocation model!");
+    assert((Subtarget->isTargetWindows() ||
+            getTargetMachine().getRelocationModel() == Reloc::Static) &&
+           "long-calls with non-static relocation model!");
     // Handle a global address or an external symbol. If it's not one of
     // those, the target's already in a register, so we don't need to do
     // anything extra.
@@ -1647,6 +1654,19 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
       Callee = DAG.getNode(ARMISD::WrapperPIC, dl, getPointerTy(),
                            DAG.getTargetGlobalAddress(GV, dl, getPointerTy()));
+    } else if (Subtarget->isTargetCOFF()) {
+      assert(Subtarget->isTargetWindows() &&
+             "Windows is the only supported COFF target");
+      unsigned TargetFlags = GV->hasDLLImportStorageClass()
+                                 ? ARMII::MO_DLLIMPORT
+                                 : ARMII::MO_NO_FLAG;
+      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), /*Offset=*/0,
+                                          TargetFlags);
+      if (GV->hasDLLImportStorageClass())
+        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                             DAG.getNode(ARMISD::Wrapper, dl, getPointerTy(),
+                                         Callee), MachinePointerInfo::getGOT(),
+                             false, false, false, 0);
     } else {
       // On ELF targets for PIC code, direct calls should go through the PLT
       unsigned OpFlags = 0;
@@ -1688,7 +1708,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // FIXME: handle tail calls differently.
   unsigned CallOpc;
-  bool HasMinSizeAttr = Subtarget->isMinSize();
+  bool HasMinSizeAttr = MF.getFunction()->getAttributes().hasAttribute(
+      AttributeSet::FunctionIndex, Attribute::MinSize);
   if (Subtarget->isThumb()) {
     if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
@@ -2326,7 +2347,8 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(Chain)
     .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
-               DAG.getExternalSymbol("__tls_get_addr", PtrVT), &Args, 0);
+               DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args),
+               0);
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
@@ -2434,7 +2456,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
 
   // If we have T2 ops, we can materialize the address directly via movt/movw
   // pair. This is always cheaper.
-  if (Subtarget->useMovt()) {
+  if (Subtarget->useMovt(DAG.getMachineFunction())) {
     ++NumMovwMovt;
     // FIXME: Once remat is capable of dealing with instructions with register
     // operands, expand this into two nodes.
@@ -2456,7 +2478,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
 
-  if (Subtarget->useMovt())
+  if (Subtarget->useMovt(DAG.getMachineFunction()))
     ++NumMovwMovt;
 
   // FIXME: Once remat is capable of dealing with instructions with register
@@ -2476,18 +2498,27 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
                                                      SelectionDAG &DAG) const {
   assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
-  assert(Subtarget->useMovt() && "Windows on ARM expects to use movw/movt");
+  assert(Subtarget->useMovt(DAG.getMachineFunction()) &&
+         "Windows on ARM expects to use movw/movt");
 
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  const ARMII::TOF TargetFlags =
+    (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG);
   EVT PtrVT = getPointerTy();
+  SDValue Result;
   SDLoc DL(Op);
 
   ++NumMovwMovt;
 
   // FIXME: Once remat is capable of dealing with instructions with register
   // operands, expand this into two nodes.
-  return DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
-                     DAG.getTargetGlobalAddress(GV, DL, PtrVT));
+  Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
+                       DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0,
+                                                  TargetFlags));
+  if (GV->hasDLLImportStorageClass())
+    Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
+  return Result;
 }
 
 SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
@@ -2535,6 +2566,11 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
   SDLoc dl(Op);
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
+  case Intrinsic::arm_rbit: {
+    assert(Op.getOperand(0).getValueType() == MVT::i32 &&
+           "RBIT intrinsic must have i32 type!");
+    return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(0));
+  }
   case Intrinsic::arm_thread_pointer: {
     EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
@@ -4492,6 +4528,11 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
       BitMask <<= 8;
       ImmMask <<= 1;
     }
+
+    if (DAG.getTargetLoweringInfo().isBigEndian())
+      // swap higher and lower 32 bit word
+      Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4);
+
     // Op=1, Cmode=1110.
     OpCmode = 0x1e;
     VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
@@ -6078,7 +6119,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
     .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), Callee,
-               &Args, 0)
+               std::move(Args), 0)
     .setDiscardResult();
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
@@ -6213,6 +6254,10 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
   case ISD::SDIVREM:
   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC:
+    if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
+      return LowerDYNAMIC_STACKALLOC(Op, DAG);
+    llvm_unreachable("Don't know how to custom lower this!");
   }
 }
 
@@ -7112,6 +7157,73 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
 }
 
 MachineBasicBlock *
+ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
+                                       MachineBasicBlock *MBB) const {
+  const TargetMachine &TM = getTargetMachine();
+  const TargetInstrInfo &TII = *TM.getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  assert(Subtarget->isTargetWindows() &&
+         "__chkstk is only supported on Windows");
+  assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
+
+  // __chkstk takes the number of words to allocate on the stack in R4, and
+  // returns the stack adjustment in number of bytes in R4.  This will not
+  // clober any other registers (other than the obvious lr).
+  //
+  // Although, technically, IP should be considered a register which may be
+  // clobbered, the call itself will not touch it.  Windows on ARM is a pure
+  // thumb-2 environment, so there is no interworking required.  As a result, we
+  // do not expect a veneer to be emitted by the linker, clobbering IP.
+  //
+  // Each module receives its own copy of __chkstk, so no import thunk is
+  // required, again, ensuring that IP is not clobbered.
+  //
+  // Finally, although some linkers may theoretically provide a trampoline for
+  // out of range calls (which is quite common due to a 32M range limitation of
+  // branches for Thumb), we can generate the long-call version via
+  // -mcmodel=large, alleviating the need for the trampoline which may clobber
+  // IP.
+
+  switch (TM.getCodeModel()) {
+  case CodeModel::Small:
+  case CodeModel::Medium:
+  case CodeModel::Default:
+  case CodeModel::Kernel:
+    BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
+      .addImm((unsigned)ARMCC::AL).addReg(0)
+      .addExternalSymbol("__chkstk")
+      .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
+      .addReg(ARM::R4, RegState::Implicit | RegState::Define)
+      .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
+    break;
+  case CodeModel::Large:
+  case CodeModel::JITDefault: {
+    MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+    unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+
+    BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
+      .addExternalSymbol("__chkstk");
+    BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
+      .addImm((unsigned)ARMCC::AL).addReg(0)
+      .addReg(Reg, RegState::Kill)
+      .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
+      .addReg(ARM::R4, RegState::Implicit | RegState::Define)
+      .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
+    break;
+  }
+  }
+
+  AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr),
+                                      ARM::SP)
+                              .addReg(ARM::SP, RegState::Define)
+                              .addReg(ARM::R4, RegState::Kill)));
+
+  MI->eraseFromParent();
+  return MBB;
+}
+
+MachineBasicBlock *
 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                MachineBasicBlock *BB) const {
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
@@ -7360,6 +7472,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case ARM::COPY_STRUCT_BYVAL_I32:
     ++NumLoopByVals;
     return EmitStructByval(MI, BB);
+  case ARM::WIN__CHKSTK:
+    return EmitLowered__chkstk(MI, BB);
   }
 }
 
@@ -8315,6 +8429,8 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
                                  std::min(4U, LD->getAlignment() / 2));
 
     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
+    if (DCI.DAG.getTargetLoweringInfo().isBigEndian())
+      std::swap (NewLD1, NewLD2);
     SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
     DCI.RemoveFromWorklist(LD);
     DAG.DeleteNode(LD);
@@ -8382,7 +8498,8 @@ static SDValue PerformSTORECombine(SDNode *N,
     SDLoc DL(St);
     SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
-    for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = i * SizeRatio;
+    for (unsigned i = 0; i < NumElems; ++i)
+      ShuffleVec[i] = TLI.isBigEndian() ? (i+1) * SizeRatio - 1 : i * SizeRatio;
 
     // Can't shuffle using an illegal type.
     if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
@@ -10471,13 +10588,39 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(InChain)
-    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, &Args, 0)
+    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0)
     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
   return CallInfo.first;
 }
 
+SDValue
+ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWindows() && "unsupported target platform");
+  SDLoc DL(Op);
+
+  // Get the inputs.
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+
+  SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
+                              DAG.getConstant(2, MVT::i32));
+
+  SDValue Flag;
+  Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
+  Flag = Chain.getValue(1);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::i32, MVT::Glue);
+  Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
+
+  SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
+  Chain = NewSP.getValue(1);
+
+  SDValue Ops[2] = { NewSP, Chain };
+  return DAG.getMergeValues(Ops, DL);
+}
+
 bool
 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The ARM target isn't yet aware of offsets.
@@ -10635,14 +10778,20 @@ bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
 bool ARMTargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
   // Loads and stores less than 64-bits are already atomic; ones above that
   // are doomed anyway, so defer to the default libcall and blame the OS when
-  // things go wrong:
-  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 64;
-  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-    return LI->getType()->getPrimitiveSizeInBits() == 64;
-
-  // For the real atomic operations, we have ldrex/strex up to 64 bits.
-  return Inst->getType()->getPrimitiveSizeInBits() <= 64;
+  // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
+  // anything for those.
+  bool IsMClass = Subtarget->isMClass();
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
+    return Size == 64 && !IsMClass;
+  } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+    return LI->getType()->getPrimitiveSizeInBits() == 64 && !IsMClass;
+  }
+
+  // For the real atomic operations, we have ldrex/strex up to 32 bits,
+  // and up to 64 bits on the non-M profiles
+  unsigned AtomicLimit = IsMClass ? 32 : 64;
+  return Inst->getType()->getPrimitiveSizeInBits() <= AtomicLimit;
 }
 
 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index c15305c..1ace0f3 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -95,6 +95,8 @@ namespace llvm {
 
       PRELOAD,      // Preload
 
+      WIN__CHKSTK,  // Windows' __chkstk call to do stack probing.
+
       VCEQ,         // Vector compare equal.
       VCEQZ,        // Vector compare equal to zero.
       VCGE,         // Vector compare greater than or equal.
@@ -470,6 +472,7 @@ namespace llvm {
                               const ARMSubtarget *ST) const;
     SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
 
     unsigned getRegisterByName(const char* RegName, EVT VT) const override;
 
@@ -578,6 +581,9 @@ namespace llvm {
 
     MachineBasicBlock *EmitStructByval(MachineInstr *MI,
                                        MachineBasicBlock *MBB) const;
+
+    MachineBasicBlock *EmitLowered__chkstk(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) const;
   };
 
   enum NEONModImmType {
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 718d5da..2bb8976 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -270,8 +270,8 @@ def UseNaClTrap      : Predicate<"Subtarget->useNaClTrap()">,
 def DontUseNaClTrap  : Predicate<"!Subtarget->useNaClTrap()">;
 
 // FIXME: Eventually this will be just "hasV6T2Ops".
-def UseMovt          : Predicate<"Subtarget->useMovt()">;
-def DontUseMovt      : Predicate<"!Subtarget->useMovt()">;
+def UseMovt          : Predicate<"Subtarget->useMovt(*MF)">;
+def DontUseMovt      : Predicate<"!Subtarget->useMovt(*MF)">;
 def UseFPVMLx        : Predicate<"Subtarget->useFPVMLx()">;
 def UseMulOps        : Predicate<"Subtarget->useMulOps()">;
 
@@ -493,7 +493,7 @@ def neon_vcvt_imm32 : Operand<i32> {
 // rot_imm: An integer that encodes a rotate amount. Must be 8, 16, or 24.
 def rot_imm_XFORM: SDNodeXForm<imm, [{
   switch (N->getZExtValue()){
-  default: assert(0);
+  default: llvm_unreachable(nullptr);
   case 0:  return CurDAG->getTargetConstant(0, MVT::i32);
   case 8:  return CurDAG->getTargetConstant(1, MVT::i32);
   case 16: return CurDAG->getTargetConstant(2, MVT::i32);
@@ -594,7 +594,7 @@ def so_imm2part : PatLeaf<(imm), [{
 /// arm_i32imm - True for +V6T2, or true only if so_imm2part is true.
 ///
 def arm_i32imm : PatLeaf<(imm), [{
-  if (Subtarget->useMovt())
+  if (Subtarget->useMovt(*MF))
     return true;
   return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue());
 }]>;
@@ -3334,8 +3334,8 @@ def SBFX  : I<(outs GPRnopc:$Rd),
   let Inst{3-0}   = Rn;
 }
 
-def UBFX  : I<(outs GPR:$Rd),
-              (ins GPR:$Rn, imm0_31:$lsb, imm1_32:$width),
+def UBFX  : I<(outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, imm0_31:$lsb, imm1_32:$width),
                AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
                "ubfx", "\t$Rd, $Rn, $lsb, $width", "", []>,
                Requires<[IsARM, HasV6T2]> {
@@ -4443,7 +4443,7 @@ def instsyncb_opt : Operand<i32> {
   let DecoderMethod = "DecodeInstSyncBarrierOption";
 }
 
-// memory barriers protect the atomic sequences
+// Memory barriers protect the atomic sequences
 let hasSideEffects = 1 in {
 def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
                 "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>,
@@ -4452,7 +4452,6 @@ def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
   let Inst{31-4} = 0xf57ff05;
   let Inst{3-0} = opt;
 }
-}
 
 def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
                 "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>,
@@ -4464,12 +4463,13 @@ def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
 
 // ISB has only full system option
 def ISB : AInoP<(outs), (ins instsyncb_opt:$opt), MiscFrm, NoItinerary,
-                "isb", "\t$opt", []>,
+                "isb", "\t$opt", [(int_arm_isb (i32 imm0_15:$opt))]>,
                 Requires<[IsARM, HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf57ff06;
   let Inst{3-0} = opt;
 }
+}
 
 let usesCustomInserter = 1, Defs = [CPSR] in {
 
@@ -5093,6 +5093,19 @@ def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask,  so_imm:$a), NoItinerary,
   let Inst{11-0} = a;
 }
 
+// Dynamic stack allocation yields a _chkstk for Windows targets.  These calls
+// are needed to probe the stack when allocating more than
+// 4k bytes in one go. Touching the stack at 4K increments is necessary to
+// ensure that the guard pages used by the OS virtual memory manager are
+// allocated in correct sequence.
+// The main point of having separate instruction are extra unmodelled effects
+// (compared to ordinary calls) like stack pointer change.
+
+def win__chkstk : SDNode<"ARMISD::WIN__CHKSTK", SDTNone,
+                      [SDNPHasChain, SDNPSideEffect]>;
+let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in
+  def WIN__CHKSTK : PseudoInst<(outs), (ins), NoItinerary, [(win__chkstk)]>;
+
 //===----------------------------------------------------------------------===//
 // TLS Instructions
 //
@@ -5100,9 +5113,11 @@ def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask,  so_imm:$a), NoItinerary,
 // __aeabi_read_tp preserves the registers r1-r3.
 // This is a pseudo inst so that we can get the encoding right,
 // complete with fixup for the aeabi_read_tp function.
+// TPsoft is valid for ARM mode only, in case of Thumb mode a tTPsoft pattern
+// is defined in "ARMInstrThumb.td".
 let isCall = 1,
   Defs = [R0, R12, LR, CPSR], Uses = [SP] in {
-  def TPsoft : PseudoInst<(outs), (ins), IIC_Br,
+  def TPsoft : ARMPseudoInst<(outs), (ins), 4, IIC_Br,
                [(set R0, ARMthread_pointer)]>, Sched<[WriteBr]>;
 }
 
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index b32b5d2..c02bb3b 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -6372,6 +6372,32 @@ multiclass Lengthen_HalfSingle<string DestLanes, string DestTy, string SrcTy,
          dsub_0)>;
 }
 
+// The following class definition is basically a copy of the
+// Lengthen_HalfSingle definition above, however with an additional parameter
+// "RevLanes" to select the correct VREV32dXX instruction. This is to convert
+// data loaded by VLD1LN into proper vector format in big endian mode.
+multiclass Lengthen_HalfSingle_Big_Endian<string DestLanes, string DestTy, string SrcTy,
+                               string InsnLanes, string InsnTy, string RevLanes> {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
+       (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
+         (!cast<Instruction>("VREV32d" # RevLanes)
+           (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+         dsub_0)>;
+  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
+       (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
+         (!cast<Instruction>("VREV32d" # RevLanes)
+           (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+         dsub_0)>;
+  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
+       (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # InsnLanes # InsnTy)
+         (!cast<Instruction>("VREV32d" # RevLanes)
+           (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+         dsub_0)>;
+}
+
 // extload, zextload and sextload for a lengthening load followed by another
 // lengthening load, to quadruple the initial length.
 //
@@ -6406,6 +6432,36 @@ multiclass Lengthen_Double<string DestLanes, string DestTy, string SrcTy,
              dsub_0))>;
 }
 
+// The following class definition is basically a copy of the
+// Lengthen_Double definition above, however with an additional parameter
+// "RevLanes" to select the correct VREV32dXX instruction. This is to convert
+// data loaded by VLD1LN into proper vector format in big endian mode.
+multiclass Lengthen_Double_Big_Endian<string DestLanes, string DestTy, string SrcTy,
+                           string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
+                           string Insn2Ty, string RevLanes> {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
+         (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+            (!cast<Instruction>("VREV32d" # RevLanes)
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+             dsub_0))>;
+  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
+         (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+            (!cast<Instruction>("VREV32d" # RevLanes)
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+             dsub_0))>;
+  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
+         (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
+            (!cast<Instruction>("VREV32d" # RevLanes)
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+             dsub_0))>;
+}
+
 // extload, zextload and sextload for a lengthening load followed by another
 // lengthening load, to quadruple the initial length, but which ends up only
 // requiring half the available lanes (a 64-bit outcome instead of a 128-bit).
@@ -6443,33 +6499,102 @@ multiclass Lengthen_HalfDouble<string DestLanes, string DestTy, string SrcTy,
           dsub_0)>;
 }
 
+// The following class definition is basically a copy of the
+// Lengthen_HalfDouble definition above, however with an additional VREV16d8
+// instruction to convert data loaded by VLD1LN into proper vector format
+// in big endian mode.
+multiclass Lengthen_HalfDouble_Big_Endian<string DestLanes, string DestTy, string SrcTy,
+                           string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
+                           string Insn2Ty> {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6:$addr)),
+         (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+            (!cast<Instruction>("VREV16d8")
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+             dsub_0)),
+          dsub_0)>;
+  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6:$addr)),
+         (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+            (!cast<Instruction>("VREV16d8")
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+             dsub_0)),
+          dsub_0)>;
+  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6:$addr)),
+         (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
+            (!cast<Instruction>("VREV16d8")
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+             dsub_0)),
+          dsub_0)>;
+}
+
 defm : Lengthen_Single<"8", "i16", "8">; // v8i8 -> v8i16
 defm : Lengthen_Single<"4", "i32", "16">; // v4i16 -> v4i32
 defm : Lengthen_Single<"2", "i64", "32">; // v2i32 -> v2i64
 
-defm : Lengthen_HalfSingle<"4", "i16", "i8", "8", "i16">; // v4i8 -> v4i16
-defm : Lengthen_HalfSingle<"2", "i32", "i16", "4", "i32">; // v2i16 -> v2i32
+let Predicates = [IsLE] in {
+  defm : Lengthen_HalfSingle<"4", "i16", "i8", "8", "i16">; // v4i8 -> v4i16
+  defm : Lengthen_HalfSingle<"2", "i32", "i16", "4", "i32">; // v2i16 -> v2i32
 
-// Double lengthening - v4i8 -> v4i16 -> v4i32
-defm : Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32">;
-// v2i8 -> v2i16 -> v2i32
-defm : Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32">;
-// v2i16 -> v2i32 -> v2i64
-defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">;
+  // Double lengthening - v4i8 -> v4i16 -> v4i32
+  defm : Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32">;
+  // v2i8 -> v2i16 -> v2i32
+  defm : Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32">;
+  // v2i16 -> v2i32 -> v2i64
+  defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">;
+}
+
+let Predicates = [IsBE] in {
+  defm : Lengthen_HalfSingle_Big_Endian<"4", "i16", "i8", "8", "i16", "8">; // v4i8 -> v4i16
+  defm : Lengthen_HalfSingle_Big_Endian<"2", "i32", "i16", "4", "i32", "16">; // v2i16 -> v2i32
+
+  // Double lengthening - v4i8 -> v4i16 -> v4i32
+  defm : Lengthen_Double_Big_Endian<"4", "i32", "i8", "8", "i16", "4", "i32", "8">;
+  // v2i8 -> v2i16 -> v2i32
+  defm : Lengthen_HalfDouble_Big_Endian<"2", "i32", "i8", "8", "i16", "4", "i32">;
+  // v2i16 -> v2i32 -> v2i64
+  defm : Lengthen_Double_Big_Endian<"2", "i64", "i16", "4", "i32", "2", "i64", "16">;
+}
 
 // Triple lengthening - v2i8 -> v2i16 -> v2i32 -> v2i64
-def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)),
-      (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
-         (VLD1LNd16 addrmode6:$addr,
-                    (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
-def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)),
-      (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
-         (VLD1LNd16 addrmode6:$addr,
-                    (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
-def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)),
-      (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16
-         (VLD1LNd16 addrmode6:$addr,
-                    (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)),
+        (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
+           (VLD1LNd16 addrmode6:$addr,
+                      (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+  def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)),
+        (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
+           (VLD1LNd16 addrmode6:$addr,
+                      (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+  def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)),
+        (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16
+           (VLD1LNd16 addrmode6:$addr,
+                      (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+}
+// The following patterns are basically a copy of the patterns above, 
+// however with an additional VREV16d instruction to convert data
+// loaded by VLD1LN into proper vector format in big endian mode.
+let Predicates = [IsBE] in {
+  def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)),
+        (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
+           (!cast<Instruction>("VREV16d8")
+             (VLD1LNd16 addrmode6:$addr,
+                        (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>;
+  def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)),
+        (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
+           (!cast<Instruction>("VREV16d8")
+             (VLD1LNd16 addrmode6:$addr,
+                        (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>;
+  def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)),
+        (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16
+           (!cast<Instruction>("VREV16d8")
+             (VLD1LNd16 addrmode6:$addr,
+                        (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>;
+}
 
 //===----------------------------------------------------------------------===//
 // Assembler aliases
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index c30d6ab..85e9351 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -3209,27 +3209,28 @@ def t2MOVCCi32imm
 let hasSideEffects = 1 in {
 def t2DMB : T2I<(outs), (ins memb_opt:$opt), NoItinerary,
                 "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>,
-                Requires<[HasDB]> {
+                Requires<[IsThumb, HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf3bf8f5;
   let Inst{3-0} = opt;
 }
-}
 
 def t2DSB : T2I<(outs), (ins memb_opt:$opt), NoItinerary,
                 "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>,
-                Requires<[HasDB]> {
+                Requires<[IsThumb, HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf3bf8f4;
   let Inst{3-0} = opt;
 }
 
 def t2ISB : T2I<(outs), (ins instsyncb_opt:$opt), NoItinerary,
-                "isb", "\t$opt", []>, Requires<[HasDB]> {
+                "isb", "\t$opt", [(int_arm_isb (i32 imm0_15:$opt))]>,
+                Requires<[IsThumb, HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf3bf8f6;
   let Inst{3-0} = opt;
 }
+}
 
 class T2I_ldrex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz,
                 InstrItinClass itin, string opc, string asm, string cstr,
diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp
index 8821c2d..6d1114d 100644
--- a/lib/Target/ARM/ARMJITInfo.cpp
+++ b/lib/Target/ARM/ARMJITInfo.cpp
@@ -13,6 +13,7 @@
 
 #include "ARMJITInfo.h"
 #include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
 #include "ARMRelocations.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "llvm/CodeGen/JITCodeEmitter.h"
@@ -334,3 +335,10 @@ void ARMJITInfo::relocate(void *Function, MachineRelocation *MR,
     }
   }
 }
+
+void ARMJITInfo::Initialize(const MachineFunction &MF, bool isPIC) {
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  ConstPoolId2AddrMap.resize(AFI->getNumPICLabels());
+  JumpTableId2AddrMap.resize(AFI->getNumJumpTables());
+  IsPIC = isPIC;
+}
diff --git a/lib/Target/ARM/ARMJITInfo.h b/lib/Target/ARM/ARMJITInfo.h
index ee4c863..27e2a20 100644
--- a/lib/Target/ARM/ARMJITInfo.h
+++ b/lib/Target/ARM/ARMJITInfo.h
@@ -14,7 +14,6 @@
 #ifndef ARMJITINFO_H
 #define ARMJITINFO_H
 
-#include "ARMMachineFunctionInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -103,12 +102,7 @@ namespace llvm {
     /// Resize constant pool ids to CONSTPOOL_ENTRY addresses map; resize
     /// jump table ids to jump table bases map; remember if codegen relocation
     /// model is PIC.
-    void Initialize(const MachineFunction &MF, bool isPIC) {
-      const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-      ConstPoolId2AddrMap.resize(AFI->getNumPICLabels());
-      JumpTableId2AddrMap.resize(AFI->getNumJumpTables());
-      IsPIC = isPIC;
-    }
+    void Initialize(const MachineFunction &MF, bool isPIC);
 
     /// getConstantPoolEntryAddr - The ARM target puts all constant
     /// pool entries into constant islands. This returns the address of the
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index ee7df54..a03bcdb 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -505,7 +505,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
 
   // Exception: If the base register is in the input reglist, Thumb1 LDM is
   // non-writeback. Check for this.
-  if (Opcode == ARM::tLDRi && isThumb1)
+  if (Opcode == ARM::tLDMIA && isThumb1)
     for (unsigned I = 0; I < NumRegs; ++I)
       if (Base == Regs[I].first) {
         Writeback = false;
@@ -519,17 +519,17 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
       // Update tLDMIA with writeback if necessary.
       Opcode = ARM::tLDMIA_UPD;
 
-    // The base isn't dead after a merged instruction with writeback. Update
-    // future uses of the base with the added offset (if possible), or reset
-    // the base register as necessary.
-    if (!BaseKill)
-      UpdateBaseRegUses(MBB, MBBI, dl, Base, NumRegs, Pred, PredReg);
-
     MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode));
 
     // Thumb1: we might need to set base writeback when building the MI.
     MIB.addReg(Base, getDefRegState(true))
        .addReg(Base, getKillRegState(BaseKill));
+
+    // The base isn't dead after a merged instruction with writeback. Update
+    // future uses of the base with the added offset (if possible), or reset
+    // the base register as necessary.
+    if (!BaseKill)
+      UpdateBaseRegUses(MBB, MBBI, dl, Base, NumRegs, Pred, PredReg);
   } else {
     // No writeback, simply build the MachineInstr.
     MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode));
@@ -1734,6 +1734,12 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   isThumb2 = AFI->isThumb2Function();
   isThumb1 = AFI->isThumbFunction() && !isThumb2;
 
+  // FIXME: Temporarily disabling for Thumb-1 due to miscompiles
+  if (isThumb1) {
+    delete RS;
+    return false;
+  }
+
   bool Modified = false;
   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
        ++MFI) {
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index 48141b1..023f5f8 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -34,7 +34,7 @@ MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
                                    OutContext);
     switch (Option) {
     default: llvm_unreachable("Unknown target flag on symbol operand");
-    case 0:
+    case ARMII::MO_NO_FLAG:
       break;
     case ARMII::MO_LO16:
       Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/lib/Target/ARM/ARMMachineFunctionInfo.cpp
index af445e2..892b269 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.cpp
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -12,3 +12,13 @@
 using namespace llvm;
 
 void ARMFunctionInfo::anchor() { }
+
+ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF)
+    : isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()),
+      hasThumb2(MF.getTarget().getSubtarget<ARMSubtarget>().hasThumb2()),
+      StByValParamsPadding(0), ArgRegsSaveSize(0), HasStackFrame(false),
+      RestoreSPFromFP(false), LRSpilledForFarJump(false),
+      FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
+      GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), JumpTableUId(0),
+      PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false),
+      GlobalBaseReg(0) {}
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index d7ec6eb..44a9e34 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -130,16 +130,7 @@ public:
     JumpTableUId(0), PICLabelUId(0),
     VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {}
 
-  explicit ARMFunctionInfo(MachineFunction &MF) :
-    isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()),
-    hasThumb2(MF.getTarget().getSubtarget<ARMSubtarget>().hasThumb2()),
-    StByValParamsPadding(0),
-    ArgRegsSaveSize(0), HasStackFrame(false), RestoreSPFromFP(false),
-    LRSpilledForFarJump(false),
-    FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
-    GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
-    JumpTableUId(0), PICLabelUId(0),
-    VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {}
+  explicit ARMFunctionInfo(MachineFunction &MF);
 
   bool isThumbFunction() const { return isThumb; }
   bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; }
@@ -220,7 +211,7 @@ public:
 
   void recordCPEClone(unsigned CPIdx, unsigned CPCloneIdx) {
     if (!CPEClones.insert(std::make_pair(CPCloneIdx, CPIdx)).second)
-      assert(0 && "Duplicate entries!");
+      llvm_unreachable("Duplicate entries!");
   }
 
   unsigned getOriginalCPIdx(unsigned CloneIdx) const {
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 008ad64..3dcc0df 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -18,10 +18,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "arm-selectiondag-info"
 
-ARMSelectionDAGInfo::ARMSelectionDAGInfo(const TargetMachine &TM)
-  : TargetSelectionDAGInfo(TM),
-    Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
-}
+ARMSelectionDAGInfo::ARMSelectionDAGInfo(const DataLayout &DL)
+    : TargetSelectionDAGInfo(&DL) {}
 
 ARMSelectionDAGInfo::~ARMSelectionDAGInfo() {
 }
@@ -34,6 +32,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                              bool isVolatile, bool AlwaysInline,
                                              MachinePointerInfo DstPtrInfo,
                                           MachinePointerInfo SrcPtrInfo) const {
+  const ARMSubtarget &Subtarget = DAG.getTarget().getSubtarget<ARMSubtarget>();
   // Do repeated 4-byte loads and stores. To be improved.
   // This requires 4-byte alignment.
   if ((Align & 3) != 0)
@@ -44,7 +43,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
   if (!ConstantSize)
     return SDValue();
   uint64_t SizeVal = ConstantSize->getZExtValue();
-  if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold())
+  if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
     return SDValue();
 
   unsigned BytesLeft = SizeVal & 3;
@@ -54,7 +53,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
   unsigned VTSize = 4;
   unsigned i = 0;
   // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
-  const unsigned MAX_LOADS_IN_LDM = Subtarget->isThumb1Only() ? 4 : 6;
+  const unsigned MAX_LOADS_IN_LDM = Subtarget.isThumb1Only() ? 4 : 6;
   SDValue TFOps[6];
   SDValue Loads[6];
   uint64_t SrcOff = 0, DstOff = 0;
@@ -151,9 +150,10 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                         SDValue Src, SDValue Size,
                         unsigned Align, bool isVolatile,
                         MachinePointerInfo DstPtrInfo) const {
+  const ARMSubtarget &Subtarget = DAG.getTarget().getSubtarget<ARMSubtarget>();
   // Use default for non-AAPCS (or MachO) subtargets
-  if (!Subtarget->isAAPCS_ABI() || Subtarget->isTargetMachO() ||
-      Subtarget->isTargetWindows())
+  if (!Subtarget.isAAPCS_ABI() || Subtarget.isTargetMachO() ||
+      Subtarget.isTargetWindows())
     return SDValue();
 
   const ARMTargetLowering &TLI =
@@ -191,7 +191,7 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
     .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMSET),
                Type::getVoidTy(*DAG.getContext()),
                DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET),
-                                     TLI.getPointerTy()), &Args, 0)
+                                     TLI.getPointerTy()), std::move(Args), 0)
     .setDiscardResult();
 
   std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.h b/lib/Target/ARM/ARMSelectionDAGInfo.h
index 8c2397b..13769dc 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -36,12 +36,8 @@ namespace ARM_AM {
 }  // end namespace ARM_AM
 
 class ARMSelectionDAGInfo : public TargetSelectionDAGInfo {
-  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const ARMSubtarget *Subtarget;
-
 public:
-  explicit ARMSelectionDAGInfo(const TargetMachine &TM);
+  explicit ARMSelectionDAGInfo(const DataLayout &DL);
   ~ARMSelectionDAGInfo();
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 5b204f6..0eb24ef 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -12,8 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMSubtarget.h"
-#include "ARMBaseInstrInfo.h"
-#include "ARMBaseRegisterInfo.h"
+#include "ARMFrameLowering.h"
+#include "ARMISelLowering.h"
+#include "ARMInstrInfo.h"
+#include "ARMJITInfo.h"
+#include "ARMSelectionDAGInfo.h"
+#include "ARMSubtarget.h"
+#include "Thumb1FrameLowering.h"
+#include "Thumb1InstrInfo.h"
+#include "Thumb2InstrInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
@@ -76,22 +83,89 @@ IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT),
                          "Allow IT blocks based on ARMv7"),
               clEnumValEnd));
 
-ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
-                           const std::string &FS, bool IsLittle,
-                           const TargetOptions &Options)
-  : ARMGenSubtargetInfo(TT, CPU, FS)
-  , ARMProcFamily(Others)
-  , ARMProcClass(None)
-  , stackAlignment(4)
-  , CPUString(CPU)
-  , IsLittle(IsLittle)
-  , TargetTriple(TT)
-  , Options(Options)
-  , TargetABI(ARM_ABI_UNKNOWN) {
+static std::string computeDataLayout(ARMSubtarget &ST) {
+  std::string Ret = "";
+
+  if (ST.isLittle())
+    // Little endian.
+    Ret += "e";
+  else
+    // Big endian.
+    Ret += "E";
+
+  Ret += DataLayout::getManglingComponent(ST.getTargetTriple());
+
+  // Pointers are 32 bits and aligned to 32 bits.
+  Ret += "-p:32:32";
+
+  // On thumb, i16,i18 and i1 have natural aligment requirements, but we try to
+  // align to 32.
+  if (ST.isThumb())
+    Ret += "-i1:8:32-i8:8:32-i16:16:32";
+
+  // ABIs other than APCS have 64 bit integers with natural alignment.
+  if (!ST.isAPCS_ABI())
+    Ret += "-i64:64";
+
+  // We have 64 bits floats. The APCS ABI requires them to be aligned to 32
+  // bits, others to 64 bits. We always try to align to 64 bits.
+  if (ST.isAPCS_ABI())
+    Ret += "-f64:32:64";
+
+  // We have 128 and 64 bit vectors. The APCS ABI aligns them to 32 bits, others
+  // to 64. We always ty to give them natural alignment.
+  if (ST.isAPCS_ABI())
+    Ret += "-v64:32:64-v128:32:128";
+  else
+    Ret += "-v128:64:128";
+
+  // On thumb and APCS, only try to align aggregates to 32 bits (the default is
+  // 64 bits).
+  if (ST.isThumb() || ST.isAPCS_ABI())
+    Ret += "-a:0:32";
+
+  // Integer registers are 32 bits.
+  Ret += "-n32";
+
+  // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit
+  // aligned everywhere else.
+  if (ST.isTargetNaCl())
+    Ret += "-S128";
+  else if (ST.isAAPCS_ABI())
+    Ret += "-S64";
+  else
+    Ret += "-S32";
+
+  return Ret;
+}
+
+/// initializeSubtargetDependencies - Initializes using a CPU and feature string
+/// so that we can use initializer lists for subtarget initialization.
+ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                            StringRef FS) {
   initializeEnvironment();
   resetSubtargetFeatures(CPU, FS);
+  return *this;
 }
 
+ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
+                           const std::string &FS, TargetMachine &TM,
+                           bool IsLittle, const TargetOptions &Options)
+    : ARMGenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
+      ARMProcClass(None), stackAlignment(4), CPUString(CPU), IsLittle(IsLittle),
+      TargetTriple(TT), Options(Options), TargetABI(ARM_ABI_UNKNOWN),
+      DL(computeDataLayout(initializeSubtargetDependencies(CPU, FS))),
+      TSInfo(DL), JITInfo(),
+      InstrInfo(isThumb1Only()
+                    ? (ARMBaseInstrInfo *)new Thumb1InstrInfo(*this)
+                    : !isThumb()
+                          ? (ARMBaseInstrInfo *)new ARMInstrInfo(*this)
+                          : (ARMBaseInstrInfo *)new Thumb2InstrInfo(*this)),
+      TLInfo(TM),
+      FrameLowering(!isThumb1Only()
+                        ? new ARMFrameLowering(*this)
+                        : (ARMFrameLowering *)new Thumb1FrameLowering(*this)) {}
+
 void ARMSubtarget::initializeEnvironment() {
   HasV4TOps = false;
   HasV5TOps = false;
@@ -106,7 +180,6 @@ void ARMSubtarget::initializeEnvironment() {
   HasVFPv4 = false;
   HasFPARMv8 = false;
   HasNEON = false;
-  MinSize = false;
   UseNEONForSinglePrecisionFP = false;
   UseMulOps = UseFusedMulOps;
   SlowFPVMLx = false;
@@ -158,9 +231,6 @@ void ARMSubtarget::resetSubtargetFeatures(const MachineFunction *MF) {
     initializeEnvironment();
     resetSubtargetFeatures(CPU, FS);
   }
-
-  MinSize =
-      FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
 }
 
 void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
@@ -353,6 +423,17 @@ bool ARMSubtarget::hasSinCos() const {
     !getTargetTriple().isOSVersionLT(7, 0);
 }
 
+// Enable the PostMachineScheduler if the target selects it instead of
+// PostRAScheduler. Currently only available on the command line via
+// -misched-postra.
+bool ARMSubtarget::enablePostMachineScheduler() const {
+  return PostRAScheduler;
+}
+
+bool ARMSubtarget::enableAtomicExpandLoadLinked() const {
+  return hasAnyDataBarrier() && !isThumb1Only();
+}
+
 bool ARMSubtarget::enablePostRAScheduler(
            CodeGenOpt::Level OptLevel,
            TargetSubtargetInfo::AntiDepBreakMode& Mode,
@@ -360,3 +441,12 @@ bool ARMSubtarget::enablePostRAScheduler(
   Mode = TargetSubtargetInfo::ANTIDEP_NONE;
   return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
 }
+
+bool ARMSubtarget::useMovt(const MachineFunction &MF) const {
+  // NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit
+  // immediates as it is inherently position independent, and may be out of
+  // range otherwise.
+  return UseMovt && (isTargetWindows() ||
+                     !MF.getFunction()->getAttributes().hasAttribute(
+                         AttributeSet::FunctionIndex, Attribute::MinSize));
+}
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 38536b2..8f6c165 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -14,8 +14,20 @@
 #ifndef ARMSUBTARGET_H
 #define ARMSUBTARGET_H
 
+
+#include "ARMFrameLowering.h"
+#include "ARMISelLowering.h"
+#include "ARMInstrInfo.h"
+#include "ARMJITInfo.h"
+#include "ARMSelectionDAGInfo.h"
+#include "ARMSubtarget.h"
+#include "Thumb1FrameLowering.h"
+#include "Thumb1InstrInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "ARMJITInfo.h"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
@@ -64,10 +76,6 @@ protected:
   bool HasFPARMv8;
   bool HasNEON;
 
-  /// MinSize - True if the function being compiled has the "minsize" attribute
-  /// and should be optimised for size at the expense of speed.
-  bool MinSize;
-
   /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been
   /// specified. Use the method useNEONForSinglePrecisionFP() to
   /// determine if NEON should actually be used.
@@ -236,7 +244,7 @@ protected:
   /// of the specified triple.
   ///
   ARMSubtarget(const std::string &TT, const std::string &CPU,
-               const std::string &FS, bool IsLittle,
+               const std::string &FS, TargetMachine &TM, bool IsLittle,
                const TargetOptions &Options);
 
   /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
@@ -250,7 +258,31 @@ protected:
 
   /// \brief Reset the features for the ARM target.
   void resetSubtargetFeatures(const MachineFunction *MF) override;
+
+  /// initializeSubtargetDependencies - Initializes using a CPU and feature string
+  /// so that we can use initializer lists for subtarget initialization.
+  ARMSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+  const DataLayout *getDataLayout() const { return &DL; }
+  const ARMSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  ARMJITInfo *getJITInfo() { return &JITInfo; }
+  const ARMBaseInstrInfo *getInstrInfo() const { return InstrInfo.get(); }
+  const ARMTargetLowering *getTargetLowering() const { return &TLInfo; }
+  const ARMFrameLowering *getFrameLowering() const { return FrameLowering.get(); }
+  const ARMBaseRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo->getRegisterInfo();
+  }
+
 private:
+  const DataLayout DL;
+  ARMSelectionDAGInfo TSInfo;
+  ARMJITInfo JITInfo;
+  // Either Thumb1InstrInfo or Thumb2InstrInfo.
+  std::unique_ptr<ARMBaseInstrInfo> InstrInfo;
+  ARMTargetLowering   TLInfo;
+  // Either Thumb1FrameLowering or ARMFrameLowering.
+  std::unique_ptr<ARMFrameLowering> FrameLowering;
+
   void initializeEnvironment();
   void resetSubtargetFeatures(StringRef CPU, StringRef FS);
 public:
@@ -286,7 +318,6 @@ public:
   bool hasCrypto() const { return HasCrypto; }
   bool hasCRC() const { return HasCRC; }
   bool hasVirtualization() const { return HasVirtualization; }
-  bool isMinSize() const { return MinSize; }
   bool useNEONForSinglePrecisionFP() const {
     return hasNEON() && UseNEONForSinglePrecisionFP; }
 
@@ -382,7 +413,8 @@ public:
 
   bool isR9Reserved() const { return IsR9Reserved; }
 
-  bool useMovt() const { return UseMovt && !isMinSize(); }
+  bool useMovt(const MachineFunction &MF) const;
+
   bool supportsTailCall() const { return SupportsTailCall; }
 
   bool allowsUnalignedMem() const { return AllowsUnalignedMem; }
@@ -399,11 +431,17 @@ public:
   /// compiler runtime or math libraries.
   bool hasSinCos() const;
 
+  /// True for some subtargets at > -O0.
+  bool enablePostMachineScheduler() const;
+
   /// enablePostRAScheduler - True at 'More' optimization.
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                              TargetSubtargetInfo::AntiDepBreakMode& Mode,
                              RegClassVector& CriticalPathRCs) const override;
 
+  // enableAtomicExpandLoadLinked - True if we need to expand our atomics.
+  bool enableAtomicExpandLoadLinked() const override;
+
   /// getInstrItins - Return the instruction itineraies based on subtarget
   /// selection.
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 8876227..d85194b 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -28,6 +28,12 @@ DisableA15SDOptimization("disable-a15-sd-optimization", cl::Hidden,
                    cl::desc("Inhibit optimization of S->D register accesses on A15"),
                    cl::init(false));
 
+static cl::opt<bool>
+EnableAtomicTidy("arm-atomic-cfg-tidy", cl::Hidden,
+                 cl::desc("Run SimplifyCFG after expanding atomic operations"
+                          " to make use of cmpxchg flow-based information"),
+                 cl::init(true));
+
 extern "C" void LLVMInitializeARMTarget() {
   // Register the target.
   RegisterTargetMachine<ARMLETargetMachine> X(TheARMLETarget);
@@ -43,12 +49,9 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, StringRef TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
                                            Reloc::Model RM, CodeModel::Model CM,
-                                           CodeGenOpt::Level OL,
-                                           bool isLittle)
-  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS, isLittle, Options),
-    JITInfo(),
-    InstrItins(Subtarget.getInstrItineraryData()) {
+                                           CodeGenOpt::Level OL, bool isLittle)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, *this, isLittle, Options) {
 
   // Default to triple-appropriate float ABI
   if (Options.FloatABIType == FloatABI::Default)
@@ -67,74 +70,11 @@ void ARMBaseTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
 
 void ARMTargetMachine::anchor() { }
 
-static std::string computeDataLayout(ARMSubtarget &ST) {
-  std::string Ret = "";
-
-  if (ST.isLittle())
-    // Little endian.
-    Ret += "e";
-  else
-    // Big endian.
-    Ret += "E";
-
-  Ret += DataLayout::getManglingComponent(ST.getTargetTriple());
-
-  // Pointers are 32 bits and aligned to 32 bits.
-  Ret += "-p:32:32";
-
-  // On thumb, i16,i18 and i1 have natural aligment requirements, but we try to
-  // align to 32.
-  if (ST.isThumb())
-    Ret += "-i1:8:32-i8:8:32-i16:16:32";
-
-  // ABIs other than APCS have 64 bit integers with natural alignment.
-  if (!ST.isAPCS_ABI())
-    Ret += "-i64:64";
-
-  // We have 64 bits floats. The APCS ABI requires them to be aligned to 32
-  // bits, others to 64 bits. We always try to align to 64 bits.
-  if (ST.isAPCS_ABI())
-    Ret += "-f64:32:64";
-
-  // We have 128 and 64 bit vectors. The APCS ABI aligns them to 32 bits, others
-  // to 64. We always ty to give them natural alignment.
-  if (ST.isAPCS_ABI())
-    Ret += "-v64:32:64-v128:32:128";
-  else
-    Ret += "-v128:64:128";
-
-  // On thumb and APCS, only try to align aggregates to 32 bits (the default is
-  // 64 bits).
-  if (ST.isThumb() || ST.isAPCS_ABI())
-    Ret += "-a:0:32";
-
-  // Integer registers are 32 bits.
-  Ret += "-n32";
-
-  // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit
-  // aligned everywhere else.
-  if (ST.isTargetNaCl())
-    Ret += "-S128";
-  else if (ST.isAAPCS_ABI())
-    Ret += "-S64";
-  else
-    Ret += "-S32";
-
-  return Ret;
-}
-
-ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT,
-                                   StringRef CPU, StringRef FS,
-                                   const TargetOptions &Options,
+ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                                   StringRef FS, const TargetOptions &Options,
                                    Reloc::Model RM, CodeModel::Model CM,
-                                   CodeGenOpt::Level OL,
-                                   bool isLittle)
-  : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle),
-    InstrInfo(Subtarget),
-    DL(computeDataLayout(Subtarget)),
-    TLInfo(*this),
-    TSInfo(*this),
-    FrameLowering(Subtarget) {
+                                   CodeGenOpt::Level OL, bool isLittle)
+    : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) {
   initAsmInfo();
   if (!Subtarget.hasARMOps())
     report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "
@@ -143,21 +83,21 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT,
 
 void ARMLETargetMachine::anchor() { }
 
-ARMLETargetMachine::
-ARMLETargetMachine(const Target &T, StringRef TT,
-                       StringRef CPU, StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL)
-  : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+ARMLETargetMachine::ARMLETargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+    : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
 void ARMBETargetMachine::anchor() { }
 
-ARMBETargetMachine::
-ARMBETargetMachine(const Target &T, StringRef TT,
-                       StringRef CPU, StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL)
-  : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+ARMBETargetMachine::ARMBETargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+    : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
 void ThumbTargetMachine::anchor() { }
 
@@ -165,38 +105,29 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT,
                                        StringRef CPU, StringRef FS,
                                        const TargetOptions &Options,
                                        Reloc::Model RM, CodeModel::Model CM,
-                                       CodeGenOpt::Level OL,
-                                       bool isLittle)
-  : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle),
-    InstrInfo(Subtarget.hasThumb2()
-              ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget))
-              : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))),
-    DL(computeDataLayout(Subtarget)),
-    TLInfo(*this),
-    TSInfo(*this),
-    FrameLowering(Subtarget.hasThumb2()
-              ? new ARMFrameLowering(Subtarget)
-              : (ARMFrameLowering*)new Thumb1FrameLowering(Subtarget)) {
+                                       CodeGenOpt::Level OL, bool isLittle)
+    : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL,
+                           isLittle) {
   initAsmInfo();
 }
 
 void ThumbLETargetMachine::anchor() { }
 
-ThumbLETargetMachine::
-ThumbLETargetMachine(const Target &T, StringRef TT,
-                       StringRef CPU, StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL)
-  : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM, CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+    : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
 void ThumbBETargetMachine::anchor() { }
 
-ThumbBETargetMachine::
-ThumbBETargetMachine(const Target &T, StringRef TT,
-                       StringRef CPU, StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL)
-  : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM, CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+    : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
 namespace {
 /// ARM Code Generator Pass Configuration Options.
@@ -213,6 +144,7 @@ public:
     return *getARMTargetMachine().getSubtargetImpl();
   }
 
+  void addIRPasses() override;
   bool addPreISel() override;
   bool addInstSelector() override;
   bool addPreRegAlloc() override;
@@ -225,11 +157,21 @@ TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new ARMPassConfig(this, PM);
 }
 
-bool ARMPassConfig::addPreISel() {
+void ARMPassConfig::addIRPasses() {
+  addPass(createAtomicExpandLoadLinkedPass(TM));
+
+  // Cmpxchg instructions are often used with a subsequent comparison to
+  // determine whether it succeeded. We can exploit existing control-flow in
+  // ldrex/strex loops to simplify this, but it needs tidying up.
   const ARMSubtarget *Subtarget = &getARMSubtarget();
   if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only())
-    addPass(createAtomicExpandLoadLinkedPass(TM));
+    if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
+      addPass(createCFGSimplificationPass());
 
+  TargetPassConfig::addIRPasses();
+}
+
+bool ARMPassConfig::addPreISel() {
   if (TM->getOptLevel() != CodeGenOpt::None)
     addPass(createGlobalMergePass(TM));
 
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index 664c992..b72b1df 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -14,17 +14,9 @@
 #ifndef ARMTARGETMACHINE_H
 #define ARMTARGETMACHINE_H
 
-#include "ARMFrameLowering.h"
-#include "ARMISelLowering.h"
 #include "ARMInstrInfo.h"
-#include "ARMJITInfo.h"
-#include "ARMSelectionDAGInfo.h"
 #include "ARMSubtarget.h"
-#include "Thumb1FrameLowering.h"
-#include "Thumb1InstrInfo.h"
-#include "Thumb2InstrInfo.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
@@ -32,10 +24,6 @@ namespace llvm {
 class ARMBaseTargetMachine : public LLVMTargetMachine {
 protected:
   ARMSubtarget        Subtarget;
-private:
-  ARMJITInfo          JITInfo;
-  InstrItineraryData  InstrItins;
-
 public:
   ARMBaseTargetMachine(const Target &T, StringRef TT,
                        StringRef CPU, StringRef FS,
@@ -44,15 +32,29 @@ public:
                        CodeGenOpt::Level OL,
                        bool isLittle);
 
-  ARMJITInfo *getJITInfo() override { return &JITInfo; }
   const ARMSubtarget *getSubtargetImpl() const override { return &Subtarget; }
+  const ARMBaseRegisterInfo *getRegisterInfo() const override {
+    return getSubtargetImpl()->getRegisterInfo();
+  }
   const ARMTargetLowering *getTargetLowering() const override {
-    // Implemented by derived classes
-    llvm_unreachable("getTargetLowering not implemented");
+    return getSubtargetImpl()->getTargetLowering();
+  }
+  const ARMSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return getSubtargetImpl()->getSelectionDAGInfo();
+  }
+  const ARMBaseInstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
+  }
+  const ARMFrameLowering *getFrameLowering() const override {
+    return getSubtargetImpl()->getFrameLowering();
   }
   const InstrItineraryData *getInstrItineraryData() const override {
-    return &InstrItins;
+    return &getSubtargetImpl()->getInstrItineraryData();
   }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
+  }
+  ARMJITInfo *getJITInfo() override { return Subtarget.getJITInfo(); }
 
   /// \brief Register ARM analysis passes with a pass manager.
   void addAnalysisPasses(PassManagerBase &PM) override;
@@ -67,35 +69,10 @@ public:
 ///
 class ARMTargetMachine : public ARMBaseTargetMachine {
   virtual void anchor();
-  ARMInstrInfo        InstrInfo;
-  const DataLayout    DL;       // Calculates type size & alignment
-  ARMTargetLowering   TLInfo;
-  ARMSelectionDAGInfo TSInfo;
-  ARMFrameLowering    FrameLowering;
  public:
-  ARMTargetMachine(const Target &T, StringRef TT,
-                   StringRef CPU, StringRef FS,
-                   const TargetOptions &Options,
-                   Reloc::Model RM, CodeModel::Model CM,
-                   CodeGenOpt::Level OL,
-                   bool isLittle);
-
-  const ARMRegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
-  }
-
-  const ARMTargetLowering *getTargetLowering() const override {
-    return &TLInfo;
-  }
-
-  const ARMSelectionDAGInfo *getSelectionDAGInfo() const override {
-    return &TSInfo;
-  }
-  const ARMFrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
-  }
-  const ARMInstrInfo *getInstrInfo() const override { return &InstrInfo; }
-  const DataLayout *getDataLayout() const override { return &DL; }
+   ARMTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+                    const TargetOptions &Options, Reloc::Model RM,
+                    CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle);
 };
 
 /// ARMLETargetMachine - ARM little endian target machine.
@@ -114,10 +91,9 @@ public:
 class ARMBETargetMachine : public ARMTargetMachine {
   void anchor() override;
 public:
-  ARMBETargetMachine(const Target &T, StringRef TT,
-                     StringRef CPU, StringRef FS, const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM,
-                     CodeGenOpt::Level OL);
+  ARMBETargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+                     const TargetOptions &Options, Reloc::Model RM,
+                     CodeModel::Model CM, CodeGenOpt::Level OL);
 };
 
 /// ThumbTargetMachine - Thumb target machine.
@@ -126,43 +102,10 @@ public:
 ///
 class ThumbTargetMachine : public ARMBaseTargetMachine {
   virtual void anchor();
-  // Either Thumb1InstrInfo or Thumb2InstrInfo.
-  std::unique_ptr<ARMBaseInstrInfo> InstrInfo;
-  const DataLayout    DL;   // Calculates type size & alignment
-  ARMTargetLowering   TLInfo;
-  ARMSelectionDAGInfo TSInfo;
-  // Either Thumb1FrameLowering or ARMFrameLowering.
-  std::unique_ptr<ARMFrameLowering> FrameLowering;
 public:
-  ThumbTargetMachine(const Target &T, StringRef TT,
-                     StringRef CPU, StringRef FS,
-                     const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM,
-                     CodeGenOpt::Level OL,
-                     bool isLittle);
-
-  /// returns either Thumb1RegisterInfo or Thumb2RegisterInfo
-  const ARMBaseRegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo->getRegisterInfo();
-  }
-
-  const ARMTargetLowering *getTargetLowering() const override {
-    return &TLInfo;
-  }
-
-  const ARMSelectionDAGInfo *getSelectionDAGInfo() const override {
-    return &TSInfo;
-  }
-
-  /// returns either Thumb1InstrInfo or Thumb2InstrInfo
-  const ARMBaseInstrInfo *getInstrInfo() const override {
-    return InstrInfo.get();
-  }
-  /// returns either Thumb1FrameLowering or ARMFrameLowering
-  const ARMFrameLowering *getFrameLowering() const override {
-    return FrameLowering.get();
-  }
-  const DataLayout *getDataLayout() const override { return &DL; }
+  ThumbTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+                     const TargetOptions &Options, Reloc::Model RM,
+                     CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle);
 };
 
 /// ThumbLETargetMachine - Thumb little endian target machine.
@@ -170,10 +113,10 @@ public:
 class ThumbLETargetMachine : public ThumbTargetMachine {
   void anchor() override;
 public:
-  ThumbLETargetMachine(const Target &T, StringRef TT,
-                     StringRef CPU, StringRef FS, const TargetOptions &Options,
-                     Reloc::Model RM, CodeModel::Model CM,
-                     CodeGenOpt::Level OL);
+  ThumbLETargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
 };
 
 /// ThumbBETargetMachine - Thumb big endian target machine.
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 57df7da..a2ace62 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -443,31 +443,58 @@ unsigned ARMTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
 
 unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
                                 Type *SubTp) const {
-  // We only handle costs of reverse shuffles for now.
-  if (Kind != SK_Reverse)
+  // We only handle costs of reverse and alternate shuffles for now.
+  if (Kind != SK_Reverse && Kind != SK_Alternate)
     return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 
-  static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = {
-    // Reverse shuffle cost one instruction if we are shuffling within a double
-    // word (vrev) or two if we shuffle a quad word (vrev, vext).
-    { ISD::VECTOR_SHUFFLE, MVT::v2i32, 1 },
-    { ISD::VECTOR_SHUFFLE, MVT::v2f32, 1 },
-    { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 },
-    { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 },
-
-    { ISD::VECTOR_SHUFFLE, MVT::v4i32, 2 },
-    { ISD::VECTOR_SHUFFLE, MVT::v4f32, 2 },
-    { ISD::VECTOR_SHUFFLE, MVT::v8i16, 2 },
-    { ISD::VECTOR_SHUFFLE, MVT::v16i8, 2 }
-  };
+  if (Kind == SK_Reverse) {
+    static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = {
+        // Reverse shuffle cost one instruction if we are shuffling within a
+        // double word (vrev) or two if we shuffle a quad word (vrev, vext).
+        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
 
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
 
-  int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
-  if (Idx == -1)
-    return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+
+    int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+    if (Idx == -1)
+      return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 
-  return LT.first * NEONShuffleTbl[Idx].Cost;
+    return LT.first * NEONShuffleTbl[Idx].Cost;
+  }
+  if (Kind == SK_Alternate) {
+    static const CostTblEntry<MVT::SimpleValueType> NEONAltShuffleTbl[] = {
+        // Alt shuffle cost table for ARM. Cost is the number of instructions
+        // required to create the shuffled vector.
+
+        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
+
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+    int Idx =
+        CostTableLookup(NEONAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+    if (Idx == -1)
+      return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+    return LT.first * NEONAltShuffleTbl[Idx].Cost;
+  }
+  return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
 unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 5cdf394..b62706c 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -190,11 +190,11 @@ class ARMAsmParser : public MCTargetAsmParser {
   }
 
   int tryParseRegister();
-  bool tryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &);
-  int tryParseShiftRegister(SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &, StringRef Mnemonic);
+  bool tryParseRegisterWithWriteBack(OperandVector &);
+  int tryParseShiftRegister(OperandVector &);
+  bool parseRegisterList(OperandVector &);
+  bool parseMemory(OperandVector &);
+  bool parseOperand(OperandVector &, StringRef Mnemonic);
   bool parsePrefix(ARMMCExpr::VariantKind &RefKind);
   bool parseMemRegOffsetShift(ARM_AM::ShiftOpc &ShiftType,
                               unsigned &ShiftAmount);
@@ -282,54 +282,42 @@ class ARMAsmParser : public MCTargetAsmParser {
 
   /// }
 
-  OperandMatchResultTy parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseCoprocNumOperand(
-    SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseCoprocRegOperand(
-    SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseCoprocOptionOperand(
-    SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseMemBarrierOptOperand(
-    SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseInstSyncBarrierOptOperand(
-    SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseProcIFlagsOperand(
-    SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseMSRMaskOperand(
-    SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parsePKHImm(SmallVectorImpl<MCParsedAsmOperand*> &O,
-                                   StringRef Op, int Low, int High);
-  OperandMatchResultTy parsePKHLSLImm(SmallVectorImpl<MCParsedAsmOperand*> &O) {
+  OperandMatchResultTy parseITCondCode(OperandVector &);
+  OperandMatchResultTy parseCoprocNumOperand(OperandVector &);
+  OperandMatchResultTy parseCoprocRegOperand(OperandVector &);
+  OperandMatchResultTy parseCoprocOptionOperand(OperandVector &);
+  OperandMatchResultTy parseMemBarrierOptOperand(OperandVector &);
+  OperandMatchResultTy parseInstSyncBarrierOptOperand(OperandVector &);
+  OperandMatchResultTy parseProcIFlagsOperand(OperandVector &);
+  OperandMatchResultTy parseMSRMaskOperand(OperandVector &);
+  OperandMatchResultTy parsePKHImm(OperandVector &O, StringRef Op, int Low,
+                                   int High);
+  OperandMatchResultTy parsePKHLSLImm(OperandVector &O) {
     return parsePKHImm(O, "lsl", 0, 31);
   }
-  OperandMatchResultTy parsePKHASRImm(SmallVectorImpl<MCParsedAsmOperand*> &O) {
+  OperandMatchResultTy parsePKHASRImm(OperandVector &O) {
     return parsePKHImm(O, "asr", 1, 32);
   }
-  OperandMatchResultTy parseSetEndImm(SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseShifterImm(SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseRotImm(SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseBitfield(SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parsePostIdxReg(SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseFPImm(SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy parseVectorList(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseSetEndImm(OperandVector &);
+  OperandMatchResultTy parseShifterImm(OperandVector &);
+  OperandMatchResultTy parseRotImm(OperandVector &);
+  OperandMatchResultTy parseBitfield(OperandVector &);
+  OperandMatchResultTy parsePostIdxReg(OperandVector &);
+  OperandMatchResultTy parseAM3Offset(OperandVector &);
+  OperandMatchResultTy parseFPImm(OperandVector &);
+  OperandMatchResultTy parseVectorList(OperandVector &);
   OperandMatchResultTy parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index,
                                        SMLoc &EndLoc);
 
   // Asm Match Converter Methods
-  void cvtThumbMultiply(MCInst &Inst,
-                        const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtThumbBranches(MCInst &Inst,
-                        const SmallVectorImpl<MCParsedAsmOperand*> &);
-
-  bool validateInstruction(MCInst &Inst,
-                           const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
-  bool processInstruction(MCInst &Inst,
-                          const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
-  bool shouldOmitCCOutOperand(StringRef Mnemonic,
-                              SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-  bool shouldOmitPredicateOperand(StringRef Mnemonic,
-                              SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  void cvtThumbMultiply(MCInst &Inst, const OperandVector &);
+  void cvtThumbBranches(MCInst &Inst, const OperandVector &);
+
+  bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
+  bool processInstruction(MCInst &Inst, const OperandVector &Ops);
+  bool shouldOmitCCOutOperand(StringRef Mnemonic, OperandVector &Operands);
+  bool shouldOmitPredicateOperand(StringRef Mnemonic, OperandVector &Operands);
+
 public:
   enum ARMMatchResultTy {
     Match_RequiresITBlock = FIRST_TARGET_MATCH_RESULT_TY,
@@ -361,19 +349,17 @@ public:
 
   // Implementation of the MCTargetAsmParser interface:
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
-  bool
-  ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                   SMLoc NameLoc,
-                   SmallVectorImpl<MCParsedAsmOperand*> &Operands) override;
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
   bool ParseDirective(AsmToken DirectiveID) override;
 
-  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
                                       unsigned Kind) override;
   unsigned checkTargetMatchPredicate(MCInst &Inst) override;
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer &Out, unsigned &ErrorInfo,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
                                bool MatchingInlineAsm) override;
   void onLabelParsed(MCSymbol *Symbol) override;
 };
@@ -545,8 +531,8 @@ class ARMOperand : public MCParsedAsmOperand {
     struct BitfieldOp Bitfield;
   };
 
-  ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
 public:
+  ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
   ARMOperand(const ARMOperand &o) : MCParsedAsmOperand() {
     Kind = o.Kind;
     StartLoc = o.StartLoc;
@@ -2481,56 +2467,58 @@ public:
 
   void print(raw_ostream &OS) const override;
 
-  static ARMOperand *CreateITMask(unsigned Mask, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_ITCondMask);
+  static std::unique_ptr<ARMOperand> CreateITMask(unsigned Mask, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_ITCondMask);
     Op->ITMask.Mask = Mask;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
-  static ARMOperand *CreateCondCode(ARMCC::CondCodes CC, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_CondCode);
+  static std::unique_ptr<ARMOperand> CreateCondCode(ARMCC::CondCodes CC,
+                                                    SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_CondCode);
     Op->CC.Val = CC;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
-  static ARMOperand *CreateCoprocNum(unsigned CopVal, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_CoprocNum);
+  static std::unique_ptr<ARMOperand> CreateCoprocNum(unsigned CopVal, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_CoprocNum);
     Op->Cop.Val = CopVal;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
-  static ARMOperand *CreateCoprocReg(unsigned CopVal, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_CoprocReg);
+  static std::unique_ptr<ARMOperand> CreateCoprocReg(unsigned CopVal, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_CoprocReg);
     Op->Cop.Val = CopVal;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
-  static ARMOperand *CreateCoprocOption(unsigned Val, SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_CoprocOption);
+  static std::unique_ptr<ARMOperand> CreateCoprocOption(unsigned Val, SMLoc S,
+                                                        SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_CoprocOption);
     Op->Cop.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static ARMOperand *CreateCCOut(unsigned RegNum, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_CCOut);
+  static std::unique_ptr<ARMOperand> CreateCCOut(unsigned RegNum, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_CCOut);
     Op->Reg.RegNum = RegNum;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
-  static ARMOperand *CreateToken(StringRef Str, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_Token);
+  static std::unique_ptr<ARMOperand> CreateToken(StringRef Str, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_Token);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->StartLoc = S;
@@ -2538,20 +2526,20 @@ public:
     return Op;
   }
 
-  static ARMOperand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_Register);
+  static std::unique_ptr<ARMOperand> CreateReg(unsigned RegNum, SMLoc S,
+                                               SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_Register);
     Op->Reg.RegNum = RegNum;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static ARMOperand *CreateShiftedRegister(ARM_AM::ShiftOpc ShTy,
-                                           unsigned SrcReg,
-                                           unsigned ShiftReg,
-                                           unsigned ShiftImm,
-                                           SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_ShiftedRegister);
+  static std::unique_ptr<ARMOperand>
+  CreateShiftedRegister(ARM_AM::ShiftOpc ShTy, unsigned SrcReg,
+                        unsigned ShiftReg, unsigned ShiftImm, SMLoc S,
+                        SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_ShiftedRegister);
     Op->RegShiftedReg.ShiftTy = ShTy;
     Op->RegShiftedReg.SrcReg = SrcReg;
     Op->RegShiftedReg.ShiftReg = ShiftReg;
@@ -2561,11 +2549,10 @@ public:
     return Op;
   }
 
-  static ARMOperand *CreateShiftedImmediate(ARM_AM::ShiftOpc ShTy,
-                                            unsigned SrcReg,
-                                            unsigned ShiftImm,
-                                            SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_ShiftedImmediate);
+  static std::unique_ptr<ARMOperand>
+  CreateShiftedImmediate(ARM_AM::ShiftOpc ShTy, unsigned SrcReg,
+                         unsigned ShiftImm, SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_ShiftedImmediate);
     Op->RegShiftedImm.ShiftTy = ShTy;
     Op->RegShiftedImm.SrcReg = SrcReg;
     Op->RegShiftedImm.ShiftImm = ShiftImm;
@@ -2574,9 +2561,9 @@ public:
     return Op;
   }
 
-  static ARMOperand *CreateShifterImm(bool isASR, unsigned Imm,
-                                   SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_ShifterImmediate);
+  static std::unique_ptr<ARMOperand> CreateShifterImm(bool isASR, unsigned Imm,
+                                                      SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_ShifterImmediate);
     Op->ShifterImm.isASR = isASR;
     Op->ShifterImm.Imm = Imm;
     Op->StartLoc = S;
@@ -2584,17 +2571,18 @@ public:
     return Op;
   }
 
-  static ARMOperand *CreateRotImm(unsigned Imm, SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_RotateImmediate);
+  static std::unique_ptr<ARMOperand> CreateRotImm(unsigned Imm, SMLoc S,
+                                                  SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_RotateImmediate);
     Op->RotImm.Imm = Imm;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static ARMOperand *CreateBitfield(unsigned LSB, unsigned Width,
-                                    SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_BitfieldDescriptor);
+  static std::unique_ptr<ARMOperand>
+  CreateBitfield(unsigned LSB, unsigned Width, SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_BitfieldDescriptor);
     Op->Bitfield.LSB = LSB;
     Op->Bitfield.Width = Width;
     Op->StartLoc = S;
@@ -2602,8 +2590,8 @@ public:
     return Op;
   }
 
-  static ARMOperand *
-  CreateRegList(SmallVectorImpl<std::pair<unsigned, unsigned> > &Regs,
+  static std::unique_ptr<ARMOperand>
+  CreateRegList(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs,
                 SMLoc StartLoc, SMLoc EndLoc) {
     assert (Regs.size() > 0 && "RegList contains no registers?");
     KindTy Kind = k_RegisterList;
@@ -2617,7 +2605,7 @@ public:
     // Sort based on the register encoding values.
     array_pod_sort(Regs.begin(), Regs.end());
 
-    ARMOperand *Op = new ARMOperand(Kind);
+    auto Op = make_unique<ARMOperand>(Kind);
     for (SmallVectorImpl<std::pair<unsigned, unsigned> >::const_iterator
            I = Regs.begin(), E = Regs.end(); I != E; ++I)
       Op->Registers.push_back(I->second);
@@ -2626,9 +2614,11 @@ public:
     return Op;
   }
 
-  static ARMOperand *CreateVectorList(unsigned RegNum, unsigned Count,
-                                      bool isDoubleSpaced, SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_VectorList);
+  static std::unique_ptr<ARMOperand> CreateVectorList(unsigned RegNum,
+                                                      unsigned Count,
+                                                      bool isDoubleSpaced,
+                                                      SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_VectorList);
     Op->VectorList.RegNum = RegNum;
     Op->VectorList.Count = Count;
     Op->VectorList.isDoubleSpaced = isDoubleSpaced;
@@ -2637,10 +2627,10 @@ public:
     return Op;
   }
 
-  static ARMOperand *CreateVectorListAllLanes(unsigned RegNum, unsigned Count,
-                                              bool isDoubleSpaced,
-                                              SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_VectorListAllLanes);
+  static std::unique_ptr<ARMOperand>
+  CreateVectorListAllLanes(unsigned RegNum, unsigned Count, bool isDoubleSpaced,
+                           SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_VectorListAllLanes);
     Op->VectorList.RegNum = RegNum;
     Op->VectorList.Count = Count;
     Op->VectorList.isDoubleSpaced = isDoubleSpaced;
@@ -2649,11 +2639,10 @@ public:
     return Op;
   }
 
-  static ARMOperand *CreateVectorListIndexed(unsigned RegNum, unsigned Count,
-                                             unsigned Index,
-                                             bool isDoubleSpaced,
-                                             SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_VectorListIndexed);
+  static std::unique_ptr<ARMOperand>
+  CreateVectorListIndexed(unsigned RegNum, unsigned Count, unsigned Index,
+                          bool isDoubleSpaced, SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_VectorListIndexed);
     Op->VectorList.RegNum = RegNum;
     Op->VectorList.Count = Count;
     Op->VectorList.LaneIndex = Index;
@@ -2663,33 +2652,30 @@ public:
     return Op;
   }
 
-  static ARMOperand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E,
-                                       MCContext &Ctx) {
-    ARMOperand *Op = new ARMOperand(k_VectorIndex);
+  static std::unique_ptr<ARMOperand>
+  CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<ARMOperand>(k_VectorIndex);
     Op->VectorIndex.Val = Idx;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static ARMOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_Immediate);
+  static std::unique_ptr<ARMOperand> CreateImm(const MCExpr *Val, SMLoc S,
+                                               SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_Immediate);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static ARMOperand *CreateMem(unsigned BaseRegNum,
-                               const MCConstantExpr *OffsetImm,
-                               unsigned OffsetRegNum,
-                               ARM_AM::ShiftOpc ShiftType,
-                               unsigned ShiftImm,
-                               unsigned Alignment,
-                               bool isNegative,
-                               SMLoc S, SMLoc E,
-                               SMLoc AlignmentLoc = SMLoc()) {
-    ARMOperand *Op = new ARMOperand(k_Memory);
+  static std::unique_ptr<ARMOperand>
+  CreateMem(unsigned BaseRegNum, const MCConstantExpr *OffsetImm,
+            unsigned OffsetRegNum, ARM_AM::ShiftOpc ShiftType,
+            unsigned ShiftImm, unsigned Alignment, bool isNegative, SMLoc S,
+            SMLoc E, SMLoc AlignmentLoc = SMLoc()) {
+    auto Op = make_unique<ARMOperand>(k_Memory);
     Op->Memory.BaseRegNum = BaseRegNum;
     Op->Memory.OffsetImm = OffsetImm;
     Op->Memory.OffsetRegNum = OffsetRegNum;
@@ -2703,11 +2689,10 @@ public:
     return Op;
   }
 
-  static ARMOperand *CreatePostIdxReg(unsigned RegNum, bool isAdd,
-                                      ARM_AM::ShiftOpc ShiftTy,
-                                      unsigned ShiftImm,
-                                      SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(k_PostIndexRegister);
+  static std::unique_ptr<ARMOperand>
+  CreatePostIdxReg(unsigned RegNum, bool isAdd, ARM_AM::ShiftOpc ShiftTy,
+                   unsigned ShiftImm, SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_PostIndexRegister);
     Op->PostIdxReg.RegNum = RegNum;
     Op->PostIdxReg.isAdd = isAdd;
     Op->PostIdxReg.ShiftTy = ShiftTy;
@@ -2717,33 +2702,35 @@ public:
     return Op;
   }
 
-  static ARMOperand *CreateMemBarrierOpt(ARM_MB::MemBOpt Opt, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_MemBarrierOpt);
+  static std::unique_ptr<ARMOperand> CreateMemBarrierOpt(ARM_MB::MemBOpt Opt,
+                                                         SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_MemBarrierOpt);
     Op->MBOpt.Val = Opt;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
-  static ARMOperand *CreateInstSyncBarrierOpt(ARM_ISB::InstSyncBOpt Opt,
-                                              SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_InstSyncBarrierOpt);
+  static std::unique_ptr<ARMOperand>
+  CreateInstSyncBarrierOpt(ARM_ISB::InstSyncBOpt Opt, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_InstSyncBarrierOpt);
     Op->ISBOpt.Val = Opt;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
-  static ARMOperand *CreateProcIFlags(ARM_PROC::IFlags IFlags, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_ProcIFlags);
+  static std::unique_ptr<ARMOperand> CreateProcIFlags(ARM_PROC::IFlags IFlags,
+                                                      SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_ProcIFlags);
     Op->IFlags.Val = IFlags;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
-  static ARMOperand *CreateMSRMask(unsigned MMask, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(k_MSRMask);
+  static std::unique_ptr<ARMOperand> CreateMSRMask(unsigned MMask, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_MSRMask);
     Op->MMask.Val = MMask;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -2947,8 +2934,7 @@ int ARMAsmParser::tryParseRegister() {
 // occurs, return -1. An irrecoverable error is one where tokens have been
 // consumed in the process of trying to parse the shifter (i.e., when it is
 // indeed a shifter operand, but malformed).
-int ARMAsmParser::tryParseShiftRegister(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+int ARMAsmParser::tryParseShiftRegister(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier))
@@ -2972,7 +2958,8 @@ int ARMAsmParser::tryParseShiftRegister(
   // The source register for the shift has already been added to the
   // operand list, so we need to pop it off and combine it into the shifted
   // register operand instead.
-  std::unique_ptr<ARMOperand> PrevOp((ARMOperand*)Operands.pop_back_val());
+  std::unique_ptr<ARMOperand> PrevOp(
+      (ARMOperand *)Operands.pop_back_val().release());
   if (!PrevOp->isReg())
     return Error(PrevOp->getStartLoc(), "shift must be of a register");
   int SrcReg = PrevOp->getReg();
@@ -3049,8 +3036,7 @@ int ARMAsmParser::tryParseShiftRegister(
 ///
 /// TODO this is likely to change to allow different register types and or to
 /// parse for a specific register type.
-bool ARMAsmParser::
-tryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool ARMAsmParser::tryParseRegisterWithWriteBack(OperandVector &Operands) {
   const AsmToken &RegTok = Parser.getTok();
   int RegNo = tryParseRegister();
   if (RegNo == -1)
@@ -3096,17 +3082,25 @@ tryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 }
 
 /// MatchCoprocessorOperandName - Try to parse an coprocessor related
-/// instruction with a symbolic operand name. Example: "p1", "p7", "c3",
-/// "c5", ...
+/// instruction with a symbolic operand name.
+/// We accept "crN" syntax for GAS compatibility.
+/// <operand-name> ::= <prefix><number>
+/// If CoprocOp is 'c', then:
+///   <prefix> ::= c | cr
+/// If CoprocOp is 'p', then :
+///   <prefix> ::= p
+/// <number> ::= integer in range [0, 15]
 static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) {
   // Use the same layout as the tablegen'erated register name matcher. Ugly,
   // but efficient.
+  if (Name.size() < 2 || Name[0] != CoprocOp)
+    return -1;
+  Name = (Name[1] == 'r') ? Name.drop_front(2) : Name.drop_front();
+
   switch (Name.size()) {
   default: return -1;
-  case 2:
-    if (Name[0] != CoprocOp)
-      return -1;
-    switch (Name[1]) {
+  case 1:
+    switch (Name[0]) {
     default:  return -1;
     case '0': return 0;
     case '1': return 1;
@@ -3119,10 +3113,10 @@ static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) {
     case '8': return 8;
     case '9': return 9;
     }
-  case 3:
-    if (Name[0] != CoprocOp || Name[1] != '1')
+  case 2:
+    if (Name[0] != '1')
       return -1;
-    switch (Name[2]) {
+    switch (Name[1]) {
     default:  return -1;
     // p10 and p11 are invalid for coproc instructions (reserved for FP/NEON)
     case '0': return CoprocOp == 'p'? -1: 10;
@@ -3136,8 +3130,8 @@ static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) {
 }
 
 /// parseITCondCode - Try to parse a condition code for an IT instruction.
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseITCondCode(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (!Tok.is(AsmToken::Identifier))
@@ -3173,8 +3167,8 @@ parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 /// parseCoprocNumOperand - Try to parse an coprocessor number operand. The
 /// token must be an Identifier when called, and if it is a coprocessor
 /// number, the token is eaten and the operand is added to the operand list.
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseCoprocNumOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseCoprocNumOperand(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier))
@@ -3192,8 +3186,8 @@ parseCoprocNumOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 /// parseCoprocRegOperand - Try to parse an coprocessor register operand. The
 /// token must be an Identifier when called, and if it is a coprocessor
 /// number, the token is eaten and the operand is added to the operand list.
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseCoprocRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseCoprocRegOperand(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier))
@@ -3210,8 +3204,8 @@ parseCoprocRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
 /// parseCoprocOptionOperand - Try to parse an coprocessor option operand.
 /// coproc_option : '{' imm0_255 '}'
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseCoprocOptionOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseCoprocOptionOperand(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
 
   // If this isn't a '{', this isn't a coprocessor immediate operand.
@@ -3288,8 +3282,7 @@ static unsigned getDRegFromQReg(unsigned QReg) {
 }
 
 /// Parse a register list.
-bool ARMAsmParser::
-parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool ARMAsmParser::parseRegisterList(OperandVector &Operands) {
   assert(Parser.getTok().is(AsmToken::LCurly) &&
          "Token is not a Left Curly Brace");
   SMLoc S = Parser.getTok().getLoc();
@@ -3470,8 +3463,8 @@ parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index, SMLoc &EndLoc) {
 }
 
 // parse a vector register list
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseVectorList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseVectorList(OperandVector &Operands) {
   VectorLaneTy LaneKind;
   unsigned LaneIndex;
   SMLoc S = Parser.getTok().getLoc();
@@ -3721,8 +3714,8 @@ parseVectorList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 }
 
 /// parseMemBarrierOptOperand - Try to parse DSB/DMB data barrier options.
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseMemBarrierOptOperand(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   unsigned Opt;
@@ -3792,8 +3785,8 @@ parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 }
 
 /// parseInstSyncBarrierOptOperand - Try to parse ISB inst sync barrier options.
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseInstSyncBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseInstSyncBarrierOptOperand(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   unsigned Opt;
@@ -3843,8 +3836,8 @@ parseInstSyncBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
 
 /// parseProcIFlagsOperand - Try to parse iflags from CPS instruction.
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseProcIFlagsOperand(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (!Tok.is(AsmToken::Identifier)) 
@@ -3877,8 +3870,8 @@ parseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 }
 
 /// parseMSRMaskOperand - Try to parse mask flags from MSR instruction.
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (!Tok.is(AsmToken::Identifier))
@@ -4005,9 +3998,9 @@ parseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parsePKHImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands, StringRef Op,
-            int Low, int High) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parsePKHImm(OperandVector &Operands, StringRef Op, int Low,
+                          int High) {
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier)) {
     Error(Parser.getTok().getLoc(), Op + " operand expected.");
@@ -4053,8 +4046,8 @@ parsePKHImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands, StringRef Op,
   return MatchOperand_Success;
 }
 
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseSetEndImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseSetEndImm(OperandVector &Operands) {
   const AsmToken &Tok = Parser.getTok();
   SMLoc S = Tok.getLoc();
   if (Tok.isNot(AsmToken::Identifier)) {
@@ -4082,8 +4075,8 @@ parseSetEndImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 ///     lsl #n  'n' in [0,31]
 ///     asr #n  'n' in [1,32]
 ///             n == 32 encoded as n == 0.
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseShifterImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseShifterImm(OperandVector &Operands) {
   const AsmToken &Tok = Parser.getTok();
   SMLoc S = Tok.getLoc();
   if (Tok.isNot(AsmToken::Identifier)) {
@@ -4152,8 +4145,8 @@ parseShifterImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 /// parseRotImm - Parse the shifter immediate operand for SXTB/UXTB family
 /// of instructions. Legal values are:
 ///     ror #n  'n' in {0, 8, 16, 24}
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseRotImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseRotImm(OperandVector &Operands) {
   const AsmToken &Tok = Parser.getTok();
   SMLoc S = Tok.getLoc();
   if (Tok.isNot(AsmToken::Identifier))
@@ -4198,8 +4191,8 @@ parseRotImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseBitfield(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseBitfield(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   // The bitfield descriptor is really two operands, the LSB and the width.
   if (Parser.getTok().isNot(AsmToken::Hash) &&
@@ -4266,8 +4259,8 @@ parseBitfield(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parsePostIdxReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parsePostIdxReg(OperandVector &Operands) {
   // Check for a post-index addressing register operand. Specifically:
   // postidx_reg := '+' register {, shift}
   //              | '-' register {, shift}
@@ -4315,8 +4308,8 @@ parsePostIdxReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseAM3Offset(OperandVector &Operands) {
   // Check for a post-index addressing register operand. Specifically:
   // am3offset := '+' register
   //              | '-' register
@@ -4388,26 +4381,24 @@ parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 /// Convert parsed operands to MCInst.  Needed here because this instruction
 /// only has two register operands, but multiplication is commutative so
 /// assemblers should accept both "mul rD, rN, rD" and "mul rD, rD, rN".
-void ARMAsmParser::
-cvtThumbMultiply(MCInst &Inst,
-           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[1])->addCCOutOperands(Inst, 1);
+void ARMAsmParser::cvtThumbMultiply(MCInst &Inst,
+                                    const OperandVector &Operands) {
+  ((ARMOperand &)*Operands[3]).addRegOperands(Inst, 1);
+  ((ARMOperand &)*Operands[1]).addCCOutOperands(Inst, 1);
   // If we have a three-operand form, make sure to set Rn to be the operand
   // that isn't the same as Rd.
   unsigned RegOp = 4;
   if (Operands.size() == 6 &&
-      ((ARMOperand*)Operands[4])->getReg() ==
-        ((ARMOperand*)Operands[3])->getReg())
+      ((ARMOperand &)*Operands[4]).getReg() ==
+          ((ARMOperand &)*Operands[3]).getReg())
     RegOp = 5;
-  ((ARMOperand*)Operands[RegOp])->addRegOperands(Inst, 1);
+  ((ARMOperand &)*Operands[RegOp]).addRegOperands(Inst, 1);
   Inst.addOperand(Inst.getOperand(0));
-  ((ARMOperand*)Operands[2])->addCondCodeOperands(Inst, 2);
+  ((ARMOperand &)*Operands[2]).addCondCodeOperands(Inst, 2);
 }
 
-void ARMAsmParser::
-cvtThumbBranches(MCInst &Inst,
-           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+void ARMAsmParser::cvtThumbBranches(MCInst &Inst,
+                                    const OperandVector &Operands) {
   int CondOp = -1, ImmOp = -1;
   switch(Inst.getOpcode()) {
     case ARM::tB:
@@ -4430,7 +4421,7 @@ cvtThumbBranches(MCInst &Inst,
   } else {
     // outside IT blocks we can only have unconditional branches with AL
     // condition code or conditional branches with non-AL condition code
-    unsigned Cond = static_cast<ARMOperand*>(Operands[CondOp])->getCondCode();
+    unsigned Cond = static_cast<ARMOperand &>(*Operands[CondOp]).getCondCode();
     switch(Inst.getOpcode()) {
       case ARM::tB:
       case ARM::tBcc: 
@@ -4447,27 +4438,26 @@ cvtThumbBranches(MCInst &Inst,
   switch(Inst.getOpcode()) {
     // classify tB as either t2B or t1B based on range of immediate operand
     case ARM::tB: {
-      ARMOperand* op = static_cast<ARMOperand*>(Operands[ImmOp]);
-      if(!op->isSignedOffset<11, 1>() && isThumbTwo()) 
+      ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]);
+      if (!op.isSignedOffset<11, 1>() && isThumbTwo())
         Inst.setOpcode(ARM::t2B);
       break;
     }
     // classify tBcc as either t2Bcc or t1Bcc based on range of immediate operand
     case ARM::tBcc: {
-      ARMOperand* op = static_cast<ARMOperand*>(Operands[ImmOp]);
-      if(!op->isSignedOffset<8, 1>() && isThumbTwo())
+      ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]);
+      if (!op.isSignedOffset<8, 1>() && isThumbTwo())
         Inst.setOpcode(ARM::t2Bcc);
       break;
     }
   }
-  ((ARMOperand*)Operands[ImmOp])->addImmOperands(Inst, 1);
-  ((ARMOperand*)Operands[CondOp])->addCondCodeOperands(Inst, 2);
+  ((ARMOperand &)*Operands[ImmOp]).addImmOperands(Inst, 1);
+  ((ARMOperand &)*Operands[CondOp]).addCondCodeOperands(Inst, 2);
 }
 
 /// Parse an ARM memory expression, return false if successful else return true
 /// or an error.  The first token must be a '[' when called.
-bool ARMAsmParser::
-parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool ARMAsmParser::parseMemory(OperandVector &Operands) {
   SMLoc S, E;
   assert(Parser.getTok().is(AsmToken::LBrac) &&
          "Token is not a Left Bracket");
@@ -4717,8 +4707,8 @@ bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St,
 }
 
 /// parseFPImm - A floating point immediate expression operand.
-ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseFPImm(OperandVector &Operands) {
   // Anything that can accept a floating point constant as an operand
   // needs to go through here, as the regular parseExpression is
   // integer only.
@@ -4744,12 +4734,12 @@ parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // integer constant. Make sure we don't try to parse an FPImm
   // for these:
   // vmov.i{8|16|32|64} <dreg|qreg>, #imm
-  ARMOperand *TyOp = static_cast<ARMOperand*>(Operands[2]);
-  bool isVmovf = TyOp->isToken() && (TyOp->getToken() == ".f32" ||
-                                     TyOp->getToken() == ".f64");
-  ARMOperand *Mnemonic = static_cast<ARMOperand*>(Operands[0]);
-  bool isFconst = Mnemonic->isToken() && (Mnemonic->getToken() == "fconstd" ||
-                                          Mnemonic->getToken() == "fconsts");
+  ARMOperand &TyOp = static_cast<ARMOperand &>(*Operands[2]);
+  bool isVmovf = TyOp.isToken() &&
+                 (TyOp.getToken() == ".f32" || TyOp.getToken() == ".f64");
+  ARMOperand &Mnemonic = static_cast<ARMOperand &>(*Operands[0]);
+  bool isFconst = Mnemonic.isToken() && (Mnemonic.getToken() == "fconstd" ||
+                                         Mnemonic.getToken() == "fconsts");
   if (!(isVmovf || isFconst))
     return MatchOperand_NoMatch;
 
@@ -4798,8 +4788,7 @@ parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
 /// Parse a arm instruction operand.  For now this parses the operand regardless
 /// of the mnemonic.
-bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                                StringRef Mnemonic) {
+bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
   SMLoc S, E;
 
   // Check if the current operand has a custom associated parser, if so, try to
@@ -5125,7 +5114,7 @@ getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
 }
 
 bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+                                          OperandVector &Operands) {
   // FIXME: This is all horribly hacky. We really need a better way to deal
   // with optional operands like this in the matcher table.
 
@@ -5138,17 +5127,17 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
   // conditionally adding the cc_out in the first place because we need
   // to check the type of the parsed immediate operand.
   if (Mnemonic == "mov" && Operands.size() > 4 && !isThumb() &&
-      !static_cast<ARMOperand*>(Operands[4])->isARMSOImm() &&
-      static_cast<ARMOperand*>(Operands[4])->isImm0_65535Expr() &&
-      static_cast<ARMOperand*>(Operands[1])->getReg() == 0)
+      !static_cast<ARMOperand &>(*Operands[4]).isARMSOImm() &&
+      static_cast<ARMOperand &>(*Operands[4]).isImm0_65535Expr() &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0)
     return true;
 
   // Register-register 'add' for thumb does not have a cc_out operand
   // when there are only two register operands.
   if (isThumb() && Mnemonic == "add" && Operands.size() == 5 &&
-      static_cast<ARMOperand*>(Operands[3])->isReg() &&
-      static_cast<ARMOperand*>(Operands[4])->isReg() &&
-      static_cast<ARMOperand*>(Operands[1])->getReg() == 0)
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0)
     return true;
   // Register-register 'add' for thumb does not have a cc_out operand
   // when it's an ADD Rdm, SP, {Rdm|#imm0_255} instruction. We do
@@ -5156,13 +5145,12 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
   // that can handle a different range and has a cc_out operand.
   if (((isThumb() && Mnemonic == "add") ||
        (isThumbTwo() && Mnemonic == "sub")) &&
-      Operands.size() == 6 &&
-      static_cast<ARMOperand*>(Operands[3])->isReg() &&
-      static_cast<ARMOperand*>(Operands[4])->isReg() &&
-      static_cast<ARMOperand*>(Operands[4])->getReg() == ARM::SP &&
-      static_cast<ARMOperand*>(Operands[1])->getReg() == 0 &&
-      ((Mnemonic == "add" &&static_cast<ARMOperand*>(Operands[5])->isReg()) ||
-       static_cast<ARMOperand*>(Operands[5])->isImm0_1020s4()))
+      Operands.size() == 6 && static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).getReg() == ARM::SP &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+      ((Mnemonic == "add" && static_cast<ARMOperand &>(*Operands[5]).isReg()) ||
+       static_cast<ARMOperand &>(*Operands[5]).isImm0_1020s4()))
     return true;
   // For Thumb2, add/sub immediate does not have a cc_out operand for the
   // imm0_4095 variant. That's the least-preferred variant when
@@ -5170,23 +5158,22 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
   // should remove the cc_out operand, we have to explicitly check that
   // it's not one of the other variants. Ugh.
   if (isThumbTwo() && (Mnemonic == "add" || Mnemonic == "sub") &&
-      Operands.size() == 6 &&
-      static_cast<ARMOperand*>(Operands[3])->isReg() &&
-      static_cast<ARMOperand*>(Operands[4])->isReg() &&
-      static_cast<ARMOperand*>(Operands[5])->isImm()) {
+      Operands.size() == 6 && static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[5]).isImm()) {
     // Nest conditions rather than one big 'if' statement for readability.
     //
     // If both registers are low, we're in an IT block, and the immediate is
     // in range, we should use encoding T1 instead, which has a cc_out.
     if (inITBlock() &&
-        isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) &&
-        isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) &&
-        static_cast<ARMOperand*>(Operands[5])->isImm0_7())
+        isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) &&
+        isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) &&
+        static_cast<ARMOperand &>(*Operands[5]).isImm0_7())
       return false;
     // Check against T3. If the second register is the PC, this is an
     // alternate form of ADR, which uses encoding T4, so check for that too.
-    if (static_cast<ARMOperand*>(Operands[4])->getReg() != ARM::PC &&
-        static_cast<ARMOperand*>(Operands[5])->isT2SOImm())
+    if (static_cast<ARMOperand &>(*Operands[4]).getReg() != ARM::PC &&
+        static_cast<ARMOperand &>(*Operands[5]).isT2SOImm())
       return false;
 
     // Otherwise, we use encoding T4, which does not have a cc_out
@@ -5198,35 +5185,34 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
   // if we have a "mul" mnemonic in Thumb mode, check if we'll be able to
   // use the 16-bit encoding or not.
   if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 6 &&
-      static_cast<ARMOperand*>(Operands[1])->getReg() == 0 &&
-      static_cast<ARMOperand*>(Operands[3])->isReg() &&
-      static_cast<ARMOperand*>(Operands[4])->isReg() &&
-      static_cast<ARMOperand*>(Operands[5])->isReg() &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[5]).isReg() &&
       // If the registers aren't low regs, the destination reg isn't the
       // same as one of the source regs, or the cc_out operand is zero
       // outside of an IT block, we have to use the 32-bit encoding, so
       // remove the cc_out operand.
-      (!isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) ||
-       !isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) ||
-       !isARMLowRegister(static_cast<ARMOperand*>(Operands[5])->getReg()) ||
-       !inITBlock() ||
-       (static_cast<ARMOperand*>(Operands[3])->getReg() !=
-        static_cast<ARMOperand*>(Operands[5])->getReg() &&
-        static_cast<ARMOperand*>(Operands[3])->getReg() !=
-        static_cast<ARMOperand*>(Operands[4])->getReg())))
+      (!isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) ||
+       !isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) ||
+       !isARMLowRegister(static_cast<ARMOperand &>(*Operands[5]).getReg()) ||
+       !inITBlock() || (static_cast<ARMOperand &>(*Operands[3]).getReg() !=
+                            static_cast<ARMOperand &>(*Operands[5]).getReg() &&
+                        static_cast<ARMOperand &>(*Operands[3]).getReg() !=
+                            static_cast<ARMOperand &>(*Operands[4]).getReg())))
     return true;
 
   // Also check the 'mul' syntax variant that doesn't specify an explicit
   // destination register.
   if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 5 &&
-      static_cast<ARMOperand*>(Operands[1])->getReg() == 0 &&
-      static_cast<ARMOperand*>(Operands[3])->isReg() &&
-      static_cast<ARMOperand*>(Operands[4])->isReg() &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
       // If the registers aren't low regs  or the cc_out operand is zero
       // outside of an IT block, we have to use the 32-bit encoding, so
       // remove the cc_out operand.
-      (!isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) ||
-       !isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) ||
+      (!isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) ||
+       !isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) ||
        !inITBlock()))
     return true;
 
@@ -5239,32 +5225,32 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
   // anyway.
   if (isThumb() && (Mnemonic == "add" || Mnemonic == "sub") &&
       (Operands.size() == 5 || Operands.size() == 6) &&
-      static_cast<ARMOperand*>(Operands[3])->isReg() &&
-      static_cast<ARMOperand*>(Operands[3])->getReg() == ARM::SP &&
-      static_cast<ARMOperand*>(Operands[1])->getReg() == 0 &&
-      (static_cast<ARMOperand*>(Operands[4])->isImm() ||
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[3]).getReg() == ARM::SP &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+      (static_cast<ARMOperand &>(*Operands[4]).isImm() ||
        (Operands.size() == 6 &&
-        static_cast<ARMOperand*>(Operands[5])->isImm())))
+        static_cast<ARMOperand &>(*Operands[5]).isImm())))
     return true;
 
   return false;
 }
 
-bool ARMAsmParser::shouldOmitPredicateOperand(
-    StringRef Mnemonic, SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+bool ARMAsmParser::shouldOmitPredicateOperand(StringRef Mnemonic,
+                                              OperandVector &Operands) {
   // VRINT{Z, R, X} have a predicate operand in VFP, but not in NEON
   unsigned RegIdx = 3;
   if ((Mnemonic == "vrintz" || Mnemonic == "vrintx" || Mnemonic == "vrintr") &&
-      static_cast<ARMOperand *>(Operands[2])->getToken() == ".f32") {
-    if (static_cast<ARMOperand *>(Operands[3])->isToken() &&
-        static_cast<ARMOperand *>(Operands[3])->getToken() == ".f32")
+      static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f32") {
+    if (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+        static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f32")
       RegIdx = 4;
 
-    if (static_cast<ARMOperand *>(Operands[RegIdx])->isReg() &&
-        (ARMMCRegisterClasses[ARM::DPRRegClassID]
-             .contains(static_cast<ARMOperand *>(Operands[RegIdx])->getReg()) ||
-         ARMMCRegisterClasses[ARM::QPRRegClassID]
-             .contains(static_cast<ARMOperand *>(Operands[RegIdx])->getReg())))
+    if (static_cast<ARMOperand &>(*Operands[RegIdx]).isReg() &&
+        (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(
+             static_cast<ARMOperand &>(*Operands[RegIdx]).getReg()) ||
+         ARMMCRegisterClasses[ARM::QPRRegClassID].contains(
+             static_cast<ARMOperand &>(*Operands[RegIdx]).getReg())))
       return true;
   }
   return false;
@@ -5309,8 +5295,7 @@ static bool RequiresVFPRegListValidation(StringRef Inst,
 
 /// Parse an arm instruction mnemonic followed by its operands.
 bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                                    SMLoc NameLoc,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+                                    SMLoc NameLoc, OperandVector &Operands) {
   // FIXME: Can this be done via tablegen in some fashion?
   bool RequireVFPRegisterListCheck;
   bool AcceptSinglePrecisionOnly;
@@ -5489,12 +5474,12 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   Parser.Lex(); // Consume the EndOfStatement
 
   if (RequireVFPRegisterListCheck) {
-    ARMOperand *Op = static_cast<ARMOperand*>(Operands.back());
-    if (AcceptSinglePrecisionOnly && !Op->isSPRRegList())
-      return Error(Op->getStartLoc(),
+    ARMOperand &Op = static_cast<ARMOperand &>(*Operands.back());
+    if (AcceptSinglePrecisionOnly && !Op.isSPRRegList())
+      return Error(Op.getStartLoc(),
                    "VFP/Neon single precision register expected");
-    if (AcceptDoublePrecisionOnly && !Op->isDPRRegList())
-      return Error(Op->getStartLoc(),
+    if (AcceptDoublePrecisionOnly && !Op.isDPRRegList())
+      return Error(Op.getStartLoc(),
                    "VFP/Neon double precision register expected");
   }
 
@@ -5505,20 +5490,14 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // try to remove a cc_out operand that was explicitly set on the the
   // mnemonic, of course (CarrySetting == true). Reason number #317 the
   // table driven matcher doesn't fit well with the ARM instruction set.
-  if (!CarrySetting && shouldOmitCCOutOperand(Mnemonic, Operands)) {
-    ARMOperand *Op = static_cast<ARMOperand*>(Operands[1]);
+  if (!CarrySetting && shouldOmitCCOutOperand(Mnemonic, Operands))
     Operands.erase(Operands.begin() + 1);
-    delete Op;
-  }
 
   // Some instructions have the same mnemonic, but don't always
   // have a predicate. Distinguish them here and delete the
   // predicate if needed.
-  if (shouldOmitPredicateOperand(Mnemonic, Operands)) {
-    ARMOperand *Op = static_cast<ARMOperand*>(Operands[1]);
+  if (shouldOmitPredicateOperand(Mnemonic, Operands))
     Operands.erase(Operands.begin() + 1);
-    delete Op;
-  }
 
   // ARM mode 'blx' need special handling, as the register operand version
   // is predicable, but the label operand version is not. So, we can't rely
@@ -5526,11 +5505,8 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // a k_CondCode operand in the list. If we're trying to match the label
   // version, remove the k_CondCode operand here.
   if (!isThumb() && Mnemonic == "blx" && Operands.size() == 3 &&
-      static_cast<ARMOperand*>(Operands[2])->isImm()) {
-    ARMOperand *Op = static_cast<ARMOperand*>(Operands[1]);
+      static_cast<ARMOperand &>(*Operands[2]).isImm())
     Operands.erase(Operands.begin() + 1);
-    delete Op;
-  }
 
   // Adjust operands of ldrexd/strexd to MCK_GPRPair.
   // ldrexd/strexd require even/odd GPR pair. To enforce this constraint,
@@ -5543,53 +5519,50 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
        Mnemonic == "stlexd")) {
     bool isLoad = (Mnemonic == "ldrexd" || Mnemonic == "ldaexd");
     unsigned Idx = isLoad ? 2 : 3;
-    ARMOperand* Op1 = static_cast<ARMOperand*>(Operands[Idx]);
-    ARMOperand* Op2 = static_cast<ARMOperand*>(Operands[Idx+1]);
+    ARMOperand &Op1 = static_cast<ARMOperand &>(*Operands[Idx]);
+    ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[Idx + 1]);
 
     const MCRegisterClass& MRC = MRI->getRegClass(ARM::GPRRegClassID);
     // Adjust only if Op1 and Op2 are GPRs.
-    if (Op1->isReg() && Op2->isReg() && MRC.contains(Op1->getReg()) &&
-        MRC.contains(Op2->getReg())) {
-      unsigned Reg1 = Op1->getReg();
-      unsigned Reg2 = Op2->getReg();
+    if (Op1.isReg() && Op2.isReg() && MRC.contains(Op1.getReg()) &&
+        MRC.contains(Op2.getReg())) {
+      unsigned Reg1 = Op1.getReg();
+      unsigned Reg2 = Op2.getReg();
       unsigned Rt = MRI->getEncodingValue(Reg1);
       unsigned Rt2 = MRI->getEncodingValue(Reg2);
 
       // Rt2 must be Rt + 1 and Rt must be even.
       if (Rt + 1 != Rt2 || (Rt & 1)) {
-        Error(Op2->getStartLoc(), isLoad ?
-            "destination operands must be sequential" :
-            "source operands must be sequential");
+        Error(Op2.getStartLoc(), isLoad
+                                     ? "destination operands must be sequential"
+                                     : "source operands must be sequential");
         return true;
       }
       unsigned NewReg = MRI->getMatchingSuperReg(Reg1, ARM::gsub_0,
           &(MRI->getRegClass(ARM::GPRPairRegClassID)));
-      Operands.erase(Operands.begin() + Idx, Operands.begin() + Idx + 2);
-      Operands.insert(Operands.begin() + Idx, ARMOperand::CreateReg(
-            NewReg, Op1->getStartLoc(), Op2->getEndLoc()));
-      delete Op1;
-      delete Op2;
+      Operands[Idx] =
+          ARMOperand::CreateReg(NewReg, Op1.getStartLoc(), Op2.getEndLoc());
+      Operands.erase(Operands.begin() + Idx + 1);
     }
   }
 
   // GNU Assembler extension (compatibility)
   if ((Mnemonic == "ldrd" || Mnemonic == "strd")) {
-    ARMOperand *Op2 = static_cast<ARMOperand *>(Operands[2]);
-    ARMOperand *Op3 = static_cast<ARMOperand *>(Operands[3]);
-    if (Op3->isMem()) {
-      assert(Op2->isReg() && "expected register argument");
+    ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[2]);
+    ARMOperand &Op3 = static_cast<ARMOperand &>(*Operands[3]);
+    if (Op3.isMem()) {
+      assert(Op2.isReg() && "expected register argument");
 
       unsigned SuperReg = MRI->getMatchingSuperReg(
-          Op2->getReg(), ARM::gsub_0, &MRI->getRegClass(ARM::GPRPairRegClassID));
+          Op2.getReg(), ARM::gsub_0, &MRI->getRegClass(ARM::GPRPairRegClassID));
 
       assert(SuperReg && "expected register pair");
 
       unsigned PairedReg = MRI->getSubReg(SuperReg, ARM::gsub_1);
 
-      Operands.insert(Operands.begin() + 3,
-                      ARMOperand::CreateReg(PairedReg,
-                                            Op2->getStartLoc(),
-                                            Op2->getEndLoc()));
+      Operands.insert(
+          Operands.begin() + 3,
+          ARMOperand::CreateReg(PairedReg, Op2.getStartLoc(), Op2.getEndLoc()));
     }
   }
 
@@ -5599,19 +5572,13 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // so the Mnemonic is the original name "subs" and delete the predicate
   // operand so it will match the table entry.
   if (isThumbTwo() && Mnemonic == "sub" && Operands.size() == 6 &&
-      static_cast<ARMOperand*>(Operands[3])->isReg() &&
-      static_cast<ARMOperand*>(Operands[3])->getReg() == ARM::PC &&
-      static_cast<ARMOperand*>(Operands[4])->isReg() &&
-      static_cast<ARMOperand*>(Operands[4])->getReg() == ARM::LR &&
-      static_cast<ARMOperand*>(Operands[5])->isImm()) {
-    ARMOperand *Op0 = static_cast<ARMOperand*>(Operands[0]);
-    Operands.erase(Operands.begin());
-    delete Op0;
-    Operands.insert(Operands.begin(), ARMOperand::CreateToken(Name, NameLoc));
-
-    ARMOperand *Op1 = static_cast<ARMOperand*>(Operands[1]);
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[3]).getReg() == ARM::PC &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).getReg() == ARM::LR &&
+      static_cast<ARMOperand &>(*Operands[5]).isImm()) {
+    Operands.front() = ARMOperand::CreateToken(Name, NameLoc);
     Operands.erase(Operands.begin() + 1);
-    delete Op1;
   }
   return false;
 }
@@ -5657,9 +5624,8 @@ static bool instIsBreakpoint(const MCInst &Inst) {
 }
 
 // FIXME: We would really like to be able to tablegen'erate this.
-bool ARMAsmParser::
-validateInstruction(MCInst &Inst,
-                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool ARMAsmParser::validateInstruction(MCInst &Inst,
+                                       const OperandVector &Operands) {
   const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
   SMLoc Loc = Operands[0]->getStartLoc();
 
@@ -5682,7 +5648,7 @@ validateInstruction(MCInst &Inst,
       // Find the condition code Operand to get its SMLoc information.
       SMLoc CondLoc;
       for (unsigned I = 1; I < Operands.size(); ++I)
-        if (static_cast<ARMOperand*>(Operands[I])->isCondCode())
+        if (static_cast<ARMOperand &>(*Operands[I]).isCondCode())
           CondLoc = Operands[I]->getStartLoc();
       return Error(CondLoc, "incorrect condition in IT block; got '" +
                    StringRef(ARMCondCodeToString(ARMCC::CondCodes(Cond))) +
@@ -5782,8 +5748,8 @@ validateInstruction(MCInst &Inst,
     // in the register list.
     unsigned Rn = Inst.getOperand(0).getReg();
     bool HasWritebackToken =
-      (static_cast<ARMOperand*>(Operands[3])->isToken() &&
-       static_cast<ARMOperand*>(Operands[3])->getToken() == "!");
+        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+         static_cast<ARMOperand &>(*Operands[3]).getToken() == "!");
     bool ListContainsBase;
     if (checkLowRegisterList(Inst, 3, Rn, 0, ListContainsBase) && !isThumbTwo())
       return Error(Operands[3 + HasWritebackToken]->getStartLoc(),
@@ -5843,11 +5809,10 @@ validateInstruction(MCInst &Inst,
     // this first statement is always true for the new Inst.  Essentially, the
     // destination is unconditionally copied into the second source operand
     // without checking to see if it matches what we actually parsed.
-    if (Operands.size() == 6 &&
-        (((ARMOperand*)Operands[3])->getReg() !=
-         ((ARMOperand*)Operands[5])->getReg()) &&
-        (((ARMOperand*)Operands[3])->getReg() !=
-         ((ARMOperand*)Operands[4])->getReg())) {
+    if (Operands.size() == 6 && (((ARMOperand &)*Operands[3]).getReg() !=
+                                 ((ARMOperand &)*Operands[5]).getReg()) &&
+        (((ARMOperand &)*Operands[3]).getReg() !=
+         ((ARMOperand &)*Operands[4]).getReg())) {
       return Error(Operands[3]->getStartLoc(),
                    "destination register must match source register");
     }
@@ -5900,23 +5865,23 @@ validateInstruction(MCInst &Inst,
   }
   // Final range checking for Thumb unconditional branch instructions.
   case ARM::tB:
-    if (!(static_cast<ARMOperand*>(Operands[2]))->isSignedOffset<11, 1>())
+    if (!(static_cast<ARMOperand &>(*Operands[2])).isSignedOffset<11, 1>())
       return Error(Operands[2]->getStartLoc(), "branch target out of range");
     break;
   case ARM::t2B: {
     int op = (Operands[2]->isImm()) ? 2 : 3;
-    if (!(static_cast<ARMOperand*>(Operands[op]))->isSignedOffset<24, 1>())
+    if (!static_cast<ARMOperand &>(*Operands[op]).isSignedOffset<24, 1>())
       return Error(Operands[op]->getStartLoc(), "branch target out of range");
     break;
   }
   // Final range checking for Thumb conditional branch instructions.
   case ARM::tBcc:
-    if (!(static_cast<ARMOperand*>(Operands[2]))->isSignedOffset<8, 1>())
+    if (!static_cast<ARMOperand &>(*Operands[2]).isSignedOffset<8, 1>())
       return Error(Operands[2]->getStartLoc(), "branch target out of range");
     break;
   case ARM::t2Bcc: {
     int Op = (Operands[2]->isImm()) ? 2 : 3;
-    if (!(static_cast<ARMOperand*>(Operands[Op]))->isSignedOffset<20, 1>())
+    if (!static_cast<ARMOperand &>(*Operands[Op]).isSignedOffset<20, 1>())
       return Error(Operands[Op]->getStartLoc(), "branch target out of range");
     break;
   }
@@ -5931,19 +5896,19 @@ validateInstruction(MCInst &Inst,
     // lead to bugs that are difficult to find since this is an easy mistake
     // to make.
     int i = (Operands[3]->isImm()) ? 3 : 4;
-    ARMOperand *Op = static_cast<ARMOperand*>(Operands[i]);
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
+    ARMOperand &Op = static_cast<ARMOperand &>(*Operands[i]);
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm());
     if (CE) break;
-    const MCExpr *E = dyn_cast<MCExpr>(Op->getImm());
+    const MCExpr *E = dyn_cast<MCExpr>(Op.getImm());
     if (!E) break;
     const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(E);
     if (!ARM16Expr || (ARM16Expr->getKind() != ARMMCExpr::VK_ARM_HI16 &&
-                       ARM16Expr->getKind() != ARMMCExpr::VK_ARM_LO16)) {
-      return Error(Op->getStartLoc(),
-	     "immediate expression for mov requires :lower16: or :upper16");
-      break;
-    }
-    }
+                       ARM16Expr->getKind() != ARMMCExpr::VK_ARM_LO16))
+      return Error(
+          Op.getStartLoc(),
+          "immediate expression for mov requires :lower16: or :upper16");
+    break;
+  }
   }
 
   return false;
@@ -6205,9 +6170,8 @@ static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) {
   }
 }
 
-bool ARMAsmParser::
-processInstruction(MCInst &Inst,
-                   const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool ARMAsmParser::processInstruction(MCInst &Inst,
+                                      const OperandVector &Operands) {
   switch (Inst.getOpcode()) {
   // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction.
   case ARM::LDRT_POST:
@@ -6264,8 +6228,8 @@ processInstruction(MCInst &Inst,
     // Select the narrow version if the immediate will fit.
     if (Inst.getOperand(1).getImm() > 0 &&
         Inst.getOperand(1).getImm() <= 0xff &&
-        !(static_cast<ARMOperand*>(Operands[2])->isToken() &&
-         static_cast<ARMOperand*>(Operands[2])->getToken() == ".w"))
+        !(static_cast<ARMOperand &>(*Operands[2]).isToken() &&
+          static_cast<ARMOperand &>(*Operands[2]).getToken() == ".w"))
       Inst.setOpcode(ARM::tLDRpci);
     else
       Inst.setOpcode(ARM::t2LDRpci);
@@ -7355,8 +7319,8 @@ processInstruction(MCInst &Inst,
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
         Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
-        !(static_cast<ARMOperand*>(Operands[3])->isToken() &&
-         static_cast<ARMOperand*>(Operands[3])->getToken() == ".w")) {
+        !(static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+          static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
       default: llvm_unreachable("unexpected opcode");
@@ -7559,7 +7523,7 @@ processInstruction(MCInst &Inst,
   case ARM::LDMIA_UPD:
     // If this is a load of a single register via a 'pop', then we should use
     // a post-indexed LDR instruction instead, per the ARM ARM.
-    if (static_cast<ARMOperand*>(Operands[0])->getToken() == "pop" &&
+    if (static_cast<ARMOperand &>(*Operands[0]).getToken() == "pop" &&
         Inst.getNumOperands() == 5) {
       MCInst TmpInst;
       TmpInst.setOpcode(ARM::LDR_POST_IMM);
@@ -7577,7 +7541,7 @@ processInstruction(MCInst &Inst,
   case ARM::STMDB_UPD:
     // If this is a store of a single register via a 'push', then we should use
     // a pre-indexed STR instruction instead, per the ARM ARM.
-    if (static_cast<ARMOperand*>(Operands[0])->getToken() == "push" &&
+    if (static_cast<ARMOperand &>(*Operands[0]).getToken() == "push" &&
         Inst.getNumOperands() == 5) {
       MCInst TmpInst;
       TmpInst.setOpcode(ARM::STR_PRE_IMM);
@@ -7593,7 +7557,7 @@ processInstruction(MCInst &Inst,
   case ARM::t2ADDri12:
     // If the immediate fits for encoding T3 (t2ADDri) and the generic "add"
     // mnemonic was used (not "addw"), encoding T3 is preferred.
-    if (static_cast<ARMOperand*>(Operands[0])->getToken() != "add" ||
+    if (static_cast<ARMOperand &>(*Operands[0]).getToken() != "add" ||
         ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1)
       break;
     Inst.setOpcode(ARM::t2ADDri);
@@ -7602,7 +7566,7 @@ processInstruction(MCInst &Inst,
   case ARM::t2SUBri12:
     // If the immediate fits for encoding T3 (t2SUBri) and the generic "sub"
     // mnemonic was used (not "subw"), encoding T3 is preferred.
-    if (static_cast<ARMOperand*>(Operands[0])->getToken() != "sub" ||
+    if (static_cast<ARMOperand &>(*Operands[0]).getToken() != "sub" ||
         ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1)
       break;
     Inst.setOpcode(ARM::t2SUBri);
@@ -7638,9 +7602,9 @@ processInstruction(MCInst &Inst,
         !isARMLowRegister(Inst.getOperand(0).getReg()) ||
         (unsigned)Inst.getOperand(2).getImm() > 255 ||
         ((!inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR) ||
-        (inITBlock() && Inst.getOperand(5).getReg() != 0)) ||
-        (static_cast<ARMOperand*>(Operands[3])->isToken() &&
-         static_cast<ARMOperand*>(Operands[3])->getToken() == ".w"))
+         (inITBlock() && Inst.getOperand(5).getReg() != 0)) ||
+        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+         static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
       break;
     MCInst TmpInst;
     TmpInst.setOpcode(Inst.getOpcode() == ARM::t2ADDri ?
@@ -7661,8 +7625,8 @@ processInstruction(MCInst &Inst,
     // 'as' behaviour. Make sure the wide encoding wasn't explicit.
     if (Inst.getOperand(0).getReg() != Inst.getOperand(1).getReg() ||
         Inst.getOperand(5).getReg() != 0 ||
-        (static_cast<ARMOperand*>(Operands[3])->isToken() &&
-         static_cast<ARMOperand*>(Operands[3])->getToken() == ".w"))
+        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+         static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
       break;
     MCInst TmpInst;
     TmpInst.setOpcode(ARM::tADDhirr);
@@ -7719,8 +7683,8 @@ processInstruction(MCInst &Inst,
     // an error in validateInstruction().
     unsigned Rn = Inst.getOperand(0).getReg();
     bool hasWritebackToken =
-      (static_cast<ARMOperand*>(Operands[3])->isToken() &&
-       static_cast<ARMOperand*>(Operands[3])->getToken() == "!");
+        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+         static_cast<ARMOperand &>(*Operands[3]).getToken() == "!");
     bool listContainsBase;
     if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) ||
         (!listContainsBase && !hasWritebackToken) ||
@@ -7782,10 +7746,10 @@ processInstruction(MCInst &Inst,
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         (unsigned)Inst.getOperand(1).getImm() <= 255 &&
         ((!inITBlock() && Inst.getOperand(2).getImm() == ARMCC::AL &&
-         Inst.getOperand(4).getReg() == ARM::CPSR) ||
-        (inITBlock() && Inst.getOperand(4).getReg() == 0)) &&
-        (!static_cast<ARMOperand*>(Operands[2])->isToken() ||
-         static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) {
+          Inst.getOperand(4).getReg() == ARM::CPSR) ||
+         (inITBlock() && Inst.getOperand(4).getReg() == 0)) &&
+        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
+         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
       // The operands aren't in the same order for tMOVi8...
       MCInst TmpInst;
       TmpInst.setOpcode(ARM::tMOVi8);
@@ -7806,8 +7770,8 @@ processInstruction(MCInst &Inst,
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
         Inst.getOperand(2).getImm() == ARMCC::AL &&
         Inst.getOperand(4).getReg() == ARM::CPSR &&
-        (!static_cast<ARMOperand*>(Operands[2])->isToken() ||
-         static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) {
+        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
+         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
       // The operands aren't the same for tMOV[S]r... (no cc_out)
       MCInst TmpInst;
       TmpInst.setOpcode(Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr);
@@ -7829,8 +7793,8 @@ processInstruction(MCInst &Inst,
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
         Inst.getOperand(2).getImm() == 0 &&
-        (!static_cast<ARMOperand*>(Operands[2])->isToken() ||
-         static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) {
+        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
+         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
       default: llvm_unreachable("Illegal opcode!");
@@ -7942,9 +7906,10 @@ processInstruction(MCInst &Inst,
          isARMLowRegister(Inst.getOperand(2).getReg())) &&
         Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
         ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
-         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && 
-        (!static_cast<ARMOperand*>(Operands[3])->isToken() ||
-         !static_cast<ARMOperand*>(Operands[3])->getToken().equals_lower(".w"))) {
+         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) &&
+        (!static_cast<ARMOperand &>(*Operands[3]).isToken() ||
+         !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower(
+             ".w"))) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
         default: llvm_unreachable("unexpected opcode");
@@ -7981,9 +7946,10 @@ processInstruction(MCInst &Inst,
         (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() ||
          Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) &&
         ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
-         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && 
-        (!static_cast<ARMOperand*>(Operands[3])->isToken() ||
-         !static_cast<ARMOperand*>(Operands[3])->getToken().equals_lower(".w"))) {
+         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) &&
+        (!static_cast<ARMOperand &>(*Operands[3]).isToken() ||
+         !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower(
+             ".w"))) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
         default: llvm_unreachable("unexpected opcode");
@@ -8063,11 +8029,10 @@ template <> inline bool IsCPSRDead<MCInst>(MCInst *Instr) {
 }
 
 static const char *getSubtargetFeatureName(unsigned Val);
-bool ARMAsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCStreamer &Out, unsigned &ErrorInfo,
-                        bool MatchingInlineAsm) {
+bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                           OperandVector &Operands,
+                                           MCStreamer &Out, unsigned &ErrorInfo,
+                                           bool MatchingInlineAsm) {
   MCInst Inst;
   unsigned MatchResult;
 
@@ -8136,7 +8101,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
-      ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc();
+      ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
       if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
     }
 
@@ -8144,7 +8109,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   }
   case Match_MnemonicFail:
     return Error(IDLoc, "invalid instruction",
-                 ((ARMOperand*)Operands[0])->getLocRange());
+                 ((ARMOperand &)*Operands[0]).getLocRange());
   case Match_RequiresNotITBlock:
     return Error(IDLoc, "flag setting instruction only valid outside IT block");
   case Match_RequiresITBlock:
@@ -8154,12 +8119,12 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_RequiresThumb2:
     return Error(IDLoc, "instruction variant requires Thumb2");
   case Match_ImmRange0_15: {
-    SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc();
+    SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
     if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
     return Error(ErrorLoc, "immediate operand must be in the range [0,15]");
   }
   case Match_ImmRange0_239: {
-    SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc();
+    SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
     if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
     return Error(ErrorLoc, "immediate operand must be in the range [0,239]");
   }
@@ -8175,7 +8140,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_DupAlignedMemoryRequires64or128:
   case Match_AlignedMemoryRequires64or128or256:
   {
-    SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getAlignmentLoc();
+    SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getAlignmentLoc();
     if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
     switch (MatchResult) {
       default:
@@ -8923,28 +8888,22 @@ bool ARMAsmParser::parseDirectiveRegSave(SMLoc L, bool IsVector) {
   }
 
   // RAII object to make sure parsed operands are deleted.
-  struct CleanupObject {
-    SmallVector<MCParsedAsmOperand *, 1> Operands;
-    ~CleanupObject() {
-      for (unsigned I = 0, E = Operands.size(); I != E; ++I)
-        delete Operands[I];
-    }
-  } CO;
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands;
 
   // Parse the register list
-  if (parseRegisterList(CO.Operands))
+  if (parseRegisterList(Operands))
     return false;
-  ARMOperand *Op = (ARMOperand*)CO.Operands[0];
-  if (!IsVector && !Op->isRegList()) {
+  ARMOperand &Op = (ARMOperand &)*Operands[0];
+  if (!IsVector && !Op.isRegList()) {
     Error(L, ".save expects GPR registers");
     return false;
   }
-  if (IsVector && !Op->isDPRRegList()) {
+  if (IsVector && !Op.isDPRRegList()) {
     Error(L, ".vsave expects DPR registers");
     return false;
   }
 
-  getTargetStreamer().emitRegSave(Op->getRegList(), IsVector);
+  getTargetStreamer().emitRegSave(Op.getRegList(), IsVector);
   return false;
 }
 
@@ -9468,23 +9427,23 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
 
 // Define this matcher function after the auto-generated include so we
 // have the match class enum definitions.
-unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
+unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
                                                   unsigned Kind) {
-  ARMOperand *Op = static_cast<ARMOperand*>(AsmOp);
+  ARMOperand &Op = static_cast<ARMOperand &>(AsmOp);
   // If the kind is a token for a literal immediate, check if our asm
   // operand matches. This is for InstAliases which have a fixed-value
   // immediate in the syntax.
   switch (Kind) {
   default: break;
   case MCK__35_0:
-    if (Op->isImm())
-      if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm()))
+    if (Op.isImm())
+      if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()))
         if (CE->getValue() == 0)
           return Match_Success;
     break;
   case MCK_ARMSOImm:
-    if (Op->isImm()) {
-      const MCExpr *SOExpr = Op->getImm();
+    if (Op.isImm()) {
+      const MCExpr *SOExpr = Op.getImm();
       int64_t Value;
       if (!SOExpr->EvaluateAsAbsolute(Value))
         return Match_Success;
@@ -9493,8 +9452,8 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
     }
     break;
   case MCK_GPRPair:
-    if (Op->isReg() &&
-        MRI->getRegClass(ARM::GPRRegClassID).contains(Op->getReg()))
+    if (Op.isReg() &&
+        MRI->getRegClass(ARM::GPRRegClassID).contains(Op.getReg()))
       return Match_Success;
     break;
   }
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index e4b785d..228fb57 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -1092,13 +1092,13 @@ void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
   if (isSub) {
     O << ", "
       << markup("<imm:")
-      << "#-" << -OffImm
+      << "#-" << formatImm(-OffImm)
       << markup(">");
   }
   else if (AlwaysPrintImm0 || OffImm > 0) {
     O << ", "
       << markup("<imm:")
-      << "#" << OffImm
+      << "#" << formatImm(OffImm)
       << markup(">");
   }
   O << "]" << markup(">");
diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index 42a1cbb..1686d76 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -295,7 +295,12 @@ namespace ARMII {
 
     /// MO_OPTION_MASK - Most flags are mutually exclusive; this mask selects
     /// just that part of the flag set.
-    MO_OPTION_MASK = 0x7f,
+    MO_OPTION_MASK = 0x3f,
+
+    /// MO_DLLIMPORT - On a symbol operand, this represents that the reference
+    /// to the symbol is for an import stub.  This is used for DLL import
+    /// storage class indication on Windows.
+    MO_DLLIMPORT = 0x40,
 
     /// MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it
     /// represents a symbol which, if indirect, will get special Darwin mangling
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index a4d13ed..7b5d8b0 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -992,7 +992,8 @@ void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) {
     return;
 
   const MCSymbolData &SD = Streamer.getOrCreateSymbolData(Symbol);
-  if (MCELF::GetType(SD) & (ELF::STT_FUNC << ELF_STT_Shift))
+  unsigned Type = MCELF::GetType(SD);
+  if (Type == ELF_STT_Func || Type == ELF_STT_GnuIFunc)
     Streamer.EmitThumbFunc(Symbol);
 }
 
@@ -1160,7 +1161,7 @@ void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
   const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::Create(
       PersonalitySym, MCSymbolRefExpr::VK_ARM_NONE, getContext());
 
-  AddValueSymbols(PersonalityRef);
+  visitUsedExpr(*PersonalityRef);
   MCDataFragment *DF = getOrCreateDataFragment();
   DF->getFixups().push_back(MCFixup::Create(DF->getContents().size(),
                                             PersonalityRef,
@@ -1332,6 +1333,12 @@ MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
   return S;
 }
 
+MCStreamer *createARMNullStreamer(MCContext &Ctx) {
+  MCStreamer *S = llvm::createNullStreamer(Ctx);
+  new ARMTargetStreamer(*S);
+  return S;
+}
+
   MCELFStreamer* createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
                                       raw_ostream &OS, MCCodeEmitter *Emitter,
                                       bool RelaxAll, bool NoExecStack,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 5b51a52..b8ee555 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -1047,8 +1047,7 @@ ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
   // we have a movt or a movw, but that led to misleadingly results.
   // This is now disallowed in the the AsmParser in validateInstruction()
   // so this should never happen.
-  assert(0 && "expression without :upper16: or :lower16:");
-  return 0;
+  llvm_unreachable("expression without :upper16: or :lower16:");
 }
 
 uint32_t ARMMCCodeEmitter::
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
index 87ea875..e545e3c 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
@@ -41,33 +41,6 @@ ARMMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
   return false;
 }
 
-// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
-// that method should be made public?
-static void AddValueSymbols_(const MCExpr *Value, MCAssembler *Asm) {
-  switch (Value->getKind()) {
-  case MCExpr::Target:
-    llvm_unreachable("Can't handle nested target expr!");
-
-  case MCExpr::Constant:
-    break;
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
-    AddValueSymbols_(BE->getLHS(), Asm);
-    AddValueSymbols_(BE->getRHS(), Asm);
-    break;
-  }
-
-  case MCExpr::SymbolRef:
-    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
-    break;
-
-  case MCExpr::Unary:
-    AddValueSymbols_(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
-    break;
-  }
-}
-
-void ARMMCExpr::AddValueSymbols(MCAssembler *Asm) const {
-  AddValueSymbols_(getSubExpr(), Asm);
+void ARMMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*getSubExpr());
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
index d819139..c5c0b10 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
@@ -59,7 +59,7 @@ public:
   void PrintImpl(raw_ostream &OS) const override;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAsmLayout *Layout) const override;
-  void AddValueSymbols(MCAssembler *) const override;
+  void visitUsedExpr(MCStreamer &Streamer) const override;
   const MCSection *FindAssociatedSection() const override {
     return getSubExpr()->FindAssociatedSection();
   }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 04d63a7..2b3855d 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -427,6 +427,12 @@ extern "C" void LLVMInitializeARMTargetMC() {
   TargetRegistry::RegisterAsmStreamer(TheThumbLETarget, createMCAsmStreamer);
   TargetRegistry::RegisterAsmStreamer(TheThumbBETarget, createMCAsmStreamer);
 
+  // Register the null streamer.
+  TargetRegistry::RegisterNullStreamer(TheARMLETarget, createARMNullStreamer);
+  TargetRegistry::RegisterNullStreamer(TheARMBETarget, createARMNullStreamer);
+  TargetRegistry::RegisterNullStreamer(TheThumbLETarget, createARMNullStreamer);
+  TargetRegistry::RegisterNullStreamer(TheThumbBETarget, createARMNullStreamer);
+
   // Register the MCInstPrinter.
   TargetRegistry::RegisterMCInstPrinter(TheARMLETarget, createARMMCInstPrinter);
   TargetRegistry::RegisterMCInstPrinter(TheARMBETarget, createARMMCInstPrinter);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 8853a8c..5326e56 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -51,6 +51,8 @@ MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
                                 MCInstPrinter *InstPrint, MCCodeEmitter *CE,
                                 MCAsmBackend *TAB, bool ShowInst);
 
+MCStreamer *createARMNullStreamer(MCContext &Ctx);
+
 MCCodeEmitter *createARMLEMCCodeEmitter(const MCInstrInfo &MCII,
                                         const MCRegisterInfo &MRI,
                                         const MCSubtargetInfo &STI,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index ecfa4e5..186776a 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -32,6 +32,7 @@ class ARMMachObjectWriter : public MCMachObjectTargetWriter {
                                     const MCFragment *Fragment,
                                     const MCFixup &Fixup,
                                     MCValue Target,
+                                    unsigned Type,
                                     unsigned Log2Size,
                                     uint64_t &FixedValue);
   void RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
@@ -251,11 +252,11 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
                                                     const MCFragment *Fragment,
                                                     const MCFixup &Fixup,
                                                     MCValue Target,
+                                                    unsigned Type,
                                                     unsigned Log2Size,
                                                     uint64_t &FixedValue) {
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
-  unsigned Type = MachO::ARM_RELOC_VANILLA;
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
@@ -272,6 +273,7 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
   uint32_t Value2 = 0;
 
   if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    assert(Type == MachO::ARM_RELOC_VANILLA && "invalid reloc for 2 symbols");
     const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
 
     if (!B_SD->getFragment())
@@ -374,7 +376,8 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
       return RecordARMScatteredHalfRelocation(Writer, Asm, Layout, Fragment,
                                               Fixup, Target, FixedValue);
     return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
-                                        Target, Log2Size, FixedValue);
+                                        Target, RelocType, Log2Size,
+                                        FixedValue);
   }
 
   // Get the symbol data, if any.
@@ -392,7 +395,8 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
     Offset += 1 << Log2Size;
   if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD))
     return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
-                                        Target, Log2Size, FixedValue);
+                                        Target, RelocType, Log2Size,
+                                        FixedValue);
 
   // See <reloc.h>.
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
diff --git a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index e3cfb05..ad3f1ca 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -11,147 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 #include "llvm/ADT/MapVector.h"
+#include "llvm/MC/ConstantPools.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
 
 using namespace llvm;
-
-namespace {
-// A class to keep track of assembler-generated constant pools that are use to
-// implement the ldr-pseudo.
-class ConstantPool {
-  typedef SmallVector<std::pair<MCSymbol *, const MCExpr *>, 4> EntryVecTy;
-  EntryVecTy Entries;
-
-public:
-  // Initialize a new empty constant pool
-  ConstantPool() {}
-
-  // Add a new entry to the constant pool in the next slot.
-  // \param Value is the new entry to put in the constant pool.
-  //
-  // \returns a MCExpr that references the newly inserted value
-  const MCExpr *addEntry(const MCExpr *Value, MCContext &Context);
-
-  // Emit the contents of the constant pool using the provided streamer.
-  void emitEntries(MCStreamer &Streamer);
-
-  // Return true if the constant pool is empty
-  bool empty();
-};
-}
-
-namespace llvm {
-class AssemblerConstantPools {
-  // Map type used to keep track of per-Section constant pools used by the
-  // ldr-pseudo opcode. The map associates a section to its constant pool. The
-  // constant pool is a vector of (label, value) pairs. When the ldr
-  // pseudo is parsed we insert a new (label, value) pair into the constant pool
-  // for the current section and add MCSymbolRefExpr to the new label as
-  // an opcode to the ldr. After we have parsed all the user input we
-  // output the (label, value) pairs in each constant pool at the end of the
-  // section.
-  //
-  // We use the MapVector for the map type to ensure stable iteration of
-  // the sections at the end of the parse. We need to iterate over the
-  // sections in a stable order to ensure that we have print the
-  // constant pools in a deterministic order when printing an assembly
-  // file.
-  typedef MapVector<const MCSection *, ConstantPool> ConstantPoolMapTy;
-  ConstantPoolMapTy ConstantPools;
-
-public:
-  AssemblerConstantPools() {}
-  ~AssemblerConstantPools() {}
-
-  void emitAll(MCStreamer &Streamer);
-  void emitForCurrentSection(MCStreamer &Streamer);
-  const MCExpr *addEntry(MCStreamer &Streamer, const MCExpr *Expr);
-
-private:
-  ConstantPool *getConstantPool(const MCSection *Section);
-  ConstantPool &getOrCreateConstantPool(const MCSection *Section);
-};
-}
-
-//
-// ConstantPool implementation
-//
-// Emit the contents of the constant pool using the provided streamer.
-void ConstantPool::emitEntries(MCStreamer &Streamer) {
-  if (Entries.empty())
-    return;
-  Streamer.EmitCodeAlignment(4); // align to 4-byte address
-  Streamer.EmitDataRegion(MCDR_DataRegion);
-  for (EntryVecTy::const_iterator I = Entries.begin(), E = Entries.end();
-       I != E; ++I) {
-    Streamer.EmitLabel(I->first);
-    Streamer.EmitValue(I->second, 4);
-  }
-  Streamer.EmitDataRegion(MCDR_DataRegionEnd);
-  Entries.clear();
-}
-
-const MCExpr *ConstantPool::addEntry(const MCExpr *Value, MCContext &Context) {
-  MCSymbol *CPEntryLabel = Context.CreateTempSymbol();
-
-  Entries.push_back(std::make_pair(CPEntryLabel, Value));
-  return MCSymbolRefExpr::Create(CPEntryLabel, Context);
-}
-
-bool ConstantPool::empty() { return Entries.empty(); }
-
-//
-// AssemblerConstantPools implementation
-//
-ConstantPool *
-AssemblerConstantPools::getConstantPool(const MCSection *Section) {
-  ConstantPoolMapTy::iterator CP = ConstantPools.find(Section);
-  if (CP == ConstantPools.end())
-    return nullptr;
-
-  return &CP->second;
-}
-
-ConstantPool &
-AssemblerConstantPools::getOrCreateConstantPool(const MCSection *Section) {
-  return ConstantPools[Section];
-}
-
-static void emitConstantPool(MCStreamer &Streamer, const MCSection *Section,
-                             ConstantPool &CP) {
-  if (!CP.empty()) {
-    Streamer.SwitchSection(Section);
-    CP.emitEntries(Streamer);
-  }
-}
-
-void AssemblerConstantPools::emitAll(MCStreamer &Streamer) {
-  // Dump contents of assembler constant pools.
-  for (ConstantPoolMapTy::iterator CPI = ConstantPools.begin(),
-                                   CPE = ConstantPools.end();
-       CPI != CPE; ++CPI) {
-    const MCSection *Section = CPI->first;
-    ConstantPool &CP = CPI->second;
-
-    emitConstantPool(Streamer, Section, CP);
-  }
-}
-
-void AssemblerConstantPools::emitForCurrentSection(MCStreamer &Streamer) {
-  const MCSection *Section = Streamer.getCurrentSection().first;
-  if (ConstantPool *CP = getConstantPool(Section)) {
-    emitConstantPool(Streamer, Section, *CP);
-  }
-}
-
-const MCExpr *AssemblerConstantPools::addEntry(MCStreamer &Streamer,
-                                               const MCExpr *Expr) {
-  const MCSection *Section = Streamer.getCurrentSection().first;
-  return getOrCreateConstantPool(Section).addEntry(Expr, Streamer.getContext());
-}
-
 //
 // ARMTargetStreamer Implemenation
 //
@@ -175,78 +40,34 @@ void ARMTargetStreamer::finish() { ConstantPools->emitAll(Streamer); }
 
 // The remaining callbacks should be handled separately by each
 // streamer.
-void ARMTargetStreamer::emitFnStart() {
-  llvm_unreachable("unimplemented");
-}
-void ARMTargetStreamer::emitFnEnd() {
-  llvm_unreachable("unimplemented");
-}
-void ARMTargetStreamer::emitCantUnwind() {
-  llvm_unreachable("unimplemented");
-}
-void ARMTargetStreamer::emitPersonality(const MCSymbol *Personality) {
-  llvm_unreachable("unimplemented");
-}
-void ARMTargetStreamer::emitPersonalityIndex(unsigned Index) {
-  llvm_unreachable("unimplemented");
-}
-void ARMTargetStreamer::emitHandlerData() {
-  llvm_unreachable("unimplemented");
-}
+void ARMTargetStreamer::emitFnStart() {}
+void ARMTargetStreamer::emitFnEnd() {}
+void ARMTargetStreamer::emitCantUnwind() {}
+void ARMTargetStreamer::emitPersonality(const MCSymbol *Personality) {}
+void ARMTargetStreamer::emitPersonalityIndex(unsigned Index) {}
+void ARMTargetStreamer::emitHandlerData() {}
 void ARMTargetStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
-                                       int64_t Offset) {
-  llvm_unreachable("unimplemented");
-}
-void ARMTargetStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
-  llvm_unreachable("unimplemented");
-}
-void ARMTargetStreamer::emitPad(int64_t Offset) {
-  llvm_unreachable("unimplemented");
-}
-void
-ARMTargetStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
-                                    bool isVector) {
-  llvm_unreachable("unimplemented");
-}
-void ARMTargetStreamer::emitUnwindRaw(
-    int64_t StackOffset, const SmallVectorImpl<uint8_t> &Opcodes) {
-  llvm_unreachable("unimplemented");
-}
-void ARMTargetStreamer::switchVendor(StringRef Vendor) {
-  llvm_unreachable("unimplemented");
-}
-void ARMTargetStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
-  llvm_unreachable("unimplemented");
-}
+                                  int64_t Offset) {}
+void ARMTargetStreamer::emitMovSP(unsigned Reg, int64_t Offset) {}
+void ARMTargetStreamer::emitPad(int64_t Offset) {}
+void ARMTargetStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                    bool isVector) {}
+void ARMTargetStreamer::emitUnwindRaw(int64_t StackOffset,
+                                      const SmallVectorImpl<uint8_t> &Opcodes) {
+}
+void ARMTargetStreamer::switchVendor(StringRef Vendor) {}
+void ARMTargetStreamer::emitAttribute(unsigned Attribute, unsigned Value) {}
 void ARMTargetStreamer::emitTextAttribute(unsigned Attribute,
-                                               StringRef String) {
-  llvm_unreachable("unimplemented");
-}
+                                          StringRef String) {}
 void ARMTargetStreamer::emitIntTextAttribute(unsigned Attribute,
-                                                  unsigned IntValue,
-                                                  StringRef StringValue) {
-  llvm_unreachable("unimplemented");
-}
-void ARMTargetStreamer::emitArch(unsigned Arch) {
-  llvm_unreachable("unimplemented");
-}
-void ARMTargetStreamer::emitObjectArch(unsigned Arch) {
-  llvm_unreachable("unimplemented");
-}
-void ARMTargetStreamer::emitFPU(unsigned FPU) {
-  llvm_unreachable("unimplemented");
-}
-void ARMTargetStreamer::finishAttributeSection() {
-  llvm_unreachable("unimplemented");
-}
-void ARMTargetStreamer::emitInst(uint32_t Inst, char Suffix) {
-  llvm_unreachable("unimplemented");
-}
-void ARMTargetStreamer::AnnotateTLSDescriptorSequence(
-    const MCSymbolRefExpr *SRE) {
-  llvm_unreachable("unimplemented");
-}
+                                             unsigned IntValue,
+                                             StringRef StringValue) {}
+void ARMTargetStreamer::emitArch(unsigned Arch) {}
+void ARMTargetStreamer::emitObjectArch(unsigned Arch) {}
+void ARMTargetStreamer::emitFPU(unsigned FPU) {}
+void ARMTargetStreamer::finishAttributeSection() {}
+void ARMTargetStreamer::emitInst(uint32_t Inst, char Suffix) {}
+void
+ARMTargetStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) {}
 
-void ARMTargetStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
-  llvm_unreachable("unimplemented");
-}
+void ARMTargetStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {}
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index be29dc5..baa97a7 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -21,6 +21,9 @@
 
 using namespace llvm;
 
+Thumb1FrameLowering::Thumb1FrameLowering(const ARMSubtarget &sti)
+    : ARMFrameLowering(sti) {}
+
 bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const{
   const MachineFrameInfo *FFI = MF.getFrameInfo();
   unsigned CFSize = FFI->getMaxCallFrameSize();
diff --git a/lib/Target/ARM/Thumb1FrameLowering.h b/lib/Target/ARM/Thumb1FrameLowering.h
index f61874b..a227f8e 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.h
+++ b/lib/Target/ARM/Thumb1FrameLowering.h
@@ -11,11 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef __THUMB_FRAMEINFO_H_
-#define __THUMB_FRAMEINFO_H_
+#ifndef LLVM_ARM_THUMB1FRAMELOWERING_H
+#define LLVM_ARM_THUMB1FRAMELOWERING_H
 
 #include "ARMFrameLowering.h"
-#include "ARMSubtarget.h"
 #include "Thumb1InstrInfo.h"
 #include "Thumb1RegisterInfo.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -24,9 +23,7 @@ namespace llvm {
 
 class Thumb1FrameLowering : public ARMFrameLowering {
 public:
-  explicit Thumb1FrameLowering(const ARMSubtarget &sti)
-    : ARMFrameLowering(sti) {
-  }
+  explicit Thumb1FrameLowering(const ARMSubtarget &sti);
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index 6267ecf..09debe7 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -1010,7 +1010,8 @@ bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
   AttributeSet FnAttrs = MF.getFunction()->getAttributes();
   OptimizeSize = FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
                                       Attribute::OptimizeForSize);
-  MinimizeSize = STI->isMinSize();
+  MinimizeSize =
+      FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
 
   BlockInfo.clear();
   BlockInfo.resize(MF.getNumBlockIDs());
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 15b574d..f610fbb 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -1577,6 +1577,10 @@ void CppWriter::printInstruction(const Instruction *I,
     nl(Out) << iName << "->setName(\"";
     printEscapedString(cxi->getName());
     Out << "\");";
+    nl(Out) << iName << "->setVolatile("
+            << (cxi->isVolatile() ? "true" : "false") << ");";
+    nl(Out) << iName << "->setWeak("
+            << (cxi->isWeak() ? "true" : "false") << ");";
     break;
   }
   case Instruction::AtomicRMW: {
@@ -1607,6 +1611,8 @@ void CppWriter::printInstruction(const Instruction *I,
     nl(Out) << iName << "->setName(\"";
     printEscapedString(rmwi->getName());
     Out << "\");";
+    nl(Out) << iName << "->setVolatile("
+            << (rmwi->isVolatile() ? "true" : "false") << ");";
     break;
   }
   case Instruction::LandingPad: {
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index d551ca9..21df12f 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -165,8 +165,8 @@ void HexagonFrameLowering::emitEpilogue(MachineFunction &MF,
     }
     // Replace 'jumpr r31' instruction with dealloc_return for V4 and higher
     // versions.
-    if (STI.hasV4TOps() && MBBI->getOpcode() == Hexagon::JMPret
-                        && !DisableDeallocRet) {
+    if (MF.getTarget().getSubtarget<HexagonSubtarget>().hasV4TOps() &&
+        MBBI->getOpcode() == Hexagon::JMPret && !DisableDeallocRet) {
       // Check for RESTORE_DEALLOC_RET_JMP_V4 call. Don't emit an extra DEALLOC
       // instruction if we encounter it.
       MachineBasicBlock::iterator BeforeJMPR =
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h
index 446af16..2d4b0b9 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -11,20 +11,16 @@
 #define HEXAGON_FRAMEINFO_H
 
 #include "Hexagon.h"
-#include "HexagonSubtarget.h"
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
 
 class HexagonFrameLowering : public TargetFrameLowering {
 private:
-  const HexagonSubtarget &STI;
   void determineFrameLayout(MachineFunction &MF) const;
 
 public:
-  explicit HexagonFrameLowering(const HexagonSubtarget &sti)
-    : TargetFrameLowering(StackGrowsDown, 8, 0), STI(sti) {
-  }
+  explicit HexagonFrameLowering() : TargetFrameLowering(StackGrowsDown, 8, 0) {}
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index b8e5d24..a460ea4 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -463,9 +463,10 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
 
+  const HexagonRegisterInfo *QRI = static_cast<const HexagonRegisterInfo *>(
+      DAG.getTarget().getRegisterInfo());
   SDValue StackPtr =
-    DAG.getCopyFromReg(Chain, dl, TM.getRegisterInfo()->getStackRegister(),
-                       getPointerTy());
+      DAG.getCopyFromReg(Chain, dl, QRI->getStackRegister(), getPointerTy());
 
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
@@ -720,7 +721,10 @@ SDValue HexagonTargetLowering::LowerINLINEASM(SDValue Op,
                 cast<RegisterSDNode>(Node->getOperand(i))->getReg();
 
               // Check it to be lr
-              if (Reg == TM.getRegisterInfo()->getRARegister()) {
+              const HexagonRegisterInfo *QRI =
+                  static_cast<const HexagonRegisterInfo *>(
+                      DAG.getTarget().getRegisterInfo());
+              if (Reg == QRI->getRARegister()) {
                 FuncInfo->setHasClobberLR(true);
                 break;
               }
@@ -812,9 +816,9 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
 
   // The Sub result contains the new stack start address, so it
   // must be placed in the stack pointer register.
-  SDValue CopyChain = DAG.getCopyToReg(Chain, dl,
-                                       TM.getRegisterInfo()->getStackRegister(),
-                                       Sub);
+  const HexagonRegisterInfo *QRI = static_cast<const HexagonRegisterInfo *>(
+      DAG.getTarget().getRegisterInfo());
+  SDValue CopyChain = DAG.getCopyToReg(Chain, dl, QRI->getStackRegister(), Sub);
 
   SDValue Ops[2] = { ArgAdjust, CopyChain };
   return DAG.getMergeValues(Ops, dl);
@@ -944,21 +948,6 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue
-HexagonTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue CC = Op.getOperand(4);
-  SDValue TrueVal = Op.getOperand(2);
-  SDValue FalseVal = Op.getOperand(3);
-  SDLoc dl(Op);
-  SDNode* OpNode = Op.getNode();
-  EVT SVT = OpNode->getValueType(0);
-
-  SDValue Cond = DAG.getNode(ISD::SETCC, dl, MVT::i1, LHS, RHS, CC);
-  return DAG.getNode(ISD::SELECT, dl, SVT, Cond, TrueVal, FalseVal);
-}
-
-SDValue
 HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   EVT ValTy = Op.getValueType();
   SDLoc dl(Op);
@@ -975,7 +964,7 @@ HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MFI->setReturnAddressIsTaken(true);
@@ -1001,7 +990,8 @@ HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
-  const HexagonRegisterInfo  *TRI = TM.getRegisterInfo();
+  const HexagonRegisterInfo *TRI =
+      static_cast<const HexagonRegisterInfo *>(DAG.getTarget().getRegisterInfo());
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   MFI->setFrameAddressIsTaken(true);
 
@@ -1053,429 +1043,422 @@ HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
 // TargetLowering Implementation
 //===----------------------------------------------------------------------===//
 
-HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
-                                             &targetmachine)
-  : TargetLowering(targetmachine, new HexagonTargetObjectFile()),
-    TM(targetmachine) {
-
-    const HexagonRegisterInfo* QRI = TM.getRegisterInfo();
+HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &targetmachine)
+    : TargetLowering(targetmachine, new HexagonTargetObjectFile()),
+      TM(targetmachine) {
 
-    // Set up the register classes.
-    addRegisterClass(MVT::i32, &Hexagon::IntRegsRegClass);
-    addRegisterClass(MVT::i64, &Hexagon::DoubleRegsRegClass);
-
-    if (QRI->Subtarget.hasV5TOps()) {
-      addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass);
-      addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass);
-    }
+  const HexagonSubtarget &Subtarget = TM.getSubtarget<HexagonSubtarget>();
 
-    addRegisterClass(MVT::i1, &Hexagon::PredRegsRegClass);
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &Hexagon::IntRegsRegClass);
+  addRegisterClass(MVT::i64, &Hexagon::DoubleRegsRegClass);
 
-    computeRegisterProperties();
+  if (Subtarget.hasV5TOps()) {
+    addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass);
+    addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass);
+  }
 
-    // Align loop entry
-    setPrefLoopAlignment(4);
+  addRegisterClass(MVT::i1, &Hexagon::PredRegsRegClass);
 
-    // Limits for inline expansion of memcpy/memmove
-    MaxStoresPerMemcpy = 6;
-    MaxStoresPerMemmove = 6;
+  computeRegisterProperties();
 
-    //
-    // Library calls for unsupported operations
-    //
+  // Align loop entry
+  setPrefLoopAlignment(4);
 
-    setLibcallName(RTLIB::SINTTOFP_I128_F64, "__hexagon_floattidf");
-    setLibcallName(RTLIB::SINTTOFP_I128_F32, "__hexagon_floattisf");
-
-    setLibcallName(RTLIB::FPTOUINT_F32_I128, "__hexagon_fixunssfti");
-    setLibcallName(RTLIB::FPTOUINT_F64_I128, "__hexagon_fixunsdfti");
-
-    setLibcallName(RTLIB::FPTOSINT_F32_I128, "__hexagon_fixsfti");
-    setLibcallName(RTLIB::FPTOSINT_F64_I128, "__hexagon_fixdfti");
-
-    setLibcallName(RTLIB::SDIV_I32, "__hexagon_divsi3");
-    setOperationAction(ISD::SDIV,  MVT::i32, Expand);
-    setLibcallName(RTLIB::SREM_I32, "__hexagon_umodsi3");
-    setOperationAction(ISD::SREM,  MVT::i32, Expand);
-
-    setLibcallName(RTLIB::SDIV_I64, "__hexagon_divdi3");
-    setOperationAction(ISD::SDIV,  MVT::i64, Expand);
-    setLibcallName(RTLIB::SREM_I64, "__hexagon_moddi3");
-    setOperationAction(ISD::SREM,  MVT::i64, Expand);
-
-    setLibcallName(RTLIB::UDIV_I32, "__hexagon_udivsi3");
-    setOperationAction(ISD::UDIV,  MVT::i32, Expand);
-
-    setLibcallName(RTLIB::UDIV_I64, "__hexagon_udivdi3");
-    setOperationAction(ISD::UDIV,  MVT::i64, Expand);
-
-    setLibcallName(RTLIB::UREM_I32, "__hexagon_umodsi3");
-    setOperationAction(ISD::UREM,  MVT::i32, Expand);
-
-    setLibcallName(RTLIB::UREM_I64, "__hexagon_umoddi3");
-    setOperationAction(ISD::UREM,  MVT::i64, Expand);
-
-    setLibcallName(RTLIB::DIV_F32, "__hexagon_divsf3");
-    setOperationAction(ISD::FDIV,  MVT::f32, Expand);
-
-    setLibcallName(RTLIB::DIV_F64, "__hexagon_divdf3");
-    setOperationAction(ISD::FDIV,  MVT::f64, Expand);
-
-    setOperationAction(ISD::FSQRT,  MVT::f32, Expand);
-    setOperationAction(ISD::FSQRT,  MVT::f64, Expand);
-    setOperationAction(ISD::FSIN,  MVT::f32, Expand);
-    setOperationAction(ISD::FSIN,  MVT::f64, Expand);
-
-    if (QRI->Subtarget.hasV5TOps()) {
-      // Hexagon V5 Support.
-      setOperationAction(ISD::FADD,       MVT::f32, Legal);
-      setOperationAction(ISD::FADD,       MVT::f64, Legal);
-      setOperationAction(ISD::FP_EXTEND,  MVT::f32, Legal);
-      setCondCodeAction(ISD::SETOEQ,      MVT::f32, Legal);
-      setCondCodeAction(ISD::SETOEQ,      MVT::f64, Legal);
-      setCondCodeAction(ISD::SETUEQ,      MVT::f32, Legal);
-      setCondCodeAction(ISD::SETUEQ,      MVT::f64, Legal);
-
-      setCondCodeAction(ISD::SETOGE,      MVT::f32, Legal);
-      setCondCodeAction(ISD::SETOGE,      MVT::f64, Legal);
-      setCondCodeAction(ISD::SETUGE,      MVT::f32, Legal);
-      setCondCodeAction(ISD::SETUGE,      MVT::f64, Legal);
-
-      setCondCodeAction(ISD::SETOGT,      MVT::f32, Legal);
-      setCondCodeAction(ISD::SETOGT,      MVT::f64, Legal);
-      setCondCodeAction(ISD::SETUGT,      MVT::f32, Legal);
-      setCondCodeAction(ISD::SETUGT,      MVT::f64, Legal);
-
-      setCondCodeAction(ISD::SETOLE,      MVT::f32, Legal);
-      setCondCodeAction(ISD::SETOLE,      MVT::f64, Legal);
-      setCondCodeAction(ISD::SETOLT,      MVT::f32, Legal);
-      setCondCodeAction(ISD::SETOLT,      MVT::f64, Legal);
-
-      setOperationAction(ISD::ConstantFP,  MVT::f32, Legal);
-      setOperationAction(ISD::ConstantFP,  MVT::f64, Legal);
-
-      setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote);
-      setOperationAction(ISD::FP_TO_SINT, MVT::i1, Promote);
-      setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
-      setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
-
-      setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
-      setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
-      setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
-      setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
-
-      setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
-      setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
-      setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
-      setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
-
-      setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
-      setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
-      setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
-      setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
-
-      setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal);
-      setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal);
-      setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal);
-      setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal);
-
-      setOperationAction(ISD::FABS,  MVT::f32, Legal);
-      setOperationAction(ISD::FABS,  MVT::f64, Expand);
-
-      setOperationAction(ISD::FNEG,  MVT::f32, Legal);
-      setOperationAction(ISD::FNEG,  MVT::f64, Expand);
-    } else {
+  // Limits for inline expansion of memcpy/memmove
+  MaxStoresPerMemcpy = 6;
+  MaxStoresPerMemmove = 6;
 
-      // Expand fp<->uint.
-      setOperationAction(ISD::FP_TO_SINT,  MVT::i32, Expand);
-      setOperationAction(ISD::FP_TO_UINT,  MVT::i32, Expand);
+  //
+  // Library calls for unsupported operations
+  //
 
-      setOperationAction(ISD::SINT_TO_FP,  MVT::i32, Expand);
-      setOperationAction(ISD::UINT_TO_FP,  MVT::i32, Expand);
+  setLibcallName(RTLIB::SINTTOFP_I128_F64, "__hexagon_floattidf");
+  setLibcallName(RTLIB::SINTTOFP_I128_F32, "__hexagon_floattisf");
+
+  setLibcallName(RTLIB::FPTOUINT_F32_I128, "__hexagon_fixunssfti");
+  setLibcallName(RTLIB::FPTOUINT_F64_I128, "__hexagon_fixunsdfti");
+
+  setLibcallName(RTLIB::FPTOSINT_F32_I128, "__hexagon_fixsfti");
+  setLibcallName(RTLIB::FPTOSINT_F64_I128, "__hexagon_fixdfti");
+
+  setLibcallName(RTLIB::SDIV_I32, "__hexagon_divsi3");
+  setOperationAction(ISD::SDIV, MVT::i32, Expand);
+  setLibcallName(RTLIB::SREM_I32, "__hexagon_umodsi3");
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+
+  setLibcallName(RTLIB::SDIV_I64, "__hexagon_divdi3");
+  setOperationAction(ISD::SDIV, MVT::i64, Expand);
+  setLibcallName(RTLIB::SREM_I64, "__hexagon_moddi3");
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+
+  setLibcallName(RTLIB::UDIV_I32, "__hexagon_udivsi3");
+  setOperationAction(ISD::UDIV, MVT::i32, Expand);
+
+  setLibcallName(RTLIB::UDIV_I64, "__hexagon_udivdi3");
+  setOperationAction(ISD::UDIV, MVT::i64, Expand);
+
+  setLibcallName(RTLIB::UREM_I32, "__hexagon_umodsi3");
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+
+  setLibcallName(RTLIB::UREM_I64, "__hexagon_umoddi3");
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+  setLibcallName(RTLIB::DIV_F32, "__hexagon_divsf3");
+  setOperationAction(ISD::FDIV, MVT::f32, Expand);
+
+  setLibcallName(RTLIB::DIV_F64, "__hexagon_divdf3");
+  setOperationAction(ISD::FDIV, MVT::f64, Expand);
+
+  setOperationAction(ISD::FSQRT, MVT::f32, Expand);
+  setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+  setOperationAction(ISD::FSIN, MVT::f32, Expand);
+  setOperationAction(ISD::FSIN, MVT::f64, Expand);
+
+  if (Subtarget.hasV5TOps()) {
+    // Hexagon V5 Support.
+    setOperationAction(ISD::FADD, MVT::f32, Legal);
+    setOperationAction(ISD::FADD, MVT::f64, Legal);
+    setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
+    setCondCodeAction(ISD::SETOEQ, MVT::f32, Legal);
+    setCondCodeAction(ISD::SETOEQ, MVT::f64, Legal);
+    setCondCodeAction(ISD::SETUEQ, MVT::f32, Legal);
+    setCondCodeAction(ISD::SETUEQ, MVT::f64, Legal);
+
+    setCondCodeAction(ISD::SETOGE, MVT::f32, Legal);
+    setCondCodeAction(ISD::SETOGE, MVT::f64, Legal);
+    setCondCodeAction(ISD::SETUGE, MVT::f32, Legal);
+    setCondCodeAction(ISD::SETUGE, MVT::f64, Legal);
+
+    setCondCodeAction(ISD::SETOGT, MVT::f32, Legal);
+    setCondCodeAction(ISD::SETOGT, MVT::f64, Legal);
+    setCondCodeAction(ISD::SETUGT, MVT::f32, Legal);
+    setCondCodeAction(ISD::SETUGT, MVT::f64, Legal);
+
+    setCondCodeAction(ISD::SETOLE, MVT::f32, Legal);
+    setCondCodeAction(ISD::SETOLE, MVT::f64, Legal);
+    setCondCodeAction(ISD::SETOLT, MVT::f32, Legal);
+    setCondCodeAction(ISD::SETOLT, MVT::f64, Legal);
+
+    setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+    setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+
+    setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i1, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
+
+    setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
+
+    setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
+
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
+
+    setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal);
+
+    setOperationAction(ISD::FABS, MVT::f32, Legal);
+    setOperationAction(ISD::FABS, MVT::f64, Expand);
+
+    setOperationAction(ISD::FNEG, MVT::f32, Legal);
+    setOperationAction(ISD::FNEG, MVT::f64, Expand);
+  } else {
 
-      setLibcallName(RTLIB::SINTTOFP_I64_F32, "__hexagon_floatdisf");
-      setLibcallName(RTLIB::UINTTOFP_I64_F32, "__hexagon_floatundisf");
+    // Expand fp<->uint.
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Expand);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
 
-      setLibcallName(RTLIB::UINTTOFP_I32_F32, "__hexagon_floatunsisf");
-      setLibcallName(RTLIB::SINTTOFP_I32_F32, "__hexagon_floatsisf");
+    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
 
-      setLibcallName(RTLIB::SINTTOFP_I64_F64, "__hexagon_floatdidf");
-      setLibcallName(RTLIB::UINTTOFP_I64_F64, "__hexagon_floatundidf");
+    setLibcallName(RTLIB::SINTTOFP_I64_F32, "__hexagon_floatdisf");
+    setLibcallName(RTLIB::UINTTOFP_I64_F32, "__hexagon_floatundisf");
 
-      setLibcallName(RTLIB::UINTTOFP_I32_F64, "__hexagon_floatunsidf");
-      setLibcallName(RTLIB::SINTTOFP_I32_F64, "__hexagon_floatsidf");
+    setLibcallName(RTLIB::UINTTOFP_I32_F32, "__hexagon_floatunsisf");
+    setLibcallName(RTLIB::SINTTOFP_I32_F32, "__hexagon_floatsisf");
 
-      setLibcallName(RTLIB::FPTOUINT_F32_I32, "__hexagon_fixunssfsi");
-      setLibcallName(RTLIB::FPTOUINT_F32_I64, "__hexagon_fixunssfdi");
+    setLibcallName(RTLIB::SINTTOFP_I64_F64, "__hexagon_floatdidf");
+    setLibcallName(RTLIB::UINTTOFP_I64_F64, "__hexagon_floatundidf");
 
-      setLibcallName(RTLIB::FPTOSINT_F64_I64, "__hexagon_fixdfdi");
-      setLibcallName(RTLIB::FPTOSINT_F32_I64, "__hexagon_fixsfdi");
+    setLibcallName(RTLIB::UINTTOFP_I32_F64, "__hexagon_floatunsidf");
+    setLibcallName(RTLIB::SINTTOFP_I32_F64, "__hexagon_floatsidf");
 
-      setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi");
-      setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi");
+    setLibcallName(RTLIB::FPTOUINT_F32_I32, "__hexagon_fixunssfsi");
+    setLibcallName(RTLIB::FPTOUINT_F32_I64, "__hexagon_fixunssfdi");
 
-      setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3");
-      setOperationAction(ISD::FADD,  MVT::f64, Expand);
+    setLibcallName(RTLIB::FPTOSINT_F64_I64, "__hexagon_fixdfdi");
+    setLibcallName(RTLIB::FPTOSINT_F32_I64, "__hexagon_fixsfdi");
 
-      setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3");
-      setOperationAction(ISD::FADD,  MVT::f32, Expand);
+    setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi");
+    setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi");
 
-      setLibcallName(RTLIB::FPEXT_F32_F64, "__hexagon_extendsfdf2");
-      setOperationAction(ISD::FP_EXTEND,  MVT::f32, Expand);
+    setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3");
+    setOperationAction(ISD::FADD, MVT::f64, Expand);
 
-      setLibcallName(RTLIB::OEQ_F32, "__hexagon_eqsf2");
-      setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
+    setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3");
+    setOperationAction(ISD::FADD, MVT::f32, Expand);
 
-      setLibcallName(RTLIB::OEQ_F64, "__hexagon_eqdf2");
-      setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
+    setLibcallName(RTLIB::FPEXT_F32_F64, "__hexagon_extendsfdf2");
+    setOperationAction(ISD::FP_EXTEND, MVT::f32, Expand);
 
-      setLibcallName(RTLIB::OGE_F32, "__hexagon_gesf2");
-      setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
+    setLibcallName(RTLIB::OEQ_F32, "__hexagon_eqsf2");
+    setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
 
-      setLibcallName(RTLIB::OGE_F64, "__hexagon_gedf2");
-      setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
+    setLibcallName(RTLIB::OEQ_F64, "__hexagon_eqdf2");
+    setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
 
-      setLibcallName(RTLIB::OGT_F32, "__hexagon_gtsf2");
-      setCondCodeAction(ISD::SETOGT, MVT::f32, Expand);
+    setLibcallName(RTLIB::OGE_F32, "__hexagon_gesf2");
+    setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
 
-      setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2");
-      setCondCodeAction(ISD::SETOGT, MVT::f64, Expand);
+    setLibcallName(RTLIB::OGE_F64, "__hexagon_gedf2");
+    setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
 
-      setLibcallName(RTLIB::FPTOSINT_F64_I32, "__hexagon_fixdfsi");
-      setOperationAction(ISD::FP_TO_SINT, MVT::f64, Expand);
+    setLibcallName(RTLIB::OGT_F32, "__hexagon_gtsf2");
+    setCondCodeAction(ISD::SETOGT, MVT::f32, Expand);
 
-      setLibcallName(RTLIB::FPTOSINT_F32_I32, "__hexagon_fixsfsi");
-      setOperationAction(ISD::FP_TO_SINT, MVT::f32, Expand);
+    setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2");
+    setCondCodeAction(ISD::SETOGT, MVT::f64, Expand);
 
-      setLibcallName(RTLIB::OLE_F64, "__hexagon_ledf2");
-      setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
+    setLibcallName(RTLIB::FPTOSINT_F64_I32, "__hexagon_fixdfsi");
+    setOperationAction(ISD::FP_TO_SINT, MVT::f64, Expand);
 
-      setLibcallName(RTLIB::OLE_F32, "__hexagon_lesf2");
-      setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
+    setLibcallName(RTLIB::FPTOSINT_F32_I32, "__hexagon_fixsfsi");
+    setOperationAction(ISD::FP_TO_SINT, MVT::f32, Expand);
 
-      setLibcallName(RTLIB::OLT_F64, "__hexagon_ltdf2");
-      setCondCodeAction(ISD::SETOLT, MVT::f64, Expand);
+    setLibcallName(RTLIB::OLE_F64, "__hexagon_ledf2");
+    setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
 
-      setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2");
-      setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
+    setLibcallName(RTLIB::OLE_F32, "__hexagon_lesf2");
+    setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
 
-      setLibcallName(RTLIB::MUL_F64, "__hexagon_muldf3");
-      setOperationAction(ISD::FMUL, MVT::f64, Expand);
+    setLibcallName(RTLIB::OLT_F64, "__hexagon_ltdf2");
+    setCondCodeAction(ISD::SETOLT, MVT::f64, Expand);
 
-      setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3");
-      setOperationAction(ISD::MUL, MVT::f32, Expand);
+    setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2");
+    setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
 
-      setLibcallName(RTLIB::UNE_F64, "__hexagon_nedf2");
-      setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+    setLibcallName(RTLIB::MUL_F64, "__hexagon_muldf3");
+    setOperationAction(ISD::FMUL, MVT::f64, Expand);
 
-      setLibcallName(RTLIB::UNE_F32, "__hexagon_nesf2");
+    setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3");
+    setOperationAction(ISD::MUL, MVT::f32, Expand);
 
-      setLibcallName(RTLIB::SUB_F64, "__hexagon_subdf3");
-      setOperationAction(ISD::SUB, MVT::f64, Expand);
+    setLibcallName(RTLIB::UNE_F64, "__hexagon_nedf2");
+    setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
 
-      setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3");
-      setOperationAction(ISD::SUB, MVT::f32, Expand);
+    setLibcallName(RTLIB::UNE_F32, "__hexagon_nesf2");
 
-      setLibcallName(RTLIB::FPROUND_F64_F32, "__hexagon_truncdfsf2");
-      setOperationAction(ISD::FP_ROUND, MVT::f64, Expand);
+    setLibcallName(RTLIB::SUB_F64, "__hexagon_subdf3");
+    setOperationAction(ISD::SUB, MVT::f64, Expand);
 
-      setLibcallName(RTLIB::UO_F64, "__hexagon_unorddf2");
-      setCondCodeAction(ISD::SETUO, MVT::f64, Expand);
+    setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3");
+    setOperationAction(ISD::SUB, MVT::f32, Expand);
 
-      setLibcallName(RTLIB::O_F64, "__hexagon_unorddf2");
-      setCondCodeAction(ISD::SETO, MVT::f64, Expand);
+    setLibcallName(RTLIB::FPROUND_F64_F32, "__hexagon_truncdfsf2");
+    setOperationAction(ISD::FP_ROUND, MVT::f64, Expand);
 
-      setLibcallName(RTLIB::O_F32, "__hexagon_unordsf2");
-      setCondCodeAction(ISD::SETO, MVT::f32, Expand);
+    setLibcallName(RTLIB::UO_F64, "__hexagon_unorddf2");
+    setCondCodeAction(ISD::SETUO, MVT::f64, Expand);
 
-      setLibcallName(RTLIB::UO_F32, "__hexagon_unordsf2");
-      setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
+    setLibcallName(RTLIB::O_F64, "__hexagon_unorddf2");
+    setCondCodeAction(ISD::SETO, MVT::f64, Expand);
 
-      setOperationAction(ISD::FABS,  MVT::f32, Expand);
-      setOperationAction(ISD::FABS,  MVT::f64, Expand);
-      setOperationAction(ISD::FNEG,  MVT::f32, Expand);
-      setOperationAction(ISD::FNEG,  MVT::f64, Expand);
-    }
+    setLibcallName(RTLIB::O_F32, "__hexagon_unordsf2");
+    setCondCodeAction(ISD::SETO, MVT::f32, Expand);
 
-    setLibcallName(RTLIB::SREM_I32, "__hexagon_modsi3");
-    setOperationAction(ISD::SREM, MVT::i32, Expand);
-
-    setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal);
-    setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal);
-    setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal);
-    setIndexedLoadAction(ISD::POST_INC, MVT::i64, Legal);
-
-    setIndexedStoreAction(ISD::POST_INC, MVT::i8, Legal);
-    setIndexedStoreAction(ISD::POST_INC, MVT::i16, Legal);
-    setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal);
-    setIndexedStoreAction(ISD::POST_INC, MVT::i64, Legal);
-
-    setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
-
-    // Turn FP extload into load/fextend.
-    setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-    // Hexagon has a i1 sign extending load.
-    setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
-    // Turn FP truncstore into trunc + store.
-    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-
-    // Custom legalize GlobalAddress nodes into CONST32.
-    setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
-    setOperationAction(ISD::GlobalAddress, MVT::i8, Custom);
-    setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
-    // Truncate action?
-    setOperationAction(ISD::TRUNCATE, MVT::i64, Expand);
-
-    // Hexagon doesn't have sext_inreg, replace them with shl/sra.
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
-
-    // Hexagon has no REM or DIVREM operations.
-    setOperationAction(ISD::UREM, MVT::i32, Expand);
-    setOperationAction(ISD::SREM, MVT::i32, Expand);
-    setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
-    setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
-    setOperationAction(ISD::SREM, MVT::i64, Expand);
-    setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
-    setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
-
-    setOperationAction(ISD::BSWAP, MVT::i64, Expand);
-
-    // Lower SELECT_CC to SETCC and SELECT.
-    setOperationAction(ISD::SELECT_CC, MVT::i32,   Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::i64,   Custom);
-
-    if (QRI->Subtarget.hasV5TOps()) {
-
-      // We need to make the operation type of SELECT node to be Custom,
-      // such that we don't go into the infinite loop of
-      // select ->  setcc -> select_cc -> select loop.
-      setOperationAction(ISD::SELECT, MVT::f32, Custom);
-      setOperationAction(ISD::SELECT, MVT::f64, Custom);
-
-      setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
-      setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
-      setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+    setLibcallName(RTLIB::UO_F32, "__hexagon_unordsf2");
+    setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
 
-    } else {
-
-      // Hexagon has no select or setcc: expand to SELECT_CC.
-      setOperationAction(ISD::SELECT, MVT::f32, Expand);
-      setOperationAction(ISD::SELECT, MVT::f64, Expand);
+    setOperationAction(ISD::FABS, MVT::f32, Expand);
+    setOperationAction(ISD::FABS, MVT::f64, Expand);
+    setOperationAction(ISD::FNEG, MVT::f32, Expand);
+    setOperationAction(ISD::FNEG, MVT::f64, Expand);
+  }
 
-      // This is a workaround documented in DAGCombiner.cpp:2892 We don't
-      // support SELECT_CC on every type.
-      setOperationAction(ISD::SELECT_CC, MVT::Other,   Expand);
+  setLibcallName(RTLIB::SREM_I32, "__hexagon_modsi3");
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+
+  setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal);
+  setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal);
+  setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal);
+  setIndexedLoadAction(ISD::POST_INC, MVT::i64, Legal);
+
+  setIndexedStoreAction(ISD::POST_INC, MVT::i8, Legal);
+  setIndexedStoreAction(ISD::POST_INC, MVT::i16, Legal);
+  setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal);
+  setIndexedStoreAction(ISD::POST_INC, MVT::i64, Legal);
+
+  setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
+
+  // Turn FP extload into load/fextend.
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  // Hexagon has a i1 sign extending load.
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
+  // Turn FP truncstore into trunc + store.
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // Custom legalize GlobalAddress nodes into CONST32.
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i8, Custom);
+  setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
+  // Truncate action?
+  setOperationAction(ISD::TRUNCATE, MVT::i64, Expand);
+
+  // Hexagon doesn't have sext_inreg, replace them with shl/sra.
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  // Hexagon has no REM or DIVREM operations.
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+
+  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
+  // Lower SELECT_CC to SETCC and SELECT.
+  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+
+  if (Subtarget.hasV5TOps()) {
+
+    // We need to make the operation type of SELECT node to be Custom,
+    // such that we don't go into the infinite loop of
+    // select ->  setcc -> select_cc -> select loop.
+    setOperationAction(ISD::SELECT, MVT::f32, Custom);
+    setOperationAction(ISD::SELECT, MVT::f64, Custom);
+
+    setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+    setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
 
-    }
+  } else {
 
-    if (EmitJumpTables) {
-      setOperationAction(ISD::BR_JT, MVT::Other, Custom);
-    } else {
-      setOperationAction(ISD::BR_JT, MVT::Other, Expand);
-    }
-    // Increase jump tables cutover to 5, was 4.
-    setMinimumJumpTableEntries(5);
-
-    setOperationAction(ISD::BR_CC, MVT::f32, Expand);
-    setOperationAction(ISD::BR_CC, MVT::f64, Expand);
-    setOperationAction(ISD::BR_CC, MVT::i1,  Expand);
-    setOperationAction(ISD::BR_CC, MVT::i32, Expand);
-    setOperationAction(ISD::BR_CC, MVT::i64, Expand);
-
-    setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
-
-    setOperationAction(ISD::FSIN , MVT::f64, Expand);
-    setOperationAction(ISD::FCOS , MVT::f64, Expand);
-    setOperationAction(ISD::FREM , MVT::f64, Expand);
-    setOperationAction(ISD::FSIN , MVT::f32, Expand);
-    setOperationAction(ISD::FCOS , MVT::f32, Expand);
-    setOperationAction(ISD::FREM , MVT::f32, Expand);
-    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
-
-    // In V4, we have double word add/sub with carry. The problem with
-    // modelling this instruction is that it produces 2 results - Rdd and Px.
-    // To model update of Px, we will have to use Defs[p0..p3] which will
-    // cause any predicate live range to spill. So, we pretend we dont't
-    // have these instructions.
-    setOperationAction(ISD::ADDE, MVT::i8, Expand);
-    setOperationAction(ISD::ADDE, MVT::i16, Expand);
-    setOperationAction(ISD::ADDE, MVT::i32, Expand);
-    setOperationAction(ISD::ADDE, MVT::i64, Expand);
-    setOperationAction(ISD::SUBE, MVT::i8, Expand);
-    setOperationAction(ISD::SUBE, MVT::i16, Expand);
-    setOperationAction(ISD::SUBE, MVT::i32, Expand);
-    setOperationAction(ISD::SUBE, MVT::i64, Expand);
-    setOperationAction(ISD::ADDC, MVT::i8, Expand);
-    setOperationAction(ISD::ADDC, MVT::i16, Expand);
-    setOperationAction(ISD::ADDC, MVT::i32, Expand);
-    setOperationAction(ISD::ADDC, MVT::i64, Expand);
-    setOperationAction(ISD::SUBC, MVT::i8, Expand);
-    setOperationAction(ISD::SUBC, MVT::i16, Expand);
-    setOperationAction(ISD::SUBC, MVT::i32, Expand);
-    setOperationAction(ISD::SUBC, MVT::i64, Expand);
-
-    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
-    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
-    setOperationAction(ISD::CTTZ , MVT::i32, Expand);
-    setOperationAction(ISD::CTTZ , MVT::i64, Expand);
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
-    setOperationAction(ISD::CTLZ , MVT::i32, Expand);
-    setOperationAction(ISD::CTLZ , MVT::i64, Expand);
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
-    setOperationAction(ISD::ROTL , MVT::i32, Expand);
-    setOperationAction(ISD::ROTR , MVT::i32, Expand);
-    setOperationAction(ISD::BSWAP, MVT::i32, Expand);
-    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
-    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
-    setOperationAction(ISD::FPOW , MVT::f64, Expand);
-    setOperationAction(ISD::FPOW , MVT::f32, Expand);
-
-    setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
-    setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
-    setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
-
-    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
-    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
-
-    setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
-    setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
-
-    setOperationAction(ISD::EH_RETURN,     MVT::Other, Custom);
-
-    if (TM.getSubtargetImpl()->isSubtargetV2()) {
-      setExceptionPointerRegister(Hexagon::R20);
-      setExceptionSelectorRegister(Hexagon::R21);
-    } else {
-      setExceptionPointerRegister(Hexagon::R0);
-      setExceptionSelectorRegister(Hexagon::R1);
-    }
+    // Hexagon has no select or setcc: expand to SELECT_CC.
+    setOperationAction(ISD::SELECT, MVT::f32, Expand);
+    setOperationAction(ISD::SELECT, MVT::f64, Expand);
+  }
 
-    // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
-    setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+  if (EmitJumpTables) {
+    setOperationAction(ISD::BR_JT, MVT::Other, Custom);
+  } else {
+    setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  }
+  // Increase jump tables cutover to 5, was 4.
+  setMinimumJumpTableEntries(5);
+
+  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+
+  setOperationAction(ISD::FSIN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FREM, MVT::f64, Expand);
+  setOperationAction(ISD::FSIN, MVT::f32, Expand);
+  setOperationAction(ISD::FCOS, MVT::f32, Expand);
+  setOperationAction(ISD::FREM, MVT::f32, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+
+  // In V4, we have double word add/sub with carry. The problem with
+  // modelling this instruction is that it produces 2 results - Rdd and Px.
+  // To model update of Px, we will have to use Defs[p0..p3] which will
+  // cause any predicate live range to spill. So, we pretend we dont't
+  // have these instructions.
+  setOperationAction(ISD::ADDE, MVT::i8, Expand);
+  setOperationAction(ISD::ADDE, MVT::i16, Expand);
+  setOperationAction(ISD::ADDE, MVT::i32, Expand);
+  setOperationAction(ISD::ADDE, MVT::i64, Expand);
+  setOperationAction(ISD::SUBE, MVT::i8, Expand);
+  setOperationAction(ISD::SUBE, MVT::i16, Expand);
+  setOperationAction(ISD::SUBE, MVT::i32, Expand);
+  setOperationAction(ISD::SUBE, MVT::i64, Expand);
+  setOperationAction(ISD::ADDC, MVT::i8, Expand);
+  setOperationAction(ISD::ADDC, MVT::i16, Expand);
+  setOperationAction(ISD::ADDC, MVT::i32, Expand);
+  setOperationAction(ISD::ADDC, MVT::i64, Expand);
+  setOperationAction(ISD::SUBC, MVT::i8, Expand);
+  setOperationAction(ISD::SUBC, MVT::i16, Expand);
+  setOperationAction(ISD::SUBC, MVT::i32, Expand);
+  setOperationAction(ISD::SUBC, MVT::i64, Expand);
+
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+  setOperationAction(ISD::CTTZ, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ, MVT::i64, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ, MVT::i64, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  setOperationAction(ISD::ROTR, MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW, MVT::f64, Expand);
+  setOperationAction(ISD::FPOW, MVT::f32, Expand);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+
+  setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
+
+  if (Subtarget.isSubtargetV2()) {
+    setExceptionPointerRegister(Hexagon::R20);
+    setExceptionSelectorRegister(Hexagon::R21);
+  } else {
+    setExceptionPointerRegister(Hexagon::R0);
+    setExceptionSelectorRegister(Hexagon::R1);
+  }
 
-    // Use the default implementation.
-    setOperationAction(ISD::VAARG             , MVT::Other, Expand);
-    setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
-    setOperationAction(ISD::VAEND             , MVT::Other, Expand);
-    setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
-    setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
 
+  // Use the default implementation.
+  setOperationAction(ISD::VAARG, MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
 
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
-    setOperationAction(ISD::INLINEASM         , MVT::Other, Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  setOperationAction(ISD::INLINEASM, MVT::Other, Custom);
 
-    setMinFunctionAlignment(2);
+  setMinFunctionAlignment(2);
 
-    // Needed for DYNAMIC_STACKALLOC expansion.
-    unsigned StackRegister = TM.getRegisterInfo()->getStackRegister();
-    setStackPointerRegisterToSaveRestore(StackRegister);
-    setSchedulingPreference(Sched::VLIW);
+  // Needed for DYNAMIC_STACKALLOC expansion.
+  const HexagonRegisterInfo *QRI =
+      static_cast<const HexagonRegisterInfo *>(TM.getRegisterInfo());
+  setStackPointerRegisterToSaveRestore(QRI->getStackRegister());
+  setSchedulingPreference(Sched::VLIW);
 }
 
-
 const char*
 HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
@@ -1577,7 +1560,6 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::BR_JT:              return LowerBR_JT(Op, DAG);
 
     case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
-    case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
     case ISD::SELECT:             return Op;
     case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
     case ISD::INLINEASM:          return LowerINLINEASM(Op, DAG);
@@ -1641,8 +1623,7 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(const
 /// specified FP immediate natively. If false, the legalizer will
 /// materialize the FP immediate as a load from a constant pool.
 bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
-  const HexagonRegisterInfo* QRI = TM.getRegisterInfo();
-  return QRI->Subtarget.hasV5TOps();
+  return TM.getSubtarget<HexagonSubtarget>().hasV5TOps();
 }
 
 /// isLegalAddressingMode - Return true if the addressing mode represented by
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 4f27c27..ec16cc8 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -74,8 +74,8 @@ namespace llvm {
                               unsigned& RetSize) const;
 
   public:
-    HexagonTargetMachine &TM;
-    explicit HexagonTargetLowering(HexagonTargetMachine &targetmachine);
+    const TargetMachine &TM;
+    explicit HexagonTargetLowering(const TargetMachine &targetmachine);
 
     /// IsEligibleForTailCallOptimization - Check whether the call is eligible
     /// for tail call optimization. Targets which want to do tail call
@@ -124,7 +124,6 @@ namespace llvm {
                             const SmallVectorImpl<SDValue> &OutVals,
                             SDValue Callee) const;
 
-    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index ea6367a..1c95e06 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1538,14 +1538,13 @@ int HexagonInstrInfo::GetDotOldOp(const int opc) const {
   int NewOp = opc;
   if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form
     NewOp = Hexagon::getPredOldOpcode(NewOp);
-    if (NewOp < 0)
-      assert(0 && "Couldn't change predicate new instruction to its old form.");
+    assert(NewOp >= 0 &&
+           "Couldn't change predicate new instruction to its old form.");
   }
 
   if (isNewValueStore(NewOp)) { // Convert into non-new-value format
     NewOp = Hexagon::getNonNVStore(NewOp);
-    if (NewOp < 0)
-      assert(0 && "Couldn't change new-value store to its old form.");
+    assert(NewOp >= 0 && "Couldn't change new-value store to its old form.");
   }
   return NewOp;
 }
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 7dd6e95..6fcaa20 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -20,7 +20,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "misched"
 
-/// Platform specific modifications to DAG.
+/// Platform-specific modifications to DAG.
 void VLIWMachineScheduler::postprocessDAG() {
   SUnit* LastSequentialCall = nullptr;
   // Currently we only catch the situation when compare gets scheduled
@@ -150,7 +150,7 @@ void VLIWMachineScheduler::schedule() {
 
   buildDAGWithRegPressure();
 
-  // Postprocess the DAG to add platform specific artificial dependencies.
+  // Postprocess the DAG to add platform-specific artificial dependencies.
   postprocessDAG();
 
   SmallVector<SUnit*, 8> TopRoots, BotRoots;
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h
index 99100a1..8c41086 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -100,7 +100,7 @@ public:
   /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
   /// time to do some work.
   virtual void schedule() override;
-  /// Perform platform specific DAG postprocessing.
+  /// Perform platform-specific DAG postprocessing.
   void postprocessDAG();
 };
 
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
index 9e1e0fd..b5db997 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -18,10 +18,8 @@ using namespace llvm;
 
 bool llvm::flag_aligned_memcpy;
 
-HexagonSelectionDAGInfo::HexagonSelectionDAGInfo(const HexagonTargetMachine
-                                                 &TM)
-  : TargetSelectionDAGInfo(TM) {
-}
+HexagonSelectionDAGInfo::HexagonSelectionDAGInfo(const DataLayout &DL)
+    : TargetSelectionDAGInfo(&DL) {}
 
 HexagonSelectionDAGInfo::~HexagonSelectionDAGInfo() {
 }
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
index 8ba6108..b40b303 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
@@ -18,11 +18,9 @@
 
 namespace llvm {
 
-class HexagonTargetMachine;
-
 class HexagonSelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
-  explicit HexagonSelectionDAGInfo(const HexagonTargetMachine &TM);
+  explicit HexagonSelectionDAGInfo(const DataLayout &DL);
   ~HexagonSelectionDAGInfo();
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 70c87fa..657893f 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -48,10 +48,8 @@ EnableIEEERndNear(
     cl::Hidden, cl::ZeroOrMore, cl::init(false),
     cl::desc("Generate non-chopped conversion from fp to int."));
 
-HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS):
-  HexagonGenSubtargetInfo(TT, CPU, FS),
-  CPUString(CPU.str()) {
-
+HexagonSubtarget &
+HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
   // If the programmer has not specified a Hexagon version, default to -mv4.
   if (CPUString.empty())
     CPUString = "hexagonv4";
@@ -70,6 +68,15 @@ HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS):
   }
 
   ParseSubtargetFeatures(CPUString, FS);
+  return *this;
+}
+
+HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS,
+                                   const TargetMachine &TM)
+    : HexagonGenSubtargetInfo(TT, CPU, FS), CPUString(CPU.str()),
+      DL("e-m:e-p:32:32-i1:32-i64:64-a:0-n32"),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM),
+      TSInfo(DL), FrameLowering() {
 
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUString);
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index 690bef0..b184e62 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -14,6 +14,11 @@
 #ifndef Hexagon_SUBTARGET_H
 #define Hexagon_SUBTARGET_H
 
+#include "HexagonFrameLowering.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonISelLowering.h"
+#include "HexagonSelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
@@ -28,6 +33,7 @@ namespace llvm {
 
 class HexagonSubtarget : public HexagonGenSubtargetInfo {
   virtual void anchor();
+
   bool UseMemOps;
   bool ModeIEEERndNear;
 
@@ -37,16 +43,35 @@ public:
   };
 
   HexagonArchEnum HexagonArchVersion;
+private:
   std::string CPUString;
+  const DataLayout DL;       // Calculates type size & alignment.
+  HexagonInstrInfo InstrInfo;
+  HexagonTargetLowering TLInfo;
+  HexagonSelectionDAGInfo TSInfo;
+  HexagonFrameLowering FrameLowering;
   InstrItineraryData InstrItins;
 
 public:
-  HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS);
+  HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS,
+                   const TargetMachine &TM);
 
   /// getInstrItins - Return the instruction itineraies based on subtarget
   /// selection.
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+  const HexagonInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  const HexagonRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const HexagonTargetLowering *getTargetLowering() const { return &TLInfo; }
+  const HexagonFrameLowering *getFrameLowering() const {
+    return &FrameLowering;
+  }
+  const HexagonSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  const DataLayout *getDataLayout() const { return &DL; }
 
+  HexagonSubtarget &initializeSubtargetDependencies(StringRef CPU,
+                                                    StringRef FS);
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index b923764..7831410 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -67,15 +67,10 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
 HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
-                                           Reloc::Model RM,
-                                           CodeModel::Model CM,
+                                           Reloc::Model RM, CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
-  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    DL("e-m:e-p:32:32-i1:32-i64:64-a:0-n32") ,
-    Subtarget(TT, CPU, FS), InstrInfo(Subtarget), TLInfo(*this),
-    TSInfo(*this),
-    FrameLowering(Subtarget),
-    InstrItins(&Subtarget.getInstrItineraryData()) {
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, *this) {
     initAsmInfo();
 }
 
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h
index 70b835e..d88178e 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -14,12 +14,8 @@
 #ifndef HexagonTARGETMACHINE_H
 #define HexagonTARGETMACHINE_H
 
-#include "HexagonFrameLowering.h"
-#include "HexagonISelLowering.h"
 #include "HexagonInstrInfo.h"
-#include "HexagonSelectionDAGInfo.h"
 #include "HexagonSubtarget.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
@@ -27,13 +23,7 @@ namespace llvm {
 class Module;
 
 class HexagonTargetMachine : public LLVMTargetMachine {
-  const DataLayout DL;       // Calculates type size & alignment.
   HexagonSubtarget Subtarget;
-  HexagonInstrInfo InstrInfo;
-  HexagonTargetLowering TLInfo;
-  HexagonSelectionDAGInfo TSInfo;
-  HexagonFrameLowering FrameLowering;
-  const InstrItineraryData* InstrItins;
 
 public:
   HexagonTargetMachine(const Target &T, StringRef TT,StringRef CPU,
@@ -42,33 +32,29 @@ public:
                        CodeGenOpt::Level OL);
 
   const HexagonInstrInfo *getInstrInfo() const override {
-    return &InstrInfo;
+    return getSubtargetImpl()->getInstrInfo();
   }
   const HexagonSubtarget *getSubtargetImpl() const override {
     return &Subtarget;
   }
   const HexagonRegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
+    return getSubtargetImpl()->getRegisterInfo();
   }
-
   const InstrItineraryData* getInstrItineraryData() const override {
-    return InstrItins;
+    return &getSubtargetImpl()->getInstrItineraryData();
   }
-
-
   const HexagonTargetLowering* getTargetLowering() const override {
-    return &TLInfo;
+    return getSubtargetImpl()->getTargetLowering();
   }
-
   const HexagonFrameLowering* getFrameLowering() const override {
-    return &FrameLowering;
+    return getSubtargetImpl()->getFrameLowering();
   }
-
   const HexagonSelectionDAGInfo* getSelectionDAGInfo() const override {
-    return &TSInfo;
+    return getSubtargetImpl()->getSelectionDAGInfo();
+  }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
   }
-
-  const DataLayout       *getDataLayout() const override { return &DL; }
   static unsigned getModuleMatchQuality(const Module &M);
 
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h
index d464dd9..fadfeed 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.h
+++ b/lib/Target/MSP430/MSP430FrameLowering.h
@@ -15,20 +15,15 @@
 #define MSP430_FRAMEINFO_H
 
 #include "MSP430.h"
-#include "MSP430Subtarget.h"
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
-  class MSP430Subtarget;
-
 class MSP430FrameLowering : public TargetFrameLowering {
 protected:
-  const MSP430Subtarget &STI;
 
 public:
-  explicit MSP430FrameLowering(const MSP430Subtarget &sti)
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2, 2),
-      STI(sti) {}
+  explicit MSP430FrameLowering()
+      : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2, 2) {}
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index c5901bc..3d3ee92 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -57,11 +57,8 @@ HWMultMode("msp430-hwmult-mode", cl::Hidden,
                 "Assume hardware multiplier cannot be used inside interrupts"),
              clEnumValEnd));
 
-MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
-  TargetLowering(tm, new TargetLoweringObjectFileELF()),
-  Subtarget(*tm.getSubtargetImpl()) {
-
-  TD = getDataLayout();
+MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM)
+    : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
 
   // Set up the register classes.
   addRegisterClass(MVT::i8,  &MSP430::GR8RegClass);
@@ -1032,7 +1029,7 @@ MSP430TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
 
   if (ReturnAddrIndex == 0) {
     // Set up a frame object for the return address.
-    uint64_t SlotSize = TD->getPointerSize();
+    uint64_t SlotSize = getDataLayout()->getPointerSize();
     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
                                                            true);
     FuncInfo->setRAIndex(ReturnAddrIndex);
@@ -1055,7 +1052,7 @@ SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op,
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset =
-      DAG.getConstant(TD->getPointerSize(), MVT::i16);
+        DAG.getConstant(getDataLayout()->getPointerSize(), MVT::i16);
     return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, getPointerTy(),
                                    FrameAddr, Offset),
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index 3ced61d..3e2f344 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -66,12 +66,9 @@ namespace llvm {
     };
   }
 
-  class MSP430Subtarget;
-  class MSP430TargetMachine;
-
   class MSP430TargetLowering : public TargetLowering {
   public:
-    explicit MSP430TargetLowering(MSP430TargetMachine &TM);
+    explicit MSP430TargetLowering(const TargetMachine &TM);
 
     MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i8; }
 
@@ -170,9 +167,6 @@ namespace llvm {
                                     SDValue &Offset,
                                     ISD::MemIndexedMode &AM,
                                     SelectionDAG &DAG) const override;
-
-    const MSP430Subtarget &Subtarget;
-    const DataLayout *TD;
   };
 } // namespace llvm
 
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index 0c04ddb..ccb6c09 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -30,9 +30,9 @@ using namespace llvm;
 // Pin the vtable to this file.
 void MSP430InstrInfo::anchor() {}
 
-MSP430InstrInfo::MSP430InstrInfo(MSP430TargetMachine &tm)
+MSP430InstrInfo::MSP430InstrInfo(MSP430Subtarget &STI)
   : MSP430GenInstrInfo(MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP),
-    RI(tm) {}
+    RI() {}
 
 void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator MI,
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index 1ffcebb..e6baaef 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -22,7 +22,7 @@
 
 namespace llvm {
 
-class MSP430TargetMachine;
+class MSP430Subtarget;
 
 /// MSP430II - This namespace holds all of the target specific flags that
 /// instruction info tracks.
@@ -44,7 +44,7 @@ class MSP430InstrInfo : public MSP430GenInstrInfo {
   const MSP430RegisterInfo RI;
   virtual void anchor();
 public:
-  explicit MSP430InstrInfo(MSP430TargetMachine &TM);
+  explicit MSP430InstrInfo(MSP430Subtarget &STI);
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 341fb64..691bcee 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -32,10 +32,8 @@ using namespace llvm;
 #include "MSP430GenRegisterInfo.inc"
 
 // FIXME: Provide proper call frame setup / destroy opcodes.
-MSP430RegisterInfo::MSP430RegisterInfo(MSP430TargetMachine &tm)
-  : MSP430GenRegisterInfo(MSP430::PCW), TM(tm) {
-  StackAlign = TM.getFrameLowering()->getStackAlignment();
-}
+MSP430RegisterInfo::MSP430RegisterInfo()
+  : MSP430GenRegisterInfo(MSP430::PCW) {}
 
 const MCPhysReg*
 MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
index a607528..cb01961 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -21,18 +21,9 @@
 
 namespace llvm {
 
-class TargetInstrInfo;
-class MSP430TargetMachine;
-
 struct MSP430RegisterInfo : public MSP430GenRegisterInfo {
-private:
-  MSP430TargetMachine &TM;
-
-  /// StackAlign - Default stack alignment.
-  ///
-  unsigned StackAlign;
 public:
-  MSP430RegisterInfo(MSP430TargetMachine &tm);
+  MSP430RegisterInfo();
 
   /// Code Generation virtual methods...
   const MCPhysReg *
diff --git a/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp b/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
index c700383..3897ef6 100644
--- a/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
+++ b/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
@@ -16,9 +16,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "msp430-selectiondag-info"
 
-MSP430SelectionDAGInfo::MSP430SelectionDAGInfo(const MSP430TargetMachine &TM)
-  : TargetSelectionDAGInfo(TM) {
-}
+MSP430SelectionDAGInfo::MSP430SelectionDAGInfo(const DataLayout &DL)
+    : TargetSelectionDAGInfo(&DL) {}
 
 MSP430SelectionDAGInfo::~MSP430SelectionDAGInfo() {
 }
diff --git a/lib/Target/MSP430/MSP430SelectionDAGInfo.h b/lib/Target/MSP430/MSP430SelectionDAGInfo.h
index fa81948..cb04adc 100644
--- a/lib/Target/MSP430/MSP430SelectionDAGInfo.h
+++ b/lib/Target/MSP430/MSP430SelectionDAGInfo.h
@@ -22,7 +22,7 @@ class MSP430TargetMachine;
 
 class MSP430SelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
-  explicit MSP430SelectionDAGInfo(const MSP430TargetMachine &TM);
+  explicit MSP430SelectionDAGInfo(const DataLayout &DL);
   ~MSP430SelectionDAGInfo();
 };
 
diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp
index 68ad091..dbddc52 100644
--- a/lib/Target/MSP430/MSP430Subtarget.cpp
+++ b/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -25,12 +25,15 @@ using namespace llvm;
 
 void MSP430Subtarget::anchor() { }
 
-MSP430Subtarget::MSP430Subtarget(const std::string &TT,
-                                 const std::string &CPU,
-                                 const std::string &FS) :
-  MSP430GenSubtargetInfo(TT, CPU, FS) {
-  std::string CPUName = "generic";
-
-  // Parse features string.
-  ParseSubtargetFeatures(CPUName, FS);
+MSP430Subtarget &MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
+  ParseSubtargetFeatures("generic", FS);
+  return *this;
 }
+
+MSP430Subtarget::MSP430Subtarget(const std::string &TT, const std::string &CPU,
+                                 const std::string &FS, const TargetMachine &TM)
+    : MSP430GenSubtargetInfo(TT, CPU, FS),
+      // FIXME: Check DataLayout string.
+      DL("e-m:e-p:16:16-i32:16:32-n8:16"), FrameLowering(),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM),
+      TSInfo(DL) {}
diff --git a/lib/Target/MSP430/MSP430Subtarget.h b/lib/Target/MSP430/MSP430Subtarget.h
index 4d8792e..0152ad1 100644
--- a/lib/Target/MSP430/MSP430Subtarget.h
+++ b/lib/Target/MSP430/MSP430Subtarget.h
@@ -14,6 +14,12 @@
 #ifndef LLVM_TARGET_MSP430_SUBTARGET_H
 #define LLVM_TARGET_MSP430_SUBTARGET_H
 
+#include "MSP430FrameLowering.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430ISelLowering.h"
+#include "MSP430RegisterInfo.h"
+#include "MSP430SelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
@@ -26,16 +32,33 @@ class StringRef;
 class MSP430Subtarget : public MSP430GenSubtargetInfo {
   virtual void anchor();
   bool ExtendedInsts;
+  const DataLayout DL; // Calculates type size & alignment
+  MSP430FrameLowering FrameLowering;
+  MSP430InstrInfo InstrInfo;
+  MSP430TargetLowering TLInfo;
+  MSP430SelectionDAGInfo TSInfo;
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   ///
   MSP430Subtarget(const std::string &TT, const std::string &CPU,
-                  const std::string &FS);
+                  const std::string &FS, const TargetMachine &TM);
+
+  MSP430Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; }
+  const MSP430InstrInfo *getInstrInfo() const { return &InstrInfo; }
+  const DataLayout *getDataLayout() const { return &DL; }
+  const TargetRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const MSP430TargetLowering *getTargetLowering() const { return &TLInfo; }
+  const MSP430SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 50be2be..5ca36f2 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -24,19 +24,13 @@ extern "C" void LLVMInitializeMSP430Target() {
   RegisterTargetMachine<MSP430TargetMachine> X(TheMSP430Target);
 }
 
-MSP430TargetMachine::MSP430TargetMachine(const Target &T,
-                                         StringRef TT,
-                                         StringRef CPU,
-                                         StringRef FS,
+MSP430TargetMachine::MSP430TargetMachine(const Target &T, StringRef TT,
+                                         StringRef CPU, StringRef FS,
                                          const TargetOptions &Options,
                                          Reloc::Model RM, CodeModel::Model CM,
                                          CodeGenOpt::Level OL)
-  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS),
-    // FIXME: Check DataLayout string.
-    DL("e-m:e-p:16:16-i32:16:32-n8:16"),
-    InstrInfo(*this), TLInfo(*this), TSInfo(*this),
-    FrameLowering(Subtarget) {
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
 
diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h
index ea5d407..efa8403 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.h
+++ b/lib/Target/MSP430/MSP430TargetMachine.h
@@ -15,13 +15,7 @@
 #ifndef LLVM_TARGET_MSP430_TARGETMACHINE_H
 #define LLVM_TARGET_MSP430_TARGETMACHINE_H
 
-#include "MSP430FrameLowering.h"
-#include "MSP430ISelLowering.h"
-#include "MSP430InstrInfo.h"
-#include "MSP430RegisterInfo.h"
-#include "MSP430SelectionDAGInfo.h"
 #include "MSP430Subtarget.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -31,11 +25,6 @@ namespace llvm {
 ///
 class MSP430TargetMachine : public LLVMTargetMachine {
   MSP430Subtarget        Subtarget;
-  const DataLayout       DL;       // Calculates type size & alignment
-  MSP430InstrInfo        InstrInfo;
-  MSP430TargetLowering   TLInfo;
-  MSP430SelectionDAGInfo TSInfo;
-  MSP430FrameLowering    FrameLowering;
 
 public:
   MSP430TargetMachine(const Target &T, StringRef TT,
@@ -44,22 +33,25 @@ public:
                       CodeGenOpt::Level OL);
 
   const TargetFrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
+    return getSubtargetImpl()->getFrameLowering();
+  }
+  const MSP430InstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
+  }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
+  }
+  const MSP430Subtarget *getSubtargetImpl() const override {
+    return &Subtarget;
   }
-  const MSP430InstrInfo *getInstrInfo() const override  { return &InstrInfo; }
-  const DataLayout *getDataLayout() const override     { return &DL;}
-  const MSP430Subtarget *getSubtargetImpl() const override { return &Subtarget; }
-
   const TargetRegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
+    return getSubtargetImpl()->getRegisterInfo();
   }
-
   const MSP430TargetLowering *getTargetLowering() const override {
-    return &TLInfo;
+    return getSubtargetImpl()->getTargetLowering();
   }
-
-  const MSP430SelectionDAGInfo* getSelectionDAGInfo() const override {
-    return &TSInfo;
+  const MSP430SelectionDAGInfo *getSelectionDAGInfo() const override {
+    return getSubtargetImpl()->getSelectionDAGInfo();
   }
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 }; // MSP430TargetMachine.
diff --git a/lib/Target/Mips/Android.mk b/lib/Target/Mips/Android.mk
index 4e8831c..9f437f8 100644
--- a/lib/Target/Mips/Android.mk
+++ b/lib/Target/Mips/Android.mk
@@ -8,6 +8,7 @@ mips_codegen_TBLGEN_TABLES := \
   MipsGenMCPseudoLowering.inc \
   MipsGenAsmWriter.inc \
   MipsGenDAGISel.inc \
+  MipsGenFastISel.inc \
   MipsGenCallingConv.inc \
   MipsGenSubtargetInfo.inc
 
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 86fd386..0c06be8 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -72,72 +72,69 @@ class MipsAsmParser : public MCTargetAsmParser {
 #define GET_ASSEMBLER_HEADER
 #include "MipsGenAsmMatcher.inc"
 
+  unsigned checkTargetMatchPredicate(MCInst &Inst) override;
+
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-                               MCStreamer &Out, unsigned &ErrorInfo,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
                                bool MatchingInlineAsm) override;
 
   /// Parse a register as used in CFI directives
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
 
-  bool ParseParenSuffix(StringRef Name,
-                        SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  bool ParseParenSuffix(StringRef Name, OperandVector &Operands);
 
-  bool ParseBracketSuffix(StringRef Name,
-                          SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  bool ParseBracketSuffix(StringRef Name, OperandVector &Operands);
 
-  bool
-  ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
-                   SmallVectorImpl<MCParsedAsmOperand *> &Operands) override;
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
 
   bool ParseDirective(AsmToken DirectiveID) override;
 
-  MipsAsmParser::OperandMatchResultTy
-  parseMemOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
-
-  MipsAsmParser::OperandMatchResultTy MatchAnyRegisterNameWithoutDollar(
-      SmallVectorImpl<MCParsedAsmOperand *> &Operands, StringRef Identifier,
-      SMLoc S);
+  MipsAsmParser::OperandMatchResultTy parseMemOperand(OperandVector &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  MatchAnyRegisterWithoutDollar(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-                                SMLoc S);
+  MatchAnyRegisterNameWithoutDollar(OperandVector &Operands,
+                                    StringRef Identifier, SMLoc S);
 
   MipsAsmParser::OperandMatchResultTy
-  ParseAnyRegister(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  MatchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S);
 
-  MipsAsmParser::OperandMatchResultTy
-  ParseImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  MipsAsmParser::OperandMatchResultTy ParseAnyRegister(OperandVector &Operands);
 
-  MipsAsmParser::OperandMatchResultTy
-  ParseJumpTarget(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  MipsAsmParser::OperandMatchResultTy ParseImm(OperandVector &Operands);
 
-  MipsAsmParser::OperandMatchResultTy
-  parseInvNum(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  MipsAsmParser::OperandMatchResultTy ParseJumpTarget(OperandVector &Operands);
 
-  MipsAsmParser::OperandMatchResultTy
-  ParseLSAImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  MipsAsmParser::OperandMatchResultTy parseInvNum(OperandVector &Operands);
 
-  bool searchSymbolAlias(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  MipsAsmParser::OperandMatchResultTy ParseLSAImm(OperandVector &Operands);
 
-  bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &,
-                    StringRef Mnemonic);
+  bool searchSymbolAlias(OperandVector &Operands);
+
+  bool ParseOperand(OperandVector &, StringRef Mnemonic);
 
   bool needsExpansion(MCInst &Inst);
 
-  void expandInstruction(MCInst &Inst, SMLoc IDLoc,
+  // Expands assembly pseudo instructions.
+  // Returns false on success, true otherwise.
+  bool expandInstruction(MCInst &Inst, SMLoc IDLoc,
                          SmallVectorImpl<MCInst> &Instructions);
-  void expandLoadImm(MCInst &Inst, SMLoc IDLoc,
+
+  bool expandLoadImm(MCInst &Inst, SMLoc IDLoc,
                      SmallVectorImpl<MCInst> &Instructions);
-  void expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
+
+  bool expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
                             SmallVectorImpl<MCInst> &Instructions);
-  void expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
+
+  bool expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
                             SmallVectorImpl<MCInst> &Instructions);
+
   void expandMemInst(MCInst &Inst, SMLoc IDLoc,
                      SmallVectorImpl<MCInst> &Instructions, bool isLoad,
                      bool isImmOpnd);
-  bool reportParseError(StringRef ErrorMsg);
-  bool reportParseError(SMLoc Loc, StringRef ErrorMsg);
+  bool reportParseError(Twine ErrorMsg);
+  bool reportParseError(SMLoc Loc, Twine ErrorMsg);
 
   bool parseMemOffset(const MCExpr *&Res, bool isParenExpr);
   bool parseRelocOperand(const MCExpr *&Res);
@@ -159,32 +156,20 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool parseSetReorderDirective();
   bool parseSetNoReorderDirective();
   bool parseSetNoMips16Directive();
+  bool parseSetFpDirective();
 
   bool parseSetAssignment();
 
   bool parseDataDirective(unsigned Size, SMLoc L);
   bool parseDirectiveGpWord();
   bool parseDirectiveGpDWord();
+  bool parseDirectiveModule();
+  bool parseDirectiveModuleFP();
+  bool parseFpABIValue(MipsABIFlagsSection::FpABIKind &FpABI,
+                       StringRef Directive);
 
   MCSymbolRefExpr::VariantKind getVariantKind(StringRef Symbol);
 
-  bool isGP64() const {
-    return (STI.getFeatureBits() & Mips::FeatureGP64Bit) != 0;
-  }
-
-  bool isFP64() const {
-    return (STI.getFeatureBits() & Mips::FeatureFP64Bit) != 0;
-  }
-
-  bool isN32() const { return STI.getFeatureBits() & Mips::FeatureN32; }
-  bool isN64() const { return STI.getFeatureBits() & Mips::FeatureN64; }
-
-  bool isMicroMips() const {
-    return STI.getFeatureBits() & Mips::FeatureMicroMips;
-  }
-
-  bool parseRegister(unsigned &RegNum);
-
   bool eatComma(StringRef ErrorStr);
 
   int matchCPURegisterName(StringRef Symbol);
@@ -205,7 +190,7 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   unsigned getGPR(int RegNo);
 
-  int getATReg();
+  int getATReg(SMLoc Loc);
 
   bool processInstruction(MCInst &Inst, SMLoc IDLoc,
                           SmallVectorImpl<MCInst> &Instructions);
@@ -230,23 +215,85 @@ class MipsAsmParser : public MCTargetAsmParser {
   }
 
 public:
+  enum MipsMatchResultTy {
+    Match_RequiresDifferentSrcAndDst = FIRST_TARGET_MATCH_RESULT_TY
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "MipsGenAsmMatcher.inc"
+#undef GET_OPERAND_DIAGNOSTIC_TYPES
+
+  };
+
   MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
-                const MCInstrInfo &MII,
-                const MCTargetOptions &Options)
+                const MCInstrInfo &MII, const MCTargetOptions &Options)
       : MCTargetAsmParser(), STI(sti), Parser(parser) {
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
 
+    getTargetStreamer().updateABIInfo(*this);
+
     // Assert exactly one ABI was chosen.
     assert((((STI.getFeatureBits() & Mips::FeatureO32) != 0) +
             ((STI.getFeatureBits() & Mips::FeatureEABI) != 0) +
             ((STI.getFeatureBits() & Mips::FeatureN32) != 0) +
             ((STI.getFeatureBits() & Mips::FeatureN64) != 0)) == 1);
+
+    if (!isABI_O32() && !allowOddSPReg() != 0)
+      report_fatal_error("-mno-odd-spreg requires the O32 ABI");
   }
 
   MCAsmParser &getParser() const { return Parser; }
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
 
+  /// True if all of $fcc0 - $fcc7 exist for the current ISA.
+  bool hasEightFccRegisters() const { return hasMips4() || hasMips32(); }
+
+  bool isGP64bit() const { return STI.getFeatureBits() & Mips::FeatureGP64Bit; }
+  bool isFP64bit() const { return STI.getFeatureBits() & Mips::FeatureFP64Bit; }
+  bool isABI_N32() const { return STI.getFeatureBits() & Mips::FeatureN32; }
+  bool isABI_N64() const { return STI.getFeatureBits() & Mips::FeatureN64; }
+  bool isABI_O32() const { return STI.getFeatureBits() & Mips::FeatureO32; }
+  bool isABI_FPXX() const { return false; } // TODO: add check for FeatureXX
+
+  bool allowOddSPReg() const {
+    return !(STI.getFeatureBits() & Mips::FeatureNoOddSPReg);
+  }
+
+  bool inMicroMipsMode() const {
+    return STI.getFeatureBits() & Mips::FeatureMicroMips;
+  }
+  bool hasMips1() const { return STI.getFeatureBits() & Mips::FeatureMips1; }
+  bool hasMips2() const { return STI.getFeatureBits() & Mips::FeatureMips2; }
+  bool hasMips3() const { return STI.getFeatureBits() & Mips::FeatureMips3; }
+  bool hasMips4() const { return STI.getFeatureBits() & Mips::FeatureMips4; }
+  bool hasMips5() const { return STI.getFeatureBits() & Mips::FeatureMips5; }
+  bool hasMips32() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips32);
+  }
+  bool hasMips64() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips64);
+  }
+  bool hasMips32r2() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips32r2);
+  }
+  bool hasMips64r2() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips64r2);
+  }
+  bool hasMips32r6() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips32r6);
+  }
+  bool hasMips64r6() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips64r6);
+  }
+  bool hasDSP() const { return (STI.getFeatureBits() & Mips::FeatureDSP); }
+  bool hasDSPR2() const { return (STI.getFeatureBits() & Mips::FeatureDSPR2); }
+  bool hasMSA() const { return (STI.getFeatureBits() & Mips::FeatureMSA); }
+
+  bool inMips16Mode() const {
+    return STI.getFeatureBits() & Mips::FeatureMips16;
+  }
+  // TODO: see how can we get this info.
+  bool mipsSEUsesSoftFloat() const { return false; }
+
   /// Warn if RegNo is the current assembler temporary.
   void WarnIfAssemblerTemporary(int RegNo, SMLoc Loc);
 };
@@ -261,9 +308,9 @@ public:
   /// Broad categories of register classes
   /// The exact class is finalized by the render method.
   enum RegKind {
-    RegKind_GPR = 1,      /// GPR32 and GPR64 (depending on isGP64())
+    RegKind_GPR = 1,      /// GPR32 and GPR64 (depending on isGP64bit())
     RegKind_FGR = 2,      /// FGR32, FGR64, AFGR64 (depending on context and
-                          /// isFP64())
+                          /// isFP64bit())
     RegKind_FCC = 4,      /// FCC
     RegKind_MSA128 = 8,   /// MSA128[BHWD] (makes no difference which)
     RegKind_MSACtrl = 16, /// MSA control registers
@@ -289,9 +336,11 @@ private:
     k_Token          /// A simple token
   } Kind;
 
+public:
   MipsOperand(KindTy K, MipsAsmParser &Parser)
       : MCParsedAsmOperand(), Kind(K), AsmParser(Parser) {}
 
+private:
   /// For diagnostics, and checking the assembler temporary
   MipsAsmParser &AsmParser;
 
@@ -330,10 +379,11 @@ private:
   SMLoc StartLoc, EndLoc;
 
   /// Internal constructor for register kinds
-  static MipsOperand *CreateReg(unsigned Index, RegKind RegKind,
-                                const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
-                                MipsAsmParser &Parser) {
-    MipsOperand *Op = new MipsOperand(k_RegisterIndex, Parser);
+  static std::unique_ptr<MipsOperand> CreateReg(unsigned Index, RegKind RegKind,
+                                                const MCRegisterInfo *RegInfo,
+                                                SMLoc S, SMLoc E,
+                                                MipsAsmParser &Parser) {
+    auto Op = make_unique<MipsOperand>(k_RegisterIndex, Parser);
     Op->RegIdx.Index = Index;
     Op->RegIdx.RegInfo = RegInfo;
     Op->RegIdx.Kind = RegKind;
@@ -521,6 +571,10 @@ public:
   void addFGR32AsmRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::CreateReg(getFGR32Reg()));
+    // FIXME: We ought to do this for -integrated-as without -via-file-asm too.
+    if (!AsmParser.allowOddSPReg() && RegIdx.Index & 1)
+      AsmParser.Error(StartLoc, "-mno-odd-spreg prohibits the use of odd FPU "
+                                "registers");
   }
 
   void addFGRH32AsmRegOperands(MCInst &Inst, unsigned N) const {
@@ -612,6 +666,12 @@ public:
     return Kind == k_Token;
   }
   bool isMem() const override { return Kind == k_Memory; }
+  bool isConstantMemOff() const {
+    return isMem() && dyn_cast<MCConstantExpr>(getMemOff());
+  }
+  template <unsigned Bits> bool isMemWithSimmOffset() const {
+    return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff());
+  }
   bool isInvNum() const { return Kind == k_Immediate; }
   bool isLSAImm() const {
     if (!isConstantImm())
@@ -656,9 +716,13 @@ public:
     return Mem.Off;
   }
 
-  static MipsOperand *CreateToken(StringRef Str, SMLoc S,
-                                  MipsAsmParser &Parser) {
-    MipsOperand *Op = new MipsOperand(k_Token, Parser);
+  int64_t getConstantMemOff() const {
+    return static_cast<const MCConstantExpr *>(getMemOff())->getValue();
+  }
+
+  static std::unique_ptr<MipsOperand> CreateToken(StringRef Str, SMLoc S,
+                                                  MipsAsmParser &Parser) {
+    auto Op = make_unique<MipsOperand>(k_Token, Parser);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->StartLoc = S;
@@ -668,74 +732,75 @@ public:
 
   /// Create a numeric register (e.g. $1). The exact register remains
   /// unresolved until an instruction successfully matches
-  static MipsOperand *CreateNumericReg(unsigned Index,
-                                       const MCRegisterInfo *RegInfo, SMLoc S,
-                                       SMLoc E, MipsAsmParser &Parser) {
+  static std::unique_ptr<MipsOperand>
+  CreateNumericReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S,
+                   SMLoc E, MipsAsmParser &Parser) {
     DEBUG(dbgs() << "CreateNumericReg(" << Index << ", ...)\n");
     return CreateReg(Index, RegKind_Numeric, RegInfo, S, E, Parser);
   }
 
   /// Create a register that is definitely a GPR.
   /// This is typically only used for named registers such as $gp.
-  static MipsOperand *CreateGPRReg(unsigned Index,
-                                   const MCRegisterInfo *RegInfo, SMLoc S,
-                                   SMLoc E, MipsAsmParser &Parser) {
+  static std::unique_ptr<MipsOperand>
+  CreateGPRReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
+               MipsAsmParser &Parser) {
     return CreateReg(Index, RegKind_GPR, RegInfo, S, E, Parser);
   }
 
   /// Create a register that is definitely a FGR.
   /// This is typically only used for named registers such as $f0.
-  static MipsOperand *CreateFGRReg(unsigned Index,
-                                   const MCRegisterInfo *RegInfo, SMLoc S,
-                                   SMLoc E, MipsAsmParser &Parser) {
+  static std::unique_ptr<MipsOperand>
+  CreateFGRReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
+               MipsAsmParser &Parser) {
     return CreateReg(Index, RegKind_FGR, RegInfo, S, E, Parser);
   }
 
   /// Create a register that is definitely an FCC.
   /// This is typically only used for named registers such as $fcc0.
-  static MipsOperand *CreateFCCReg(unsigned Index,
-                                   const MCRegisterInfo *RegInfo, SMLoc S,
-                                   SMLoc E, MipsAsmParser &Parser) {
+  static std::unique_ptr<MipsOperand>
+  CreateFCCReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
+               MipsAsmParser &Parser) {
     return CreateReg(Index, RegKind_FCC, RegInfo, S, E, Parser);
   }
 
   /// Create a register that is definitely an ACC.
   /// This is typically only used for named registers such as $ac0.
-  static MipsOperand *CreateACCReg(unsigned Index,
-                                   const MCRegisterInfo *RegInfo, SMLoc S,
-                                   SMLoc E, MipsAsmParser &Parser) {
+  static std::unique_ptr<MipsOperand>
+  CreateACCReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
+               MipsAsmParser &Parser) {
     return CreateReg(Index, RegKind_ACC, RegInfo, S, E, Parser);
   }
 
   /// Create a register that is definitely an MSA128.
   /// This is typically only used for named registers such as $w0.
-  static MipsOperand *CreateMSA128Reg(unsigned Index,
-                                      const MCRegisterInfo *RegInfo, SMLoc S,
-                                      SMLoc E, MipsAsmParser &Parser) {
+  static std::unique_ptr<MipsOperand>
+  CreateMSA128Reg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S,
+                  SMLoc E, MipsAsmParser &Parser) {
     return CreateReg(Index, RegKind_MSA128, RegInfo, S, E, Parser);
   }
 
   /// Create a register that is definitely an MSACtrl.
   /// This is typically only used for named registers such as $msaaccess.
-  static MipsOperand *CreateMSACtrlReg(unsigned Index,
-                                       const MCRegisterInfo *RegInfo, SMLoc S,
-                                       SMLoc E, MipsAsmParser &Parser) {
+  static std::unique_ptr<MipsOperand>
+  CreateMSACtrlReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S,
+                   SMLoc E, MipsAsmParser &Parser) {
     return CreateReg(Index, RegKind_MSACtrl, RegInfo, S, E, Parser);
   }
 
-  static MipsOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E,
-                                MipsAsmParser &Parser) {
-    MipsOperand *Op = new MipsOperand(k_Immediate, Parser);
+  static std::unique_ptr<MipsOperand>
+  CreateImm(const MCExpr *Val, SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+    auto Op = make_unique<MipsOperand>(k_Immediate, Parser);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static MipsOperand *CreateMem(MipsOperand *Base, const MCExpr *Off, SMLoc S,
-                                SMLoc E, MipsAsmParser &Parser) {
-    MipsOperand *Op = new MipsOperand(k_Memory, Parser);
-    Op->Mem.Base = Base;
+  static std::unique_ptr<MipsOperand>
+  CreateMem(std::unique_ptr<MipsOperand> Base, const MCExpr *Off, SMLoc S,
+            SMLoc E, MipsAsmParser &Parser) {
+    auto Op = make_unique<MipsOperand>(k_Memory, Parser);
+    Op->Mem.Base = Base.release();
     Op->Mem.Off = Off;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -756,7 +821,11 @@ public:
     return isRegIdx() && RegIdx.Kind & RegKind_CCR && RegIdx.Index <= 31;
   }
   bool isFCCAsmReg() const {
-    return isRegIdx() && RegIdx.Kind & RegKind_FCC && RegIdx.Index <= 7;
+    if (!(isRegIdx() && RegIdx.Kind & RegKind_FCC))
+      return false;
+    if (!AsmParser.hasEightFccRegisters())
+      return RegIdx.Index == 0;
+    return RegIdx.Index <= 7;
   }
   bool isACCAsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_ACC && RegIdx.Index <= 3;
@@ -849,9 +918,10 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
       Offset = Inst.getOperand(2);
       if (!Offset.isImm())
         break; // We'll deal with this situation later on when applying fixups.
-      if (!isIntN(isMicroMips() ? 17 : 18, Offset.getImm()))
+      if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm()))
         return Error(IDLoc, "branch target out of range");
-      if (OffsetToAlignment(Offset.getImm(), 1LL << (isMicroMips() ? 1 : 2)))
+      if (OffsetToAlignment(Offset.getImm(),
+                            1LL << (inMicroMipsMode() ? 1 : 2)))
         return Error(IDLoc, "branch to misaligned address");
       break;
     case Mips::BGEZ:
@@ -874,14 +944,23 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
       Offset = Inst.getOperand(1);
       if (!Offset.isImm())
         break; // We'll deal with this situation later on when applying fixups.
-      if (!isIntN(isMicroMips() ? 17 : 18, Offset.getImm()))
+      if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm()))
         return Error(IDLoc, "branch target out of range");
-      if (OffsetToAlignment(Offset.getImm(), 1LL << (isMicroMips() ? 1 : 2)))
+      if (OffsetToAlignment(Offset.getImm(),
+                            1LL << (inMicroMipsMode() ? 1 : 2)))
         return Error(IDLoc, "branch to misaligned address");
       break;
     }
   }
 
+  // SSNOP is deprecated on MIPS32r6/MIPS64r6
+  // We still accept it but it is a normal nop.
+  if (hasMips32r6() && Inst.getOpcode() == Mips::SSNOP) {
+    std::string ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6";
+    Warning(IDLoc, "ssnop is deprecated for " + ISA + " and is equivalent to a "
+                                                      "nop instruction");
+  }
+
   if (MCID.hasDelaySlot() && Options.isReorder()) {
     // If this instruction has a delay slot and .set reorder is active,
     // emit a NOP after it.
@@ -930,7 +1009,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
   }   // if load/store
 
   if (needsExpansion(Inst))
-    expandInstruction(Inst, IDLoc, Instructions);
+    return expandInstruction(Inst, IDLoc, Instructions);
   else
     Instructions.push_back(Inst);
 
@@ -943,17 +1022,27 @@ bool MipsAsmParser::needsExpansion(MCInst &Inst) {
   case Mips::LoadImm32Reg:
   case Mips::LoadAddr32Imm:
   case Mips::LoadAddr32Reg:
+  case Mips::LoadImm64Reg:
     return true;
   default:
     return false;
   }
 }
 
-void MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
+bool MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
                                       SmallVectorImpl<MCInst> &Instructions) {
   switch (Inst.getOpcode()) {
+  default:
+    assert(0 && "unimplemented expansion");
+    return true;
   case Mips::LoadImm32Reg:
     return expandLoadImm(Inst, IDLoc, Instructions);
+  case Mips::LoadImm64Reg:
+    if (!isGP64bit()) {
+      Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+      return true;
+    }
+    return expandLoadImm(Inst, IDLoc, Instructions);
   case Mips::LoadAddr32Imm:
     return expandLoadAddressImm(Inst, IDLoc, Instructions);
   case Mips::LoadAddr32Reg:
@@ -961,7 +1050,31 @@ void MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
   }
 }
 
-void MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc,
+namespace {
+template <int Shift, bool PerformShift>
+void createShiftOr(int64_t Value, unsigned RegNo, SMLoc IDLoc,
+                   SmallVectorImpl<MCInst> &Instructions) {
+  MCInst tmpInst;
+  if (PerformShift) {
+    tmpInst.setOpcode(Mips::DSLL);
+    tmpInst.addOperand(MCOperand::CreateReg(RegNo));
+    tmpInst.addOperand(MCOperand::CreateReg(RegNo));
+    tmpInst.addOperand(MCOperand::CreateImm(16));
+    tmpInst.setLoc(IDLoc);
+    Instructions.push_back(tmpInst);
+    tmpInst.clear();
+  }
+  tmpInst.setOpcode(Mips::ORi);
+  tmpInst.addOperand(MCOperand::CreateReg(RegNo));
+  tmpInst.addOperand(MCOperand::CreateReg(RegNo));
+  tmpInst.addOperand(
+      MCOperand::CreateImm(((Value & (0xffffLL << Shift)) >> Shift)));
+  tmpInst.setLoc(IDLoc);
+  Instructions.push_back(tmpInst);
+}
+}
+
+bool MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc,
                                   SmallVectorImpl<MCInst> &Instructions) {
   MCInst tmpInst;
   const MCOperand &ImmOp = Inst.getOperand(1);
@@ -969,8 +1082,10 @@ void MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc,
   const MCOperand &RegOp = Inst.getOperand(0);
   assert(RegOp.isReg() && "expected register operand kind");
 
-  int ImmValue = ImmOp.getImm();
+  int64_t ImmValue = ImmOp.getImm();
   tmpInst.setLoc(IDLoc);
+  // FIXME: gas has a special case for values that are 000...1111, which
+  // becomes a li -1 and then a dsrl
   if (0 <= ImmValue && ImmValue <= 65535) {
     // For 0 <= j <= 65535.
     // li d,j => ori d,$zero,j
@@ -987,25 +1102,76 @@ void MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc,
     tmpInst.addOperand(MCOperand::CreateReg(Mips::ZERO));
     tmpInst.addOperand(MCOperand::CreateImm(ImmValue));
     Instructions.push_back(tmpInst);
-  } else {
-    // For any other value of j that is representable as a 32-bit integer.
+  } else if ((ImmValue & 0xffffffff) == ImmValue) {
+    // For any value of j that is representable as a 32-bit integer, create
+    // a sequence of:
     // li d,j => lui d,hi16(j)
     //           ori d,d,lo16(j)
     tmpInst.setOpcode(Mips::LUi);
     tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
     tmpInst.addOperand(MCOperand::CreateImm((ImmValue & 0xffff0000) >> 16));
     Instructions.push_back(tmpInst);
-    tmpInst.clear();
-    tmpInst.setOpcode(Mips::ORi);
+    createShiftOr<0, false>(ImmValue, RegOp.getReg(), IDLoc, Instructions);
+  } else if ((ImmValue & (0xffffLL << 48)) == 0) {
+    if (!isGP64bit()) {
+      Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+      return true;
+    }
+
+    //            <-------  lo32 ------>
+    // <-------  hi32 ------>
+    // <- hi16 ->             <- lo16 ->
+    //  _________________________________
+    // |          |          |          |
+    // | 16-bytes | 16-bytes | 16-bytes |
+    // |__________|__________|__________|
+    //
+    // For any value of j that is representable as a 48-bit integer, create
+    // a sequence of:
+    // li d,j => lui d,hi16(j)
+    //           ori d,d,hi16(lo32(j))
+    //           dsll d,d,16
+    //           ori d,d,lo16(lo32(j))
+    tmpInst.setOpcode(Mips::LUi);
     tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(
+        MCOperand::CreateImm((ImmValue & (0xffffLL << 32)) >> 32));
+    Instructions.push_back(tmpInst);
+    createShiftOr<16, false>(ImmValue, RegOp.getReg(), IDLoc, Instructions);
+    createShiftOr<0, true>(ImmValue, RegOp.getReg(), IDLoc, Instructions);
+  } else {
+    if (!isGP64bit()) {
+      Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+      return true;
+    }
+
+    // <-------  hi32 ------> <-------  lo32 ------>
+    // <- hi16 ->                        <- lo16 ->
+    //  ___________________________________________
+    // |          |          |          |          |
+    // | 16-bytes | 16-bytes | 16-bytes | 16-bytes |
+    // |__________|__________|__________|__________|
+    //
+    // For any value of j that isn't representable as a 48-bit integer.
+    // li d,j => lui d,hi16(j)
+    //           ori d,d,lo16(hi32(j))
+    //           dsll d,d,16
+    //           ori d,d,hi16(lo32(j))
+    //           dsll d,d,16
+    //           ori d,d,lo16(lo32(j))
+    tmpInst.setOpcode(Mips::LUi);
     tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
-    tmpInst.addOperand(MCOperand::CreateImm(ImmValue & 0xffff));
-    tmpInst.setLoc(IDLoc);
+    tmpInst.addOperand(
+        MCOperand::CreateImm((ImmValue & (0xffffLL << 48)) >> 48));
     Instructions.push_back(tmpInst);
+    createShiftOr<32, false>(ImmValue, RegOp.getReg(), IDLoc, Instructions);
+    createShiftOr<16, true>(ImmValue, RegOp.getReg(), IDLoc, Instructions);
+    createShiftOr<0, true>(ImmValue, RegOp.getReg(), IDLoc, Instructions);
   }
+  return false;
 }
 
-void
+bool
 MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
                                     SmallVectorImpl<MCInst> &Instructions) {
   MCInst tmpInst;
@@ -1046,9 +1212,10 @@ MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
     tmpInst.addOperand(MCOperand::CreateReg(SrcRegOp.getReg()));
     Instructions.push_back(tmpInst);
   }
+  return false;
 }
 
-void
+bool
 MipsAsmParser::expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
                                     SmallVectorImpl<MCInst> &Instructions) {
   MCInst tmpInst;
@@ -1080,6 +1247,7 @@ MipsAsmParser::expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
     tmpInst.addOperand(MCOperand::CreateImm(ImmValue & 0xffff));
     Instructions.push_back(tmpInst);
   }
+  return false;
 }
 
 void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
@@ -1090,8 +1258,6 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
   unsigned ImmOffset, HiOffset, LoOffset;
   const MCExpr *ExprOffset;
   unsigned TmpRegNum;
-  unsigned AtRegNum = getReg(
-      (isGP64()) ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, getATReg());
   // 1st operand is either the source or destination register.
   assert(Inst.getOperand(0).isReg() && "expected register operand kind");
   unsigned RegOpNum = Inst.getOperand(0).getReg();
@@ -1111,10 +1277,46 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
     ExprOffset = Inst.getOperand(2).getExpr();
   // All instructions will have the same location.
   TempInst.setLoc(IDLoc);
-  // 1st instruction in expansion is LUi. For load instruction we can use
-  // the dst register as a temporary if base and dst are different,
-  // but for stores we must use $at.
-  TmpRegNum = (isLoad && (BaseRegNum != RegOpNum)) ? RegOpNum : AtRegNum;
+  // These are some of the types of expansions we perform here:
+  // 1) lw $8, sym        => lui $8, %hi(sym)
+  //                         lw $8, %lo(sym)($8)
+  // 2) lw $8, offset($9) => lui $8, %hi(offset)
+  //                         add $8, $8, $9
+  //                         lw $8, %lo(offset)($9)
+  // 3) lw $8, offset($8) => lui $at, %hi(offset)
+  //                         add $at, $at, $8
+  //                         lw $8, %lo(offset)($at)
+  // 4) sw $8, sym        => lui $at, %hi(sym)
+  //                         sw $8, %lo(sym)($at)
+  // 5) sw $8, offset($8) => lui $at, %hi(offset)
+  //                         add $at, $at, $8
+  //                         sw $8, %lo(offset)($at)
+  // 6) ldc1 $f0, sym     => lui $at, %hi(sym)
+  //                         ldc1 $f0, %lo(sym)($at)
+  //
+  // For load instructions we can use the destination register as a temporary
+  // if base and dst are different (examples 1 and 2) and if the base register
+  // is general purpose otherwise we must use $at (example 6) and error if it's
+  // not available. For stores we must use $at (examples 4 and 5) because we
+  // must not clobber the source register setting up the offset.
+  const MCInstrDesc &Desc = getInstDesc(Inst.getOpcode());
+  int16_t RegClassOp0 = Desc.OpInfo[0].RegClass;
+  unsigned RegClassIDOp0 =
+      getContext().getRegisterInfo()->getRegClass(RegClassOp0).getID();
+  bool IsGPR = (RegClassIDOp0 == Mips::GPR32RegClassID) ||
+               (RegClassIDOp0 == Mips::GPR64RegClassID);
+  if (isLoad && IsGPR && (BaseRegNum != RegOpNum))
+    TmpRegNum = RegOpNum;
+  else {
+    int AT = getATReg(IDLoc);
+    // At this point we need AT to perform the expansions and we exit if it is
+    // not available.
+    if (!AT)
+      return;
+    TmpRegNum = getReg(
+        (isGP64bit()) ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, AT);
+  }
+
   TempInst.setOpcode(Mips::LUi);
   TempInst.addOperand(MCOperand::CreateReg(TmpRegNum));
   if (isImmOpnd)
@@ -1164,10 +1366,24 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
   TempInst.clear();
 }
 
-bool MipsAsmParser::MatchAndEmitInstruction(
-    SMLoc IDLoc, unsigned &Opcode,
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands, MCStreamer &Out,
-    unsigned &ErrorInfo, bool MatchingInlineAsm) {
+unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
+  // As described by the Mips32r2 spec, the registers Rd and Rs for
+  // jalr.hb must be different.
+  unsigned Opcode = Inst.getOpcode();
+
+  if (Opcode == Mips::JALR_HB &&
+      (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()))
+    return Match_RequiresDifferentSrcAndDst;
+
+  return Match_Success;
+}
+
+bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                            OperandVector &Operands,
+                                            MCStreamer &Out,
+                                            unsigned &ErrorInfo,
+                                            bool MatchingInlineAsm) {
+
   MCInst Inst;
   SmallVector<MCInst, 8> Instructions;
   unsigned MatchResult =
@@ -1192,7 +1408,7 @@ bool MipsAsmParser::MatchAndEmitInstruction(
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
-      ErrorLoc = ((MipsOperand *)Operands[ErrorInfo])->getStartLoc();
+      ErrorLoc = ((MipsOperand &)*Operands[ErrorInfo]).getStartLoc();
       if (ErrorLoc == SMLoc())
         ErrorLoc = IDLoc;
     }
@@ -1201,6 +1417,8 @@ bool MipsAsmParser::MatchAndEmitInstruction(
   }
   case Match_MnemonicFail:
     return Error(IDLoc, "invalid instruction");
+  case Match_RequiresDifferentSrcAndDst:
+    return Error(IDLoc, "source and destination must be different");
   }
   return true;
 }
@@ -1254,7 +1472,7 @@ int MipsAsmParser::matchCPURegisterName(StringRef Name) {
            .Case("t9", 25)
            .Default(-1);
 
-  if (isN32() || isN64()) {
+  if (isABI_N32() || isABI_N64()) {
     // Although SGI documentation just cuts out t0-t3 for n32/n64,
     // GNU pushes the values of t0-t3 to override the o32/o64 values for t4-t7
     // We are supporting both cases, so for t0-t3 we'll just push them to t4-t7.
@@ -1354,10 +1572,11 @@ bool MipsAssemblerOptions::setATReg(unsigned Reg) {
   return true;
 }
 
-int MipsAsmParser::getATReg() {
+int MipsAsmParser::getATReg(SMLoc Loc) {
   int AT = Options.getATRegNum();
   if (AT == 0)
-    TokError("Pseudo instruction requires $at, which is not available");
+    reportParseError(Loc,
+                     "Pseudo instruction requires $at, which is not available");
   return AT;
 }
 
@@ -1366,7 +1585,7 @@ unsigned MipsAsmParser::getReg(int RC, int RegNo) {
 }
 
 unsigned MipsAsmParser::getGPR(int RegNo) {
-  return getReg(isGP64() ? Mips::GPR64RegClassID : Mips::GPR32RegClassID,
+  return getReg(isGP64bit() ? Mips::GPR64RegClassID : Mips::GPR32RegClassID,
                 RegNo);
 }
 
@@ -1378,9 +1597,7 @@ int MipsAsmParser::matchRegisterByNumber(unsigned RegNum, unsigned RegClass) {
   return getReg(RegClass, RegNum);
 }
 
-bool
-MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-                            StringRef Mnemonic) {
+bool MipsAsmParser::ParseOperand(OperandVector &Operands, StringRef Mnemonic) {
   DEBUG(dbgs() << "ParseOperand\n");
 
   // Check if the current operand has a custom associated parser, if so, try to
@@ -1431,6 +1648,7 @@ MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
   case AsmToken::Minus:
   case AsmToken::Plus:
   case AsmToken::Integer:
+  case AsmToken::Tilde:
   case AsmToken::String: {
     DEBUG(dbgs() << ".. generic integer\n");
     OperandMatchResultTy ResTy = ParseImm(Operands);
@@ -1578,11 +1796,11 @@ bool MipsAsmParser::parseRelocOperand(const MCExpr *&Res) {
 
 bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                                   SMLoc &EndLoc) {
-  SmallVector<MCParsedAsmOperand *, 1> Operands;
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands;
   OperandMatchResultTy ResTy = ParseAnyRegister(Operands);
   if (ResTy == MatchOperand_Success) {
     assert(Operands.size() == 1);
-    MipsOperand &Operand = *static_cast<MipsOperand *>(Operands.front());
+    MipsOperand &Operand = static_cast<MipsOperand &>(*Operands.front());
     StartLoc = Operand.getStartLoc();
     EndLoc = Operand.getEndLoc();
 
@@ -1592,11 +1810,9 @@ bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
     // register is a parse error.
     if (Operand.isGPRAsmReg()) {
       // Resolve to GPR32 or GPR64 appropriately.
-      RegNo = isGP64() ? Operand.getGPR64Reg() : Operand.getGPR32Reg();
+      RegNo = isGP64bit() ? Operand.getGPR64Reg() : Operand.getGPR32Reg();
     }
 
-    delete &Operand;
-
     return (RegNo == (unsigned)-1);
   }
 
@@ -1632,8 +1848,8 @@ bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) {
   return Result;
 }
 
-MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseMemOperand(OperandVector &Operands) {
   DEBUG(dbgs() << "parseMemOperand\n");
   const MCExpr *IdVal = nullptr;
   SMLoc S;
@@ -1653,8 +1869,8 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
 
     const AsmToken &Tok = Parser.getTok(); // Get the next token.
     if (Tok.isNot(AsmToken::LParen)) {
-      MipsOperand *Mnemonic = static_cast<MipsOperand *>(Operands[0]);
-      if (Mnemonic->getToken() == "la") {
+      MipsOperand &Mnemonic = static_cast<MipsOperand &>(*Operands[0]);
+      if (Mnemonic.getToken() == "la") {
         SMLoc E =
             SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
         Operands.push_back(MipsOperand::CreateImm(IdVal, S, E, *this));
@@ -1666,9 +1882,10 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
 
         // Zero register assumed, add a memory operand with ZERO as its base.
         // "Base" will be managed by k_Memory.
-        MipsOperand *Base = MipsOperand::CreateGPRReg(
-            0, getContext().getRegisterInfo(), S, E, *this);
-        Operands.push_back(MipsOperand::CreateMem(Base, IdVal, S, E, *this));
+        auto Base = MipsOperand::CreateGPRReg(0, getContext().getRegisterInfo(),
+                                              S, E, *this);
+        Operands.push_back(
+            MipsOperand::CreateMem(std::move(Base), IdVal, S, E, *this));
         return MatchOperand_Success;
       }
       Error(Parser.getTok().getLoc(), "'(' expected");
@@ -1695,7 +1912,8 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
     IdVal = MCConstantExpr::Create(0, getContext());
 
   // Replace the register operand with the memory operand.
-  MipsOperand *op = static_cast<MipsOperand *>(Operands.back());
+  std::unique_ptr<MipsOperand> op(
+      static_cast<MipsOperand *>(Operands.back().release()));
   // Remove the register from the operands.
   // "op" will be managed by k_Memory.
   Operands.pop_back();
@@ -1709,12 +1927,11 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
                                    getContext());
   }
 
-  Operands.push_back(MipsOperand::CreateMem(op, IdVal, S, E, *this));
+  Operands.push_back(MipsOperand::CreateMem(std::move(op), IdVal, S, E, *this));
   return MatchOperand_Success;
 }
 
-bool MipsAsmParser::searchSymbolAlias(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+bool MipsAsmParser::searchSymbolAlias(OperandVector &Operands) {
 
   MCSymbol *Sym = getContext().LookupSymbol(Parser.getTok().getIdentifier());
   if (Sym) {
@@ -1740,9 +1957,8 @@ bool MipsAsmParser::searchSymbolAlias(
     } else if (Expr->getKind() == MCExpr::Constant) {
       Parser.Lex();
       const MCConstantExpr *Const = static_cast<const MCConstantExpr *>(Expr);
-      MipsOperand *op =
-          MipsOperand::CreateImm(Const, S, Parser.getTok().getLoc(), *this);
-      Operands.push_back(op);
+      Operands.push_back(
+          MipsOperand::CreateImm(Const, S, Parser.getTok().getLoc(), *this));
       return true;
     }
   }
@@ -1750,9 +1966,9 @@ bool MipsAsmParser::searchSymbolAlias(
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::MatchAnyRegisterNameWithoutDollar(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands, StringRef Identifier,
-    SMLoc S) {
+MipsAsmParser::MatchAnyRegisterNameWithoutDollar(OperandVector &Operands,
+                                                 StringRef Identifier,
+                                                 SMLoc S) {
   int Index = matchCPURegisterName(Identifier);
   if (Index != -1) {
     Operands.push_back(MipsOperand::CreateGPRReg(
@@ -1799,8 +2015,7 @@ MipsAsmParser::MatchAnyRegisterNameWithoutDollar(
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::MatchAnyRegisterWithoutDollar(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands, SMLoc S) {
+MipsAsmParser::MatchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S) {
   auto Token = Parser.getLexer().peekTok(false);
 
   if (Token.is(AsmToken::Identifier)) {
@@ -1822,8 +2037,8 @@ MipsAsmParser::MatchAnyRegisterWithoutDollar(
   return MatchOperand_NoMatch;
 }
 
-MipsAsmParser::OperandMatchResultTy MipsAsmParser::ParseAnyRegister(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::ParseAnyRegister(OperandVector &Operands) {
   DEBUG(dbgs() << "ParseAnyRegister\n");
 
   auto Token = Parser.getTok();
@@ -1850,7 +2065,7 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::ParseAnyRegister(
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::ParseImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+MipsAsmParser::ParseImm(OperandVector &Operands) {
   switch (getLexer().getKind()) {
   default:
     return MatchOperand_NoMatch;
@@ -1858,6 +2073,7 @@ MipsAsmParser::ParseImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
   case AsmToken::Minus:
   case AsmToken::Plus:
   case AsmToken::Integer:
+  case AsmToken::Tilde:
   case AsmToken::String:
     break;
   }
@@ -1872,8 +2088,8 @@ MipsAsmParser::ParseImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
   return MatchOperand_Success;
 }
 
-MipsAsmParser::OperandMatchResultTy MipsAsmParser::ParseJumpTarget(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::ParseJumpTarget(OperandVector &Operands) {
   DEBUG(dbgs() << "ParseJumpTarget\n");
 
   SMLoc S = getLexer().getLoc();
@@ -1899,7 +2115,7 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::ParseJumpTarget(
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseInvNum(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+MipsAsmParser::parseInvNum(OperandVector &Operands) {
   const MCExpr *IdVal;
   // If the first token is '$' we may have register operand.
   if (Parser.getTok().is(AsmToken::Dollar))
@@ -1917,7 +2133,7 @@ MipsAsmParser::parseInvNum(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::ParseLSAImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+MipsAsmParser::ParseLSAImm(OperandVector &Operands) {
   switch (getLexer().getKind()) {
   default:
     return MatchOperand_NoMatch;
@@ -1996,8 +2212,7 @@ MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
 /// ::= '(', register, ')'
 /// handle it before we iterate so we don't get tripped up by the lack of
 /// a comma.
-bool MipsAsmParser::ParseParenSuffix(
-    StringRef Name, SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+bool MipsAsmParser::ParseParenSuffix(StringRef Name, OperandVector &Operands) {
   if (getLexer().is(AsmToken::LParen)) {
     Operands.push_back(
         MipsOperand::CreateToken("(", getLexer().getLoc(), *this));
@@ -2025,8 +2240,8 @@ bool MipsAsmParser::ParseParenSuffix(
 /// ::= '[', integer, ']'
 /// handle it before we iterate so we don't get tripped up by the lack of
 /// a comma.
-bool MipsAsmParser::ParseBracketSuffix(
-    StringRef Name, SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+bool MipsAsmParser::ParseBracketSuffix(StringRef Name,
+                                       OperandVector &Operands) {
   if (getLexer().is(AsmToken::LBrac)) {
     Operands.push_back(
         MipsOperand::CreateToken("[", getLexer().getLoc(), *this));
@@ -2048,10 +2263,12 @@ bool MipsAsmParser::ParseBracketSuffix(
   return false;
 }
 
-bool MipsAsmParser::ParseInstruction(
-    ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                     SMLoc NameLoc, OperandVector &Operands) {
   DEBUG(dbgs() << "ParseInstruction\n");
+  // We have reached first instruction, module directive after
+  // this is forbidden.
+  getTargetStreamer().setCanHaveModuleDir(false);
   // Check if we have valid mnemonic
   if (!mnemonicIsValid(Name, 0)) {
     Parser.eatToEndOfStatement();
@@ -2098,13 +2315,13 @@ bool MipsAsmParser::ParseInstruction(
   return false;
 }
 
-bool MipsAsmParser::reportParseError(StringRef ErrorMsg) {
+bool MipsAsmParser::reportParseError(Twine ErrorMsg) {
   SMLoc Loc = getLexer().getLoc();
   Parser.eatToEndOfStatement();
   return Error(Loc, ErrorMsg);
 }
 
-bool MipsAsmParser::reportParseError(SMLoc Loc, StringRef ErrorMsg) {
+bool MipsAsmParser::reportParseError(SMLoc Loc, Twine ErrorMsg) {
   return Error(Loc, ErrorMsg);
 }
 
@@ -2238,6 +2455,32 @@ bool MipsAsmParser::parseSetNoMips16Directive() {
   return false;
 }
 
+bool MipsAsmParser::parseSetFpDirective() {
+  MipsABIFlagsSection::FpABIKind FpAbiVal;
+  // Line can be: .set fp=32
+  //              .set fp=xx
+  //              .set fp=64
+  Parser.Lex(); // Eat fp token
+  AsmToken Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Equal)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+  Parser.Lex(); // Eat '=' token.
+  Tok = Parser.getTok();
+
+  if (!parseFpABIValue(FpAbiVal, ".set"))
+    return false;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+  getTargetStreamer().emitDirectiveSetFp(FpAbiVal);
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
 bool MipsAsmParser::parseSetAssignment() {
   StringRef Name;
   const MCExpr *Value;
@@ -2296,25 +2539,6 @@ bool MipsAsmParser::parseSetFeature(uint64_t Feature) {
   return false;
 }
 
-bool MipsAsmParser::parseRegister(unsigned &RegNum) {
-  if (!getLexer().is(AsmToken::Dollar))
-    return false;
-
-  Parser.Lex();
-
-  const AsmToken &Reg = Parser.getTok();
-  if (Reg.is(AsmToken::Identifier)) {
-    RegNum = matchCPURegisterName(Reg.getIdentifier());
-  } else if (Reg.is(AsmToken::Integer)) {
-    RegNum = Reg.getIntVal();
-  } else {
-    return false;
-  }
-
-  Parser.Lex();
-  return true;
-}
-
 bool MipsAsmParser::eatComma(StringRef ErrorStr) {
   if (getLexer().isNot(AsmToken::Comma)) {
     SMLoc Loc = getLexer().getLoc();
@@ -2332,21 +2556,20 @@ bool MipsAsmParser::parseDirectiveCPLoad(SMLoc Loc) {
 
   // FIXME: Warn if cpload is used in Mips16 mode.
 
-  SmallVector<MCParsedAsmOperand *, 1> Reg;
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Reg;
   OperandMatchResultTy ResTy = ParseAnyRegister(Reg);
   if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) {
     reportParseError("expected register containing function address");
     return false;
   }
 
-  MipsOperand *RegOpnd = static_cast<MipsOperand *>(Reg[0]);
-  if (!RegOpnd->isGPRAsmReg()) {
-    reportParseError(RegOpnd->getStartLoc(), "invalid register");
+  MipsOperand &RegOpnd = static_cast<MipsOperand &>(*Reg[0]);
+  if (!RegOpnd.isGPRAsmReg()) {
+    reportParseError(RegOpnd.getStartLoc(), "invalid register");
     return false;
   }
 
-  getTargetStreamer().emitDirectiveCpload(RegOpnd->getGPR32Reg());
-  delete RegOpnd;
+  getTargetStreamer().emitDirectiveCpload(RegOpnd.getGPR32Reg());
   return false;
 }
 
@@ -2355,23 +2578,48 @@ bool MipsAsmParser::parseDirectiveCPSetup() {
   unsigned Save;
   bool SaveIsReg = true;
 
-  if (!parseRegister(FuncReg))
-    return reportParseError("expected register containing function address");
-  FuncReg = getGPR(FuncReg);
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> TmpReg;
+  OperandMatchResultTy ResTy = ParseAnyRegister(TmpReg);
+  if (ResTy == MatchOperand_NoMatch) {
+    reportParseError("expected register containing function address");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  MipsOperand &FuncRegOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
+  if (!FuncRegOpnd.isGPRAsmReg()) {
+    reportParseError(FuncRegOpnd.getStartLoc(), "invalid register");
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  FuncReg = FuncRegOpnd.getGPR32Reg();
+  TmpReg.clear();
 
   if (!eatComma("expected comma parsing directive"))
     return true;
 
-  if (!parseRegister(Save)) {
+  ResTy = ParseAnyRegister(TmpReg);
+  if (ResTy == MatchOperand_NoMatch) {
     const AsmToken &Tok = Parser.getTok();
     if (Tok.is(AsmToken::Integer)) {
       Save = Tok.getIntVal();
       SaveIsReg = false;
       Parser.Lex();
-    } else
-      return reportParseError("expected save register or stack offset");
-  } else
-    Save = getGPR(Save);
+    } else {
+      reportParseError("expected save register or stack offset");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+  } else {
+    MipsOperand &SaveOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
+    if (!SaveOpnd.isGPRAsmReg()) {
+      reportParseError(SaveOpnd.getStartLoc(), "invalid register");
+      Parser.eatToEndOfStatement();
+      return false;
+    }
+    Save = SaveOpnd.getGPR32Reg();
+  }
 
   if (!eatComma("expected comma parsing directive"))
     return true;
@@ -2414,6 +2662,8 @@ bool MipsAsmParser::parseDirectiveSet() {
     return parseSetNoAtDirective();
   } else if (Tok.getString() == "at") {
     return parseSetAtDirective();
+  } else if (Tok.getString() == "fp") {
+    return parseSetFpDirective();
   } else if (Tok.getString() == "reorder") {
     return parseSetReorderDirective();
   } else if (Tok.getString() == "noreorder") {
@@ -2546,6 +2796,134 @@ bool MipsAsmParser::parseDirectiveOption() {
   return false;
 }
 
+/// parseDirectiveModule
+///  ::= .module oddspreg
+///  ::= .module nooddspreg
+///  ::= .module fp=value
+bool MipsAsmParser::parseDirectiveModule() {
+  MCAsmLexer &Lexer = getLexer();
+  SMLoc L = Lexer.getLoc();
+
+  if (!getTargetStreamer().getCanHaveModuleDir()) {
+    // TODO : get a better message.
+    reportParseError(".module directive must appear before any code");
+    return false;
+  }
+
+  if (Lexer.is(AsmToken::Identifier)) {
+    StringRef Option = Parser.getTok().getString();
+    Parser.Lex();
+
+    if (Option == "oddspreg") {
+      getTargetStreamer().emitDirectiveModuleOddSPReg(true, isABI_O32());
+      clearFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
+
+      if (getLexer().isNot(AsmToken::EndOfStatement)) {
+        reportParseError("Expected end of statement");
+        return false;
+      }
+
+      return false;
+    } else if (Option == "nooddspreg") {
+      if (!isABI_O32()) {
+        Error(L, "'.module nooddspreg' requires the O32 ABI");
+        return false;
+      }
+
+      getTargetStreamer().emitDirectiveModuleOddSPReg(false, isABI_O32());
+      setFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
+
+      if (getLexer().isNot(AsmToken::EndOfStatement)) {
+        reportParseError("Expected end of statement");
+        return false;
+      }
+
+      return false;
+    } else if (Option == "fp") {
+      return parseDirectiveModuleFP();
+    }
+
+    return Error(L, "'" + Twine(Option) + "' is not a valid .module option.");
+  }
+
+  return false;
+}
+
+/// parseDirectiveModuleFP
+///  ::= =32
+///  ::= =xx
+///  ::= =64
+bool MipsAsmParser::parseDirectiveModuleFP() {
+  MCAsmLexer &Lexer = getLexer();
+
+  if (Lexer.isNot(AsmToken::Equal)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+  Parser.Lex(); // Eat '=' token.
+
+  MipsABIFlagsSection::FpABIKind FpABI;
+  if (!parseFpABIValue(FpABI, ".module"))
+    return false;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+
+  // Emit appropriate flags.
+  getTargetStreamer().emitDirectiveModuleFP(FpABI, isABI_O32());
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseFpABIValue(MipsABIFlagsSection::FpABIKind &FpABI,
+                                    StringRef Directive) {
+  MCAsmLexer &Lexer = getLexer();
+
+  if (Lexer.is(AsmToken::Identifier)) {
+    StringRef Value = Parser.getTok().getString();
+    Parser.Lex();
+
+    if (Value != "xx") {
+      reportParseError("unsupported value, expected 'xx', '32' or '64'");
+      return false;
+    }
+
+    if (!isABI_O32()) {
+      reportParseError("'" + Directive + " fp=xx' requires the O32 ABI");
+      return false;
+    }
+
+    FpABI = MipsABIFlagsSection::FpABIKind::XX;
+    return true;
+  }
+
+  if (Lexer.is(AsmToken::Integer)) {
+    unsigned Value = Parser.getTok().getIntVal();
+    Parser.Lex();
+
+    if (Value != 32 && Value != 64) {
+      reportParseError("unsupported value, expected 'xx', '32' or '64'");
+      return false;
+    }
+
+    if (Value == 32) {
+      if (!isABI_O32()) {
+        reportParseError("'" + Directive + " fp=32' requires the O32 ABI");
+        return false;
+      }
+
+      FpABI = MipsABIFlagsSection::FpABIKind::S32;
+    } else
+      FpABI = MipsABIFlagsSection::FpABIKind::S64;
+
+    return true;
+  }
+
+  return false;
+}
+
 bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getString();
 
@@ -2624,6 +3002,9 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
   if (IDVal == ".cpsetup")
     return parseDirectiveCPSetup();
 
+  if (IDVal == ".module")
+    return parseDirectiveModule();
+
   return true;
 }
 
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 95670aa..902b877 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -57,16 +57,24 @@ class MipsDisassembler : public MipsDisassemblerBase {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  MipsDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
-                   bool bigEndian) :
-    MipsDisassemblerBase(STI, Ctx, bigEndian) {
-      IsMicroMips = STI.getFeatureBits() & Mips::FeatureMicroMips;
-    }
+  MipsDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, bool bigEndian)
+      : MipsDisassemblerBase(STI, Ctx, bigEndian) {
+    IsMicroMips = STI.getFeatureBits() & Mips::FeatureMicroMips;
+  }
 
-  bool isMips32r6() const {
+  bool hasMips3() const { return STI.getFeatureBits() & Mips::FeatureMips3; }
+  bool hasMips32() const { return STI.getFeatureBits() & Mips::FeatureMips32; }
+  bool hasMips32r6() const {
     return STI.getFeatureBits() & Mips::FeatureMips32r6;
   }
 
+  bool isGP64() const { return STI.getFeatureBits() & Mips::FeatureGP64Bit; }
+
+  bool hasCOP3() const {
+    // Only present in MIPS-I and MIPS-II
+    return !hasMips32() && !hasMips3();
+  }
+
   /// getInstruction - See MCDisassembler.
   DecodeStatus getInstruction(MCInst &instr,
                               uint64_t &size,
@@ -149,6 +157,10 @@ static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst,
                                            uint64_t Address,
                                            const void *Decoder);
 
+static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
 static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst,
                                               unsigned Insn,
                                               uint64_t Address,
@@ -260,6 +272,11 @@ static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn,
                                uint64_t Address,
                                const void *Decoder);
 
+static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
 static DecodeStatus DecodeSimm16(MCInst &Inst,
                                  unsigned Insn,
                                  uint64_t Address,
@@ -285,6 +302,9 @@ static DecodeStatus DecodeExtSize(MCInst &Inst,
 static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
                                      uint64_t Address, const void *Decoder);
 
+static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder);
+
 /// INSVE_[BHWD] have an implicit operand that the generated decoder doesn't
 /// handle.
 template <typename InsnType>
@@ -316,6 +336,11 @@ static DecodeStatus
 DecodeBgtzGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
                       const void *Decoder);
 
+template <typename InsnType>
+static DecodeStatus
+DecodeBlezGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                       const void *Decoder);
+
 namespace llvm {
 extern Target TheMipselTarget, TheMipsTarget, TheMips64Target,
               TheMips64elTarget;
@@ -511,6 +536,7 @@ static DecodeStatus DecodeBlezlGroupBranch(MCInst &MI, InsnType insn,
   InsnType Rs = fieldFromInstruction(insn, 21, 5);
   InsnType Rt = fieldFromInstruction(insn, 16, 5);
   InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  bool HasRs = false;
 
   if (Rt == 0)
     return MCDisassembler::Fail;
@@ -518,8 +544,14 @@ static DecodeStatus DecodeBlezlGroupBranch(MCInst &MI, InsnType insn,
     MI.setOpcode(Mips::BLEZC);
   else if (Rs == Rt)
     MI.setOpcode(Mips::BGEZC);
-  else
-    return MCDisassembler::Fail; // FIXME: BGEC is not implemented yet.
+  else {
+    HasRs = true;
+    MI.setOpcode(Mips::BGEC);
+  }
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
 
   MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
                                      Rt)));
@@ -544,6 +576,8 @@ static DecodeStatus DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn,
   //      BLTZC   if rs == rt && rt != 0
   //      BLTC    if rs != rt && rs != 0  && rt != 0
 
+  bool HasRs = false;
+
   InsnType Rs = fieldFromInstruction(insn, 21, 5);
   InsnType Rt = fieldFromInstruction(insn, 16, 5);
   InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
@@ -554,8 +588,14 @@ static DecodeStatus DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn,
     MI.setOpcode(Mips::BGTZC);
   else if (Rs == Rt)
     MI.setOpcode(Mips::BLTZC);
-  else
-    return MCDisassembler::Fail; // FIXME: BLTC is not implemented yet.
+  else {
+    MI.setOpcode(Mips::BLTC);
+    HasRs = true;
+  }
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                              Rs)));
 
   MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
                                      Rt)));
@@ -595,8 +635,11 @@ static DecodeStatus DecodeBgtzGroupBranch(MCInst &MI, InsnType insn,
   } else if (Rs == Rt) {
     MI.setOpcode(Mips::BLTZALC);
     HasRs = true;
-  } else
-    return MCDisassembler::Fail; // BLTUC not implemented yet
+  } else {
+    MI.setOpcode(Mips::BLTUC);
+    HasRs = true;
+    HasRt = true;
+  }
 
   if (HasRs)
     MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
@@ -611,6 +654,48 @@ static DecodeStatus DecodeBgtzGroupBranch(MCInst &MI, InsnType insn,
   return MCDisassembler::Success;
 }
 
+template <typename InsnType>
+static DecodeStatus DecodeBlezGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the BLEZL instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b000110 sssss ttttt iiiiiiiiiiiiiiii
+  //      Invalid   if rs == 0
+  //      BLEZALC   if rs == 0  && rt != 0
+  //      BGEZALC   if rs == rt && rt != 0
+  //      BGEUC     if rs != rt && rs != 0  && rt != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  bool HasRs = false;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0)
+    MI.setOpcode(Mips::BLEZALC);
+  else if (Rs == Rt)
+    MI.setOpcode(Mips::BGEZALC);
+  else {
+    HasRs = true;
+    MI.setOpcode(Mips::BGEUC);
+  }
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+  MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
   /// readInstruction - read four bytes from the MemoryObject
   /// and return 32 bit word sorted according to the given endianess
 static DecodeStatus readInstruction32(const MemoryObject &region,
@@ -670,6 +755,7 @@ MipsDisassembler::getInstruction(MCInst &instr,
     return MCDisassembler::Fail;
 
   if (IsMicroMips) {
+    DEBUG(dbgs() << "Trying MicroMips32 table (32-bit opcodes):\n");
     // Calling the auto-generated decoder function.
     Result = decodeInstruction(DecoderTableMicroMips32, instr, Insn, Address,
                                this, STI);
@@ -680,7 +766,28 @@ MipsDisassembler::getInstruction(MCInst &instr,
     return MCDisassembler::Fail;
   }
 
-  if (isMips32r6()) {
+  if (hasCOP3()) {
+    DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
+    Result =
+        decodeInstruction(DecoderTableCOP3_32, instr, Insn, Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
+  if (hasMips32r6() && isGP64()) {
+    DEBUG(dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableMips32r6_64r6_GP6432, instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
+  if (hasMips32r6()) {
+    DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips32r6_64r632, instr, Insn,
                                Address, this, STI);
     if (Result != MCDisassembler::Fail) {
@@ -689,6 +796,7 @@ MipsDisassembler::getInstruction(MCInst &instr,
     }
   }
 
+  DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n");
   // Calling the auto-generated decoder function.
   Result = decodeInstruction(DecoderTableMips32, instr, Insn, Address,
                              this, STI);
@@ -840,6 +948,17 @@ static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::FGRCCRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeMem(MCInst &Inst,
                               unsigned Insn,
                               uint64_t Address,
@@ -965,6 +1084,27 @@ static DecodeStatus DecodeFMem(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder) {
+  int64_t Offset = SignExtend64<9>((Insn >> 7) & 0x1ff);
+  unsigned Rt = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Rt = getReg(Decoder, Mips::GPR32RegClassID, Rt);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  if(Inst.getOpcode() == Mips::SC_R6 || Inst.getOpcode() == Mips::SCD_R6){
+    Inst.addOperand(MCOperand::CreateReg(Rt));
+  }
+
+  Inst.addOperand(MCOperand::CreateReg(Rt));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst,
                                               unsigned RegNo,
@@ -1197,3 +1337,9 @@ static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
   Inst.addOperand(MCOperand::CreateImm(SignExtend32<19>(Insn) << 2));
   return MCDisassembler::Success;
 }
+
+static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<18>(Insn) << 3));
+  return MCDisassembler::Success;
+}
diff --git a/lib/Target/Mips/MCTargetDesc/Android.mk b/lib/Target/Mips/MCTargetDesc/Android.mk
index 7ee11a1..c8d18fc 100644
--- a/lib/Target/Mips/MCTargetDesc/Android.mk
+++ b/lib/Target/Mips/MCTargetDesc/Android.mk
@@ -7,6 +7,7 @@ mips_mc_desc_TBLGEN_TABLES := \
   MipsGenSubtargetInfo.inc
 
 mips_mc_desc_SRC_FILES := \
+  MipsABIFlagsSection.cpp \
   MipsAsmBackend.cpp \
   MipsELFObjectWriter.cpp \
   MipsELFStreamer.cpp \
diff --git a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
index d3e2fd7..c14ee35 100644
--- a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_library(LLVMMipsDesc
+  MipsABIFlagsSection.cpp
   MipsAsmBackend.cpp
   MipsELFObjectWriter.cpp
   MipsELFStreamer.cpp
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
new file mode 100644
index 0000000..52d5dd3
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
@@ -0,0 +1,60 @@
+//===-- MipsABIFlagsSection.cpp - Mips ELF ABI Flags Section ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsABIFlagsSection.h"
+
+using namespace llvm;
+
+uint8_t MipsABIFlagsSection::getFpABIValue() {
+  switch (FpABI) {
+  case FpABIKind::ANY:
+    return Val_GNU_MIPS_ABI_FP_ANY;
+  case FpABIKind::XX:
+    return Val_GNU_MIPS_ABI_FP_XX;
+  case FpABIKind::S32:
+    return Val_GNU_MIPS_ABI_FP_DOUBLE;
+  case FpABIKind::S64:
+    if (Is32BitABI)
+      return OddSPReg ? Val_GNU_MIPS_ABI_FP_64 : Val_GNU_MIPS_ABI_FP_64A;
+    return Val_GNU_MIPS_ABI_FP_DOUBLE;
+  }
+
+  llvm_unreachable("unexpected fp abi value");
+}
+
+StringRef MipsABIFlagsSection::getFpABIString(FpABIKind Value) {
+  switch (Value) {
+  case FpABIKind::XX:
+    return "xx";
+  case FpABIKind::S32:
+    return "32";
+  case FpABIKind::S64:
+    return "64";
+  default:
+    llvm_unreachable("unsupported fp abi value");
+  }
+}
+
+namespace llvm {
+MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection) {
+  // Write out a Elf_Internal_ABIFlags_v0 struct
+  OS.EmitIntValue(ABIFlagsSection.getVersionValue(), 2);         // version
+  OS.EmitIntValue(ABIFlagsSection.getISALevelValue(), 1);        // isa_level
+  OS.EmitIntValue(ABIFlagsSection.getISARevisionValue(), 1);     // isa_rev
+  OS.EmitIntValue(ABIFlagsSection.getGPRSizeValue(), 1);         // gpr_size
+  OS.EmitIntValue(ABIFlagsSection.getCPR1SizeValue(), 1);        // cpr1_size
+  OS.EmitIntValue(ABIFlagsSection.getCPR2SizeValue(), 1);        // cpr2_size
+  OS.EmitIntValue(ABIFlagsSection.getFpABIValue(), 1);           // fp_abi
+  OS.EmitIntValue(ABIFlagsSection.getISAExtensionSetValue(), 4); // isa_ext
+  OS.EmitIntValue(ABIFlagsSection.getASESetValue(), 4);          // ases
+  OS.EmitIntValue(ABIFlagsSection.getFlags1Value(), 4);          // flags1
+  OS.EmitIntValue(ABIFlagsSection.getFlags2Value(), 4);          // flags2
+  return OS;
+}
+}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
new file mode 100644
index 0000000..ab18c44
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
@@ -0,0 +1,237 @@
+//===-- MipsABIFlagsSection.h - Mips ELF ABI Flags Section -----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSABIFLAGSSECTION_H
+#define MIPSABIFLAGSSECTION_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+class MCStreamer;
+
+struct MipsABIFlagsSection {
+  // Values for the xxx_size bytes of an ABI flags structure.
+  enum AFL_REG {
+    AFL_REG_NONE = 0x00, // No registers.
+    AFL_REG_32 = 0x01,   // 32-bit registers.
+    AFL_REG_64 = 0x02,   // 64-bit registers.
+    AFL_REG_128 = 0x03   // 128-bit registers.
+  };
+
+  // Masks for the ases word of an ABI flags structure.
+  enum AFL_ASE {
+    AFL_ASE_DSP = 0x00000001,       // DSP ASE.
+    AFL_ASE_DSPR2 = 0x00000002,     // DSP R2 ASE.
+    AFL_ASE_EVA = 0x00000004,       // Enhanced VA Scheme.
+    AFL_ASE_MCU = 0x00000008,       // MCU (MicroController) ASE.
+    AFL_ASE_MDMX = 0x00000010,      // MDMX ASE.
+    AFL_ASE_MIPS3D = 0x00000020,    // MIPS-3D ASE.
+    AFL_ASE_MT = 0x00000040,        // MT ASE.
+    AFL_ASE_SMARTMIPS = 0x00000080, // SmartMIPS ASE.
+    AFL_ASE_VIRT = 0x00000100,      // VZ ASE.
+    AFL_ASE_MSA = 0x00000200,       // MSA ASE.
+    AFL_ASE_MIPS16 = 0x00000400,    // MIPS16 ASE.
+    AFL_ASE_MICROMIPS = 0x00000800, // MICROMIPS ASE.
+    AFL_ASE_XPA = 0x00001000        // XPA ASE.
+  };
+
+  // Values for the isa_ext word of an ABI flags structure.
+  enum AFL_EXT {
+    AFL_EXT_XLR = 1,          // RMI Xlr instruction.
+    AFL_EXT_OCTEON2 = 2,      // Cavium Networks Octeon2.
+    AFL_EXT_OCTEONP = 3,      // Cavium Networks OcteonP.
+    AFL_EXT_LOONGSON_3A = 4,  // Loongson 3A.
+    AFL_EXT_OCTEON = 5,       // Cavium Networks Octeon.
+    AFL_EXT_5900 = 6,         // MIPS R5900 instruction.
+    AFL_EXT_4650 = 7,         // MIPS R4650 instruction.
+    AFL_EXT_4010 = 8,         // LSI R4010 instruction.
+    AFL_EXT_4100 = 9,         // NEC VR4100 instruction.
+    AFL_EXT_3900 = 10,        // Toshiba R3900 instruction.
+    AFL_EXT_10000 = 11,       // MIPS R10000 instruction.
+    AFL_EXT_SB1 = 12,         // Broadcom SB-1 instruction.
+    AFL_EXT_4111 = 13,        // NEC VR4111/VR4181 instruction.
+    AFL_EXT_4120 = 14,        // NEC VR4120 instruction.
+    AFL_EXT_5400 = 15,        // NEC VR5400 instruction.
+    AFL_EXT_5500 = 16,        // NEC VR5500 instruction.
+    AFL_EXT_LOONGSON_2E = 17, // ST Microelectronics Loongson 2E.
+    AFL_EXT_LOONGSON_2F = 18  // ST Microelectronics Loongson 2F.
+  };
+
+  // Values for the fp_abi word of an ABI flags structure.
+  enum Val_GNU_MIPS_ABI {
+    Val_GNU_MIPS_ABI_FP_ANY = 0,
+    Val_GNU_MIPS_ABI_FP_DOUBLE = 1,
+    Val_GNU_MIPS_ABI_FP_XX = 5,
+    Val_GNU_MIPS_ABI_FP_64 = 6,
+    Val_GNU_MIPS_ABI_FP_64A = 7
+  };
+
+  enum AFL_FLAGS1 {
+    AFL_FLAGS1_ODDSPREG = 1
+  };
+
+  // Internal representation of the values used in .module fp=value
+  enum class FpABIKind { ANY, XX, S32, S64 };
+
+  // Version of flags structure.
+  uint16_t Version;
+  // The level of the ISA: 1-5, 32, 64.
+  uint8_t ISALevel;
+  // The revision of ISA: 0 for MIPS V and below, 1-n otherwise.
+  uint8_t ISARevision;
+  // The size of general purpose registers.
+  AFL_REG GPRSize;
+  // The size of co-processor 1 registers.
+  AFL_REG CPR1Size;
+  // The size of co-processor 2 registers.
+  AFL_REG CPR2Size;
+  // Processor-specific extension.
+  uint32_t ISAExtensionSet;
+  // Mask of ASEs used.
+  uint32_t ASESet;
+
+  bool OddSPReg;
+
+  bool Is32BitABI;
+
+protected:
+  // The floating-point ABI.
+  FpABIKind FpABI;
+
+public:
+  MipsABIFlagsSection()
+      : Version(0), ISALevel(0), ISARevision(0), GPRSize(AFL_REG_NONE),
+        CPR1Size(AFL_REG_NONE), CPR2Size(AFL_REG_NONE), ISAExtensionSet(0),
+        ASESet(0), OddSPReg(false), Is32BitABI(false), FpABI(FpABIKind::ANY) {}
+
+  uint16_t getVersionValue() { return (uint16_t)Version; }
+  uint8_t getISALevelValue() { return (uint8_t)ISALevel; }
+  uint8_t getISARevisionValue() { return (uint8_t)ISARevision; }
+  uint8_t getGPRSizeValue() { return (uint8_t)GPRSize; }
+  uint8_t getCPR1SizeValue() { return (uint8_t)CPR1Size; }
+  uint8_t getCPR2SizeValue() { return (uint8_t)CPR2Size; }
+  uint8_t getFpABIValue();
+  uint32_t getISAExtensionSetValue() { return (uint32_t)ISAExtensionSet; }
+  uint32_t getASESetValue() { return (uint32_t)ASESet; }
+
+  uint32_t getFlags1Value() {
+    uint32_t Value = 0;
+
+    if (OddSPReg)
+      Value |= (uint32_t)AFL_FLAGS1_ODDSPREG;
+
+    return Value;
+  }
+
+  uint32_t getFlags2Value() { return 0; }
+
+  FpABIKind getFpABI() { return FpABI; }
+  void setFpABI(FpABIKind Value, bool IsABI32Bit) {
+    FpABI = Value;
+    Is32BitABI = IsABI32Bit;
+  }
+  StringRef getFpABIString(FpABIKind Value);
+
+  template <class PredicateLibrary>
+  void setISALevelAndRevisionFromPredicates(const PredicateLibrary &P) {
+    if (P.hasMips64()) {
+      ISALevel = 64;
+      if (P.hasMips64r6())
+        ISARevision = 6;
+      else if (P.hasMips64r2())
+        ISARevision = 2;
+      else
+        ISARevision = 1;
+    } else if (P.hasMips32()) {
+      ISALevel = 32;
+      if (P.hasMips32r6())
+        ISARevision = 6;
+      else if (P.hasMips32r2())
+        ISARevision = 2;
+      else
+        ISARevision = 1;
+    } else {
+      ISARevision = 0;
+      if (P.hasMips5())
+        ISALevel = 5;
+      else if (P.hasMips4())
+        ISALevel = 4;
+      else if (P.hasMips3())
+        ISALevel = 3;
+      else if (P.hasMips2())
+        ISALevel = 2;
+      else if (P.hasMips1())
+        ISALevel = 1;
+      else
+        llvm_unreachable("Unknown ISA level!");
+    }
+  }
+
+  template <class PredicateLibrary>
+  void setGPRSizeFromPredicates(const PredicateLibrary &P) {
+    GPRSize = P.isGP64bit() ? AFL_REG_64 : AFL_REG_32;
+  }
+
+  template <class PredicateLibrary>
+  void setCPR1SizeFromPredicates(const PredicateLibrary &P) {
+    if (P.mipsSEUsesSoftFloat())
+      CPR1Size = AFL_REG_NONE;
+    else if (P.hasMSA())
+      CPR1Size = AFL_REG_128;
+    else
+      CPR1Size = P.isFP64bit() ? AFL_REG_64 : AFL_REG_32;
+  }
+
+  template <class PredicateLibrary>
+  void setASESetFromPredicates(const PredicateLibrary &P) {
+    ASESet = 0;
+    if (P.hasDSP())
+      ASESet |= AFL_ASE_DSP;
+    if (P.hasDSPR2())
+      ASESet |= AFL_ASE_DSPR2;
+    if (P.hasMSA())
+      ASESet |= AFL_ASE_MSA;
+    if (P.inMicroMipsMode())
+      ASESet |= AFL_ASE_MICROMIPS;
+    if (P.inMips16Mode())
+      ASESet |= AFL_ASE_MIPS16;
+  }
+
+  template <class PredicateLibrary>
+  void setFpAbiFromPredicates(const PredicateLibrary &P) {
+    Is32BitABI = P.isABI_O32();
+
+    FpABI = FpABIKind::ANY;
+    if (P.isABI_N32() || P.isABI_N64())
+      FpABI = FpABIKind::S64;
+    else if (P.isABI_O32()) {
+      if (P.isFP64bit())
+        FpABI = FpABIKind::S64;
+      else if (P.isABI_FPXX())
+        FpABI = FpABIKind::XX;
+      else
+        FpABI = FpABIKind::S32;
+    }
+  }
+
+  template <class PredicateLibrary>
+  void setAllFromPredicates(const PredicateLibrary &P) {
+    setISALevelAndRevisionFromPredicates(P);
+    setGPRSizeFromPredicates(P);
+    setCPR1SizeFromPredicates(P);
+    setASESetFromPredicates(P);
+    setFpAbiFromPredicates(P);
+  }
+};
+
+MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection);
+}
+
+#endif
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 5375a00..d8e6128 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -70,6 +70,13 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     if (!isIntN(16, Value) && Ctx)
       Ctx->FatalError(Fixup.getLoc(), "out of range PC16 fixup");
     break;
+  case Mips::fixup_MIPS_PC19_S2:
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 4;
+    // We now check if Value can be encoded as a 19-bit signed immediate.
+    if (!isIntN(19, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC19 fixup");
+    break;
   case Mips::fixup_Mips_26:
     // So far we are only using this type for jumps.
     // The displacement is then divided by 4 to give us an 28 bit
@@ -104,6 +111,13 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     if (!isIntN(16, Value) && Ctx)
       Ctx->FatalError(Fixup.getLoc(), "out of range PC16 fixup");
     break;
+  case Mips::fixup_MIPS_PC18_S3:
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 8;
+    // We now check if Value can be encoded as a 18-bit signed immediate.
+    if (!isIntN(18, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC18 fixup");
+    break;
   case Mips::fixup_MIPS_PC21_S2:
     Value -= 4;
     // Forcing a signed division because Value can be negative.
@@ -247,6 +261,8 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_Mips_GOT_LO16",     0,     16,   0 },
     { "fixup_Mips_CALL_HI16",    0,     16,   0 },
     { "fixup_Mips_CALL_LO16",    0,     16,   0 },
+    { "fixup_Mips_PC18_S3",      0,     18,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC19_S2",      0,     19,  MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_MIPS_PC21_S2",      0,     21,  MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_MIPS_PC26_S2",      0,     26,  MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_MIPS_PCHI16",       0,     16,  MCFixupKindInfo::FKF_IsPCRel },
@@ -308,6 +324,8 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_Mips_GOT_LO16",    16,     16,   0 },
     { "fixup_Mips_CALL_HI16",   16,     16,   0 },
     { "fixup_Mips_CALL_LO16",   16,     16,   0 },
+    { "fixup_Mips_PC18_S3",     14,     18,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC19_S2",     13,     19,  MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_MIPS_PC21_S2",     11,     21,  MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_MIPS_PC26_S2",      6,     26,  MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_MIPS_PCHI16",      16,     16,  MCFixupKindInfo::FKF_IsPCRel },
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index bc695e6..d5c3dbc 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -65,7 +65,7 @@ public:
                              const MCRelaxableFragment *DF,
                              const MCAsmLayout &Layout) const override {
     // FIXME.
-    assert(0 && "RelaxInstruction() unimplemented");
+    llvm_unreachable("RelaxInstruction() unimplemented");
     return false;
   }
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 74c12ff..49ac256 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -193,6 +193,12 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
   case Mips::fixup_MICROMIPS_TLS_TPREL_LO16:
     Type = ELF::R_MICROMIPS_TLS_TPREL_LO16;
     break;
+  case Mips::fixup_MIPS_PC19_S2:
+    Type = ELF::R_MIPS_PC19_S2;
+    break;
+  case Mips::fixup_MIPS_PC18_S3:
+    Type = ELF::R_MIPS_PC18_S3;
+    break;
   case Mips::fixup_MIPS_PC21_S2:
     Type = ELF::R_MIPS_PC21_S2;
     break;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index 3079004..05080f0 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -128,6 +128,12 @@ namespace Mips {
     // resulting in - R_MIPS_CALL_LO16
     fixup_Mips_CALL_LO16,
 
+    // resulting in - R_MIPS_PC18_S3
+    fixup_MIPS_PC18_S3,
+
+    // resulting in - R_MIPS_PC19_S2
+    fixup_MIPS_PC19_S2,
+
     // resulting in - R_MIPS_PC21_S2
     fixup_MIPS_PC21_S2,
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 6aa3c76..e415412 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -38,7 +38,7 @@ MipsMCAsmInfo::MipsMCAsmInfo(StringRef TT) {
   ZeroDirective               = "\t.space\t";
   GPRel32Directive            = "\t.gpword\t";
   GPRel64Directive            = "\t.gpdword\t";
-  DebugLabelSuffix            = "=.";
+  UseAssignmentForEHBegin = true;
   SupportsDebugInformation = true;
   ExceptionsType = ExceptionHandling::DwarfCFI;
   HasLEB128 = true;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 85e0bf1..43fc521 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -621,11 +621,42 @@ unsigned
 MipsMCCodeEmitter::getSimm19Lsl2Encoding(const MCInst &MI, unsigned OpNo,
                                          SmallVectorImpl<MCFixup> &Fixups,
                                          const MCSubtargetInfo &STI) const {
-  assert(MI.getOperand(OpNo).isImm());
-  // The immediate is encoded as 'immediate << 2'.
-  unsigned Res = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
-  assert((Res & 3) == 0);
-  return Res >> 2;
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm()) {
+    // The immediate is encoded as 'immediate << 2'.
+    unsigned Res = getMachineOpValue(MI, MO, Fixups, STI);
+    assert((Res & 3) == 0);
+    return Res >> 2;
+  }
+
+  assert(MO.isExpr() &&
+         "getSimm19Lsl2Encoding expects only expressions or an immediate");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MIPS_PC19_S2)));
+  return 0;
+}
+
+unsigned
+MipsMCCodeEmitter::getSimm18Lsl3Encoding(const MCInst &MI, unsigned OpNo,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm()) {
+    // The immediate is encoded as 'immediate << 3'.
+    unsigned Res = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
+    assert((Res & 7) == 0);
+    return Res >> 3;
+  }
+
+  assert(MO.isExpr() &&
+         "getSimm18Lsl2Encoding expects only expressions or an immediate");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MIPS_PC18_S3)));
+  return 0;
 }
 
 #include "MipsGenMCCodeEmitter.inc"
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index 3f7daab..304167f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -141,6 +141,10 @@ public:
                                  SmallVectorImpl<MCFixup> &Fixups,
                                  const MCSubtargetInfo &STI) const;
 
+  unsigned getSimm18Lsl3Encoding(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
   unsigned getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
                           const MCSubtargetInfo &STI) const;
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index 21ccc3c..5bba3e5 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -11,6 +11,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectStreamer.h"
 
 using namespace llvm;
 
@@ -83,33 +84,6 @@ MipsMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
   return getSubExpr()->EvaluateAsRelocatable(Res, Layout);
 }
 
-// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
-// that method should be made public?
-static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
-  switch (Value->getKind()) {
-  case MCExpr::Target:
-    llvm_unreachable("Can't handle nested target expr!");
-
-  case MCExpr::Constant:
-    break;
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
-    AddValueSymbolsImpl(BE->getLHS(), Asm);
-    AddValueSymbolsImpl(BE->getRHS(), Asm);
-    break;
-  }
-
-  case MCExpr::SymbolRef:
-    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
-    break;
-
-  case MCExpr::Unary:
-    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
-    break;
-  }
-}
-
-void MipsMCExpr::AddValueSymbols(MCAssembler *Asm) const {
-  AddValueSymbolsImpl(getSubExpr(), Asm);
+void MipsMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*getSubExpr());
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
index 8d7aacd..f193dc9 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -49,7 +49,7 @@ public:
   void PrintImpl(raw_ostream &OS) const override;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAsmLayout *Layout) const override;
-  void AddValueSymbols(MCAssembler *) const override;
+  void visitUsedExpr(MCStreamer &Streamer) const override;
   const MCSection *FindAssociatedSection() const override {
     return getSubExpr()->FindAssociatedSection();
   }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index 660e5a7..d2b929b 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -133,6 +133,12 @@ createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
   return S;
 }
 
+static MCStreamer *createMipsNullStreamer(MCContext &Ctx) {
+  MCStreamer *S = llvm::createNullStreamer(Ctx);
+  new MipsTargetStreamer(*S);
+  return S;
+}
+
 extern "C" void LLVMInitializeMipsTargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfoFn X(TheMipsTarget, createMipsMCAsmInfo);
@@ -187,6 +193,12 @@ extern "C" void LLVMInitializeMipsTargetMC() {
   TargetRegistry::RegisterAsmStreamer(TheMips64Target, createMCAsmStreamer);
   TargetRegistry::RegisterAsmStreamer(TheMips64elTarget, createMCAsmStreamer);
 
+  TargetRegistry::RegisterNullStreamer(TheMipsTarget, createMipsNullStreamer);
+  TargetRegistry::RegisterNullStreamer(TheMipselTarget, createMipsNullStreamer);
+  TargetRegistry::RegisterNullStreamer(TheMips64Target, createMipsNullStreamer);
+  TargetRegistry::RegisterNullStreamer(TheMips64elTarget,
+                                       createMipsNullStreamer);
+
   // Register the asm backend.
   TargetRegistry::RegisterMCAsmBackend(TheMipsTarget,
                                        createMipsAsmBackendEB32);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
index cd6be73..6cde8f9 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
@@ -48,7 +48,13 @@ private:
   bool PendingCall;
 
   bool isIndirectJump(const MCInst &MI) {
-    return MI.getOpcode() == Mips::JR || MI.getOpcode() == Mips::RET;
+    if (MI.getOpcode() == Mips::JALR) {
+      // MIPS32r6/MIPS64r6 doesn't have a JR instruction and uses JALR instead.
+      // JALR is an indirect branch if the link register is $0.
+      assert(MI.getOperand(0).isReg());
+      return MI.getOperand(0).getReg() == Mips::ZERO;
+    }
+    return MI.getOpcode() == Mips::JR;
   }
 
   bool isStackPointerFirstOperand(const MCInst &MI) {
@@ -56,7 +62,9 @@ private:
             && MI.getOperand(0).getReg() == Mips::SP);
   }
 
-  bool isCall(unsigned Opcode, bool *IsIndirectCall) {
+  bool isCall(const MCInst &MI, bool *IsIndirectCall) {
+    unsigned Opcode = MI.getOpcode();
+
     *IsIndirectCall = false;
 
     switch (Opcode) {
@@ -64,12 +72,19 @@ private:
       return false;
 
     case Mips::JAL:
+    case Mips::BAL:
     case Mips::BAL_BR:
     case Mips::BLTZAL:
     case Mips::BGEZAL:
       return true;
 
     case Mips::JALR:
+      // JALR is only a call if the link register is not $0. Otherwise it's an
+      // indirect branch.
+      assert(MI.getOperand(0).isReg());
+      if (MI.getOperand(0).getReg() == Mips::ZERO)
+        return false;
+
       *IsIndirectCall = true;
       return true;
     }
@@ -137,24 +152,23 @@ public:
                                                     &IsStore);
     bool IsSPFirstOperand = isStackPointerFirstOperand(Inst);
     if (IsMemAccess || IsSPFirstOperand) {
-      if (PendingCall)
-        report_fatal_error("Dangerous instruction in branch delay slot!");
-
       bool MaskBefore = (IsMemAccess
                          && baseRegNeedsLoadStoreMask(Inst.getOperand(AddrIdx)
                                                           .getReg()));
       bool MaskAfter = IsSPFirstOperand && !IsStore;
-      if (MaskBefore || MaskAfter)
+      if (MaskBefore || MaskAfter) {
+        if (PendingCall)
+          report_fatal_error("Dangerous instruction in branch delay slot!");
         sandboxLoadStoreStackChange(Inst, AddrIdx, STI, MaskBefore, MaskAfter);
-      else
-        MipsELFStreamer::EmitInstruction(Inst, STI);
-      return;
+        return;
+      }
+      // fallthrough
     }
 
     // Sandbox calls by aligning call and branch delay to the bundle end.
     // For indirect calls, emit the mask before the call.
     bool IsIndirectCall;
-    if (isCall(Inst.getOpcode(), &IsIndirectCall)) {
+    if (isCall(Inst, &IsIndirectCall)) {
       if (PendingCall)
         report_fatal_error("Dangerous instruction in branch delay slot!");
 
@@ -203,6 +217,7 @@ bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
   case Mips::LWC1:
   case Mips::LDC1:
   case Mips::LL:
+  case Mips::LL_R6:
   case Mips::LWL:
   case Mips::LWR:
     *AddrIdx = 1;
@@ -223,6 +238,7 @@ bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
 
   // Store instructions with base address register in position 2.
   case Mips::SC:
+  case Mips::SC_R6:
     *AddrIdx = 2;
     if (IsStore)
       *IsStore = true;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index a8fa272..fbe375b 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -27,10 +27,43 @@
 
 using namespace llvm;
 
-// Pin vtable to this file.
-void MipsTargetStreamer::anchor() {}
-
-MipsTargetStreamer::MipsTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+MipsTargetStreamer::MipsTargetStreamer(MCStreamer &S)
+    : MCTargetStreamer(S), canHaveModuleDirective(true) {}
+void MipsTargetStreamer::emitDirectiveSetMicroMips() {}
+void MipsTargetStreamer::emitDirectiveSetNoMicroMips() {}
+void MipsTargetStreamer::emitDirectiveSetMips16() {}
+void MipsTargetStreamer::emitDirectiveSetNoMips16() {}
+void MipsTargetStreamer::emitDirectiveSetReorder() {}
+void MipsTargetStreamer::emitDirectiveSetNoReorder() {}
+void MipsTargetStreamer::emitDirectiveSetMacro() {}
+void MipsTargetStreamer::emitDirectiveSetNoMacro() {}
+void MipsTargetStreamer::emitDirectiveSetAt() {}
+void MipsTargetStreamer::emitDirectiveSetNoAt() {}
+void MipsTargetStreamer::emitDirectiveEnd(StringRef Name) {}
+void MipsTargetStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {}
+void MipsTargetStreamer::emitDirectiveAbiCalls() {}
+void MipsTargetStreamer::emitDirectiveNaN2008() {}
+void MipsTargetStreamer::emitDirectiveNaNLegacy() {}
+void MipsTargetStreamer::emitDirectiveOptionPic0() {}
+void MipsTargetStreamer::emitDirectiveOptionPic2() {}
+void MipsTargetStreamer::emitFrame(unsigned StackReg, unsigned StackSize,
+                                   unsigned ReturnReg) {}
+void MipsTargetStreamer::emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) {}
+void MipsTargetStreamer::emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) {
+}
+void MipsTargetStreamer::emitDirectiveSetMips32R2() {}
+void MipsTargetStreamer::emitDirectiveSetMips64() {}
+void MipsTargetStreamer::emitDirectiveSetMips64R2() {}
+void MipsTargetStreamer::emitDirectiveSetDsp() {}
+void MipsTargetStreamer::emitDirectiveCpload(unsigned RegNo) {}
+void MipsTargetStreamer::emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+                                              const MCSymbol &Sym, bool IsReg) {
+}
+void MipsTargetStreamer::emitDirectiveModuleOddSPReg(bool Enabled,
+                                                     bool IsO32ABI) {
+  if (!Enabled && !IsO32ABI)
+    report_fatal_error("+nooddspreg is only valid for O32");
+}
 
 MipsTargetAsmStreamer::MipsTargetAsmStreamer(MCStreamer &S,
                                              formatted_raw_ostream &OS)
@@ -38,42 +71,52 @@ MipsTargetAsmStreamer::MipsTargetAsmStreamer(MCStreamer &S,
 
 void MipsTargetAsmStreamer::emitDirectiveSetMicroMips() {
   OS << "\t.set\tmicromips\n";
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetNoMicroMips() {
   OS << "\t.set\tnomicromips\n";
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetMips16() {
   OS << "\t.set\tmips16\n";
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetNoMips16() {
   OS << "\t.set\tnomips16\n";
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetReorder() {
   OS << "\t.set\treorder\n";
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetNoReorder() {
   OS << "\t.set\tnoreorder\n";
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetMacro() {
   OS << "\t.set\tmacro\n";
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetNoMacro() {
   OS << "\t.set\tnomacro\n";
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetAt() {
   OS << "\t.set\tat\n";
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetNoAt() {
   OS << "\t.set\tnoat\n";
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetAsmStreamer::emitDirectiveEnd(StringRef Name) {
@@ -110,24 +153,28 @@ void MipsTargetAsmStreamer::emitFrame(unsigned StackReg, unsigned StackSize,
 
 void MipsTargetAsmStreamer::emitDirectiveSetMips32R2() {
   OS << "\t.set\tmips32r2\n";
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetMips64() {
   OS << "\t.set\tmips64\n";
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetMips64R2() {
   OS << "\t.set\tmips64r2\n";
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetDsp() {
   OS << "\t.set\tdsp\n";
+  setCanHaveModuleDir(false);
 }
 // Print a 32 bit hex number with all numbers.
 static void printHex32(unsigned Value, raw_ostream &OS) {
   OS << "0x";
   for (int i = 7; i >= 0; i--)
-    OS.write_hex((Value & (0xF << (i*4))) >> (i*4));
+    OS.write_hex((Value & (0xF << (i * 4))) >> (i * 4));
 }
 
 void MipsTargetAsmStreamer::emitMask(unsigned CPUBitmask,
@@ -147,6 +194,7 @@ void MipsTargetAsmStreamer::emitFMask(unsigned FPUBitmask,
 void MipsTargetAsmStreamer::emitDirectiveCpload(unsigned RegNo) {
   OS << "\t.cpload\t$"
      << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n";
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo,
@@ -165,6 +213,34 @@ void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo,
   OS << ", ";
 
   OS << Sym.getName() << "\n";
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleFP(
+    MipsABIFlagsSection::FpABIKind Value, bool Is32BitABI) {
+  MipsTargetStreamer::emitDirectiveModuleFP(Value, Is32BitABI);
+
+  StringRef ModuleValue;
+  OS << "\t.module\tfp=";
+  OS << ABIFlagsSection.getFpABIString(Value) << "\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetFp(
+    MipsABIFlagsSection::FpABIKind Value) {
+  StringRef ModuleValue;
+  OS << "\t.set\tfp=";
+  OS << ABIFlagsSection.getFpABIString(Value) << "\n";
+}
+
+void MipsTargetAsmStreamer::emitMipsAbiFlags() {
+  // No action required for text output.
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleOddSPReg(bool Enabled,
+                                                        bool IsO32ABI) {
+  MipsTargetStreamer::emitDirectiveModuleOddSPReg(Enabled, IsO32ABI);
+
+  OS << "\t.module\t" << (Enabled ? "" : "no") << "oddspreg\n";
 }
 
 // This part is for ELF object output.
@@ -174,7 +250,7 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
   MCAssembler &MCA = getStreamer().getAssembler();
   uint64_t Features = STI.getFeatureBits();
   Triple T(STI.getTargetTriple());
-  Pic = (MCA.getContext().getObjectFileInfo()->getRelocM() ==  Reloc::PIC_)
+  Pic = (MCA.getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_)
             ? true
             : false;
 
@@ -182,16 +258,28 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
   unsigned EFlags = 0;
 
   // Architecture
-  if (Features & Mips::FeatureMips64r2)
+  if (Features & Mips::FeatureMips64r6)
+    EFlags |= ELF::EF_MIPS_ARCH_64R6;
+  else if (Features & Mips::FeatureMips64r2)
     EFlags |= ELF::EF_MIPS_ARCH_64R2;
   else if (Features & Mips::FeatureMips64)
     EFlags |= ELF::EF_MIPS_ARCH_64;
+  else if (Features & Mips::FeatureMips5)
+    EFlags |= ELF::EF_MIPS_ARCH_5;
   else if (Features & Mips::FeatureMips4)
     EFlags |= ELF::EF_MIPS_ARCH_4;
+  else if (Features & Mips::FeatureMips3)
+    EFlags |= ELF::EF_MIPS_ARCH_3;
+  else if (Features & Mips::FeatureMips32r6)
+    EFlags |= ELF::EF_MIPS_ARCH_32R6;
   else if (Features & Mips::FeatureMips32r2)
     EFlags |= ELF::EF_MIPS_ARCH_32R2;
   else if (Features & Mips::FeatureMips32)
     EFlags |= ELF::EF_MIPS_ARCH_32;
+  else if (Features & Mips::FeatureMips2)
+    EFlags |= ELF::EF_MIPS_ARCH_2;
+  else
+    EFlags |= ELF::EF_MIPS_ARCH_1;
 
   if (T.isArch64Bit()) {
     if (Features & Mips::FeatureN32)
@@ -244,17 +332,17 @@ void MipsTargetELFStreamer::finish() {
         ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP, SectionKind::getMetadata());
     OS.SwitchSection(Sec);
 
-    OS.EmitIntValue(1, 1); // kind
+    OS.EmitIntValue(1, 1);  // kind
     OS.EmitIntValue(40, 1); // size
-    OS.EmitIntValue(0, 2); // section
-    OS.EmitIntValue(0, 4); // info
-    OS.EmitIntValue(0, 4); // ri_gprmask
-    OS.EmitIntValue(0, 4); // pad
-    OS.EmitIntValue(0, 4); // ri_cpr[0]mask
-    OS.EmitIntValue(0, 4); // ri_cpr[1]mask
-    OS.EmitIntValue(0, 4); // ri_cpr[2]mask
-    OS.EmitIntValue(0, 4); // ri_cpr[3]mask
-    OS.EmitIntValue(0, 8); // ri_gp_value
+    OS.EmitIntValue(0, 2);  // section
+    OS.EmitIntValue(0, 4);  // info
+    OS.EmitIntValue(0, 4);  // ri_gprmask
+    OS.EmitIntValue(0, 4);  // pad
+    OS.EmitIntValue(0, 4);  // ri_cpr[0]mask
+    OS.EmitIntValue(0, 4);  // ri_cpr[1]mask
+    OS.EmitIntValue(0, 4);  // ri_cpr[2]mask
+    OS.EmitIntValue(0, 4);  // ri_cpr[3]mask
+    OS.EmitIntValue(0, 8);  // ri_gp_value
   } else {
     const MCSectionELF *Sec =
         Context.getELFSection(".reginfo", ELF::SHT_MIPS_REGINFO, ELF::SHF_ALLOC,
@@ -268,6 +356,7 @@ void MipsTargetELFStreamer::finish() {
     OS.EmitIntValue(0, 4); // ri_cpr[3]mask
     OS.EmitIntValue(0, 4); // ri_gp_value
   }
+  emitMipsAbiFlags();
 }
 
 void MipsTargetELFStreamer::emitAssignment(MCSymbol *Symbol,
@@ -276,11 +365,11 @@ void MipsTargetELFStreamer::emitAssignment(MCSymbol *Symbol,
   if (Value->getKind() != MCExpr::SymbolRef)
     return;
   const MCSymbol &RhsSym =
-    static_cast<const MCSymbolRefExpr *>(Value)->getSymbol();
+      static_cast<const MCSymbolRefExpr *>(Value)->getSymbol();
   MCSymbolData &Data = getStreamer().getOrCreateSymbolData(&RhsSym);
   uint8_t Type = MCELF::GetType(Data);
-  if ((Type != ELF::STT_FUNC)
-      || !(MCELF::getOther(Data) & (ELF::STO_MIPS_MICROMIPS >> 2)))
+  if ((Type != ELF::STT_FUNC) ||
+      !(MCELF::getOther(Data) & (ELF::STO_MIPS_MICROMIPS >> 2)))
     return;
 
   MCSymbolData &SymbolData = getStreamer().getOrCreateSymbolData(Symbol);
@@ -305,6 +394,7 @@ void MipsTargetELFStreamer::emitDirectiveSetMicroMips() {
 
 void MipsTargetELFStreamer::emitDirectiveSetNoMicroMips() {
   MicroMipsEnabled = false;
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetMips16() {
@@ -312,14 +402,17 @@ void MipsTargetELFStreamer::emitDirectiveSetMips16() {
   unsigned Flags = MCA.getELFHeaderEFlags();
   Flags |= ELF::EF_MIPS_ARCH_ASE_M16;
   MCA.setELFHeaderEFlags(Flags);
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetNoMips16() {
   // FIXME: implement.
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetReorder() {
   // FIXME: implement.
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetNoReorder() {
@@ -327,22 +420,27 @@ void MipsTargetELFStreamer::emitDirectiveSetNoReorder() {
   unsigned Flags = MCA.getELFHeaderEFlags();
   Flags |= ELF::EF_MIPS_NOREORDER;
   MCA.setELFHeaderEFlags(Flags);
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetMacro() {
   // FIXME: implement.
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetNoMacro() {
   // FIXME: implement.
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetAt() {
   // FIXME: implement.
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetNoAt() {
   // FIXME: implement.
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
@@ -411,19 +509,19 @@ void MipsTargetELFStreamer::emitFMask(unsigned FPUBitmask,
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetMips32R2() {
-  // No action required for ELF output.
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetMips64() {
-  // No action required for ELF output.
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetMips64R2() {
-  // No action required for ELF output.
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetDsp() {
-  // No action required for ELF output.
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetELFStreamer::emitDirectiveCpload(unsigned RegNo) {
@@ -473,6 +571,8 @@ void MipsTargetELFStreamer::emitDirectiveCpload(unsigned RegNo) {
   TmpInst.addOperand(MCOperand::CreateReg(Mips::GP));
   TmpInst.addOperand(MCOperand::CreateReg(RegNo));
   getStreamer().EmitInstruction(TmpInst, STI);
+
+  setCanHaveModuleDir(false);
 }
 
 void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
@@ -528,4 +628,27 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
   Inst.addOperand(MCOperand::CreateReg(Mips::GP));
   Inst.addOperand(MCOperand::CreateReg(RegNo));
   getStreamer().EmitInstruction(Inst, STI);
+
+  setCanHaveModuleDir(false);
+}
+
+void MipsTargetELFStreamer::emitMipsAbiFlags() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  MCContext &Context = MCA.getContext();
+  MCStreamer &OS = getStreamer();
+  const MCSectionELF *Sec =
+      Context.getELFSection(".MIPS.abiflags", ELF::SHT_MIPS_ABIFLAGS,
+                            ELF::SHF_ALLOC, SectionKind::getMetadata());
+  MCSectionData &ABIShndxSD = MCA.getOrCreateSectionData(*Sec);
+  ABIShndxSD.setAlignment(8);
+  OS.SwitchSection(Sec);
+
+  OS << ABIFlagsSection;
+}
+
+void MipsTargetELFStreamer::emitDirectiveModuleOddSPReg(bool Enabled,
+                                                        bool IsO32ABI) {
+  MipsTargetStreamer::emitDirectiveModuleOddSPReg(Enabled, IsO32ABI);
+
+  ABIFlagsSection.OddSPReg = Enabled;
 }
diff --git a/lib/Target/Mips/MicroMipsInstrFPU.td b/lib/Target/Mips/MicroMipsInstrFPU.td
index d95f9b0..b93017a 100644
--- a/lib/Target/Mips/MicroMipsInstrFPU.td
+++ b/lib/Target/Mips/MicroMipsInstrFPU.td
@@ -24,13 +24,13 @@ def LDC1_MM : MMRel, LW_FT<"ldc1", AFGR64Opnd, II_LDC1, load>, LW_FM_MM<0x2f>;
 def SDC1_MM : MMRel, SW_FT<"sdc1", AFGR64Opnd, II_SDC1, store>,
               LW_FM_MM<0x2e>;
 def LWXC1_MM : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>,
-               LWXC1_FM_MM<0x48>;
+               LWXC1_FM_MM<0x48>, INSN_MIPS4_32R2_NOT_32R6_64R6;
 def SWXC1_MM : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>,
-               SWXC1_FM_MM<0x88>;
+               SWXC1_FM_MM<0x88>, INSN_MIPS4_32R2_NOT_32R6_64R6;
 def LUXC1_MM : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>,
-               LWXC1_FM_MM<0x148>, INSN_MIPS5_32R2;
+               LWXC1_FM_MM<0x148>, INSN_MIPS5_32R2_NOT_32R6_64R6;
 def SUXC1_MM : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>,
-               SWXC1_FM_MM<0x188>, INSN_MIPS5_32R2;
+               SWXC1_FM_MM<0x188>, INSN_MIPS5_32R2_NOT_32R6_64R6;
 
 def FCMP_S32_MM : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>,
                   CEQS_FM_MM<0>;
@@ -38,9 +38,9 @@ def FCMP_D32_MM : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>,
                   CEQS_FM_MM<1>;
 
 def BC1F_MM : MMRel, BC1F_FT<"bc1f", brtarget_mm, IIBranch, MIPS_BRANCH_F>,
-              BC1F_FM_MM<0x1c>;
+              BC1F_FM_MM<0x1c>, ISA_MIPS1_NOT_32R6_64R6;
 def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, IIBranch, MIPS_BRANCH_T>,
-              BC1F_FM_MM<0x1d>;
+              BC1F_FM_MM<0x1d>, ISA_MIPS1_NOT_32R6_64R6;
 
 def CEIL_W_S_MM  : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
                    ROUND_W_FM_MM<0, 0x6c>;
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 9904bc6..87a3a3e 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -246,7 +246,6 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   }
   def JR_MM   : MMRel, IndirectBranch<"jr", GPR32Opnd>, JR_FM_MM<0x3c>;
   def JALR_MM : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>;
-  def RET_MM : MMRel, RetBase<"ret", GPR32Opnd>, JR_FM_MM<0x3c>;
 
   /// Branch Instructions
   def BEQ_MM  : MMRel, CBranch<"beq", brtarget_mm, seteq, GPR32Opnd>,
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index ea16331..dd3bc9b 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -61,6 +61,8 @@ def FeatureGP64Bit     : SubtargetFeature<"gp64", "IsGP64bit", "true",
                                 "General Purpose Registers are 64-bit wide.">;
 def FeatureFP64Bit     : SubtargetFeature<"fp64", "IsFP64bit", "true",
                                 "Support 64-bit FP registers.">;
+def FeatureFPXX        : SubtargetFeature<"fpxx", "IsFPXX", "true",
+                                "Support for FPXX.">;
 def FeatureNaN2008     : SubtargetFeature<"nan2008", "IsNaN2008bit", "true",
                                 "IEEE 754-2008 NaN encoding.">;
 def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat",
@@ -73,6 +75,9 @@ def FeatureN64         : SubtargetFeature<"n64", "MipsABI", "N64",
                                 "Enable n64 ABI">;
 def FeatureEABI        : SubtargetFeature<"eabi", "MipsABI", "EABI",
                                 "Enable eabi ABI">;
+def FeatureNoOddSPReg  : SubtargetFeature<"nooddspreg", "UseOddSPReg", "false",
+                              "Disable odd numbered single-precision "
+                              "registers">;
 def FeatureVFPU        : SubtargetFeature<"vfpu", "HasVFPU",
                                 "true", "Enable vector FPU instructions.">;
 def FeatureMips1       : SubtargetFeature<"mips1", "MipsArchVersion", "Mips1",
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index c01d03a..93706c2 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -16,6 +16,7 @@
 #include "Mips16InstrInfo.h"
 #include "MipsInstrInfo.h"
 #include "MipsRegisterInfo.h"
+#include "MipsSubtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -28,6 +29,9 @@
 
 using namespace llvm;
 
+Mips16FrameLowering::Mips16FrameLowering(const MipsSubtarget &STI)
+    : MipsFrameLowering(STI, STI.stackAlignment()) {}
+
 void Mips16FrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB = MF.front();
   MachineFrameInfo *MFI = MF.getFrameInfo();
diff --git a/lib/Target/Mips/Mips16FrameLowering.h b/lib/Target/Mips/Mips16FrameLowering.h
index 3f7829d..1fb7eda 100644
--- a/lib/Target/Mips/Mips16FrameLowering.h
+++ b/lib/Target/Mips/Mips16FrameLowering.h
@@ -19,8 +19,7 @@
 namespace llvm {
 class Mips16FrameLowering : public MipsFrameLowering {
 public:
-  explicit Mips16FrameLowering(const MipsSubtarget &STI)
-    : MipsFrameLowering(STI, STI.stackAlignment()) {}
+  explicit Mips16FrameLowering(const MipsSubtarget &STI);
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index 4e86a27..6672aef 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -37,7 +37,7 @@ using namespace llvm;
 #define DEBUG_TYPE "mips-isel"
 
 bool Mips16DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
-  if (!Subtarget.inMips16Mode())
+  if (!Subtarget->inMips16Mode())
     return false;
   return MipsDAGToDAGISel::runOnMachineFunction(MF);
 }
@@ -226,9 +226,9 @@ bool Mips16DAGToDAGISel::selectAddr16(
     const LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(Parent);
 
     if (LS) {
-      if (LS->getMemoryVT() == MVT::f32 && Subtarget.hasMips4_32r2())
+      if (LS->getMemoryVT() == MVT::f32 && Subtarget->hasMips4_32r2())
         return false;
-      if (LS->getMemoryVT() == MVT::f64 && Subtarget.hasMips4_32r2())
+      if (LS->getMemoryVT() == MVT::f64 && Subtarget->hasMips4_32r2())
         return false;
     }
   }
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index 9102450..81a05df 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -120,13 +120,6 @@ static const Mips16IntrinsicHelperType Mips16IntrinsicHelper[] = {
 
 Mips16TargetLowering::Mips16TargetLowering(MipsTargetMachine &TM)
   : MipsTargetLowering(TM) {
-  //
-  // set up as if mips32 and then revert so we can test the mechanism
-  // for switching
-  addRegisterClass(MVT::i32, &Mips::GPR32RegClass);
-  addRegisterClass(MVT::f32, &Mips::FGR32RegClass);
-  computeRegisterProperties();
-  clearRegisterClasses();
 
   // Set up the register classes
   addRegisterClass(MVT::i32, &Mips::CPU16RegsRegClass);
diff --git a/lib/Target/Mips/Mips16ISelLowering.h b/lib/Target/Mips/Mips16ISelLowering.h
index df88333..2a5eec5 100644
--- a/lib/Target/Mips/Mips16ISelLowering.h
+++ b/lib/Target/Mips/Mips16ISelLowering.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef Mips16ISELLOWERING_H
-#define Mips16ISELLOWERING_H
+#ifndef MIPS16ISELLOWERING_H
+#define MIPS16ISELLOWERING_H
 
 #include "MipsISelLowering.h"
 
diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
index 11166c4..5e4eebb 100644
--- a/lib/Target/Mips/Mips16InstrInfo.td
+++ b/lib/Target/Mips/Mips16InstrInfo.td
@@ -1370,9 +1370,11 @@ def : Mips16Pat<(MipsJmpLink (i32 texternalsym:$dst)),
                 (Jal16 texternalsym:$dst)>;
 
 // Indirect branch
-def: Mips16Pat<
-  (brind CPU16Regs:$rs),
-  (JrcRx16 CPU16Regs:$rs)>;
+def: Mips16Pat<(brind CPU16Regs:$rs), (JrcRx16 CPU16Regs:$rs)> {
+  // Ensure that the addition of MIPS32r6/MIPS64r6 support does not change
+  // MIPS16's behaviour.
+  let AddedComplexity = 1;
+}
 
 // Jump and Link (Call)
 let isCall=1, hasDelaySlot=0 in
diff --git a/lib/Target/Mips/Mips32r6InstrFormats.td b/lib/Target/Mips/Mips32r6InstrFormats.td
index a3f9df5..e4ec96a 100644
--- a/lib/Target/Mips/Mips32r6InstrFormats.td
+++ b/lib/Target/Mips/Mips32r6InstrFormats.td
@@ -39,7 +39,10 @@ def OPGROUP_DAUI     : OPGROUP<0b011101>;
 def OPGROUP_PCREL    : OPGROUP<0b111011>;
 def OPGROUP_REGIMM   : OPGROUP<0b000001>;
 def OPGROUP_SPECIAL  : OPGROUP<0b000000>;
+// The spec occasionally names this value LL, LLD, SC, or SCD.
 def OPGROUP_SPECIAL3 : OPGROUP<0b011111>;
+// The spec names this constant LWC2, LDC2, SWC2, and SDC2 in different places.
+def OPGROUP_COP2LDST : OPGROUP<0b010010>;
 
 class OPCODE2<bits<2> Val> {
   bits<2> Value = Val;
@@ -48,6 +51,11 @@ def OPCODE2_ADDIUPC : OPCODE2<0b00>;
 def OPCODE2_LWPC    : OPCODE2<0b01>;
 def OPCODE2_LWUPC   : OPCODE2<0b10>;
 
+class OPCODE3<bits<3> Val> {
+  bits<3> Value = Val;
+}
+def OPCODE3_LDPC : OPCODE3<0b110>;
+
 class OPCODE5<bits<5> Val> {
   bits<5> Value = Val;
 }
@@ -59,6 +67,13 @@ def OPCODE5_BC1EQZ : OPCODE5<0b01001>;
 def OPCODE5_BC1NEZ : OPCODE5<0b01101>;
 def OPCODE5_BC2EQZ : OPCODE5<0b01001>;
 def OPCODE5_BC2NEZ : OPCODE5<0b01101>;
+def OPCODE5_BGEZAL : OPCODE5<0b10001>;
+// The next four constants are unnamed in the spec. These names are taken from
+// the OPGROUP names they are used with.
+def OPCODE5_LDC2   : OPCODE5<0b01110>;
+def OPCODE5_LWC2   : OPCODE5<0b01010>;
+def OPCODE5_SDC2   : OPCODE5<0b01111>;
+def OPCODE5_SWC2   : OPCODE5<0b01011>;
 
 class OPCODE6<bits<6> Val> {
   bits<6> Value = Val;
@@ -67,6 +82,22 @@ def OPCODE6_ALIGN    : OPCODE6<0b100000>;
 def OPCODE6_DALIGN   : OPCODE6<0b100100>;
 def OPCODE6_BITSWAP  : OPCODE6<0b100000>;
 def OPCODE6_DBITSWAP : OPCODE6<0b100100>;
+def OPCODE6_JALR     : OPCODE6<0b001001>;
+def OPCODE6_CACHE    : OPCODE6<0b100101>;
+def OPCODE6_PREF     : OPCODE6<0b110101>;
+// The next four constants are unnamed in the spec. These names are taken from
+// the OPGROUP names they are used with.
+def OPCODE6_LL       : OPCODE6<0b110110>;
+def OPCODE6_LLD      : OPCODE6<0b110111>;
+def OPCODE6_SC       : OPCODE6<0b100110>;
+def OPCODE6_SCD      : OPCODE6<0b100111>;
+def OPCODE6_CLO      : OPCODE6<0b010001>;
+def OPCODE6_CLZ      : OPCODE6<0b010000>;
+def OPCODE6_DCLO     : OPCODE6<0b010011>;
+def OPCODE6_DCLZ     : OPCODE6<0b010010>;
+def OPCODE6_LSA      : OPCODE6<0b000101>;
+def OPCODE6_DLSA     : OPCODE6<0b010101>;
+def OPCODE6_SDBBP    : OPCODE6<0b001110>;
 
 class FIELD_FMT<bits<5> Val> {
   bits<5> Value = Val;
@@ -77,22 +108,23 @@ def FIELD_FMT_D : FIELD_FMT<0b10001>;
 class FIELD_CMP_COND<bits<5> Val> {
   bits<5> Value = Val;
 }
-def FIELD_CMP_COND_F    : FIELD_CMP_COND<0b00000>;
+// Note: The CMP_COND_FMT names differ from the C_COND_FMT names.
+def FIELD_CMP_COND_AF   : FIELD_CMP_COND<0b00000>;
 def FIELD_CMP_COND_UN   : FIELD_CMP_COND<0b00001>;
 def FIELD_CMP_COND_EQ   : FIELD_CMP_COND<0b00010>;
 def FIELD_CMP_COND_UEQ  : FIELD_CMP_COND<0b00011>;
-def FIELD_CMP_COND_OLT  : FIELD_CMP_COND<0b00100>;
+def FIELD_CMP_COND_LT   : FIELD_CMP_COND<0b00100>;
 def FIELD_CMP_COND_ULT  : FIELD_CMP_COND<0b00101>;
-def FIELD_CMP_COND_OLE  : FIELD_CMP_COND<0b00110>;
+def FIELD_CMP_COND_LE   : FIELD_CMP_COND<0b00110>;
 def FIELD_CMP_COND_ULE  : FIELD_CMP_COND<0b00111>;
-def FIELD_CMP_COND_SF   : FIELD_CMP_COND<0b01000>;
-def FIELD_CMP_COND_NGLE : FIELD_CMP_COND<0b01001>;
+def FIELD_CMP_COND_SAF  : FIELD_CMP_COND<0b01000>;
+def FIELD_CMP_COND_SUN  : FIELD_CMP_COND<0b01001>;
 def FIELD_CMP_COND_SEQ  : FIELD_CMP_COND<0b01010>;
-def FIELD_CMP_COND_NGL  : FIELD_CMP_COND<0b01011>;
-def FIELD_CMP_COND_LT   : FIELD_CMP_COND<0b01100>;
-def FIELD_CMP_COND_NGE  : FIELD_CMP_COND<0b01101>;
-def FIELD_CMP_COND_LE   : FIELD_CMP_COND<0b01110>;
-def FIELD_CMP_COND_NGT  : FIELD_CMP_COND<0b01111>;
+def FIELD_CMP_COND_SUEQ : FIELD_CMP_COND<0b01011>;
+def FIELD_CMP_COND_SLT  : FIELD_CMP_COND<0b01100>;
+def FIELD_CMP_COND_SULT : FIELD_CMP_COND<0b01101>;
+def FIELD_CMP_COND_SLE  : FIELD_CMP_COND<0b01110>;
+def FIELD_CMP_COND_SULE : FIELD_CMP_COND<0b01111>;
 
 class FIELD_CMP_FORMAT<bits<5> Val> {
   bits<5> Value = Val;
@@ -139,6 +171,17 @@ class DAUI_FM : AUI_FM {
   let Inst{31-26} = OPGROUP_DAUI.Value;
 }
 
+class BAL_FM : MipsR6Inst {
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_REGIMM.Value;
+  let Inst{25-21} = 0b00000;
+  let Inst{20-16} = OPCODE5_BGEZAL.Value;
+  let Inst{15-0} = offset;
+}
+
 class COP1_2R_FM<bits<6> funct, FIELD_FMT Format> : MipsR6Inst {
   bits<5> fs;
   bits<5> fd;
@@ -216,6 +259,18 @@ class PCREL19_FM<OPCODE2 Operation> : MipsR6Inst {
   let Inst{18-0} = imm;
 }
 
+class PCREL18_FM<OPCODE3 Operation> : MipsR6Inst {
+  bits<5> rs;
+  bits<18> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_PCREL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-18} = Operation.Value;
+  let Inst{17-0} = imm;
+}
+
 class SPECIAL3_2R_FM<OPCODE6 Operation> : MipsR6Inst {
   bits<5> rd;
   bits<5> rt;
@@ -230,6 +285,36 @@ class SPECIAL3_2R_FM<OPCODE6 Operation> : MipsR6Inst {
   let Inst{5-0}   = Operation.Value;
 }
 
+class SPECIAL3_MEM_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<21> addr;
+  bits<5> hint;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = base;
+  let Inst{20-16} = hint;
+  let Inst{15-7}  = offset;
+  let Inst{6}     = 0;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class SPECIAL_2R_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0b00000;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0b00001;
+  let Inst{5-0}   = Operation.Value;
+}
+
 class SPECIAL_3R_FM<bits<5> mulop, bits<6> funct> : MipsR6Inst {
   bits<5> rd;
   bits<5> rs;
@@ -245,6 +330,16 @@ class SPECIAL_3R_FM<bits<5> mulop, bits<6> funct> : MipsR6Inst {
   let Inst{5-0}   = funct;
 }
 
+class SPECIAL_SDBBP_FM : MipsR6Inst {
+  bits<20> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL.Value;
+  let Inst{25-6}  = code_;
+  let Inst{5-0}   = OPCODE6_SDBBP.Value;
+}
+
 // This class is ambiguous with other branches:
 //   BEQC/BNEC require that rs > rt
 class CMP_BRANCH_2R_OFF16_FM<OPGROUP funct> : MipsR6Inst {
@@ -355,6 +450,40 @@ class SPECIAL3_DALIGN_FM<OPCODE6 Operation> : MipsR6Inst {
   let Inst{5-0}   = Operation.Value;
 }
 
+class SPECIAL3_LL_SC_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = base;
+  let Inst{20-16} = rt;
+  let Inst{15-7} = offset;
+  let Inst{5-0} = Operation.Value;
+
+  string DecoderMethod = "DecodeSpecial3LlSc";
+}
+
+class SPECIAL_LSA_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+  bits<2> imm2;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-8}  = 0b000;
+  let Inst{7-6}   = imm2;
+  let Inst{5-0}   = Operation.Value;
+}
+
 class REGIMM_FM<OPCODE5 Operation> : MipsR6Inst {
   bits<5> rs;
   bits<16> imm;
@@ -384,3 +513,31 @@ class COP1_CMP_CONDN_FM<FIELD_CMP_FORMAT Format,
   let Inst{4-0}   = Cond.Value;
 }
 
+class JR_HB_R6_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = 0;
+  let Inst{10} = 1;
+  let Inst{9-6} = 0;
+  let Inst{5-0} = Operation.Value;
+}
+
+class COP2LDST_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<11> offset = addr{10-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP2LDST.Value;
+  let Inst{25-21} = Operation.Value;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = base;
+  let Inst{10-0}  = offset;
+}
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index ffaf965..d06e5ca 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -14,39 +14,8 @@
 include "Mips32r6InstrFormats.td"
 
 // Notes about removals/changes from MIPS32r6:
-// Unclear: ssnop
-// Reencoded: cache, pref
-// Reencoded: clo, clz
 // Reencoded: jr -> jalr
 // Reencoded: jr.hb -> jalr.hb
-// Reencoded: ldc2
-// Reencoded: ll, sc
-// Reencoded: lwc2
-// Reencoded: sdbbp
-// Reencoded: sdc2
-// Reencoded: swc2
-// Removed: bc1any2, bc1any4
-// Removed: bc2[ft]
-// Removed: bc2f, bc2t
-// Removed: bgezal
-// Removed: bltzal
-// Removed: c.cond.fmt, bc1[ft]
-// Removed: div, divu
-// Removed: jalx
-// Removed: ldxc1
-// Removed: luxc1
-// Removed: lwxc1
-// Removed: madd.[ds], nmadd.[ds], nmsub.[ds], sub.[ds]
-// Removed: mfhi, mflo, mthi, mtlo, madd, maddu, msub, msubu, mul
-// Removed: movf, movt
-// Removed: movf.fmt, movt.fmt, movn.fmt, movz.fmt
-// Removed: movn, movz
-// Removed: mult, multu
-// Removed: prefx
-// Removed: sdxc1
-// Removed: suxc1
-// Removed: swxc1
-// Rencoded: [ls][wd]c2
 
 def brtarget21 : Operand<OtherVT> {
   let EncoderMethod = "getBranchTarget21OpValue";
@@ -84,6 +53,7 @@ class ALUIPC_ENC : PCREL16_FM<OPCODE5_ALUIPC>;
 class AUI_ENC    : AUI_FM;
 class AUIPC_ENC  : PCREL16_FM<OPCODE5_AUIPC>;
 
+class BAL_ENC   : BAL_FM;
 class BALC_ENC  : BRANCH_OFF26_FM<0b111010>;
 class BC_ENC    : BRANCH_OFF26_FM<0b110010>;
 class BEQC_ENC  : CMP_BRANCH_2R_OFF16_FM<OPGROUP_ADDI>,
@@ -97,11 +67,20 @@ class BNEZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_DADDI>,
 
 class BLTZC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BGTZL>,
                   DecodeDisambiguates<"BgtzlGroupBranch">;
+class BGEC_ENC  : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BLEZL>,
+                  DecodeDisambiguatedBy<"BlezlGroupBranch">;
+class BGEUC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BLEZ>,
+                  DecodeDisambiguatedBy<"BlezGroupBranch">;
 class BGEZC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BLEZL>,
                   DecodeDisambiguates<"BlezlGroupBranch">;
 class BGTZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BGTZ>,
                     DecodeDisambiguatedBy<"BgtzGroupBranch">;
 
+class BLTC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BGTZL>,
+                 DecodeDisambiguatedBy<"BgtzlGroupBranch">;
+class BLTUC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BGTZ>,
+                  DecodeDisambiguatedBy<"BgtzGroupBranch">;
+
 class BLEZC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BLEZL>,
                   DecodeDisambiguatedBy<"BlezlGroupBranch">;
 class BLTZALC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BGTZ>,
@@ -110,7 +89,8 @@ class BGTZC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BGTZL>,
                   DecodeDisambiguatedBy<"BgtzlGroupBranch">;
 
 class BEQZC_ENC : CMP_BRANCH_OFF21_FM<0b110110>;
-class BGEZALC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BLEZ>;
+class BGEZALC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BLEZ>,
+                    DecodeDisambiguates<"BlezGroupBranch">;
 class BNEZC_ENC : CMP_BRANCH_OFF21_FM<0b111110>;
 
 class BC1EQZ_ENC : COP1_BCCZ_FM<OPCODE5_BC1EQZ>;
@@ -120,9 +100,10 @@ class BC2NEZ_ENC : COP2_BCCZ_FM<OPCODE5_BC2NEZ>;
 
 class JIALC_ENC : JMP_IDX_COMPACT_FM<0b111110>;
 class JIC_ENC   : JMP_IDX_COMPACT_FM<0b110110>;
-
+class JR_HB_R6_ENC : JR_HB_R6_FM<OPCODE6_JALR>;
 class BITSWAP_ENC : SPECIAL3_2R_FM<OPCODE6_BITSWAP>;
-class BLEZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BLEZ>;
+class BLEZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BLEZ>,
+                    DecodeDisambiguatedBy<"BlezGroupBranch">;
 class BNVC_ENC   : CMP_BRANCH_2R_OFF16_FM<OPGROUP_DADDI>,
                    DecodeDisambiguatedBy<"DaddiGroupBranch">;
 class BOVC_ENC   : CMP_BRANCH_2R_OFF16_FM<OPGROUP_ADDI>,
@@ -170,12 +151,23 @@ class RINT_D_ENC : COP1_2R_FM<0b011010, FIELD_FMT_D>;
 class CLASS_S_ENC : COP1_2R_FM<0b011011, FIELD_FMT_S>;
 class CLASS_D_ENC : COP1_2R_FM<0b011011, FIELD_FMT_D>;
 
-class CMP_CONDN_DESC_BASE<string CondStr, string Typestr, RegisterOperand FGROpnd> {
-  dag OutOperandList = (outs FGROpnd:$fd);
-  dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft);
-  string AsmString = !strconcat("cmp.", CondStr, ".", Typestr, "\t$fd, $fs, $ft");
-  list<dag> Pattern = [];
-}
+class CACHE_ENC : SPECIAL3_MEM_FM<OPCODE6_CACHE>;
+class PREF_ENC : SPECIAL3_MEM_FM<OPCODE6_PREF>;
+
+class LDC2_R6_ENC : COP2LDST_FM<OPCODE5_LDC2>;
+class LWC2_R6_ENC : COP2LDST_FM<OPCODE5_LWC2>;
+class SDC2_R6_ENC : COP2LDST_FM<OPCODE5_SDC2>;
+class SWC2_R6_ENC : COP2LDST_FM<OPCODE5_SWC2>;
+
+class LSA_R6_ENC : SPECIAL_LSA_FM<OPCODE6_LSA>;
+
+class LL_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_LL>;
+class SC_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_SC>;
+
+class CLO_R6_ENC : SPECIAL_2R_FM<OPCODE6_CLO>;
+class CLZ_R6_ENC : SPECIAL_2R_FM<OPCODE6_CLZ>;
+
+class SDBBP_R6_ENC : SPECIAL_SDBBP_FM;
 
 //===----------------------------------------------------------------------===//
 //
@@ -183,56 +175,65 @@ class CMP_CONDN_DESC_BASE<string CondStr, string Typestr, RegisterOperand FGROpn
 //
 //===----------------------------------------------------------------------===//
 
+class CMP_CONDN_DESC_BASE<string CondStr, string Typestr,
+                          RegisterOperand FGROpnd,
+                          SDPatternOperator Op = null_frag> {
+  dag OutOperandList = (outs FGRCCOpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat("cmp.", CondStr, ".", Typestr, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [(set FGRCCOpnd:$fd, (Op FGROpnd:$fs, FGROpnd:$ft))];
+}
+
 multiclass CMP_CC_M <FIELD_CMP_FORMAT Format, string Typestr,
                      RegisterOperand FGROpnd>{
-  def CMP_F_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_F>,
-                    CMP_CONDN_DESC_BASE<"f", Typestr, FGROpnd>,
+  def CMP_F_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_AF>,
+                    CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd>,
                     ISA_MIPS32R6;
   def CMP_UN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UN>,
-                     CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd>,
+                     CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd, setuo>,
                      ISA_MIPS32R6;
   def CMP_EQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_EQ>,
-                     CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd>,
+                     CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd, setoeq>,
                      ISA_MIPS32R6;
   def CMP_UEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UEQ>,
-                      CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd>,
-                      ISA_MIPS32R6;
-  def CMP_OLT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_OLT>,
-                      CMP_CONDN_DESC_BASE<"olt", Typestr, FGROpnd>,
+                      CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd, setueq>,
                       ISA_MIPS32R6;
+  def CMP_LT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LT>,
+                     CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd, setolt>,
+                     ISA_MIPS32R6;
   def CMP_ULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULT>,
-                      CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd>,
-                      ISA_MIPS32R6;
-  def CMP_OLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_OLE>,
-                      CMP_CONDN_DESC_BASE<"ole", Typestr, FGROpnd>,
+                      CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd, setult>,
                       ISA_MIPS32R6;
+  def CMP_LE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LE>,
+                     CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd, setole>,
+                     ISA_MIPS32R6;
   def CMP_ULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULE>,
-                      CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd>,
+                      CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd, setule>,
+                      ISA_MIPS32R6;
+  def CMP_SAF_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SAF>,
+                      CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_SUN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUN>,
+                      CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd>,
                       ISA_MIPS32R6;
-  def CMP_SF_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SF>,
-                     CMP_CONDN_DESC_BASE<"sf", Typestr, FGROpnd>,
-                     ISA_MIPS32R6;
-  def CMP_NGLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_NGLE>,
-                       CMP_CONDN_DESC_BASE<"ngle", Typestr, FGROpnd>,
-                       ISA_MIPS32R6;
   def CMP_SEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SEQ>,
                       CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>,
                       ISA_MIPS32R6;
-  def CMP_NGL_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_NGL>,
-                      CMP_CONDN_DESC_BASE<"ngl", Typestr, FGROpnd>,
-                      ISA_MIPS32R6;
-  def CMP_LT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LT>,
-                     CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd>,
-                     ISA_MIPS32R6;
-  def CMP_NGE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_NGE>,
-                      CMP_CONDN_DESC_BASE<"nge", Typestr, FGROpnd>,
+  def CMP_SUEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUEQ>,
+                       CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd>,
+                       ISA_MIPS32R6;
+  def CMP_SLT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLT>,
+                      CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd>,
                       ISA_MIPS32R6;
-  def CMP_LE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LE>,
-                     CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd>,
-                     ISA_MIPS32R6;
-  def CMP_NGT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_NGT>,
-                      CMP_CONDN_DESC_BASE<"ngt", Typestr, FGROpnd>,
+  def CMP_SULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULT>,
+                       CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd>,
+                       ISA_MIPS32R6;
+  def CMP_SLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLE>,
+                      CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd>,
                       ISA_MIPS32R6;
+  def CMP_SULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULE>,
+                       CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd>,
+                       ISA_MIPS32R6;
 }
 
 //===----------------------------------------------------------------------===//
@@ -241,16 +242,17 @@ multiclass CMP_CC_M <FIELD_CMP_FORMAT Format, string Typestr,
 //
 //===----------------------------------------------------------------------===//
 
-class PCREL19_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+class PCREL_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                      Operand ImmOpnd> {
   dag OutOperandList = (outs GPROpnd:$rs);
-  dag InOperandList = (ins simm19_lsl2:$imm);
+  dag InOperandList = (ins ImmOpnd:$imm);
   string AsmString = !strconcat(instr_asm, "\t$rs, $imm");
   list<dag> Pattern = [];
 }
 
-class ADDIUPC_DESC : PCREL19_DESC_BASE<"addiupc", GPR32Opnd>;
-class LWPC_DESC: PCREL19_DESC_BASE<"lwpc", GPR32Opnd>;
-class LWUPC_DESC: PCREL19_DESC_BASE<"lwupc", GPR32Opnd>;
+class ADDIUPC_DESC : PCREL_DESC_BASE<"addiupc", GPR32Opnd, simm19_lsl2>;
+class LWPC_DESC: PCREL_DESC_BASE<"lwpc", GPR32Opnd, simm19_lsl2>;
+class LWUPC_DESC: PCREL_DESC_BASE<"lwupc", GPR32Opnd, simm19_lsl2>;
 
 class ALIGN_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
                       Operand ImmOpnd> {
@@ -318,15 +320,26 @@ class CMP_CBR_RT_Z_DESC_BASE<string instr_asm, DAGOperand opnd,
   list<Register> Defs = [AT];
 }
 
+class BAL_DESC : BC_DESC_BASE<"bal", brtarget> {
+  bit isCall = 1;
+  bit hasDelaySlot = 1;
+  list<Register> Defs = [RA];
+}
+
 class BALC_DESC : BC_DESC_BASE<"balc", brtarget26> {
   bit isCall = 1;
   list<Register> Defs = [RA];
 }
 
 class BC_DESC : BC_DESC_BASE<"bc", brtarget26>;
+class BGEC_DESC : CMP_BC_DESC_BASE<"bgec", brtarget, GPR32Opnd>;
+class BGEUC_DESC : CMP_BC_DESC_BASE<"bgeuc", brtarget, GPR32Opnd>;
 class BEQC_DESC : CMP_BC_DESC_BASE<"beqc", brtarget, GPR32Opnd>;
 class BNEC_DESC : CMP_BC_DESC_BASE<"bnec", brtarget, GPR32Opnd>;
 
+class BLTC_DESC : CMP_BC_DESC_BASE<"bltc", brtarget, GPR32Opnd>;
+class BLTUC_DESC : CMP_BC_DESC_BASE<"bltuc", brtarget, GPR32Opnd>;
+
 class BLTZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bltzc", brtarget, GPR32Opnd>;
 class BGEZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgezc", brtarget, GPR32Opnd>;
 
@@ -380,6 +393,14 @@ class JIC_DESC : JMP_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16, GPR32Opnd> {
   list<Register> Defs = [AT];
 }
 
+class JR_HB_R6_DESC : JR_HB_DESC_BASE<"jr.hb", GPR32Opnd> {
+  bit isBranch = 1;
+  bit isIndirectBranch = 1;
+  bit hasDelaySlot = 1;
+  bit isTerminator=1;
+  bit isBarrier=1;
+}
+
 class BITSWAP_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
   dag OutOperandList = (outs GPROpnd:$rd);
   dag InOperandList = (ins GPROpnd:$rt);
@@ -389,17 +410,22 @@ class BITSWAP_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
 
 class BITSWAP_DESC : BITSWAP_DESC_BASE<"bitswap", GPR32Opnd>;
 
-class DIVMOD_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+class DIVMOD_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                       SDPatternOperator Op=null_frag> {
   dag OutOperandList = (outs GPROpnd:$rd);
   dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
-  list<dag> Pattern = [];
+  list<dag> Pattern = [(set GPROpnd:$rd, (Op GPROpnd:$rs, GPROpnd:$rt))];
+
+  // This instruction doesn't trap division by zero itself. We must insert
+  // teq instructions as well.
+  bit usesCustomInserter = 1;
 }
 
-class DIV_DESC  : DIVMOD_DESC_BASE<"div", GPR32Opnd>;
-class DIVU_DESC : DIVMOD_DESC_BASE<"divu", GPR32Opnd>;
-class MOD_DESC  : DIVMOD_DESC_BASE<"mod", GPR32Opnd>;
-class MODU_DESC : DIVMOD_DESC_BASE<"modu", GPR32Opnd>;
+class DIV_DESC  : DIVMOD_DESC_BASE<"div", GPR32Opnd, sdiv>;
+class DIVU_DESC : DIVMOD_DESC_BASE<"divu", GPR32Opnd, udiv>;
+class MOD_DESC  : DIVMOD_DESC_BASE<"mod", GPR32Opnd, srem>;
+class MODU_DESC : DIVMOD_DESC_BASE<"modu", GPR32Opnd, urem>;
 
 class BEQZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"beqzalc", brtarget, GPR32Opnd> {
   list<Register> Defs = [RA];
@@ -424,28 +450,35 @@ class BLTZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bltzalc", brtarget, GPR32Opnd> {
 class BNEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bnezalc", brtarget, GPR32Opnd> {
   list<Register> Defs = [RA];
 }
-class MUL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+
+class MUL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                       SDPatternOperator Op=null_frag> {
   dag OutOperandList = (outs GPROpnd:$rd);
   dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
-  list<dag> Pattern = [];
+  list<dag> Pattern = [(set GPROpnd:$rd, (Op GPROpnd:$rs, GPROpnd:$rt))];
 }
 
-class MUH_DESC    : MUL_R6_DESC_BASE<"muh", GPR32Opnd>;
-class MUHU_DESC   : MUL_R6_DESC_BASE<"muhu", GPR32Opnd>;
-class MUL_R6_DESC : MUL_R6_DESC_BASE<"mul", GPR32Opnd>;
+class MUH_DESC    : MUL_R6_DESC_BASE<"muh", GPR32Opnd, mulhs>;
+class MUHU_DESC   : MUL_R6_DESC_BASE<"muhu", GPR32Opnd, mulhu>;
+class MUL_R6_DESC : MUL_R6_DESC_BASE<"mul", GPR32Opnd, mul>;
 class MULU_DESC   : MUL_R6_DESC_BASE<"mulu", GPR32Opnd>;
 
-class COP1_4R_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
+class COP1_SEL_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
   dag OutOperandList = (outs FGROpnd:$fd);
-  dag InOperandList = (ins FGROpnd:$fd_in, FGROpnd:$fs, FGROpnd:$ft);
+  dag InOperandList = (ins FGRCCOpnd:$fd_in, FGROpnd:$fs, FGROpnd:$ft);
   string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
-  list<dag> Pattern = [];
+  list<dag> Pattern = [(set FGROpnd:$fd, (select FGRCCOpnd:$fd_in,
+                                                 FGROpnd:$ft,
+                                                 FGROpnd:$fs))];
   string Constraints = "$fd_in = $fd";
 }
 
-class SEL_D_DESC : COP1_4R_DESC_BASE<"sel.d", FGR64Opnd>;
-class SEL_S_DESC : COP1_4R_DESC_BASE<"sel.s", FGR32Opnd>;
+class SEL_D_DESC : COP1_SEL_DESC_BASE<"sel.d", FGR64Opnd> {
+  // We must insert a SUBREG_TO_REG around $fd_in
+  bit usesCustomInserter = 1;
+}
+class SEL_S_DESC : COP1_SEL_DESC_BASE<"sel.s", FGR32Opnd>;
 
 class SELEQNE_Z_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
   dag OutOperandList = (outs GPROpnd:$rd);
@@ -457,6 +490,14 @@ class SELEQNE_Z_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
 class SELEQZ_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR32Opnd>;
 class SELNEZ_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR32Opnd>;
 
+class COP1_4R_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fd_in, FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [];
+  string Constraints = "$fd_in = $fd";
+}
+
 class MADDF_S_DESC  : COP1_4R_DESC_BASE<"maddf.s", FGR32Opnd>;
 class MADDF_D_DESC  : COP1_4R_DESC_BASE<"maddf.d", FGR64Opnd>;
 class MSUBF_S_DESC  : COP1_4R_DESC_BASE<"msubf.s", FGR32Opnd>;
@@ -503,6 +544,96 @@ class RINT_D_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd>;
 class CLASS_S_DESC : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd>;
 class CLASS_D_DESC : CLASS_RINT_DESC_BASE<"class.d", FGR64Opnd>;
 
+class CACHE_HINT_DESC<string instr_asm, Operand MemOpnd,
+                      RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint);
+  string AsmString = !strconcat(instr_asm, "\t$hint, $addr");
+  list<dag> Pattern = [];
+}
+
+class CACHE_DESC : CACHE_HINT_DESC<"cache", mem_simm9, GPR32Opnd>;
+class PREF_DESC : CACHE_HINT_DESC<"pref", mem_simm9, GPR32Opnd>;
+
+class COP2LD_DESC_BASE<string instr_asm, RegisterOperand COPOpnd> {
+  dag OutOperandList = (outs COPOpnd:$rt);
+  dag InOperandList = (ins mem_simm11:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayLoad = 1;
+}
+
+class LDC2_R6_DESC : COP2LD_DESC_BASE<"ldc2", COP2Opnd>;
+class LWC2_R6_DESC : COP2LD_DESC_BASE<"lwc2", COP2Opnd>;
+
+class COP2ST_DESC_BASE<string instr_asm, RegisterOperand COPOpnd> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins COPOpnd:$rt, mem_simm11:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayStore = 1;
+}
+
+class SDC2_R6_DESC : COP2ST_DESC_BASE<"sdc2", COP2Opnd>;
+class SWC2_R6_DESC : COP2ST_DESC_BASE<"swc2", COP2Opnd>;
+
+class LSA_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                       Operand ImmOpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$imm2);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $imm2");
+  list<dag> Pattern = [];
+}
+
+class LSA_R6_DESC : LSA_R6_DESC_BASE<"lsa", GPR32Opnd, uimm2>;
+
+class LL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins mem_simm9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayLoad = 1;
+}
+
+class LL_R6_DESC : LL_R6_DESC_BASE<"ll", GPR32Opnd>;
+
+class SC_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$dst);
+  dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayStore = 1;
+  string Constraints = "$rt = $dst";
+}
+
+class SC_R6_DESC : SC_R6_DESC_BASE<"sc", GPR32Opnd>;
+
+class CLO_CLZ_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
+}
+
+class CLO_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> :
+    CLO_CLZ_R6_DESC_BASE<instr_asm, GPROpnd> {
+  list<dag> Pattern = [(set GPROpnd:$rd, (ctlz (not GPROpnd:$rs)))];
+}
+
+class CLZ_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> :
+    CLO_CLZ_R6_DESC_BASE<instr_asm, GPROpnd> {
+  list<dag> Pattern = [(set GPROpnd:$rd, (ctlz GPROpnd:$rs))];
+}
+
+class CLO_R6_DESC : CLO_R6_DESC_BASE<"clo", GPR32Opnd>;
+class CLZ_R6_DESC : CLZ_R6_DESC_BASE<"clz", GPR32Opnd>;
+
+class SDBBP_R6_DESC {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins uimm20:$code_);
+  string AsmString = "sdbbp\t$code_";
+  list<dag> Pattern = [];
+}
+
 //===----------------------------------------------------------------------===//
 //
 // Instruction Definitions
@@ -514,6 +645,7 @@ def ALIGN : ALIGN_ENC, ALIGN_DESC, ISA_MIPS32R6;
 def ALUIPC : ALUIPC_ENC, ALUIPC_DESC, ISA_MIPS32R6;
 def AUI : AUI_ENC, AUI_DESC, ISA_MIPS32R6;
 def AUIPC : AUIPC_ENC, AUIPC_DESC, ISA_MIPS32R6;
+def BAL : BAL_ENC, BAL_DESC, ISA_MIPS32R6;
 def BALC : BALC_ENC, BALC_DESC, ISA_MIPS32R6;
 def BC1EQZ : BC1EQZ_ENC, BC1EQZ_DESC, ISA_MIPS32R6;
 def BC1NEZ : BC1NEZ_ENC, BC1NEZ_DESC, ISA_MIPS32R6;
@@ -523,8 +655,8 @@ def BC : BC_ENC, BC_DESC, ISA_MIPS32R6;
 def BEQC : BEQC_ENC, BEQC_DESC, ISA_MIPS32R6;
 def BEQZALC : BEQZALC_ENC, BEQZALC_DESC, ISA_MIPS32R6;
 def BEQZC : BEQZC_ENC, BEQZC_DESC, ISA_MIPS32R6;
-def BGEC;  // Also aliased to blec with operands swapped
-def BGEUC; // Also aliased to bleuc with operands swapped
+def BGEC : BGEC_ENC, BGEC_DESC, ISA_MIPS32R6;
+def BGEUC : BGEUC_ENC, BGEUC_DESC, ISA_MIPS32R6;
 def BGEZALC : BGEZALC_ENC, BGEZALC_DESC, ISA_MIPS32R6;
 def BGEZC : BGEZC_ENC, BGEZC_DESC, ISA_MIPS32R6;
 def BGTZALC : BGTZALC_ENC, BGTZALC_DESC, ISA_MIPS32R6;
@@ -532,8 +664,8 @@ def BGTZC : BGTZC_ENC, BGTZC_DESC, ISA_MIPS32R6;
 def BITSWAP : BITSWAP_ENC, BITSWAP_DESC, ISA_MIPS32R6;
 def BLEZALC : BLEZALC_ENC, BLEZALC_DESC, ISA_MIPS32R6;
 def BLEZC : BLEZC_ENC, BLEZC_DESC, ISA_MIPS32R6;
-def BLTC; // Also aliased to bgtc with operands swapped
-def BLTUC; // Also aliased to bgtuc with operands swapped
+def BLTC : BLTC_ENC, BLTC_DESC, ISA_MIPS32R6;
+def BLTUC : BLTUC_ENC, BLTUC_DESC, ISA_MIPS32R6;
 def BLTZALC : BLTZALC_ENC, BLTZALC_DESC, ISA_MIPS32R6;
 def BLTZC : BLTZC_ENC, BLTZC_DESC, ISA_MIPS32R6;
 def BNEC : BNEC_ENC, BNEC_DESC, ISA_MIPS32R6;
@@ -541,15 +673,22 @@ def BNEZALC : BNEZALC_ENC, BNEZALC_DESC, ISA_MIPS32R6;
 def BNEZC : BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6;
 def BNVC : BNVC_ENC, BNVC_DESC, ISA_MIPS32R6;
 def BOVC : BOVC_ENC, BOVC_DESC, ISA_MIPS32R6;
+def CACHE_R6 : CACHE_ENC, CACHE_DESC, ISA_MIPS32R6;
 def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6;
 def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6;
+def CLO_R6 : CLO_R6_ENC, CLO_R6_DESC, ISA_MIPS32R6;
+def CLZ_R6 : CLZ_R6_ENC, CLZ_R6_DESC, ISA_MIPS32R6;
 defm S : CMP_CC_M<FIELD_CMP_FORMAT_S, "s", FGR32Opnd>;
 defm D : CMP_CC_M<FIELD_CMP_FORMAT_D, "d", FGR64Opnd>;
 def DIV : DIV_ENC, DIV_DESC, ISA_MIPS32R6;
 def DIVU : DIVU_ENC, DIVU_DESC, ISA_MIPS32R6;
 def JIALC : JIALC_ENC, JIALC_DESC, ISA_MIPS32R6;
 def JIC : JIC_ENC, JIC_DESC, ISA_MIPS32R6;
-// def LSA; // See MSA
+def JR_HB_R6 : JR_HB_R6_ENC, JR_HB_R6_DESC, ISA_MIPS32R6;
+def LDC2_R6 : LDC2_R6_ENC, LDC2_R6_DESC, ISA_MIPS32R6;
+def LL_R6 : LL_R6_ENC, LL_R6_DESC, ISA_MIPS32R6;
+def LSA_R6 : LSA_R6_ENC, LSA_R6_DESC, ISA_MIPS32R6;
+def LWC2_R6 : LWC2_R6_ENC, LWC2_R6_DESC, ISA_MIPS32R6;
 def LWPC : LWPC_ENC, LWPC_DESC, ISA_MIPS32R6;
 def LWUPC : LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6;
 def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6;
@@ -571,13 +710,115 @@ def MUHU   : MUHU_ENC, MUHU_DESC, ISA_MIPS32R6;
 def MUL_R6 : MUL_R6_ENC, MUL_R6_DESC, ISA_MIPS32R6;
 def MULU   : MULU_ENC, MULU_DESC, ISA_MIPS32R6;
 def NAL; // BAL with rd=0
+def PREF_R6 : PREF_ENC, PREF_DESC, ISA_MIPS32R6;
 def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6;
 def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6;
-def SELEQZ : SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6;
+def SC_R6 : SC_R6_ENC, SC_R6_DESC, ISA_MIPS32R6;
+def SDBBP_R6 : SDBBP_R6_ENC, SDBBP_R6_DESC, ISA_MIPS32R6;
+def SDC2_R6 : SDC2_R6_ENC, SDC2_R6_DESC, ISA_MIPS32R6;
+def SELEQZ : SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6, GPR_32;
 def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6;
 def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6;
-def SELNEZ : SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6;
+def SELNEZ : SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6, GPR_32;
 def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6;
 def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6;
 def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6;
 def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6;
+def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Aliases
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsInstAlias<"sdbbp", (SDBBP_R6 0)>, ISA_MIPS32R6;
+def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6;
+
+//===----------------------------------------------------------------------===//
+//
+// Patterns and Pseudo Instructions
+//
+//===----------------------------------------------------------------------===//
+
+// f32 comparisons supported via another comparison
+def : MipsPat<(setone f32:$lhs, f32:$rhs),
+              (NOR (CMP_UEQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
+def : MipsPat<(seto f32:$lhs, f32:$rhs),
+              (NOR (CMP_UN_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
+def : MipsPat<(setune f32:$lhs, f32:$rhs),
+              (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
+def : MipsPat<(seteq f32:$lhs, f32:$rhs), (CMP_EQ_S f32:$lhs, f32:$rhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setgt f32:$lhs, f32:$rhs), (CMP_LE_S f32:$rhs, f32:$lhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setge f32:$lhs, f32:$rhs), (CMP_LT_S f32:$rhs, f32:$lhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_LT_S f32:$lhs, f32:$rhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_LE_S f32:$lhs, f32:$rhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setne f32:$lhs, f32:$rhs),
+              (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
+
+// f64 comparisons supported via another comparison
+def : MipsPat<(setone f64:$lhs, f64:$rhs),
+              (NOR (CMP_UEQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
+def : MipsPat<(seto f64:$lhs, f64:$rhs),
+              (NOR (CMP_UN_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
+def : MipsPat<(setune f64:$lhs, f64:$rhs),
+              (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
+def : MipsPat<(seteq f64:$lhs, f64:$rhs), (CMP_EQ_D f64:$lhs, f64:$rhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setgt f64:$lhs, f64:$rhs), (CMP_LE_D f64:$rhs, f64:$lhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setge f64:$lhs, f64:$rhs), (CMP_LT_D f64:$rhs, f64:$lhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_LT_D f64:$lhs, f64:$rhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_LE_D f64:$lhs, f64:$rhs)>,
+      ISA_MIPS32R6;
+def : MipsPat<(setne f64:$lhs, f64:$rhs),
+              (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
+
+// i32 selects
+def : MipsPat<(select i32:$cond, i32:$t, i32:$f),
+              (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>,
+              ISA_MIPS32R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, i32:$f),
+              (OR (SELEQZ i32:$t, i32:$cond), (SELNEZ i32:$f, i32:$cond))>,
+              ISA_MIPS32R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, i32:$f),
+              (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>,
+              ISA_MIPS32R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immZExt16:$imm)), i32:$t, i32:$f),
+              (OR (SELEQZ i32:$t, (XORi i32:$cond, immZExt16:$imm)),
+                  (SELNEZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>,
+              ISA_MIPS32R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i32:$t, i32:$f),
+              (OR (SELNEZ i32:$f, (XORi i32:$cond, immZExt16:$imm)),
+                  (SELEQZ i32:$t, (XORi i32:$cond, immZExt16:$imm)))>,
+              ISA_MIPS32R6;
+def : MipsPat<(select (i32 (setgt i32:$cond, immSExt16Plus1:$imm)), i32:$t,
+                      i32:$f),
+              (OR (SELEQZ i32:$t, (SLTi i32:$cond, (Plus1 imm:$imm))),
+                  (SELNEZ i32:$f, (SLTi i32:$cond, (Plus1 imm:$imm))))>,
+              ISA_MIPS32R6;
+def : MipsPat<(select (i32 (setugt i32:$cond, immSExt16Plus1:$imm)),
+                      i32:$t, i32:$f),
+              (OR (SELEQZ i32:$t, (SLTiu i32:$cond, (Plus1 imm:$imm))),
+                  (SELNEZ i32:$f, (SLTiu i32:$cond, (Plus1 imm:$imm))))>,
+              ISA_MIPS32R6;
+
+def : MipsPat<(select i32:$cond, i32:$t, immz),
+              (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, immz),
+              (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, immz),
+              (SELEQZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
+def : MipsPat<(select i32:$cond, immz, i32:$f),
+              (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), immz, i32:$f),
+              (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i32:$f),
+              (SELNEZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 924b325..f0b6814 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -23,6 +23,8 @@ def uimm16_64      : Operand<i64> {
 // Signed Operand
 def simm10_64 : Operand<i64>;
 
+def imm64: Operand<i64>;
+
 // Transformation Function - get Imm - 32.
 def Subtract32 : SDNodeXForm<imm, [{
   return getImm(N, (unsigned)N->getZExtValue() - 32);
@@ -36,6 +38,9 @@ def immZExt6 : ImmLeaf<i32, [{return Imm == (Imm & 0x3f);}]>;
 def immSExt10_64 : PatLeaf<(i64 imm),
                            [{ return isInt<10>(N->getSExtValue()); }]>;
 
+def immZExt16_64 : PatLeaf<(i64 imm),
+                           [{ return isInt<16>(N->getZExtValue()); }]>;
+
 //===----------------------------------------------------------------------===//
 // Instructions specific format
 //===----------------------------------------------------------------------===//
@@ -62,7 +67,7 @@ let isPseudo = 1, isCodeGenOnly = 1 in {
 let DecoderNamespace = "Mips64" in {
 /// Arithmetic Instructions (ALU Immediate)
 def DADDi   : ArithLogicI<"daddi", simm16_64, GPR64Opnd>, ADDI_FM<0x18>,
-              ISA_MIPS3;
+              ISA_MIPS3_NOT_32R6_64R6;
 def DADDiu  : ArithLogicI<"daddiu", simm16_64, GPR64Opnd, II_DADDIU,
                           immSExt16, add>,
               ADDI_FM<0x19>, IsAsCheapAsAMove, ISA_MIPS3;
@@ -164,49 +169,58 @@ def SDR   : StoreLeftRight<"sdr", MipsSDR, GPR64Opnd, II_SDR>, LW_FM<0x2d>,
             ISA_MIPS3_NOT_32R6_64R6;
 
 /// Load-linked, Store-conditional
-def LLD : LLBase<"lld", GPR64Opnd>, LW_FM<0x34>, ISA_MIPS3;
-def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>, ISA_MIPS3;
+def LLD : LLBase<"lld", GPR64Opnd>, LW_FM<0x34>, ISA_MIPS3_NOT_32R6_64R6;
+def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>, ISA_MIPS3_NOT_32R6_64R6;
 
 /// Jump and Branch Instructions
 let isCodeGenOnly = 1 in {
-def JR64   : IndirectBranch<"jr", GPR64Opnd>, MTLO_FM<8>;
-def BEQ64  : CBranch<"beq", brtarget, seteq, GPR64Opnd>, BEQ_FM<4>;
-def BNE64  : CBranch<"bne", brtarget, setne, GPR64Opnd>, BEQ_FM<5>;
-def BGEZ64 : CBranchZero<"bgez", brtarget, setge, GPR64Opnd>, BGEZ_FM<1, 1>;
-def BGTZ64 : CBranchZero<"bgtz", brtarget, setgt, GPR64Opnd>, BGEZ_FM<7, 0>;
-def BLEZ64 : CBranchZero<"blez", brtarget, setle, GPR64Opnd>, BGEZ_FM<6, 0>;
-def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>;
-def JALR64 : JumpLinkReg<"jalr", GPR64Opnd>, JALR_FM;
-def JALR64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR, RA, GPR32Opnd>;
-def TAILCALL64_R : TailCallReg<GPR64Opnd, JR, GPR32Opnd>;
+  def JR64   : IndirectBranch<"jr", GPR64Opnd>, MTLO_FM<8>;
+  def BEQ64  : CBranch<"beq", brtarget, seteq, GPR64Opnd>, BEQ_FM<4>;
+  def BNE64  : CBranch<"bne", brtarget, setne, GPR64Opnd>, BEQ_FM<5>;
+  def BGEZ64 : CBranchZero<"bgez", brtarget, setge, GPR64Opnd>, BGEZ_FM<1, 1>;
+  def BGTZ64 : CBranchZero<"bgtz", brtarget, setgt, GPR64Opnd>, BGEZ_FM<7, 0>;
+  def BLEZ64 : CBranchZero<"blez", brtarget, setle, GPR64Opnd>, BGEZ_FM<6, 0>;
+  def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>;
+  def JALR64 : JumpLinkReg<"jalr", GPR64Opnd>, JALR_FM;
+  def JALR64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR, RA, GPR32Opnd>;
+  def TAILCALL64_R : TailCallReg<GPR64Opnd, JR, GPR32Opnd>;
 }
 
+def PseudoReturn64 : PseudoReturnBase<GPR64Opnd>;
+def PseudoIndirectBranch64 : PseudoIndirectBranchBase<GPR64Opnd>;
+
 /// Multiply and Divide Instructions.
 def DMULT  : Mult<"dmult", II_DMULT, GPR64Opnd, [HI0_64, LO0_64]>,
-             MULT_FM<0, 0x1c>, ISA_MIPS3;
+             MULT_FM<0, 0x1c>, ISA_MIPS3_NOT_32R6_64R6;
 def DMULTu : Mult<"dmultu", II_DMULTU, GPR64Opnd, [HI0_64, LO0_64]>,
-             MULT_FM<0, 0x1d>, ISA_MIPS3;
+             MULT_FM<0, 0x1d>, ISA_MIPS3_NOT_32R6_64R6;
 def PseudoDMULT  : MultDivPseudo<DMULT, ACC128, GPR64Opnd, MipsMult,
-                                 II_DMULT>;
+                                 II_DMULT>, ISA_MIPS3_NOT_32R6_64R6;
 def PseudoDMULTu : MultDivPseudo<DMULTu, ACC128, GPR64Opnd, MipsMultu,
-                                 II_DMULTU>;
+                                 II_DMULTU>, ISA_MIPS3_NOT_32R6_64R6;
 def DSDIV : Div<"ddiv", II_DDIV, GPR64Opnd, [HI0_64, LO0_64]>,
-            MULT_FM<0, 0x1e>, ISA_MIPS3;
+            MULT_FM<0, 0x1e>, ISA_MIPS3_NOT_32R6_64R6;
 def DUDIV : Div<"ddivu", II_DDIVU, GPR64Opnd, [HI0_64, LO0_64]>,
-            MULT_FM<0, 0x1f>, ISA_MIPS3;
+            MULT_FM<0, 0x1f>, ISA_MIPS3_NOT_32R6_64R6;
 def PseudoDSDIV : MultDivPseudo<DSDIV, ACC128, GPR64Opnd, MipsDivRem,
-                                II_DDIV, 0, 1, 1>;
+                                II_DDIV, 0, 1, 1>, ISA_MIPS3_NOT_32R6_64R6;
 def PseudoDUDIV : MultDivPseudo<DUDIV, ACC128, GPR64Opnd, MipsDivRemU,
-                                II_DDIVU, 0, 1, 1>;
+                                II_DDIVU, 0, 1, 1>, ISA_MIPS3_NOT_32R6_64R6;
 
 let isCodeGenOnly = 1 in {
-def MTHI64 : MoveToLOHI<"mthi", GPR64Opnd, [HI0_64]>, MTLO_FM<0x11>;
-def MTLO64 : MoveToLOHI<"mtlo", GPR64Opnd, [LO0_64]>, MTLO_FM<0x13>;
-def MFHI64 : MoveFromLOHI<"mfhi", GPR64Opnd, AC0_64>, MFLO_FM<0x10>;
-def MFLO64 : MoveFromLOHI<"mflo", GPR64Opnd, AC0_64>, MFLO_FM<0x12>;
-def PseudoMFHI64 : PseudoMFLOHI<GPR64, ACC128, MipsMFHI>;
-def PseudoMFLO64 : PseudoMFLOHI<GPR64, ACC128, MipsMFLO>;
-def PseudoMTLOHI64 : PseudoMTLOHI<ACC128, GPR64>;
+def MTHI64 : MoveToLOHI<"mthi", GPR64Opnd, [HI0_64]>, MTLO_FM<0x11>,
+             ISA_MIPS3_NOT_32R6_64R6;
+def MTLO64 : MoveToLOHI<"mtlo", GPR64Opnd, [LO0_64]>, MTLO_FM<0x13>,
+             ISA_MIPS3_NOT_32R6_64R6;
+def MFHI64 : MoveFromLOHI<"mfhi", GPR64Opnd, AC0_64>, MFLO_FM<0x10>,
+             ISA_MIPS3_NOT_32R6_64R6;
+def MFLO64 : MoveFromLOHI<"mflo", GPR64Opnd, AC0_64>, MFLO_FM<0x12>,
+             ISA_MIPS3_NOT_32R6_64R6;
+def PseudoMFHI64 : PseudoMFLOHI<GPR64, ACC128, MipsMFHI>,
+                   ISA_MIPS3_NOT_32R6_64R6;
+def PseudoMFLO64 : PseudoMFLOHI<GPR64, ACC128, MipsMFLO>,
+                   ISA_MIPS3_NOT_32R6_64R6;
+def PseudoMTLOHI64 : PseudoMTLOHI<ACC128, GPR64>, ISA_MIPS3_NOT_32R6_64R6;
 
 /// Sign Ext In Register Instructions.
 def SEB64 : SignExtInReg<"seb", i8, GPR64Opnd, II_SEB>, SEB_FM<0x10, 0x20>,
@@ -216,8 +230,8 @@ def SEH64 : SignExtInReg<"seh", i16, GPR64Opnd, II_SEH>, SEB_FM<0x18, 0x20>,
 }
 
 /// Count Leading
-def DCLZ : CountLeading0<"dclz", GPR64Opnd>, CLO_FM<0x24>, ISA_MIPS64;
-def DCLO : CountLeading1<"dclo", GPR64Opnd>, CLO_FM<0x25>, ISA_MIPS64;
+def DCLZ : CountLeading0<"dclz", GPR64Opnd>, CLO_FM<0x24>, ISA_MIPS64_NOT_64R6;
+def DCLO : CountLeading1<"dclo", GPR64Opnd>, CLO_FM<0x25>, ISA_MIPS64_NOT_64R6;
 
 /// Double Word Swap Bytes/HalfWords
 def DSBH : SubwordSwap<"dsbh", GPR64Opnd>, SEB_FM<2, 0x24>, ISA_MIPS64R2;
@@ -431,13 +445,13 @@ def : MipsInstAlias<"daddu $rs, $rt, $imm",
                     0>;
 def : MipsInstAlias<"dadd $rs, $rt, $imm",
                     (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
-                    0>;
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
 def : MipsInstAlias<"daddu $rs, $imm",
                     (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
                     0>;
 def : MipsInstAlias<"dadd $rs, $imm",
                     (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
-                    0>;
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
 def : MipsInstAlias<"add $rs, $imm",
                     (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
                     0>;
@@ -450,10 +464,22 @@ def : MipsInstAlias<"dsll $rd, $rt, $rs",
 def : MipsInstAlias<"dsubu $rt, $rs, $imm",
                     (DADDiu GPR64Opnd:$rt, GPR64Opnd:$rs,
                             InvertedImOperand64:$imm), 0>;
+def : MipsInstAlias<"dsubi $rs, $rt, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt,
+                           InvertedImOperand64:$imm),
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
+def : MipsInstAlias<"dsubi $rs, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs,
+                           InvertedImOperand64:$imm),
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
+def : MipsInstAlias<"dsub $rs, $rt, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt,
+                           InvertedImOperand64:$imm),
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
 def : MipsInstAlias<"dsub $rs, $imm",
                     (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs,
                            InvertedImOperand64:$imm),
-                    0>;
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
 def : MipsInstAlias<"dsubu $rs, $imm",
                     (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs,
                             InvertedImOperand64:$imm),
@@ -465,6 +491,11 @@ def : MipsInstAlias<"dsrl $rd, $rt, $rs",
                     (DSRLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
                     ISA_MIPS3;
 
+class LoadImm64< string instr_asm, Operand Od, RegisterOperand RO> :
+  MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm64),
+                     !strconcat(instr_asm, "\t$rt, $imm64")> ;
+def LoadImm64Reg : LoadImm64<"dli", imm64, GPR64Opnd>;
+
 /// Move between CPU and coprocessor registers
 let DecoderNamespace = "Mips64", Predicates = [HasMips64] in {
 def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd>, MFC3OP_FM<0x10, 1>;
diff --git a/lib/Target/Mips/Mips64r6InstrInfo.td b/lib/Target/Mips/Mips64r6InstrInfo.td
index f971218..63cf60b 100644
--- a/lib/Target/Mips/Mips64r6InstrInfo.td
+++ b/lib/Target/Mips/Mips64r6InstrInfo.td
@@ -13,10 +13,6 @@
 
 // Notes about removals/changes from MIPS32r6:
 // Reencoded: dclo, dclz
-// Reencoded: lld, scd
-// Removed: daddi
-// Removed: ddiv, ddivu, dmult, dmultu
-// Removed: div, divu
 
 //===----------------------------------------------------------------------===//
 //
@@ -29,14 +25,20 @@ class DAUI_ENC    : DAUI_FM;
 class DAHI_ENC    : REGIMM_FM<OPCODE5_DAHI>;
 class DATI_ENC    : REGIMM_FM<OPCODE5_DATI>;
 class DBITSWAP_ENC : SPECIAL3_2R_FM<OPCODE6_DBITSWAP>;
+class DCLO_R6_ENC : SPECIAL_2R_FM<OPCODE6_DCLO>;
+class DCLZ_R6_ENC : SPECIAL_2R_FM<OPCODE6_DCLZ>;
 class DDIV_ENC    : SPECIAL_3R_FM<0b00010, 0b011110>;
 class DDIVU_ENC   : SPECIAL_3R_FM<0b00010, 0b011111>;
+class DLSA_R6_ENC : SPECIAL_LSA_FM<OPCODE6_DLSA>;
 class DMOD_ENC    : SPECIAL_3R_FM<0b00011, 0b011110>;
 class DMODU_ENC   : SPECIAL_3R_FM<0b00011, 0b011111>;
-class DMUH_ENC    : SPECIAL_3R_FM<0b00011, 0b111000>;
-class DMUHU_ENC   : SPECIAL_3R_FM<0b00011, 0b111001>;
-class DMUL_R6_ENC : SPECIAL_3R_FM<0b00010, 0b111000>;
-class DMULU_ENC   : SPECIAL_3R_FM<0b00010, 0b111001>;
+class DMUH_ENC    : SPECIAL_3R_FM<0b00011, 0b011100>;
+class DMUHU_ENC   : SPECIAL_3R_FM<0b00011, 0b011101>;
+class DMUL_R6_ENC : SPECIAL_3R_FM<0b00010, 0b011100>;
+class DMULU_ENC   : SPECIAL_3R_FM<0b00010, 0b011101>;
+class LDPC_ENC    : PCREL18_FM<OPCODE3_LDPC>;
+class LLD_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_LLD>;
+class SCD_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_SCD>;
 
 //===----------------------------------------------------------------------===//
 //
@@ -56,14 +58,22 @@ class DAHI_DESC    : AHI_ATI_DESC_BASE<"dahi", GPR64Opnd>;
 class DATI_DESC    : AHI_ATI_DESC_BASE<"dati", GPR64Opnd>;
 class DAUI_DESC    : AUI_DESC_BASE<"daui", GPR64Opnd>;
 class DBITSWAP_DESC : BITSWAP_DESC_BASE<"dbitswap", GPR64Opnd>;
-class DDIV_DESC    : DIVMOD_DESC_BASE<"ddiv", GPR64Opnd>;
-class DDIVU_DESC   : DIVMOD_DESC_BASE<"ddivu", GPR64Opnd>;
-class DMOD_DESC    : DIVMOD_DESC_BASE<"dmod", GPR64Opnd>;
-class DMODU_DESC   : DIVMOD_DESC_BASE<"dmodu", GPR64Opnd>;
-class DMUH_DESC    : MUL_R6_DESC_BASE<"dmuh", GPR64Opnd>;
-class DMUHU_DESC   : MUL_R6_DESC_BASE<"dmuhu", GPR64Opnd>;
-class DMUL_R6_DESC : MUL_R6_DESC_BASE<"dmul", GPR64Opnd>;
+class DCLO_R6_DESC : CLO_R6_DESC_BASE<"dclo", GPR64Opnd>;
+class DCLZ_R6_DESC : CLZ_R6_DESC_BASE<"dclz", GPR64Opnd>;
+class DDIV_DESC    : DIVMOD_DESC_BASE<"ddiv", GPR64Opnd, sdiv>;
+class DDIVU_DESC   : DIVMOD_DESC_BASE<"ddivu", GPR64Opnd, udiv>;
+class DLSA_R6_DESC : LSA_R6_DESC_BASE<"dlsa", GPR64Opnd, uimm2>;
+class DMOD_DESC    : DIVMOD_DESC_BASE<"dmod", GPR64Opnd, srem>;
+class DMODU_DESC   : DIVMOD_DESC_BASE<"dmodu", GPR64Opnd, urem>;
+class DMUH_DESC    : MUL_R6_DESC_BASE<"dmuh", GPR64Opnd, mulhs>;
+class DMUHU_DESC   : MUL_R6_DESC_BASE<"dmuhu", GPR64Opnd, mulhu>;
+class DMUL_R6_DESC : MUL_R6_DESC_BASE<"dmul", GPR64Opnd, mul>;
 class DMULU_DESC   : MUL_R6_DESC_BASE<"dmulu", GPR64Opnd>;
+class LDPC_DESC    : PCREL_DESC_BASE<"ldpc", GPR64Opnd, simm18_lsl3>;
+class LLD_R6_DESC   : LL_R6_DESC_BASE<"lld", GPR64Opnd>;
+class SCD_R6_DESC   : SC_R6_DESC_BASE<"scd", GPR64Opnd>;
+class SELEQZ64_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR64Opnd>;
+class SELNEZ64_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR64Opnd>;
 
 //===----------------------------------------------------------------------===//
 //
@@ -76,13 +86,132 @@ def DALIGN : DALIGN_ENC, DALIGN_DESC, ISA_MIPS64R6;
 def DATI : DATI_ENC, DATI_DESC, ISA_MIPS64R6;
 def DAUI : DAUI_ENC, DAUI_DESC, ISA_MIPS64R6;
 def DBITSWAP : DBITSWAP_ENC, DBITSWAP_DESC, ISA_MIPS64R6;
+def DCLO_R6 : DCLO_R6_ENC, DCLO_R6_DESC, ISA_MIPS64R6;
+def DCLZ_R6 : DCLZ_R6_ENC, DCLZ_R6_DESC, ISA_MIPS64R6;
 def DDIV : DDIV_ENC, DDIV_DESC, ISA_MIPS64R6;
 def DDIVU : DDIVU_ENC, DDIVU_DESC, ISA_MIPS64R6;
-// def DLSA; // See MSA
+def DLSA_R6 : DLSA_R6_ENC, DLSA_R6_DESC, ISA_MIPS64R6;
 def DMOD : DMOD_ENC, DMOD_DESC, ISA_MIPS64R6;
 def DMODU : DMODU_ENC, DMODU_DESC, ISA_MIPS64R6;
 def DMUH: DMUH_ENC, DMUH_DESC, ISA_MIPS64R6;
 def DMUHU: DMUHU_ENC, DMUHU_DESC, ISA_MIPS64R6;
 def DMUL_R6: DMUL_R6_ENC, DMUL_R6_DESC, ISA_MIPS64R6;
 def DMULU: DMULU_ENC, DMULU_DESC, ISA_MIPS64R6;
-def LDPC;
+def LDPC: LDPC_ENC, LDPC_DESC, ISA_MIPS64R6;
+def LLD_R6 : LLD_R6_ENC, LLD_R6_DESC, ISA_MIPS32R6;
+def SCD_R6 : SCD_R6_ENC, SCD_R6_DESC, ISA_MIPS32R6;
+let DecoderNamespace = "Mips32r6_64r6_GP64" in {
+  def SELEQZ64 : SELEQZ_ENC, SELEQZ64_DESC, ISA_MIPS32R6, GPR_64;
+  def SELNEZ64 : SELNEZ_ENC, SELNEZ64_DESC, ISA_MIPS32R6, GPR_64;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Aliases
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsInstAlias<"jr $rs", (JALR64 ZERO_64, GPR64Opnd:$rs), 1>, ISA_MIPS64R6;
+
+//===----------------------------------------------------------------------===//
+//
+// Patterns and Pseudo Instructions
+//
+//===----------------------------------------------------------------------===//
+
+// i64 selects
+def : MipsPat<(select i64:$cond, i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, i64:$cond),
+                    (SELEQZ64 i64:$f, i64:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immz)), i64:$t, i64:$f),
+              (OR64 (SELEQZ64 i64:$t, i64:$cond),
+                    (SELNEZ64 i64:$f, i64:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i64:$cond, immz)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, i64:$cond),
+                    (SELEQZ64 i64:$f, i64:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immZExt16_64:$imm)), i64:$t, i64:$f),
+              (OR64 (SELEQZ64 i64:$t, (XORi64 i64:$cond, immZExt16_64:$imm)),
+                    (SELNEZ64 i64:$f, (XORi64 i64:$cond, immZExt16_64:$imm)))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i64:$cond, immZExt16_64:$imm)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, (XORi64 i64:$cond, immZExt16_64:$imm)),
+                    (SELEQZ64 i64:$f, (XORi64 i64:$cond, immZExt16_64:$imm)))>,
+              ISA_MIPS64R6;
+def : MipsPat<
+  (select (i32 (setgt i64:$cond, immSExt16Plus1:$imm)), i64:$t, i64:$f),
+  (OR64 (SELEQZ64 i64:$t,
+                  (SUBREG_TO_REG (i64 0), (SLTi64 i64:$cond, (Plus1 imm:$imm)),
+                                 sub_32)),
+        (SELNEZ64 i64:$f,
+                  (SUBREG_TO_REG (i64 0), (SLTi64 i64:$cond, (Plus1 imm:$imm)),
+                                 sub_32)))>,
+  ISA_MIPS64R6;
+def : MipsPat<
+  (select (i32 (setugt i64:$cond, immSExt16Plus1:$imm)), i64:$t, i64:$f),
+  (OR64 (SELEQZ64 i64:$t,
+                  (SUBREG_TO_REG (i64 0), (SLTiu64 i64:$cond, (Plus1 imm:$imm)),
+                                 sub_32)),
+        (SELNEZ64 i64:$f,
+                  (SUBREG_TO_REG (i64 0), (SLTiu64 i64:$cond, (Plus1 imm:$imm)),
+                                 sub_32)))>,
+  ISA_MIPS64R6;
+
+def : MipsPat<(select (i32 (setne i64:$cond, immz)), i64:$t, immz),
+              (SELNEZ64 i64:$t, i64:$cond)>, ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immz)), i64:$t, immz),
+              (SELEQZ64 i64:$t, i64:$cond)>, ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i64:$cond, immz)), immz, i64:$f),
+              (SELEQZ64 i64:$f, i64:$cond)>, ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immz)), immz, i64:$f),
+              (SELNEZ64 i64:$f, i64:$cond)>, ISA_MIPS64R6;
+
+// i64 selects from an i32 comparison
+// One complicating factor here is that bits 32-63 of an i32 are undefined.
+// FIXME: Ideally, setcc would always produce an i64 on MIPS64 targets.
+//        This would allow us to remove the sign-extensions here.
+def : MipsPat<(select i32:$cond, i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, (SLL64_32 i32:$cond)),
+                    (SELEQZ64 i64:$f, (SLL64_32 i32:$cond)))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i64:$t, i64:$f),
+              (OR64 (SELEQZ64 i64:$t, (SLL64_32 i32:$cond)),
+                    (SELNEZ64 i64:$f, (SLL64_32 i32:$cond)))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, (SLL64_32 i32:$cond)),
+                    (SELEQZ64 i64:$f, (SLL64_32 i32:$cond)))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immZExt16:$imm)), i64:$t, i64:$f),
+              (OR64 (SELEQZ64 i64:$t, (SLL64_32 (XORi i32:$cond,
+                                                      immZExt16:$imm))),
+                    (SELNEZ64 i64:$f, (SLL64_32 (XORi i32:$cond,
+                                                      immZExt16:$imm))))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$f, (SLL64_32 (XORi i32:$cond,
+                                                      immZExt16:$imm))),
+                    (SELEQZ64 i64:$t, (SLL64_32 (XORi i32:$cond,
+                                                      immZExt16:$imm))))>,
+              ISA_MIPS64R6;
+
+def : MipsPat<(select i32:$cond, i64:$t, immz),
+              (SELNEZ64 i64:$t, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), i64:$t, immz),
+              (SELNEZ64 i64:$t, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i64:$t, immz),
+              (SELEQZ64 i64:$t, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select i32:$cond, immz, i64:$f),
+              (SELEQZ64 i64:$f, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), immz, i64:$f),
+              (SELEQZ64 i64:$f, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i64:$f),
+              (SELNEZ64 i64:$f, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 6df90aa..1fb75a2 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -91,7 +91,46 @@ bool MipsAsmPrinter::lowerOperand(const MachineOperand &MO, MCOperand &MCOp) {
 
 #include "MipsGenMCPseudoLowering.inc"
 
+// Lower PseudoReturn/PseudoIndirectBranch/PseudoIndirectBranch64 to JR, JR_MM,
+// JALR, or JALR64 as appropriate for the target
+void MipsAsmPrinter::emitPseudoIndirectBranch(MCStreamer &OutStreamer,
+                                              const MachineInstr *MI) {
+  bool HasLinkReg = false;
+  MCInst TmpInst0;
+
+  if (Subtarget->hasMips64r6()) {
+    // MIPS64r6 should use (JALR64 ZERO_64, $rs)
+    TmpInst0.setOpcode(Mips::JALR64);
+    HasLinkReg = true;
+  } else if (Subtarget->hasMips32r6()) {
+    // MIPS32r6 should use (JALR ZERO, $rs)
+    TmpInst0.setOpcode(Mips::JALR);
+    HasLinkReg = true;
+  } else if (Subtarget->inMicroMipsMode())
+    // microMIPS should use (JR_MM $rs)
+    TmpInst0.setOpcode(Mips::JR_MM);
+  else {
+    // Everything else should use (JR $rs)
+    TmpInst0.setOpcode(Mips::JR);
+  }
+
+  MCOperand MCOp;
+
+  if (HasLinkReg) {
+    unsigned ZeroReg = Subtarget->isGP64bit() ? Mips::ZERO_64 : Mips::ZERO;
+    TmpInst0.addOperand(MCOperand::CreateReg(ZeroReg));
+  }
+
+  lowerOperand(MI->getOperand(0), MCOp);
+  TmpInst0.addOperand(MCOp);
+
+  EmitToStreamer(OutStreamer, TmpInst0);
+}
+
 void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  MipsTargetStreamer &TS = getTargetStreamer();
+  TS.setCanHaveModuleDir(false);
+
   if (MI->isDebugValue()) {
     SmallString<128> Str;
     raw_svector_ostream OS(Str);
@@ -141,6 +180,14 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     if (emitPseudoExpansionLowering(OutStreamer, &*I))
       continue;
 
+    if (I->getOpcode() == Mips::PseudoReturn ||
+        I->getOpcode() == Mips::PseudoReturn64 ||
+        I->getOpcode() == Mips::PseudoIndirectBranch ||
+        I->getOpcode() == Mips::PseudoIndirectBranch64) {
+      emitPseudoIndirectBranch(OutStreamer, &*I);
+      continue;
+    }
+
     // The inMips16Mode() test is not permanent.
     // Some instructions are marked as pseudo right now which
     // would make the test fail for the wrong reason but
@@ -657,6 +704,13 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
           OutContext.getELFSection(".gcc_compiled_long64", ELF::SHT_PROGBITS, 0,
                                    SectionKind::getDataRel()));
   }
+
+  getTargetStreamer().updateABIInfo(*Subtarget);
+  getTargetStreamer().emitDirectiveModuleFP();
+
+  if (Subtarget->isABI_O32())
+    getTargetStreamer().emitDirectiveModuleOddSPReg(Subtarget->useOddSPReg(),
+                                                    Subtarget->isABI_O32());
 }
 
 void MipsAsmPrinter::EmitJal(MCSymbol *Symbol) {
@@ -852,7 +906,7 @@ void MipsAsmPrinter::EmitFPCallStub(
   TS.emitDirectiveSetNoMicroMips();
   //
   // .ent __call_stub_fp_xxxx
-  // .type	__call_stub_fp_xxxx,@function
+  // .type  __call_stub_fp_xxxx,@function
   //  __call_stub_fp_xxxx:
   //
   std::string x = "__call_stub_fp_" + std::string(Symbol);
diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index e82b145..967aa0b 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h
@@ -40,6 +40,12 @@ private:
   bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
                                    const MachineInstr *MI);
 
+  // Emit PseudoReturn, PseudoReturn64, PseudoIndirectBranch,
+  // and PseudoIndirectBranch64 as a JR, JR_MM, JALR, or JALR64 as appropriate
+  // for the target.
+  void emitPseudoIndirectBranch(MCStreamer &OutStreamer,
+                                const MachineInstr *MI);
+
   // lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
 
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index c83d880..007213c 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -239,6 +239,11 @@ def RetCC_Mips : CallingConv<[
 def CSR_SingleFloatOnly : CalleeSavedRegs<(add (sequence "F%u", 31, 20), RA, FP,
                                                (sequence "S%u", 7, 0))>;
 
+def CSR_O32_FPXX : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP,
+                                        (sequence "S%u", 7, 0))> {
+  let OtherPreserved = (add (decimate (sequence "F%u", 30, 20), 2));
+}
+
 def CSR_O32 : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP,
                                    (sequence "S%u", 7, 0))>;
 
diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
index 13fa546..151ef13 100644
--- a/lib/Target/Mips/MipsCodeEmitter.cpp
+++ b/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -124,6 +124,7 @@ private:
   unsigned getSizeInsEncoding(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getLSAImmEncoding(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getSimm19Lsl2Encoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getSimm18Lsl3Encoding(const MachineInstr &MI, unsigned OpNo) const;
 
   /// Expand pseudo instructions with accumulator register operands.
   void expandACCInstr(MachineBasicBlock::instr_iterator MI,
@@ -273,6 +274,12 @@ unsigned MipsCodeEmitter::getLSAImmEncoding(const MachineInstr &MI,
   return 0;
 }
 
+unsigned MipsCodeEmitter::getSimm18Lsl3Encoding(const MachineInstr &MI,
+                                                unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
 unsigned MipsCodeEmitter::getSimm19Lsl2Encoding(const MachineInstr &MI,
                                                 unsigned OpNo) const {
   llvm_unreachable("Unimplemented function.");
diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td
index 7177f65..690f626 100644
--- a/lib/Target/Mips/MipsCondMov.td
+++ b/lib/Target/Mips/MipsCondMov.td
@@ -104,136 +104,162 @@ multiclass MovnPats<RegisterClass CRC, RegisterClass DRC, Instruction MOVNInst,
 
 // Instantiation of instructions.
 def MOVZ_I_I : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd, II_MOVZ>,
-               ADD_FM<0, 0xa>, INSN_MIPS4_32;
+               ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
 
 let isCodeGenOnly = 1 in {
   def MOVZ_I_I64   : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd, II_MOVZ>,
-                     ADD_FM<0, 0xa>;
+                     ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
   def MOVZ_I64_I   : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd, II_MOVZ>,
-                     ADD_FM<0, 0xa>;
+                     ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
   def MOVZ_I64_I64 : CMov_I_I_FT<"movz", GPR64Opnd, GPR64Opnd, II_MOVZ>,
-                     ADD_FM<0, 0xa>;
+                     ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
 }
 
 def MOVN_I_I       : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd, II_MOVN>,
-                     ADD_FM<0, 0xb>, INSN_MIPS4_32;
+                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
 
 let isCodeGenOnly = 1 in {
   def MOVN_I_I64   : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd, II_MOVN>,
-                     ADD_FM<0, 0xb>;
+                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
   def MOVN_I64_I   : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd, II_MOVN>,
-                     ADD_FM<0, 0xb>;
+                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
   def MOVN_I64_I64 : CMov_I_I_FT<"movn", GPR64Opnd, GPR64Opnd, II_MOVN>,
-                     ADD_FM<0, 0xb>;
+                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
 }
 
 def MOVZ_I_S : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, II_MOVZ_S>,
-               CMov_I_F_FM<18, 16>, INSN_MIPS4_32;
+               CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
 
 let isCodeGenOnly = 1 in
 def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32Opnd, II_MOVZ_S>,
-                 CMov_I_F_FM<18, 16>, AdditionalRequires<[HasMips64]>;
+                 CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6,
+                 AdditionalRequires<[HasMips64]>;
 
 def MOVN_I_S : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd, II_MOVN_S>,
-               CMov_I_F_FM<19, 16>, INSN_MIPS4_32;
+               CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
 
 let isCodeGenOnly = 1 in
 def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32Opnd, II_MOVN_S>,
-                 CMov_I_F_FM<19, 16>, AdditionalRequires<[IsGP64bit]>;
+                 CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6,
+                 AdditionalRequires<[IsGP64bit]>;
 
 def MOVZ_I_D32 : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
                                     II_MOVZ_D>, CMov_I_F_FM<18, 17>,
-                 INSN_MIPS4_32, FGR_32;
+                 INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
 def MOVN_I_D32 : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
                                     II_MOVN_D>, CMov_I_F_FM<19, 17>,
-                 INSN_MIPS4_32, FGR_32;
+                 INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
 
 let DecoderNamespace = "Mips64" in {
   def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", GPR32Opnd, FGR64Opnd, II_MOVZ_D>,
-                   CMov_I_F_FM<18, 17>, INSN_MIPS4_32, FGR_64;
+                   CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
   def MOVN_I_D64 : CMov_I_F_FT<"movn.d", GPR32Opnd, FGR64Opnd, II_MOVN_D>,
-                   CMov_I_F_FM<19, 17>, INSN_MIPS4_32, FGR_64;
+                   CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
   let isCodeGenOnly = 1 in {
-    def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd,
-                                   II_MOVZ_D>, CMov_I_F_FM<18, 17>, FGR_64;
-    def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd,
-                                   II_MOVN_D>, CMov_I_F_FM<19, 17>, FGR_64;
+    def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd, II_MOVZ_D>,
+                       CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+    def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd, II_MOVN_D>,
+                       CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
   }
 }
 
 def MOVT_I : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT, MipsCMovFP_T>,
-             CMov_F_I_FM<1>, INSN_MIPS4_32;
+             CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6;
 
 let isCodeGenOnly = 1 in
 def MOVT_I64 : CMov_F_I_FT<"movt", GPR64Opnd, II_MOVT, MipsCMovFP_T>,
-               CMov_F_I_FM<1>, AdditionalRequires<[IsGP64bit]>;
+               CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6,
+               AdditionalRequires<[IsGP64bit]>;
 
 def MOVF_I : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF, MipsCMovFP_F>,
-             CMov_F_I_FM<0>, INSN_MIPS4_32;
+             CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6;
 
 let isCodeGenOnly = 1 in
 def MOVF_I64 : CMov_F_I_FT<"movf", GPR64Opnd, II_MOVF, MipsCMovFP_F>,
-               CMov_F_I_FM<0>, AdditionalRequires<[IsGP64bit]>;
+               CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6,
+               AdditionalRequires<[IsGP64bit]>;
 
 def MOVT_S : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S, MipsCMovFP_T>,
-             CMov_F_F_FM<16, 1>, INSN_MIPS4_32;
+             CMov_F_F_FM<16, 1>, INSN_MIPS4_32_NOT_32R6_64R6;
 def MOVF_S : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S, MipsCMovFP_F>,
-             CMov_F_F_FM<16, 0>, INSN_MIPS4_32;
+             CMov_F_F_FM<16, 0>, INSN_MIPS4_32_NOT_32R6_64R6;
 
 def MOVT_D32 : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
                                   MipsCMovFP_T>, CMov_F_F_FM<17, 1>,
-               INSN_MIPS4_32, FGR_32;
+               INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
 def MOVF_D32 : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
                                   MipsCMovFP_F>, CMov_F_F_FM<17, 0>,
-               INSN_MIPS4_32, FGR_32;
+               INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
 
 let DecoderNamespace = "Mips64" in {
   def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64Opnd, II_MOVT_D, MipsCMovFP_T>,
-                 CMov_F_F_FM<17, 1>, INSN_MIPS4_32, FGR_64;
+                 CMov_F_F_FM<17, 1>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
   def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64Opnd, II_MOVF_D, MipsCMovFP_F>,
-                 CMov_F_F_FM<17, 0>, INSN_MIPS4_32, FGR_64;
+                 CMov_F_F_FM<17, 0>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
 }
 
 // Instantiation of conditional move patterns.
-defm : MovzPats0<GPR32, GPR32, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>;
-defm : MovzPats1<GPR32, GPR32, MOVZ_I_I, XOR>;
-defm : MovzPats2<GPR32, GPR32, MOVZ_I_I, XORi>;
+defm : MovzPats0<GPR32, GPR32, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovzPats1<GPR32, GPR32, MOVZ_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovzPats2<GPR32, GPR32, MOVZ_I_I, XORi>, INSN_MIPS4_32_NOT_32R6_64R6;
 
-defm : MovzPats0<GPR32, GPR64, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>, GPR_64;
+defm : MovzPats0<GPR32, GPR64, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
 defm : MovzPats0<GPR64, GPR32, MOVZ_I_I, SLT64, SLTu64, SLTi64, SLTiu64>,
-       GPR_64;
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
 defm : MovzPats0<GPR64, GPR64, MOVZ_I_I64, SLT64, SLTu64, SLTi64, SLTiu64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR32, GPR64, MOVZ_I_I64, XOR>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR64, GPR32, MOVZ_I64_I, XOR64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR64, GPR64, MOVZ_I64_I64, XOR64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats2<GPR32, GPR64, MOVZ_I_I64, XORi>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats2<GPR64, GPR32, MOVZ_I64_I, XORi64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats2<GPR64, GPR64, MOVZ_I64_I64, XORi64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+
+defm : MovnPats<GPR32, GPR32, MOVN_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+defm : MovnPats<GPR32, GPR64, MOVN_I_I64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       GPR_64;
+defm : MovnPats<GPR64, GPR32, MOVN_I64_I, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+       GPR_64;
+defm : MovnPats<GPR64, GPR64, MOVN_I64_I64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
        GPR_64;
-defm : MovzPats1<GPR32, GPR64, MOVZ_I_I64, XOR>, GPR_64;
-defm : MovzPats1<GPR64, GPR32, MOVZ_I64_I, XOR64>, GPR_64;
-defm : MovzPats1<GPR64, GPR64, MOVZ_I64_I64, XOR64>, GPR_64;
-defm : MovzPats2<GPR32, GPR64, MOVZ_I_I64, XORi>, GPR_64;
-defm : MovzPats2<GPR64, GPR32, MOVZ_I64_I, XORi64>, GPR_64;
-defm : MovzPats2<GPR64, GPR64, MOVZ_I64_I64, XORi64>, GPR_64;
-
-defm : MovnPats<GPR32, GPR32, MOVN_I_I, XOR>;
-
-defm : MovnPats<GPR32, GPR64, MOVN_I_I64, XOR>, GPR_64;
-defm : MovnPats<GPR64, GPR32, MOVN_I64_I, XOR64>, GPR_64;
-defm : MovnPats<GPR64, GPR64, MOVN_I64_I64, XOR64>, GPR_64;
 
-defm : MovzPats0<GPR32, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>;
-defm : MovzPats1<GPR32, FGR32, MOVZ_I_S, XOR>;
-defm : MovnPats<GPR32, FGR32, MOVN_I_S, XOR>;
+defm : MovzPats0<GPR32, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovzPats1<GPR32, FGR32, MOVZ_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovnPats<GPR32, FGR32, MOVN_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
 
 defm : MovzPats0<GPR64, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64, SLTiu64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR64, FGR32, MOVZ_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+       GPR_64;
+defm : MovnPats<GPR64, FGR32, MOVN_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
        GPR_64;
-defm : MovzPats1<GPR64, FGR32, MOVZ_I64_S, XOR64>, GPR_64;
-defm : MovnPats<GPR64, FGR32, MOVN_I64_S, XOR64>, GPR_64;
 
-defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>, FGR_32;
-defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32, XOR>, FGR_32;
-defm : MovnPats<GPR32, AFGR64, MOVN_I_D32, XOR>, FGR_32;
+defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       FGR_32;
+defm : MovnPats<GPR32, AFGR64, MOVN_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       FGR_32;
 
-defm : MovzPats0<GPR32, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>, FGR_64;
+defm : MovzPats0<GPR32, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
 defm : MovzPats0<GPR64, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64, SLTiu64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+defm : MovzPats1<GPR32, FGR64, MOVZ_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       FGR_64;
+defm : MovzPats1<GPR64, FGR64, MOVZ_I64_D64, XOR64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+defm : MovnPats<GPR32, FGR64, MOVN_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       FGR_64;
+defm : MovnPats<GPR64, FGR64, MOVN_I64_D64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
        FGR_64;
-defm : MovzPats1<GPR32, FGR64, MOVZ_I_D64, XOR>, FGR_64;
-defm : MovzPats1<GPR64, FGR64, MOVZ_I64_D64, XOR64>, FGR_64;
-defm : MovnPats<GPR32, FGR64, MOVN_I_D64, XOR>, FGR_64;
-defm : MovnPats<GPR64, FGR64, MOVN_I64_D64, XOR64>, FGR_64;
diff --git a/lib/Target/Mips/MipsDSPInstrFormats.td b/lib/Target/Mips/MipsDSPInstrFormats.td
index cf09113..b5d52ce 100644
--- a/lib/Target/Mips/MipsDSPInstrFormats.td
+++ b/lib/Target/Mips/MipsDSPInstrFormats.td
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-def HasDSP : Predicate<"Subtarget.hasDSP()">,
+def HasDSP : Predicate<"Subtarget->hasDSP()">,
              AssemblerPredicate<"FeatureDSP">;
-def HasDSPR2 : Predicate<"Subtarget.hasDSPR2()">,
+def HasDSPR2 : Predicate<"Subtarget->hasDSPR2()">,
                AssemblerPredicate<"FeatureDSPR2">;
 
 // Fields.
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index d6c7cac..bcfbc12 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -177,6 +178,13 @@ namespace {
       for (MachineFunction::iterator FI = F.begin(), FE = F.end();
            FI != FE; ++FI)
         Changed |= runOnMachineBasicBlock(*FI);
+
+      // This pass invalidates liveness information when it reorders
+      // instructions to fill delay slot. Without this, -verify-machineinstrs
+      // will fail.
+      if (Changed)
+        F.getRegInfo().invalidateLiveness();
+
       return Changed;
     }
 
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index 268a0ed..617801b 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -12,6 +12,7 @@
 #include "MipsISelLowering.h"
 #include "MipsMachineFunction.h"
 #include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
 
 using namespace llvm;
 
@@ -36,11 +37,11 @@ class MipsFastISel final : public FastISel {
 
   /// Subtarget - Keep a pointer to the MipsSubtarget around so that we can
   /// make the right decision when generating code for different targets.
-  const MipsSubtarget *Subtarget;
   Module &M;
   const TargetMachine &TM;
   const TargetInstrInfo &TII;
   const TargetLowering &TLI;
+  const MipsSubtarget *Subtarget;
   MipsFunctionInfo *MFI;
 
   // Convenience variables to avoid some queries.
@@ -54,8 +55,8 @@ public:
       : FastISel(funcInfo, libInfo),
         M(const_cast<Module &>(*funcInfo.Fn->getParent())),
         TM(funcInfo.MF->getTarget()), TII(*TM.getInstrInfo()),
-        TLI(*TM.getTargetLowering()) {
-    Subtarget = &TM.getSubtarget<MipsSubtarget>();
+        TLI(*TM.getTargetLowering()),
+        Subtarget(&TM.getSubtarget<MipsSubtarget>()) {
     MFI = funcInfo.MF->getInfo<MipsFunctionInfo>();
     Context = &funcInfo.Fn->getContext();
     TargetSupported = ((Subtarget->getRelocationModel() == Reloc::PIC_) &&
@@ -68,8 +69,11 @@ public:
   bool ComputeAddress(const Value *Obj, Address &Addr);
 
 private:
+  bool EmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                unsigned Alignment = 0);
   bool EmitStore(MVT VT, unsigned SrcReg, Address &Addr,
                  unsigned Alignment = 0);
+  bool SelectLoad(const Instruction *I);
   bool SelectRet(const Instruction *I);
   bool SelectStore(const Instruction *I);
 
@@ -80,6 +84,36 @@ private:
   unsigned MaterializeGV(const GlobalValue *GV, MVT VT);
   unsigned MaterializeInt(const Constant *C, MVT VT);
   unsigned Materialize32BitInt(int64_t Imm, const TargetRegisterClass *RC);
+
+  // for some reason, this default is not generated by tablegen
+  // so we explicitly generate it here.
+  //
+  unsigned FastEmitInst_riir(uint64_t inst, const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill, uint64_t imm1,
+                             uint64_t imm2, unsigned Op3, bool Op3IsKill) {
+    return 0;
+  }
+
+  MachineInstrBuilder EmitInst(unsigned Opc) {
+    return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+  }
+
+  MachineInstrBuilder EmitInst(unsigned Opc, unsigned DstReg) {
+    return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                   DstReg);
+  }
+
+  MachineInstrBuilder EmitInstStore(unsigned Opc, unsigned SrcReg,
+                                    unsigned MemReg, int64_t MemOffset) {
+    return EmitInst(Opc).addReg(SrcReg).addReg(MemReg).addImm(MemOffset);
+  }
+
+  MachineInstrBuilder EmitInstLoad(unsigned Opc, unsigned DstReg,
+                                      unsigned MemReg, int64_t MemOffset) {
+    return EmitInst(Opc, DstReg).addReg(MemReg).addImm(MemOffset);
+  }
+
+#include "MipsGenFastISel.inc"
 };
 
 bool MipsFastISel::isTypeLegal(Type *Ty, MVT &VT) {
@@ -100,6 +134,8 @@ bool MipsFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
   // We will extend this in a later patch:
   //   If this is a type than can be sign or zero-extended to a basic operation
   //   go ahead and accept it now.
+  if (VT == MVT::i8 || VT == MVT::i16)
+    return true;
   return false;
 }
 
@@ -116,6 +152,45 @@ bool MipsFastISel::ComputeAddress(const Value *Obj, Address &Addr) {
   return Addr.Base.Reg != 0;
 }
 
+bool MipsFastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                            unsigned Alignment) {
+  //
+  // more cases will be handled here in following patches.
+  //
+  unsigned Opc;
+  switch (VT.SimpleTy) {
+  case MVT::i32: {
+    ResultReg = createResultReg(&Mips::GPR32RegClass);
+    Opc = Mips::LW;
+    break;
+  }
+  case MVT::i16: {
+    ResultReg = createResultReg(&Mips::GPR32RegClass);
+    Opc = Mips::LHu;
+    break;
+  }
+  case MVT::i8: {
+    ResultReg = createResultReg(&Mips::GPR32RegClass);
+    Opc = Mips::LBu;
+    break;
+  }
+  case MVT::f32: {
+    ResultReg = createResultReg(&Mips::FGR32RegClass);
+    Opc = Mips::LWC1;
+    break;
+  }
+  case MVT::f64: {
+    ResultReg = createResultReg(&Mips::AFGR64RegClass);
+    Opc = Mips::LDC1;
+    break;
+  }
+  default:
+    return false;
+  }
+  EmitInstLoad(Opc, ResultReg, Addr.Base.Reg, Addr.Offset);
+  return true;
+}
+
 // Materialize a constant into a register, and return the register
 // number (or zero if we failed to handle it).
 unsigned MipsFastISel::TargetMaterializeConstant(const Constant *C) {
@@ -141,12 +216,49 @@ bool MipsFastISel::EmitStore(MVT VT, unsigned SrcReg, Address &Addr,
   //
   // more cases will be handled here in following patches.
   //
-  if (VT != MVT::i32)
+  unsigned Opc;
+  switch (VT.SimpleTy) {
+  case MVT::i8:
+    Opc = Mips::SB;
+    break;
+  case MVT::i16:
+    Opc = Mips::SH;
+    break;
+  case MVT::i32:
+    Opc = Mips::SW;
+    break;
+  case MVT::f32:
+    Opc = Mips::SWC1;
+    break;
+  case MVT::f64:
+    Opc = Mips::SDC1;
+    break;
+  default:
     return false;
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::SW))
-      .addReg(SrcReg)
-      .addReg(Addr.Base.Reg)
-      .addImm(Addr.Offset);
+  }
+  EmitInstStore(Opc, SrcReg, Addr.Base.Reg, Addr.Offset);
+  return true;
+}
+
+bool MipsFastISel::SelectLoad(const Instruction *I) {
+  // Atomic loads need special handling.
+  if (cast<LoadInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getType(), VT))
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ComputeAddress(I->getOperand(0), Addr))
+    return false;
+
+  unsigned ResultReg;
+  if (!EmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment()))
+    return false;
+  UpdateValueMap(I, ResultReg);
   return true;
 }
 
@@ -186,8 +298,7 @@ bool MipsFastISel::SelectRet(const Instruction *I) {
   if (Ret->getNumOperands() > 0) {
     return false;
   }
-  unsigned RetOpc = Mips::RetRA;
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(RetOpc));
+  EmitInst(Mips::RetRA);
   return true;
 }
 
@@ -197,6 +308,8 @@ bool MipsFastISel::TargetSelectInstruction(const Instruction *I) {
   switch (I->getOpcode()) {
   default:
     break;
+  case Instruction::Load:
+    return SelectLoad(I);
   case Instruction::Store:
     return SelectStore(I);
   case Instruction::Ret:
@@ -207,6 +320,22 @@ bool MipsFastISel::TargetSelectInstruction(const Instruction *I) {
 }
 
 unsigned MipsFastISel::MaterializeFP(const ConstantFP *CFP, MVT VT) {
+  int64_t Imm = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+  if (VT == MVT::f32) {
+    const TargetRegisterClass *RC = &Mips::FGR32RegClass;
+    unsigned DestReg = createResultReg(RC);
+    unsigned TempReg = Materialize32BitInt(Imm, &Mips::GPR32RegClass);
+    EmitInst(Mips::MTC1, DestReg).addReg(TempReg);
+    return DestReg;
+  } else if (VT == MVT::f64) {
+    const TargetRegisterClass *RC = &Mips::AFGR64RegClass;
+    unsigned DestReg = createResultReg(RC);
+    unsigned TempReg1 = Materialize32BitInt(Imm >> 32, &Mips::GPR32RegClass);
+    unsigned TempReg2 =
+        Materialize32BitInt(Imm & 0xFFFFFFFF, &Mips::GPR32RegClass);
+    EmitInst(Mips::BuildPairF64, DestReg).addReg(TempReg2).addReg(TempReg1);
+    return DestReg;
+  }
   return 0;
 }
 
@@ -221,9 +350,8 @@ unsigned MipsFastISel::MaterializeGV(const GlobalValue *GV, MVT VT) {
   // TLS not supported at this time.
   if (IsThreadLocal)
     return 0;
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::LW), DestReg)
-      .addReg(MFI->getGlobalBaseReg())
-      .addGlobalAddress(GV, 0, MipsII::MO_GOT);
+  EmitInst(Mips::LW, DestReg).addReg(MFI->getGlobalBaseReg()).addGlobalAddress(
+      GV, 0, MipsII::MO_GOT);
   return DestReg;
 }
 unsigned MipsFastISel::MaterializeInt(const Constant *C, MVT VT) {
@@ -245,15 +373,10 @@ unsigned MipsFastISel::Materialize32BitInt(int64_t Imm,
 
   if (isInt<16>(Imm)) {
     unsigned Opc = Mips::ADDiu;
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-        .addReg(Mips::ZERO)
-        .addImm(Imm);
+    EmitInst(Opc, ResultReg).addReg(Mips::ZERO).addImm(Imm);
     return ResultReg;
   } else if (isUInt<16>(Imm)) {
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::ORi),
-            ResultReg)
-        .addReg(Mips::ZERO)
-        .addImm(Imm);
+    EmitInst(Mips::ORi, ResultReg).addReg(Mips::ZERO).addImm(Imm);
     return ResultReg;
   }
   unsigned Lo = Imm & 0xFFFF;
@@ -261,16 +384,10 @@ unsigned MipsFastISel::Materialize32BitInt(int64_t Imm,
   if (Lo) {
     // Both Lo and Hi have nonzero bits.
     unsigned TmpReg = createResultReg(RC);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::LUi),
-            TmpReg).addImm(Hi);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::ORi),
-            ResultReg)
-        .addReg(TmpReg)
-        .addImm(Lo);
-
+    EmitInst(Mips::LUi, TmpReg).addImm(Hi);
+    EmitInst(Mips::ORi, ResultReg).addReg(TmpReg).addImm(Lo);
   } else {
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::LUi),
-            ResultReg).addImm(Hi);
+    EmitInst(Mips::LUi, ResultReg).addImm(Hi);
   }
   return ResultReg;
 }
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index e10a3a5..8e9196c 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -15,7 +15,6 @@
 #define MIPS_FRAMEINFO_H
 
 #include "Mips.h"
-#include "MipsSubtarget.h"
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 90cff63..0bdabf3 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -47,6 +47,7 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 
 bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &TM.getSubtarget<MipsSubtarget>();
   bool Ret = SelectionDAGISel::runOnMachineFunction(MF);
 
   processFunctionAfterISel(MF);
@@ -202,7 +203,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
 #ifndef NDEBUG
   case ISD::LOAD:
   case ISD::STORE:
-    assert((Subtarget.systemSupportsUnalignedAccess() ||
+    assert((Subtarget->systemSupportsUnalignedAccess() ||
             cast<MemSDNode>(Node)->getMemoryVT().getSizeInBits() / 8 <=
             cast<MemSDNode>(Node)->getAlignment()) &&
            "Unexpected unaligned loads/stores.");
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h
index 13becb6..2a6c875 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -32,7 +32,7 @@ namespace llvm {
 class MipsDAGToDAGISel : public SelectionDAGISel {
 public:
   explicit MipsDAGToDAGISel(MipsTargetMachine &TM)
-    : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<MipsSubtarget>()) {}
+    : SelectionDAGISel(TM), Subtarget(&TM.getSubtarget<MipsSubtarget>()) {}
 
   // Pass Name
   const char *getPassName() const override {
@@ -46,7 +46,7 @@ protected:
 
   /// Keep a pointer to the MipsSubtarget around so that we can make the right
   /// decision when generating code for different targets.
-  const MipsSubtarget &Subtarget;
+  const MipsSubtarget *Subtarget;
 
 private:
   // Include the pieces autogenerated from the target description.
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index bfe5ea1..b7af2d4 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -215,6 +215,11 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM)
   // setcc operations results (slt, sgt, ...).
   setBooleanContents(ZeroOrOneBooleanContent);
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+  // The cmp.cond.fmt instruction in MIPS32r6/MIPS64r6 uses 0 and -1 like MSA
+  // does. Integer booleans still use 0 and 1.
+  if (Subtarget->hasMips32r6())
+    setBooleanContents(ZeroOrOneBooleanContent,
+                       ZeroOrNegativeOneBooleanContent);
 
   // Load extented operations for i1 types must be promoted
   setLoadExtAction(ISD::EXTLOAD,  MVT::i1,  Promote);
@@ -251,7 +256,7 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
   setOperationAction(ISD::FP_TO_SINT,         MVT::i32,   Custom);
 
-  if (isGP64bit()) {
+  if (Subtarget->isGP64bit()) {
     setOperationAction(ISD::GlobalAddress,      MVT::i64,   Custom);
     setOperationAction(ISD::BlockAddress,       MVT::i64,   Custom);
     setOperationAction(ISD::GlobalTLSAddress,   MVT::i64,   Custom);
@@ -263,14 +268,14 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM)
     setOperationAction(ISD::FP_TO_SINT,         MVT::i64,   Custom);
   }
 
-  if (!isGP64bit()) {
+  if (!Subtarget->isGP64bit()) {
     setOperationAction(ISD::SHL_PARTS,          MVT::i32,   Custom);
     setOperationAction(ISD::SRA_PARTS,          MVT::i32,   Custom);
     setOperationAction(ISD::SRL_PARTS,          MVT::i32,   Custom);
   }
 
   setOperationAction(ISD::ADD,                MVT::i32,   Custom);
-  if (isGP64bit())
+  if (Subtarget->isGP64bit())
     setOperationAction(ISD::ADD,                MVT::i64,   Custom);
 
   setOperationAction(ISD::SDIV, MVT::i32, Expand);
@@ -287,7 +292,8 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::BR_CC,             MVT::f64,   Expand);
   setOperationAction(ISD::BR_CC,             MVT::i32,   Expand);
   setOperationAction(ISD::BR_CC,             MVT::i64,   Expand);
-  setOperationAction(ISD::SELECT_CC,         MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC,         MVT::i32,   Expand);
+  setOperationAction(ISD::SELECT_CC,         MVT::i64,   Expand);
   setOperationAction(ISD::UINT_TO_FP,        MVT::i32,   Expand);
   setOperationAction(ISD::UINT_TO_FP,        MVT::i64,   Expand);
   setOperationAction(ISD::FP_TO_UINT,        MVT::i32,   Expand);
@@ -368,7 +374,7 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM)
   if (!Subtarget->hasMips64r2())
     setOperationAction(ISD::BSWAP, MVT::i64, Expand);
 
-  if (isGP64bit()) {
+  if (Subtarget->isGP64bit()) {
     setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Custom);
     setLoadExtAction(ISD::EXTLOAD, MVT::i32, Custom);
@@ -384,12 +390,13 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM)
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::ADD);
 
-  setMinFunctionAlignment(isGP64bit() ? 3 : 2);
+  setMinFunctionAlignment(Subtarget->isGP64bit() ? 3 : 2);
 
-  setStackPointerRegisterToSaveRestore(isN64() ? Mips::SP_64 : Mips::SP);
+  setStackPointerRegisterToSaveRestore(Subtarget->isABI_N64() ? Mips::SP_64
+                                                              : Mips::SP);
 
-  setExceptionPointerRegister(isN64() ? Mips::A0_64 : Mips::A0);
-  setExceptionSelectorRegister(isN64() ? Mips::A1_64 : Mips::A1);
+  setExceptionPointerRegister(Subtarget->isABI_N64() ? Mips::A0_64 : Mips::A0);
+  setExceptionSelectorRegister(Subtarget->isABI_N64() ? Mips::A1_64 : Mips::A1);
 
   MaxStoresPerMemcpy = 16;
 
@@ -815,10 +822,10 @@ addLiveIn(MachineFunction &MF, unsigned PReg, const TargetRegisterClass *RC)
   return VReg;
 }
 
-static MachineBasicBlock *expandPseudoDIV(MachineInstr *MI,
-                                          MachineBasicBlock &MBB,
-                                          const TargetInstrInfo &TII,
-                                          bool Is64Bit) {
+static MachineBasicBlock *insertDivByZeroTrap(MachineInstr *MI,
+                                              MachineBasicBlock &MBB,
+                                              const TargetInstrInfo &TII,
+                                              bool Is64Bit) {
   if (NoZeroDivCheck)
     return &MBB;
 
@@ -836,6 +843,10 @@ static MachineBasicBlock *expandPseudoDIV(MachineInstr *MI,
 
   // Clear Divisor's kill flag.
   Divisor.setIsKill(false);
+
+  // We would normally delete the original instruction here but in this case
+  // we only needed to inject an additional instruction rather than replace it.
+
   return &MBB;
 }
 
@@ -918,10 +929,22 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     return emitAtomicCmpSwap(MI, BB, 8);
   case Mips::PseudoSDIV:
   case Mips::PseudoUDIV:
-    return expandPseudoDIV(MI, *BB, *getTargetMachine().getInstrInfo(), false);
+  case Mips::DIV:
+  case Mips::DIVU:
+  case Mips::MOD:
+  case Mips::MODU:
+    return insertDivByZeroTrap(MI, *BB, *getTargetMachine().getInstrInfo(),
+                               false);
   case Mips::PseudoDSDIV:
   case Mips::PseudoDUDIV:
-    return expandPseudoDIV(MI, *BB, *getTargetMachine().getInstrInfo(), true);
+  case Mips::DDIV:
+  case Mips::DDIVU:
+  case Mips::DMOD:
+  case Mips::DMODU:
+    return insertDivByZeroTrap(MI, *BB, *getTargetMachine().getInstrInfo(),
+                               true);
+  case Mips::SEL_D:
+    return emitSEL_D(MI, BB);
   }
 }
 
@@ -941,16 +964,20 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   unsigned LL, SC, AND, NOR, ZERO, BEQ;
 
   if (Size == 4) {
-    LL = isMicroMips ? Mips::LL_MM : Mips::LL;
-    SC = isMicroMips ? Mips::SC_MM : Mips::SC;
+    if (isMicroMips) {
+      LL = Mips::LL_MM;
+      SC = Mips::SC_MM;
+    } else {
+      LL = Subtarget->hasMips32r6() ? Mips::LL : Mips::LL_R6;
+      SC = Subtarget->hasMips32r6() ? Mips::SC : Mips::SC_R6;
+    }
     AND = Mips::AND;
     NOR = Mips::NOR;
     ZERO = Mips::ZERO;
     BEQ = Mips::BEQ;
-  }
-  else {
-    LL = Mips::LLD;
-    SC = Mips::SCD;
+  } else {
+    LL = Subtarget->hasMips64r6() ? Mips::LLD : Mips::LLD_R6;
+    SC = Subtarget->hasMips64r6() ? Mips::SCD : Mips::SCD_R6;
     AND = Mips::AND64;
     NOR = Mips::NOR64;
     ZERO = Mips::ZERO_64;
@@ -1012,11 +1039,39 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   return exitMBB;
 }
 
-MachineBasicBlock *
-MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
-                                             MachineBasicBlock *BB,
-                                             unsigned Size, unsigned BinOpcode,
-                                             bool Nand) const {
+MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg(
+    MachineInstr *MI, MachineBasicBlock *BB, unsigned Size, unsigned DstReg,
+    unsigned SrcReg) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Subtarget->hasMips32r2() && Size == 1) {
+    BuildMI(BB, DL, TII->get(Mips::SEB), DstReg).addReg(SrcReg);
+    return BB;
+  }
+
+  if (Subtarget->hasMips32r2() && Size == 2) {
+    BuildMI(BB, DL, TII->get(Mips::SEH), DstReg).addReg(SrcReg);
+    return BB;
+  }
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  unsigned ScrReg = RegInfo.createVirtualRegister(RC);
+
+  assert(Size < 32);
+  int64_t ShiftImm = 32 - (Size * 8);
+
+  BuildMI(BB, DL, TII->get(Mips::SLL), ScrReg).addReg(SrcReg).addImm(ShiftImm);
+  BuildMI(BB, DL, TII->get(Mips::SRA), DstReg).addReg(ScrReg).addImm(ShiftImm);
+
+  return BB;
+}
+
+MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
+    MachineInstr *MI, MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
+    bool Nand) const {
   assert((Size == 1 || Size == 2) &&
          "Unsupported size for EmitAtomicBinaryPartial.");
 
@@ -1046,7 +1101,6 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
   unsigned StoreVal = RegInfo.createVirtualRegister(RC);
   unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
   unsigned SrlRes = RegInfo.createVirtualRegister(RC);
-  unsigned SllRes = RegInfo.createVirtualRegister(RC);
   unsigned Success = RegInfo.createVirtualRegister(RC);
 
   // insert new blocks after the current block
@@ -1152,19 +1206,14 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
   //  sinkMBB:
   //    and     maskedoldval1,oldval,mask
   //    srl     srlres,maskedoldval1,shiftamt
-  //    sll     sllres,srlres,24
-  //    sra     dest,sllres,24
+  //    sign_extend dest,srlres
   BB = sinkMBB;
-  int64_t ShiftImm = (Size == 1) ? 24 : 16;
 
   BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
     .addReg(OldVal).addReg(Mask);
   BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
       .addReg(MaskedOldVal1).addReg(ShiftAmt);
-  BuildMI(BB, DL, TII->get(Mips::SLL), SllRes)
-      .addReg(SrlRes).addImm(ShiftImm);
-  BuildMI(BB, DL, TII->get(Mips::SRA), Dest)
-      .addReg(SllRes).addImm(ShiftImm);
+  BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);
 
   MI->eraseFromParent(); // The instruction is gone now.
 
@@ -1285,7 +1334,6 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
   unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
   unsigned StoreVal = RegInfo.createVirtualRegister(RC);
   unsigned SrlRes = RegInfo.createVirtualRegister(RC);
-  unsigned SllRes = RegInfo.createVirtualRegister(RC);
   unsigned Success = RegInfo.createVirtualRegister(RC);
 
   // insert new blocks after the current block
@@ -1382,23 +1430,44 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
 
   //  sinkMBB:
   //    srl     srlres,maskedoldval0,shiftamt
-  //    sll     sllres,srlres,24
-  //    sra     dest,sllres,24
+  //    sign_extend dest,srlres
   BB = sinkMBB;
-  int64_t ShiftImm = (Size == 1) ? 24 : 16;
 
   BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
       .addReg(MaskedOldVal0).addReg(ShiftAmt);
-  BuildMI(BB, DL, TII->get(Mips::SLL), SllRes)
-      .addReg(SrlRes).addImm(ShiftImm);
-  BuildMI(BB, DL, TII->get(Mips::SRA), Dest)
-      .addReg(SllRes).addImm(ShiftImm);
+  BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);
 
   MI->eraseFromParent();   // The instruction is gone now.
 
   return exitMBB;
 }
 
+MachineBasicBlock *MipsTargetLowering::emitSEL_D(MachineInstr *MI,
+                                                 MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  MachineBasicBlock::iterator II(MI);
+
+  unsigned Fc = MI->getOperand(1).getReg();
+  const auto &FGR64RegClass = TRI->getRegClass(Mips::FGR64RegClassID);
+
+  unsigned Fc2 = RegInfo.createVirtualRegister(FGR64RegClass);
+
+  BuildMI(*BB, II, DL, TII->get(Mips::SUBREG_TO_REG), Fc2)
+      .addImm(0)
+      .addReg(Fc)
+      .addImm(Mips::sub_lo);
+
+  // We don't erase the original instruction, we just replace the condition
+  // register with the 64-bit super-register.
+  MI->getOperand(1).setReg(Fc2);
+
+  return BB;
+}
+
 //===----------------------------------------------------------------------===//
 //  Misc Lower Operation implementation
 //===----------------------------------------------------------------------===//
@@ -1421,7 +1490,8 @@ SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
                         0);
   Chain = Addr.getValue(1);
 
-  if ((getTargetMachine().getRelocationModel() == Reloc::PIC_) || isN64()) {
+  if ((getTargetMachine().getRelocationModel() == Reloc::PIC_) ||
+      Subtarget->isABI_N64()) {
     // For PIC, the sequence is:
     // BRIND(load(Jumptable + index) + RelocBase)
     // RelocBase can be JumpTable, GOT or some sort of global base.
@@ -1439,6 +1509,7 @@ SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   SDValue Dest = Op.getOperand(2);
   SDLoc DL(Op);
 
+  assert(!Subtarget->hasMips32r6() && !Subtarget->hasMips64r6());
   SDValue CondRes = createFPCmp(DAG, Op.getOperand(1));
 
   // Return if flag is not set by a floating point comparison.
@@ -1458,6 +1529,7 @@ SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 SDValue MipsTargetLowering::
 lowerSELECT(SDValue Op, SelectionDAG &DAG) const
 {
+  assert(!Subtarget->hasMips32r6() && !Subtarget->hasMips64r6());
   SDValue Cond = createFPCmp(DAG, Op.getOperand(0));
 
   // Return if flag is not set by a floating point comparison.
@@ -1483,6 +1555,7 @@ lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
 }
 
 SDValue MipsTargetLowering::lowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  assert(!Subtarget->hasMips32r6() && !Subtarget->hasMips64r6());
   SDValue Cond = createFPCmp(DAG, Op);
 
   assert(Cond.getOpcode() == MipsISD::FPCmp &&
@@ -1502,7 +1575,8 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
   GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = N->getGlobal();
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !isN64()) {
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ &&
+      !Subtarget->isABI_N64()) {
     const MipsTargetObjectFile &TLOF =
       (const MipsTargetObjectFile&)getObjFileLowering();
 
@@ -1521,15 +1595,18 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
   }
 
   if (GV->hasInternalLinkage() || (GV->hasLocalLinkage() && !isa<Function>(GV)))
-    return getAddrLocal(N, Ty, DAG, isN32() || isN64());
+    return getAddrLocal(N, Ty, DAG,
+                        Subtarget->isABI_N32() || Subtarget->isABI_N64());
 
   if (LargeGOT)
     return getAddrGlobalLargeGOT(N, Ty, DAG, MipsII::MO_GOT_HI16,
                                  MipsII::MO_GOT_LO16, DAG.getEntryNode(),
                                  MachinePointerInfo::getGOT());
 
-  return getAddrGlobal(N, Ty, DAG, (isN32() || isN64()) ? MipsII::MO_GOT_DISP
-                                                        : MipsII::MO_GOT16,
+  return getAddrGlobal(N, Ty, DAG,
+                       (Subtarget->isABI_N32() || Subtarget->isABI_N64())
+                           ? MipsII::MO_GOT_DISP
+                           : MipsII::MO_GOT16,
                        DAG.getEntryNode(), MachinePointerInfo::getGOT());
 }
 
@@ -1538,10 +1615,12 @@ SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op,
   BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
   EVT Ty = Op.getValueType();
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !isN64())
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ &&
+      !Subtarget->isABI_N64())
     return getAddrNonPIC(N, Ty, DAG);
 
-  return getAddrLocal(N, Ty, DAG, isN32() || isN64());
+  return getAddrLocal(N, Ty, DAG,
+                      Subtarget->isABI_N32() || Subtarget->isABI_N64());
 }
 
 SDValue MipsTargetLowering::
@@ -1579,7 +1658,7 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
 
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(DL).setChain(DAG.getEntryNode())
-      .setCallee(CallingConv::C, PtrTy, TlsGetAddr, &Args, 0);
+      .setCallee(CallingConv::C, PtrTy, TlsGetAddr, std::move(Args), 0);
     std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
     SDValue Ret = CallResult.first;
@@ -1629,10 +1708,12 @@ lowerJumpTable(SDValue Op, SelectionDAG &DAG) const
   JumpTableSDNode *N = cast<JumpTableSDNode>(Op);
   EVT Ty = Op.getValueType();
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !isN64())
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ &&
+      !Subtarget->isABI_N64())
     return getAddrNonPIC(N, Ty, DAG);
 
-  return getAddrLocal(N, Ty, DAG, isN32() || isN64());
+  return getAddrLocal(N, Ty, DAG,
+                      Subtarget->isABI_N32() || Subtarget->isABI_N64());
 }
 
 SDValue MipsTargetLowering::
@@ -1650,10 +1731,12 @@ lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
   ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
   EVT Ty = Op.getValueType();
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !isN64())
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ &&
+      !Subtarget->isABI_N64())
     return getAddrNonPIC(N, Ty, DAG);
 
-  return getAddrLocal(N, Ty, DAG, isN32() || isN64());
+  return getAddrLocal(N, Ty, DAG,
+                      Subtarget->isABI_N32() || Subtarget->isABI_N64());
 }
 
 SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
@@ -1784,8 +1867,9 @@ lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   MFI->setFrameAddressIsTaken(true);
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
-  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL,
-                                         isN64() ? Mips::FP_64 : Mips::FP, VT);
+  SDValue FrameAddr =
+      DAG.getCopyFromReg(DAG.getEntryNode(), DL,
+                         Subtarget->isABI_N64() ? Mips::FP_64 : Mips::FP, VT);
   return FrameAddr;
 }
 
@@ -1801,7 +1885,7 @@ SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MVT VT = Op.getSimpleValueType();
-  unsigned RA = isN64() ? Mips::RA_64 : Mips::RA;
+  unsigned RA = Subtarget->isABI_N64() ? Mips::RA_64 : Mips::RA;
   MFI->setReturnAddressIsTaken(true);
 
   // Return RA, which contains the return address. Mark it an implicit live-in.
@@ -1823,12 +1907,12 @@ SDValue MipsTargetLowering::lowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
   SDValue Offset    = Op.getOperand(1);
   SDValue Handler   = Op.getOperand(2);
   SDLoc DL(Op);
-  EVT Ty = isN64() ? MVT::i64 : MVT::i32;
+  EVT Ty = Subtarget->isABI_N64() ? MVT::i64 : MVT::i32;
 
   // Store stack offset in V1, store jump target in V0. Glue CopyToReg and
   // EH_RETURN nodes, so that instructions are emitted back-to-back.
-  unsigned OffsetReg = isN64() ? Mips::V1_64 : Mips::V1;
-  unsigned AddrReg = isN64() ? Mips::V0_64 : Mips::V0;
+  unsigned OffsetReg = Subtarget->isABI_N64() ? Mips::V1_64 : Mips::V1;
+  unsigned AddrReg = Subtarget->isABI_N64() ? Mips::V0_64 : Mips::V0;
   Chain = DAG.getCopyToReg(Chain, DL, OffsetReg, Offset, SDValue());
   Chain = DAG.getCopyToReg(Chain, DL, AddrReg, Handler, Chain.getValue(1));
   return DAG.getNode(MipsISD::EH_RETURN, DL, MVT::Other, Chain,
@@ -2256,8 +2340,8 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
   // in PIC mode) allow symbols to be resolved via lazy binding.
   // The lazy binding stub requires GP to point to the GOT.
   if (IsPICCall && !InternalLinkage) {
-    unsigned GPReg = isN64() ? Mips::GP_64 : Mips::GP;
-    EVT Ty = isN64() ? MVT::i64 : MVT::i32;
+    unsigned GPReg = Subtarget->isABI_N64() ? Mips::GP_64 : Mips::GP;
+    EVT Ty = Subtarget->isABI_N64() ? MVT::i64 : MVT::i32;
     RegsToPass.push_back(std::make_pair(GPReg, getGlobalReg(CLI.DAG, Ty)));
   }
 
@@ -2326,8 +2410,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                  getTargetMachine(), ArgLocs, *DAG.getContext());
   MipsCC::SpecialCallingConvType SpecialCallingConv =
     getSpecialCallingConv(Callee);
-  MipsCC MipsCCInfo(CallConv, isO32(), Subtarget->isFP64bit(), CCInfo,
-                    SpecialCallingConv);
+  MipsCC MipsCCInfo(CallConv, Subtarget->isABI_O32(), Subtarget->isFP64bit(),
+                    CCInfo, SpecialCallingConv);
 
   MipsCCInfo.analyzeCallOperands(Outs, IsVarArg,
                                  Subtarget->mipsSEUsesSoftFloat(),
@@ -2360,7 +2444,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL);
 
   SDValue StackPtr = DAG.getCopyFromReg(
-      Chain, DL, isN64() ? Mips::SP_64 : Mips::SP, getPointerTy());
+      Chain, DL, Subtarget->isABI_N64() ? Mips::SP_64 : Mips::SP,
+      getPointerTy());
 
   // With EABI is it possible to have 16 args on registers.
   std::deque< std::pair<unsigned, SDValue> > RegsToPass;
@@ -2446,8 +2531,9 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
-  bool IsPICCall = (isN64() || IsPIC); // true if calls are translated to
-                                       // jalr $25
+  bool IsPICCall =
+      (Subtarget->isABI_N64() || IsPIC); // true if calls are translated to
+                                         // jalr $25
   bool GlobalOrExternal = false, InternalLinkage = false;
   SDValue CalleeLo;
   EVT Ty = Callee.getValueType();
@@ -2458,7 +2544,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       InternalLinkage = Val->hasInternalLinkage();
 
       if (InternalLinkage)
-        Callee = getAddrLocal(G, Ty, DAG, isN32() || isN64());
+        Callee = getAddrLocal(G, Ty, DAG,
+                              Subtarget->isABI_N32() || Subtarget->isABI_N64());
       else if (LargeGOT)
         Callee = getAddrGlobalLargeGOT(G, Ty, DAG, MipsII::MO_CALL_HI16,
                                        MipsII::MO_CALL_LO16, Chain,
@@ -2474,7 +2561,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     const char *Sym = S->getSymbol();
 
-    if (!isN64() && !IsPIC) // !N64 && static
+    if (!Subtarget->isABI_N64() && !IsPIC) // !N64 && static
       Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(),
                                             MipsII::MO_NO_FLAG);
     else if (LargeGOT)
@@ -2525,7 +2612,8 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), RVLocs, *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, isO32(), Subtarget->isFP64bit(), CCInfo);
+  MipsCC MipsCCInfo(CallConv, Subtarget->isABI_O32(), Subtarget->isFP64bit(),
+                    CCInfo);
 
   MipsCCInfo.analyzeCallResult(Ins, Subtarget->mipsSEUsesSoftFloat(),
                                CallNode, RetTy);
@@ -2572,7 +2660,8 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, isO32(), Subtarget->isFP64bit(), CCInfo);
+  MipsCC MipsCCInfo(CallConv, Subtarget->isABI_O32(), Subtarget->isFP64bit(),
+                    CCInfo);
   Function::const_arg_iterator FuncArg =
     DAG.getMachineFunction().getFunction()->arg_begin();
   bool UseSoftFloat = Subtarget->mipsSEUsesSoftFloat();
@@ -2634,7 +2723,8 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
           (RegVT == MVT::i64 && ValVT == MVT::f64) ||
           (RegVT == MVT::f64 && ValVT == MVT::i64))
         ArgValue = DAG.getNode(ISD::BITCAST, DL, ValVT, ArgValue);
-      else if (isO32() && RegVT == MVT::i32 && ValVT == MVT::f64) {
+      else if (Subtarget->isABI_O32() && RegVT == MVT::i32 &&
+               ValVT == MVT::f64) {
         unsigned Reg2 = addLiveIn(DAG.getMachineFunction(),
                                   getNextIntArgReg(ArgReg), RC);
         SDValue ArgValue2 = DAG.getCopyFromReg(Chain, DL, Reg2, RegVT);
@@ -2672,7 +2762,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
       unsigned Reg = MipsFI->getSRetReturnReg();
       if (!Reg) {
         Reg = MF.getRegInfo().createVirtualRegister(
-            getRegClassFor(isN64() ? MVT::i64 : MVT::i32));
+            getRegClassFor(Subtarget->isABI_N64() ? MVT::i64 : MVT::i32));
         MipsFI->setSRetReturnReg(Reg);
       }
       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[i]);
@@ -2723,7 +2813,8 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, IsVarArg, MF, getTargetMachine(), RVLocs,
                  *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, isO32(), Subtarget->isFP64bit(), CCInfo);
+  MipsCC MipsCCInfo(CallConv, Subtarget->isABI_O32(), Subtarget->isFP64bit(),
+                    CCInfo);
 
   // Analyze return values.
   MipsCCInfo.analyzeReturn(Outs, Subtarget->mipsSEUsesSoftFloat(),
@@ -2759,7 +2850,7 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
     if (!Reg)
       llvm_unreachable("sret virtual register not created in the entry block");
     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
-    unsigned V0 = isN64() ? Mips::V0_64 : Mips::V0;
+    unsigned V0 = Subtarget->isABI_N64() ? Mips::V0_64 : Mips::V0;
 
     Chain = DAG.getCopyToReg(Chain, DL, V0, Val, Flag);
     Flag = Chain.getValue(1);
@@ -2980,9 +3071,9 @@ getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const
           return std::make_pair(0U, &Mips::CPU16RegsRegClass);
         return std::make_pair(0U, &Mips::GPR32RegClass);
       }
-      if (VT == MVT::i64 && !isGP64bit())
+      if (VT == MVT::i64 && !Subtarget->isGP64bit())
         return std::make_pair(0U, &Mips::GPR32RegClass);
-      if (VT == MVT::i64 && isGP64bit())
+      if (VT == MVT::i64 && Subtarget->isGP64bit())
         return std::make_pair(0U, &Mips::GPR64RegClass);
       // This will generate an error message
       return std::make_pair(0U, nullptr);
@@ -3169,7 +3260,7 @@ bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
 }
 
 unsigned MipsTargetLowering::getJumpTableEncoding() const {
-  if (isN64())
+  if (Subtarget->isABI_N64())
     return MachineJumpTableInfo::EK_GPRel64BlockAddress;
 
   return TargetLowering::getJumpTableEncoding();
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 4ac33bf..4701bc4 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -17,7 +17,6 @@
 
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "Mips.h"
-#include "MipsSubtarget.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/Function.h"
@@ -210,6 +209,7 @@ namespace llvm {
   // TargetLowering Implementation
   //===--------------------------------------------------------------------===//
   class MipsFunctionInfo;
+  class MipsSubtarget;
 
   class MipsTargetLowering : public TargetLowering  {
     bool isMicroMips;
@@ -438,12 +438,6 @@ namespace llvm {
     // Subtarget Info
     const MipsSubtarget *Subtarget;
 
-    bool hasMips64() const { return Subtarget->hasMips64(); }
-    bool isGP64bit() const { return Subtarget->isGP64bit(); }
-    bool isO32() const { return Subtarget->isABI_O32(); }
-    bool isN32() const { return Subtarget->isABI_N32(); }
-    bool isN64() const { return Subtarget->isABI_N64(); }
-
   private:
     // Create a TargetGlobalAddress node.
     SDValue getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
@@ -598,6 +592,12 @@ namespace llvm {
 
     unsigned getJumpTableEncoding() const override;
 
+    /// Emit a sign-extension using sll/sra, seb, or seh appropriately.
+    MachineBasicBlock *emitSignExtendToI32InReg(MachineInstr *MI,
+                                                MachineBasicBlock *BB,
+                                                unsigned Size, unsigned DstReg,
+                                                unsigned SrcRec) const;
+
     MachineBasicBlock *emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
                     unsigned Size, unsigned BinOpcode, bool Nand = false) const;
     MachineBasicBlock *emitAtomicBinaryPartword(MachineInstr *MI,
@@ -607,6 +607,7 @@ namespace llvm {
                                   MachineBasicBlock *BB, unsigned Size) const;
     MachineBasicBlock *emitAtomicCmpSwapPartword(MachineInstr *MI,
                                   MachineBasicBlock *BB, unsigned Size) const;
+    MachineBasicBlock *emitSEL_D(MachineInstr *MI, MachineBasicBlock *BB) const;
   };
 
   /// Create MipsTargetLowering objects.
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 32cda3b..2260d53 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -57,13 +57,13 @@ let PrintMethod = "printFCCOperand", DecoderMethod = "DecodeCondCode" in
 // Feature predicates.
 //===----------------------------------------------------------------------===//
 
-def IsFP64bit        : Predicate<"Subtarget.isFP64bit()">,
+def IsFP64bit        : Predicate<"Subtarget->isFP64bit()">,
                        AssemblerPredicate<"FeatureFP64Bit">;
-def NotFP64bit       : Predicate<"!Subtarget.isFP64bit()">,
+def NotFP64bit       : Predicate<"!Subtarget->isFP64bit()">,
                        AssemblerPredicate<"!FeatureFP64Bit">;
-def IsSingleFloat    : Predicate<"Subtarget.isSingleFloat()">,
+def IsSingleFloat    : Predicate<"Subtarget->isSingleFloat()">,
                        AssemblerPredicate<"FeatureSingleFloat">;
-def IsNotSingleFloat : Predicate<"!Subtarget.isSingleFloat()">,
+def IsNotSingleFloat : Predicate<"!Subtarget->isSingleFloat()">,
                        AssemblerPredicate<"!FeatureSingleFloat">;
 
 //===----------------------------------------------------------------------===//
@@ -153,6 +153,15 @@ class MTC1_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
   InstSE<(outs DstRC:$fs), (ins SrcRC:$rt), !strconcat(opstr, "\t$rt, $fs"),
          [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR, opstr>;
 
+class MTC1_64_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+                 InstrItinClass Itin> :
+  InstSE<(outs DstRC:$fs), (ins DstRC:$fs_in, SrcRC:$rt),
+         !strconcat(opstr, "\t$rt, $fs"), [], Itin, FrmFR, opstr> {
+  // $fs_in is part of a white lie to work around a widespread bug in the FPU
+  // implementation. See expandBuildPairF64 for details.
+  let Constraints = "$fs = $fs_in";
+}
+
 class LW_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
             SDPatternOperator OpNode= null_frag> :
   InstSE<(outs RC:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
@@ -249,11 +258,11 @@ multiclass C_COND_M<string TypeStr, RegisterOperand RC, bits<5> fmt,
   def C_NGT_#NAME : C_COND_FT<"ngt", TypeStr, RC, itin>, C_COND_FM<fmt, 15>;
 }
 
-defm S : C_COND_M<"s", FGR32Opnd, 16, II_C_CC_S>;
-defm D32 : C_COND_M<"d", AFGR64Opnd, 17, II_C_CC_D>,
+defm S : C_COND_M<"s", FGR32Opnd, 16, II_C_CC_S>, ISA_MIPS1_NOT_32R6_64R6;
+defm D32 : C_COND_M<"d", AFGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6,
            AdditionalRequires<[NotFP64bit]>;
 let DecoderNamespace = "Mips64" in
-defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>,
+defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6,
            AdditionalRequires<[IsFP64bit]>;
 
 //===----------------------------------------------------------------------===//
@@ -355,8 +364,12 @@ def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1,
                           bitconvert>, MFC1_FM<4>;
 def MFHC1 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, FGRH32Opnd, II_MFHC1>,
             MFC1_FM<3>, ISA_MIPS32R2;
-def MTHC1 : MMRel, MTC1_FT<"mthc1", FGRH32Opnd, GPR32Opnd, II_MTHC1>,
-            MFC1_FM<7>, ISA_MIPS32R2;
+def MTHC1_D32 : MMRel, MTC1_64_FT<"mthc1", FGR64Opnd, GPR32Opnd, II_MTHC1>,
+                MFC1_FM<7>, ISA_MIPS32R2, AdditionalRequires<[NotFP64bit]>;
+def MTHC1_D64 : MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
+                MFC1_FM<7>, ISA_MIPS32R2, AdditionalRequires<[IsFP64bit]> {
+  let DecoderNamespace = "Mips64";
+}
 def DMFC1 : MFC1_FT<"dmfc1", GPR64Opnd, FGR64Opnd, II_DMFC1,
             bitconvert>, MFC1_FM<1>, ISA_MIPS3;
 def DMTC1 : MTC1_FT<"dmtc1", FGR64Opnd, GPR64Opnd, II_DMTC1,
@@ -390,56 +403,64 @@ def SDC1 : MMRel, SW_FT<"sdc1", AFGR64Opnd, II_SDC1, store>, LW_FM<0x3d>,
 // Cop2 Memory Instructions
 // FIXME: These aren't really FPU instructions and as such don't belong in this
 //        file
-def LWC2 : LW_FT<"lwc2", COP2Opnd, NoItinerary, load>, LW_FM<0x32>;
-def SWC2 : SW_FT<"swc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3a>;
-def LDC2 : LW_FT<"ldc2", COP2Opnd, NoItinerary, load>, LW_FM<0x36>, ISA_MIPS2;
-def SDC2 : SW_FT<"sdc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3e>, ISA_MIPS2;
+def LWC2 : LW_FT<"lwc2", COP2Opnd, NoItinerary, load>, LW_FM<0x32>,
+           ISA_MIPS1_NOT_32R6_64R6;
+def SWC2 : SW_FT<"swc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3a>,
+           ISA_MIPS1_NOT_32R6_64R6;
+def LDC2 : LW_FT<"ldc2", COP2Opnd, NoItinerary, load>, LW_FM<0x36>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def SDC2 : SW_FT<"sdc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3e>,
+           ISA_MIPS2_NOT_32R6_64R6;
 
 // Cop3 Memory Instructions
 // FIXME: These aren't really FPU instructions and as such don't belong in this
 //        file
-def LWC3 : LW_FT<"lwc3", COP3Opnd, NoItinerary, load>, LW_FM<0x33>;
-def SWC3 : SW_FT<"swc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3b>;
-def LDC3 : LW_FT<"ldc3", COP3Opnd, NoItinerary, load>, LW_FM<0x37>, ISA_MIPS2;
-def SDC3 : SW_FT<"sdc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3f>, ISA_MIPS2;
+let DecoderNamespace = "COP3_" in {
+  def LWC3 : LW_FT<"lwc3", COP3Opnd, NoItinerary, load>, LW_FM<0x33>;
+  def SWC3 : SW_FT<"swc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3b>;
+  def LDC3 : LW_FT<"ldc3", COP3Opnd, NoItinerary, load>, LW_FM<0x37>,
+             ISA_MIPS2;
+  def SDC3 : SW_FT<"sdc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3f>,
+             ISA_MIPS2;
+}
 
 // Indexed loads and stores.
 // Base register + offset register addressing mode (indicated by "x" in the
 // instruction mnemonic) is disallowed under NaCl.
 let AdditionalPredicates = [IsNotNaCl] in {
   def LWXC1 : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>, LWXC1_FM<0>,
-              INSN_MIPS4_32R2;
+              INSN_MIPS4_32R2_NOT_32R6_64R6;
   def SWXC1 : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>, SWXC1_FM<8>,
-              INSN_MIPS4_32R2;
+              INSN_MIPS4_32R2_NOT_32R6_64R6;
 }
 
 let AdditionalPredicates = [NotInMicroMips, IsNotNaCl] in {
   def LDXC1 : LWXC1_FT<"ldxc1", AFGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>,
-              INSN_MIPS4_32R2, FGR_32;
+              INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
   def SDXC1 : SWXC1_FT<"sdxc1", AFGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>,
-              INSN_MIPS4_32R2, FGR_32;
+              INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
 }
 
 let DecoderNamespace="Mips64" in {
   def LDXC164 : LWXC1_FT<"ldxc1", FGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>,
-                INSN_MIPS4_32R2, FGR_64;
+                INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
   def SDXC164 : SWXC1_FT<"sdxc1", FGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>,
-                INSN_MIPS4_32R2, FGR_64;
+                INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
 }
 
 // Load/store doubleword indexed unaligned.
 let AdditionalPredicates = [IsNotNaCl] in {
   def LUXC1 : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
-              INSN_MIPS5_32R2, FGR_32;
+              INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32;
   def SUXC1 : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
-              INSN_MIPS5_32R2, FGR_32;
+              INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32;
 }
 
 let DecoderNamespace="Mips64" in {
   def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
-                INSN_MIPS5_32R2, FGR_64;
+                INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_64;
   def SUXC164 : SWXC1_FT<"suxc1", FGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
-                INSN_MIPS5_32R2, FGR_64;
+                INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_64;
 }
 
 /// Floating-point Aritmetic
@@ -457,42 +478,42 @@ def FSUB_S : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
 defm FSUB :  ADDS_M<"sub.d", II_SUB_D, 0, fsub>, ADDS_FM<0x01, 17>;
 
 def MADD_S : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
-             MADDS_FM<4, 0>, ISA_MIPS32R2;
+             MADDS_FM<4, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
 def MSUB_S : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
-             MADDS_FM<5, 0>, ISA_MIPS32R2;
+             MADDS_FM<5, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
 
 let AdditionalPredicates = [NoNaNsFPMath] in {
   def NMADD_S : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>,
-                MADDS_FM<6, 0>, ISA_MIPS32R2;
+                MADDS_FM<6, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
   def NMSUB_S : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>,
-                MADDS_FM<7, 0>, ISA_MIPS32R2;
+                MADDS_FM<7, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
 }
 
 def MADD_D32 : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
-               MADDS_FM<4, 1>, ISA_MIPS32R2, FGR_32;
+               MADDS_FM<4, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
 def MSUB_D32 : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
-               MADDS_FM<5, 1>, ISA_MIPS32R2, FGR_32;
+               MADDS_FM<5, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
 
 let AdditionalPredicates = [NoNaNsFPMath] in {
   def NMADD_D32 : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
-                  MADDS_FM<6, 1>, ISA_MIPS32R2, FGR_32;
+                  MADDS_FM<6, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
   def NMSUB_D32 : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
-                  MADDS_FM<7, 1>, ISA_MIPS32R2, FGR_32;
+                  MADDS_FM<7, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
 }
 
 let isCodeGenOnly=1 in {
   def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, II_MADD_D, fadd>,
-                 MADDS_FM<4, 1>, ISA_MIPS32R2, FGR_64;
+                 MADDS_FM<4, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
   def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, II_MSUB_D, fsub>,
-                 MADDS_FM<5, 1>, ISA_MIPS32R2, FGR_64;
+                 MADDS_FM<5, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
 }
 
 let AdditionalPredicates = [NoNaNsFPMath],
     isCodeGenOnly=1 in {
   def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, II_NMADD_D, fadd>,
-                  MADDS_FM<6, 1>, ISA_MIPS32R2, FGR_64;
+                  MADDS_FM<6, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
   def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, II_NMSUB_D, fsub>,
-                  MADDS_FM<7, 1>, ISA_MIPS32R2, FGR_64;
+                  MADDS_FM<7, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
 }
 
 //===----------------------------------------------------------------------===//
@@ -504,9 +525,9 @@ def MIPS_BRANCH_F  : PatLeaf<(i32 0)>;
 def MIPS_BRANCH_T  : PatLeaf<(i32 1)>;
 
 def BC1F : MMRel, BC1F_FT<"bc1f", brtarget, IIBranch, MIPS_BRANCH_F>,
-           BC1F_FM<0, 0>;
+           BC1F_FM<0, 0>, ISA_MIPS1_NOT_32R6_64R6;
 def BC1T : MMRel, BC1F_FT<"bc1t", brtarget, IIBranch, MIPS_BRANCH_T>,
-           BC1F_FM<0, 1>;
+           BC1F_FM<0, 1>, ISA_MIPS1_NOT_32R6_64R6;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Flag Conditions
@@ -531,12 +552,13 @@ def MIPS_FCOND_LE   : PatLeaf<(i32 14)>;
 def MIPS_FCOND_NGT  : PatLeaf<(i32 15)>;
 
 /// Floating Point Compare
-def FCMP_S32 : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>, CEQS_FM<16>;
+def FCMP_S32 : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>, CEQS_FM<16>,
+               ISA_MIPS1_NOT_32R6_64R6;
 def FCMP_D32 : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
-               AdditionalRequires<[NotFP64bit]>;
+               ISA_MIPS1_NOT_32R6_64R6, AdditionalRequires<[NotFP64bit]>;
 let DecoderNamespace = "Mips64" in
 def FCMP_D64 : CEQS_FT<"d", FGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
-               AdditionalRequires<[IsFP64bit]>;
+               ISA_MIPS1_NOT_32R6_64R6, AdditionalRequires<[IsFP64bit]>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Pseudo-Instructions
@@ -569,8 +591,10 @@ def ExtractElementF64_64 : ExtractElementF64Base<FGR64Opnd>,
 //===----------------------------------------------------------------------===//
 // InstAliases.
 //===----------------------------------------------------------------------===//
-def : MipsInstAlias<"bc1t $offset", (BC1T FCC0, brtarget:$offset)>;
-def : MipsInstAlias<"bc1f $offset", (BC1F FCC0, brtarget:$offset)>;
+def : MipsInstAlias<"bc1t $offset", (BC1T FCC0, brtarget:$offset)>,
+      ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"bc1f $offset", (BC1F FCC0, brtarget:$offset)>,
+      ISA_MIPS1_NOT_32R6_64R6;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Patterns
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 0377eab..6a01ae5 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -844,6 +844,44 @@ class BARRIER_FM<bits<5> op> : StdArch {
   let Inst{5-0} = 0;   // SLL
 }
 
+class SDBBP_FM : StdArch {
+  bits<20> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011100; // SPECIAL2
+  let Inst{25-6} = code_;
+  let Inst{5-0} = 0b111111;   // SDBBP
+}
+
+class JR_HB_FM<bits<6> op> : StdArch{
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0; // SPECIAL
+  let Inst{25-21} = rs;
+  let Inst{20-11} = 0;
+  let Inst{10} = 1;
+  let Inst{9-6} = 0;
+  let Inst{5-0} = op;
+}
+
+class JALR_HB_FM<bits<6> op> : StdArch {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0; // SPECIAL
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = rd;
+  let Inst{10} = 1;
+  let Inst{9-6} = 0;
+  let Inst{5-0} = op;
+}
+
 class COP0_TLB_FM<bits<6> op> : StdArch {
   bits<32> Inst;
 
@@ -852,3 +890,17 @@ class COP0_TLB_FM<bits<6> op> : StdArch {
   let Inst{24-6} = 0;
   let Inst{5-0} = op;     // Operation
 }
+
+class CACHEOP_FM<bits<6> op> : StdArch {
+  bits<21> addr;
+  bits<5> hint;
+  bits<5> base = addr{20-16};
+  bits<16> offset = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = base;
+  let Inst{20-16} = hint;
+  let Inst{15-0}  = offset;
+}
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 0d3cb75..8e9472c 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -146,61 +146,61 @@ def MipsSDR : SDNode<"MipsISD::SDR", SDTStore,
 //===----------------------------------------------------------------------===//
 // Mips Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
-def HasMips2     :    Predicate<"Subtarget.hasMips2()">,
+def HasMips2     :    Predicate<"Subtarget->hasMips2()">,
                       AssemblerPredicate<"FeatureMips2">;
-def HasMips3_32  :    Predicate<"Subtarget.hasMips3_32()">,
+def HasMips3_32  :    Predicate<"Subtarget->hasMips3_32()">,
                       AssemblerPredicate<"FeatureMips3_32">;
-def HasMips3_32r2 :   Predicate<"Subtarget.hasMips3_32r2()">,
+def HasMips3_32r2 :   Predicate<"Subtarget->hasMips3_32r2()">,
                       AssemblerPredicate<"FeatureMips3_32r2">;
-def HasMips3     :    Predicate<"Subtarget.hasMips3()">,
+def HasMips3     :    Predicate<"Subtarget->hasMips3()">,
                       AssemblerPredicate<"FeatureMips3">;
-def HasMips4_32  :    Predicate<"Subtarget.hasMips4_32()">,
+def HasMips4_32  :    Predicate<"Subtarget->hasMips4_32()">,
                       AssemblerPredicate<"FeatureMips4_32">;
-def HasMips4_32r2 :   Predicate<"Subtarget.hasMips4_32r2()">,
+def HasMips4_32r2 :   Predicate<"Subtarget->hasMips4_32r2()">,
                       AssemblerPredicate<"FeatureMips4_32r2">;
-def HasMips5_32r2 :   Predicate<"Subtarget.hasMips5_32r2()">,
+def HasMips5_32r2 :   Predicate<"Subtarget->hasMips5_32r2()">,
                       AssemblerPredicate<"FeatureMips5_32r2">;
-def HasMips32    :    Predicate<"Subtarget.hasMips32()">,
+def HasMips32    :    Predicate<"Subtarget->hasMips32()">,
                       AssemblerPredicate<"FeatureMips32">;
-def HasMips32r2  :    Predicate<"Subtarget.hasMips32r2()">,
+def HasMips32r2  :    Predicate<"Subtarget->hasMips32r2()">,
                       AssemblerPredicate<"FeatureMips32r2">;
-def HasMips32r6  :    Predicate<"Subtarget.hasMips32r6()">,
+def HasMips32r6  :    Predicate<"Subtarget->hasMips32r6()">,
                       AssemblerPredicate<"FeatureMips32r6">;
-def NotMips32r6  :    Predicate<"!Subtarget.hasMips32r6()">,
+def NotMips32r6  :    Predicate<"!Subtarget->hasMips32r6()">,
                       AssemblerPredicate<"!FeatureMips32r6">;
-def IsGP64bit    :    Predicate<"Subtarget.isGP64bit()">,
+def IsGP64bit    :    Predicate<"Subtarget->isGP64bit()">,
                       AssemblerPredicate<"FeatureGP64Bit">;
-def IsGP32bit    :    Predicate<"!Subtarget.isGP64bit()">,
+def IsGP32bit    :    Predicate<"!Subtarget->isGP64bit()">,
                       AssemblerPredicate<"!FeatureGP64Bit">;
-def HasMips64    :    Predicate<"Subtarget.hasMips64()">,
+def HasMips64    :    Predicate<"Subtarget->hasMips64()">,
                       AssemblerPredicate<"FeatureMips64">;
-def HasMips64r2  :    Predicate<"Subtarget.hasMips64r2()">,
+def HasMips64r2  :    Predicate<"Subtarget->hasMips64r2()">,
                       AssemblerPredicate<"FeatureMips64r2">;
-def HasMips64r6  :    Predicate<"Subtarget.hasMips64r6()">,
+def HasMips64r6  :    Predicate<"Subtarget->hasMips64r6()">,
                       AssemblerPredicate<"FeatureMips64r6">;
-def NotMips64r6  :    Predicate<"!Subtarget.hasMips64r6()">,
+def NotMips64r6  :    Predicate<"!Subtarget->hasMips64r6()">,
                       AssemblerPredicate<"!FeatureMips64r6">;
-def IsN64       :     Predicate<"Subtarget.isABI_N64()">,
+def IsN64       :     Predicate<"Subtarget->isABI_N64()">,
                       AssemblerPredicate<"FeatureN64">;
-def InMips16Mode :    Predicate<"Subtarget.inMips16Mode()">,
+def InMips16Mode :    Predicate<"Subtarget->inMips16Mode()">,
                       AssemblerPredicate<"FeatureMips16">;
-def HasCnMips    :    Predicate<"Subtarget.hasCnMips()">,
+def HasCnMips    :    Predicate<"Subtarget->hasCnMips()">,
                       AssemblerPredicate<"FeatureCnMips">;
 def RelocStatic :     Predicate<"TM.getRelocationModel() == Reloc::Static">,
                       AssemblerPredicate<"FeatureMips32">;
 def RelocPIC    :     Predicate<"TM.getRelocationModel() == Reloc::PIC_">,
                       AssemblerPredicate<"FeatureMips32">;
 def NoNaNsFPMath :    Predicate<"TM.Options.NoNaNsFPMath">;
-def HasStdEnc :       Predicate<"Subtarget.hasStandardEncoding()">,
+def HasStdEnc :       Predicate<"Subtarget->hasStandardEncoding()">,
                       AssemblerPredicate<"!FeatureMips16">;
-def NotDSP :          Predicate<"!Subtarget.hasDSP()">;
-def InMicroMips    :  Predicate<"Subtarget.inMicroMipsMode()">,
+def NotDSP :          Predicate<"!Subtarget->hasDSP()">;
+def InMicroMips    :  Predicate<"Subtarget->inMicroMipsMode()">,
                       AssemblerPredicate<"FeatureMicroMips">;
-def NotInMicroMips :  Predicate<"!Subtarget.inMicroMipsMode()">,
+def NotInMicroMips :  Predicate<"!Subtarget->inMicroMipsMode()">,
                       AssemblerPredicate<"!FeatureMicroMips">;
-def IsLE           :  Predicate<"Subtarget.isLittle()">;
-def IsBE           :  Predicate<"!Subtarget.isLittle()">;
-def IsNotNaCl    :    Predicate<"!Subtarget.isTargetNaCl()">;
+def IsLE           :  Predicate<"Subtarget->isLittle()">;
+def IsBE           :  Predicate<"!Subtarget->isLittle()">;
+def IsNotNaCl    :    Predicate<"!Subtarget->isTargetNaCl()">;
 
 //===----------------------------------------------------------------------===//
 // Mips GPR size adjectives.
@@ -232,8 +232,17 @@ class ISA_MIPS3_NOT_32R6_64R6 {
   list<Predicate> InsnPredicates = [HasMips3, NotMips32r6, NotMips64r6];
 }
 class ISA_MIPS32   { list<Predicate> InsnPredicates = [HasMips32]; }
+class ISA_MIPS32_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips32, NotMips32r6, NotMips64r6];
+}
 class ISA_MIPS32R2 { list<Predicate> InsnPredicates = [HasMips32r2]; }
+class ISA_MIPS32R2_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips32r2, NotMips32r6, NotMips64r6];
+}
 class ISA_MIPS64   { list<Predicate> InsnPredicates = [HasMips64]; }
+class ISA_MIPS64_NOT_64R6 {
+  list<Predicate> InsnPredicates = [HasMips64, NotMips64r6];
+}
 class ISA_MIPS64R2 { list<Predicate> InsnPredicates = [HasMips64r2]; }
 class ISA_MIPS32R6 { list<Predicate> InsnPredicates = [HasMips32r6]; }
 class ISA_MIPS64R6 { list<Predicate> InsnPredicates = [HasMips64r6]; }
@@ -241,17 +250,32 @@ class ISA_MIPS64R6 { list<Predicate> InsnPredicates = [HasMips64r6]; }
 // The portions of MIPS-III that were also added to MIPS32
 class INSN_MIPS3_32 { list<Predicate> InsnPredicates = [HasMips3_32]; }
 
+// The portions of MIPS-III that were also added to MIPS32 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS3_32_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips3_32, NotMips32r6, NotMips64r6];
+}
+
 // The portions of MIPS-III that were also added to MIPS32
 class INSN_MIPS3_32R2 { list<Predicate> InsnPredicates = [HasMips3_32r2]; }
 
-// The portions of MIPS-IV that were also added to MIPS32
-class INSN_MIPS4_32 { list<Predicate> InsnPredicates = [HasMips4_32]; }
+// The portions of MIPS-IV that were also added to MIPS32 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS4_32_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips4_32, NotMips32r6, NotMips64r6];
+}
 
-// The portions of MIPS-IV that were also added to MIPS32R2
-class INSN_MIPS4_32R2 { list<Predicate> InsnPredicates = [HasMips4_32r2]; }
+// The portions of MIPS-IV that were also added to MIPS32r2 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS4_32R2_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips4_32r2, NotMips32r6, NotMips64r6];
+}
 
-// The portions of MIPS-V that were also added to MIPS32R2
-class INSN_MIPS5_32R2 { list<Predicate> InsnPredicates = [HasMips5_32r2]; }
+// The portions of MIPS-V that were also added to MIPS32r2 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS5_32R2_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips5_32r2, NotMips32r6, NotMips64r6];
+}
 
 //===----------------------------------------------------------------------===//
 
@@ -328,7 +352,9 @@ def calltarget  : Operand<iPTR> {
   let ParserMatchClass = MipsJumpTargetAsmOperand;
 }
 
+def simm9 : Operand<i32>;
 def simm10 : Operand<i32>;
+def simm11 : Operand<i32>;
 
 def simm16      : Operand<i32> {
   let DecoderMethod= "DecodeSimm16";
@@ -337,6 +363,13 @@ def simm16      : Operand<i32> {
 def simm19_lsl2 : Operand<i32> {
   let EncoderMethod = "getSimm19Lsl2Encoding";
   let DecoderMethod = "DecodeSimm19Lsl2";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def simm18_lsl3 : Operand<i32> {
+  let EncoderMethod = "getSimm18Lsl3Encoding";
+  let DecoderMethod = "DecodeSimm18Lsl3";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
 }
 
 def simm20      : Operand<i32> {
@@ -386,6 +419,15 @@ def MipsMemAsmOperand : AsmOperandClass {
   let ParserMethod = "parseMemOperand";
 }
 
+def MipsMemSimm11AsmOperand : AsmOperandClass {
+  let Name = "MemOffsetSimm11";
+  let SuperClasses = [MipsMemAsmOperand];
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithSimmOffset<11>";
+  //let DiagnosticType = "Simm11";
+}
+
 def MipsInvertedImmoperand : AsmOperandClass {
   let Name = "InvNum";
   let RenderMethod = "addImmOperands";
@@ -417,6 +459,17 @@ def mem_msa : mem_generic {
   let EncoderMethod = "getMSAMemEncoding";
 }
 
+def mem_simm9 : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm9);
+  let EncoderMethod = "getMemEncoding";
+}
+
+def mem_simm11 : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm11);
+  let EncoderMethod = "getMemEncoding";
+  let ParserMatchClass = MipsMemSimm11AsmOperand;
+}
+
 def mem_ea : Operand<iPTR> {
   let PrintMethod = "printMemOperandEA";
   let MIOperandInfo = (ops ptr_rc, simm16);
@@ -690,20 +743,11 @@ class JumpFR<string opstr, RegisterOperand RO,
          FrmR, opstr>;
 
 // Indirect branch
-class IndirectBranch<string opstr, RegisterOperand RO> :
-      JumpFR<opstr, RO, brind> {
+class IndirectBranch<string opstr, RegisterOperand RO> : JumpFR<opstr, RO> {
   let isBranch = 1;
   let isIndirectBranch = 1;
 }
 
-// Return instruction
-class RetBase<string opstr, RegisterOperand RO>: JumpFR<opstr, RO> {
-  let isReturn = 1;
-  let isCodeGenOnly = 1;
-  let hasCtrlDep = 1;
-  let hasExtraSrcRegAllocReq = 1;
-}
-
 // Jump and Link (Call)
 let isCall=1, hasDelaySlot=1, Defs = [RA] in {
   class JumpLink<string opstr, DAGOperand opnd> :
@@ -1042,7 +1086,7 @@ def SUBu  : MMRel, ArithLogicR<"subu", GPR32Opnd, 0, II_SUBU, sub>,
             ADD_FM<0, 0x23>;
 let Defs = [HI0, LO0] in
 def MUL   : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>,
-            ADD_FM<0x1c, 2>, ISA_MIPS32;
+            ADD_FM<0x1c, 2>, ISA_MIPS32_NOT_32R6_64R6;
 def ADD   : MMRel, ArithLogicR<"add", GPR32Opnd>, ADD_FM<0, 0x20>;
 def SUB   : MMRel, ArithLogicR<"sub", GPR32Opnd>, ADD_FM<0, 0x22>;
 def SLT   : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM<0, 0x2a>;
@@ -1103,7 +1147,7 @@ def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, II_SWR>, LW_FM<0x2e>,
           ISA_MIPS1_NOT_32R6_64R6;
 }
 
-def SYNC : MMRel, SYNC_FT<"sync">, SYNC_FM;
+def SYNC : MMRel, SYNC_FT<"sync">, SYNC_FM, ISA_MIPS32;
 def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>;
 def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>;
 def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM<0x31>;
@@ -1127,6 +1171,7 @@ def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM<0xe>,
 def BREAK : MMRel, BRK_FT<"break">, BRK_FM<0xd>;
 def SYSCALL : MMRel, SYS_FT<"syscall">, SYS_FM<0xc>;
 def TRAP : TrapBase<BREAK>;
+def SDBBP : SYS_FT<"sdbbp">, SDBBP_FM, ISA_MIPS32_NOT_32R6_64R6;
 
 def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18>, INSN_MIPS3_32;
 def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f>, ISA_MIPS32;
@@ -1139,8 +1184,8 @@ let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably
 def WAIT : WAIT_FT<"wait">, WAIT_FM;
 
 /// Load-linked, Store-conditional
-def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>, ISA_MIPS2;
-def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, ISA_MIPS2;
+def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>, ISA_MIPS2_NOT_32R6_64R6;
+def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, ISA_MIPS2_NOT_32R6_64R6;
 }
 
 /// Jump and Branch Instructions
@@ -1161,17 +1206,49 @@ def B       : UncondBranch<BEQ>;
 
 def JAL  : MMRel, JumpLink<"jal", calltarget>, FJ<3>;
 let AdditionalPredicates = [NotInMicroMips] in {
-def JALR : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM;
-def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>;
+  def JALR : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM;
+  def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>;
 }
-def JALX  : JumpLink<"jalx", calltarget>, FJ<0x1D>;
-def BGEZAL : MMRel, BGEZAL_FT<"bgezal", brtarget, GPR32Opnd>, BGEZAL_FM<0x11>;
-def BLTZAL : MMRel, BGEZAL_FT<"bltzal", brtarget, GPR32Opnd>, BGEZAL_FM<0x10>;
+
+// FIXME: JALX really requires either MIPS16 or microMIPS in addition to MIPS32.
+def JALX  : JumpLink<"jalx", calltarget>, FJ<0x1D>, ISA_MIPS32_NOT_32R6_64R6;
+def BGEZAL : MMRel, BGEZAL_FT<"bgezal", brtarget, GPR32Opnd>, BGEZAL_FM<0x11>,
+             ISA_MIPS1_NOT_32R6_64R6;
+def BLTZAL : MMRel, BGEZAL_FT<"bltzal", brtarget, GPR32Opnd>, BGEZAL_FM<0x10>,
+             ISA_MIPS1_NOT_32R6_64R6;
 def BAL_BR : BAL_BR_Pseudo<BGEZAL>;
 def TAILCALL : TailCall<J>;
 def TAILCALL_R : TailCallReg<GPR32Opnd, JR>;
 
-def RET : MMRel, RetBase<"ret", GPR32Opnd>, MTLO_FM<8>;
+// Indirect branches are matched as PseudoIndirectBranch/PseudoIndirectBranch64
+// then are expanded to JR, JR64, JALR, or JALR64 depending on the ISA.
+class PseudoIndirectBranchBase<RegisterOperand RO> :
+    MipsPseudo<(outs), (ins RO:$rs), [(brind RO:$rs)], IIBranch> {
+  let isTerminator=1;
+  let isBarrier=1;
+  let hasDelaySlot = 1;
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+}
+
+def PseudoIndirectBranch : PseudoIndirectBranchBase<GPR32Opnd>;
+
+// Return instructions are matched as a RetRA instruction, then ar expanded
+// into PseudoReturn/PseudoReturn64 after register allocation. Finally,
+// MipsAsmPrinter expands this into JR, JR64, JALR, or JALR64 depending on the
+// ISA.
+class PseudoReturnBase<RegisterOperand RO> : MipsPseudo<(outs), (ins RO:$rs),
+                                                        [], IIBranch> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let hasDelaySlot = 1;
+  let isReturn = 1;
+  let isCodeGenOnly = 1;
+  let hasCtrlDep = 1;
+  let hasExtraSrcRegAllocReq = 1;
+}
+
+def PseudoReturn : PseudoReturnBase<GPR32Opnd>;
 
 // Exception handling related node and instructions.
 // The conversion sequence is:
@@ -1196,20 +1273,24 @@ let Uses = [V0, V1], isTerminator = 1, isReturn = 1, isBarrier = 1 in {
 
 /// Multiply and Divide Instructions.
 def MULT  : MMRel, Mult<"mult", II_MULT, GPR32Opnd, [HI0, LO0]>,
-            MULT_FM<0, 0x18>;
+            MULT_FM<0, 0x18>, ISA_MIPS1_NOT_32R6_64R6;
 def MULTu : MMRel, Mult<"multu", II_MULTU, GPR32Opnd, [HI0, LO0]>,
-            MULT_FM<0, 0x19>;
+            MULT_FM<0, 0x19>, ISA_MIPS1_NOT_32R6_64R6;
 def SDIV  : MMRel, Div<"div", II_DIV, GPR32Opnd, [HI0, LO0]>,
-            MULT_FM<0, 0x1a>;
+            MULT_FM<0, 0x1a>, ISA_MIPS1_NOT_32R6_64R6;
 def UDIV  : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
-            MULT_FM<0, 0x1b>;
+            MULT_FM<0, 0x1b>, ISA_MIPS1_NOT_32R6_64R6;
 
-def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>;
-def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>;
+def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>,
+           ISA_MIPS1_NOT_32R6_64R6;
+def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>,
+           ISA_MIPS1_NOT_32R6_64R6;
 let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
     AdditionalPredicates = [NotInMicroMips] in {
-def MFHI : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>, MFLO_FM<0x10>;
-def MFLO : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>, MFLO_FM<0x12>;
+def MFHI : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>, MFLO_FM<0x10>,
+           ISA_MIPS1_NOT_32R6_64R6;
+def MFLO : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>, MFLO_FM<0x12>,
+           ISA_MIPS1_NOT_32R6_64R6;
 }
 
 /// Sign Ext In Register Instructions.
@@ -1219,8 +1300,10 @@ def SEH : MMRel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>,
           SEB_FM<0x18, 0x20>, ISA_MIPS32R2;
 
 /// Count Leading
-def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>, ISA_MIPS32;
-def CLO : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>, ISA_MIPS32;
+def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>,
+          ISA_MIPS32_NOT_32R6_64R6;
+def CLO : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>,
+          ISA_MIPS32_NOT_32R6_64R6;
 
 /// Word Swap Bytes Within Halfwords
 def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM<2, 0x20>, ISA_MIPS32R2;
@@ -1235,27 +1318,37 @@ def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
 def LEA_ADDiu : MMRel, EffectiveAddress<"addiu", GPR32Opnd>, LW_FM<9>;
 
 // MADD*/MSUB*
-def MADD  : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM<0x1c, 0>, ISA_MIPS32;
-def MADDU : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM<0x1c, 1>, ISA_MIPS32;
-def MSUB  : MMRel, MArithR<"msub", II_MSUB>, MULT_FM<0x1c, 4>, ISA_MIPS32;
-def MSUBU : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM<0x1c, 5>, ISA_MIPS32;
+def MADD  : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM<0x1c, 0>,
+            ISA_MIPS32_NOT_32R6_64R6;
+def MADDU : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM<0x1c, 1>,
+            ISA_MIPS32_NOT_32R6_64R6;
+def MSUB  : MMRel, MArithR<"msub", II_MSUB>, MULT_FM<0x1c, 4>,
+            ISA_MIPS32_NOT_32R6_64R6;
+def MSUBU : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM<0x1c, 5>,
+            ISA_MIPS32_NOT_32R6_64R6;
 
 let AdditionalPredicates = [NotDSP] in {
-def PseudoMULT  : MultDivPseudo<MULT, ACC64, GPR32Opnd, MipsMult, II_MULT>;
-def PseudoMULTu : MultDivPseudo<MULTu, ACC64, GPR32Opnd, MipsMultu, II_MULTU>;
-def PseudoMFHI : PseudoMFLOHI<GPR32, ACC64, MipsMFHI>;
-def PseudoMFLO : PseudoMFLOHI<GPR32, ACC64, MipsMFLO>;
-def PseudoMTLOHI : PseudoMTLOHI<ACC64, GPR32>;
-def PseudoMADD  : MAddSubPseudo<MADD, MipsMAdd, II_MADD>;
-def PseudoMADDU : MAddSubPseudo<MADDU, MipsMAddu, II_MADDU>;
-def PseudoMSUB  : MAddSubPseudo<MSUB, MipsMSub, II_MSUB>;
-def PseudoMSUBU : MAddSubPseudo<MSUBU, MipsMSubu, II_MSUBU>;
+def PseudoMULT  : MultDivPseudo<MULT, ACC64, GPR32Opnd, MipsMult, II_MULT>,
+                  ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMULTu : MultDivPseudo<MULTu, ACC64, GPR32Opnd, MipsMultu, II_MULTU>,
+                  ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMFHI : PseudoMFLOHI<GPR32, ACC64, MipsMFHI>, ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMFLO : PseudoMFLOHI<GPR32, ACC64, MipsMFLO>, ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMTLOHI : PseudoMTLOHI<ACC64, GPR32>, ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMADD  : MAddSubPseudo<MADD, MipsMAdd, II_MADD>,
+                  ISA_MIPS32_NOT_32R6_64R6;
+def PseudoMADDU : MAddSubPseudo<MADDU, MipsMAddu, II_MADDU>,
+                  ISA_MIPS32_NOT_32R6_64R6;
+def PseudoMSUB  : MAddSubPseudo<MSUB, MipsMSub, II_MSUB>,
+                  ISA_MIPS32_NOT_32R6_64R6;
+def PseudoMSUBU : MAddSubPseudo<MSUBU, MipsMSubu, II_MSUBU>,
+                  ISA_MIPS32_NOT_32R6_64R6;
 }
 
 def PseudoSDIV : MultDivPseudo<SDIV, ACC64, GPR32Opnd, MipsDivRem, II_DIV,
-                               0, 1, 1>;
+                               0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
 def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, II_DIVU,
-                               0, 1, 1>;
+                               0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
 
 def RDHWR : ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM;
 
@@ -1274,6 +1367,46 @@ def SSNOP : Barrier<"ssnop">, BARRIER_FM<1>;
 def EHB : Barrier<"ehb">, BARRIER_FM<3>;
 def PAUSE : Barrier<"pause">, BARRIER_FM<5>, ISA_MIPS32R2;
 
+// JR_HB and JALR_HB are defined here using the new style naming
+// scheme because some of this code is shared with Mips32r6InstrInfo.td
+// and because of that it doesn't follow the naming convention of the
+// rest of the file. To avoid a mixture of old vs new style, the new
+// style was chosen.
+class JR_HB_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins GPROpnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rs");
+  list<dag> Pattern = [];
+}
+
+class JALR_HB_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
+  list<dag> Pattern = [];
+}
+
+class JR_HB_DESC : InstSE<(outs), (ins), "", [], NoItinerary, FrmJ>,
+                   JR_HB_DESC_BASE<"jr.hb", GPR32Opnd> {
+  let isBranch=1;
+  let isIndirectBranch=1;
+  let hasDelaySlot=1;
+  let isTerminator=1;
+  let isBarrier=1;
+}
+
+class JALR_HB_DESC : InstSE<(outs), (ins), "", [], NoItinerary, FrmJ>,
+                     JALR_HB_DESC_BASE<"jalr.hb", GPR32Opnd> {
+  let isIndirectBranch=1;
+  let hasDelaySlot=1;
+}
+
+class JR_HB_ENC : JR_HB_FM<8>;
+class JALR_HB_ENC : JALR_HB_FM<9>;
+
+def JR_HB : JR_HB_DESC, JR_HB_ENC, ISA_MIPS32_NOT_32R6_64R6;
+def JALR_HB : JALR_HB_DESC, JALR_HB_ENC, ISA_MIPS32;
+
 class TLB<string asmstr> : InstSE<(outs), (ins), asmstr, [], NoItinerary,
                                       FrmOther>;
 def TLBP : TLB<"tlbp">, COP0_TLB_FM<0x08>;
@@ -1281,6 +1414,15 @@ def TLBR : TLB<"tlbr">, COP0_TLB_FM<0x01>;
 def TLBWI : TLB<"tlbwi">, COP0_TLB_FM<0x02>;
 def TLBWR : TLB<"tlbwr">, COP0_TLB_FM<0x06>;
 
+class CacheOp<string instr_asm, Operand MemOpnd, RegisterOperand GPROpnd> :
+    InstSE<(outs), (ins  MemOpnd:$addr, uimm5:$hint),
+           !strconcat(instr_asm, "\t$hint, $addr"), [], NoItinerary, FrmOther>;
+
+def CACHE : CacheOp<"cache", mem, GPR32Opnd>, CACHEOP_FM<0b101111>,
+            INSN_MIPS3_32_NOT_32R6_64R6;
+def PREF :  CacheOp<"pref", mem, GPR32Opnd>, CACHEOP_FM<0b110011>,
+            INSN_MIPS3_32_NOT_32R6_64R6;
+
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
@@ -1289,19 +1431,23 @@ def : MipsInstAlias<"move $dst, $src",
       GPR_32 {
   let AdditionalPredicates = [NotInMicroMips];
 }
-def : MipsInstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>;
+def : MipsInstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>,
+      ISA_MIPS1_NOT_32R6_64R6;
 def : MipsInstAlias<"addu $rs, $rt, $imm",
                     (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
 def : MipsInstAlias<"add $rs, $rt, $imm",
                     (ADDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
 def : MipsInstAlias<"and $rs, $rt, $imm",
                     (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+def : MipsInstAlias<"and $rs, $imm",
+                    (ANDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm), 0>;
 def : MipsInstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>;
 let Predicates = [NotInMicroMips] in {
 def : MipsInstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
 }
 def : MipsInstAlias<"jal $rs", (JALR RA, GPR32Opnd:$rs), 0>;
 def : MipsInstAlias<"jal $rd,$rs", (JALR GPR32Opnd:$rd, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"jalr.hb $rs", (JALR_HB RA, GPR32Opnd:$rs), 1>, ISA_MIPS32;
 def : MipsInstAlias<"not $rt, $rs",
                     (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
 def : MipsInstAlias<"neg $rt, $rs",
@@ -1318,6 +1464,8 @@ def : MipsInstAlias<"xor $rs, $rt, $imm",
                     (XORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
 def : MipsInstAlias<"or $rs, $rt, $imm",
                     (ORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
+def : MipsInstAlias<"or $rs, $imm",
+                    (ORi GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>;
 def : MipsInstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
 def : MipsInstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
 def : MipsInstAlias<"mtc0 $rt, $rd", (MTC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
@@ -1360,6 +1508,9 @@ def : MipsInstAlias<"sra $rd, $rt, $rs",
                     (SRAV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
 def : MipsInstAlias<"srl $rd, $rt, $rs",
                     (SRLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"sdbbp", (SDBBP 0)>, ISA_MIPS32_NOT_32R6_64R6;
+def : MipsInstAlias<"sync",
+                    (SYNC 0), 1>, ISA_MIPS2;
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions
 //===----------------------------------------------------------------------===//
@@ -1412,6 +1563,10 @@ let AdditionalPredicates = [NotDSP] in {
                 (ADDiu GPR32:$src, imm:$imm)>;
 }
 
+// SYNC
+def : MipsPat<(MipsSync (i32 immz)),
+              (SYNC 0)>, ISA_MIPS2;
+
 // Call
 def : MipsPat<(MipsJmpLink (i32 tglobaladdr:$dst)),
               (JAL tglobaladdr:$dst)>;
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index acfe76e..c6838a3 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -15,6 +15,7 @@
 
 #include "Mips.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsMCNaCl.h"
 #include "MipsTargetMachine.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -64,7 +65,8 @@ namespace {
       : MachineFunctionPass(ID), TM(tm),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_),
         ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()),
-        LongBranchSeqSize(!IsPIC ? 2 : (ABI == MipsSubtarget::N64 ? 10 : 9)) {}
+        LongBranchSeqSize(!IsPIC ? 2 : (ABI == MipsSubtarget::N64 ? 10 :
+            (!TM.getSubtarget<MipsSubtarget>().isTargetNaCl() ? 9 : 10))) {}
 
     const char *getPassName() const override {
       return "Mips Long Branch";
@@ -264,6 +266,13 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
     LongBrMBB->addSuccessor(BalTgtMBB);
     BalTgtMBB->addSuccessor(TgtMBB);
 
+    // We must select between the MIPS32r6/MIPS64r6 BAL (which is a normal
+    // instruction) and the pre-MIPS32r6/MIPS64r6 definition (which is an
+    // pseudo-instruction wrapping BGEZAL).
+
+    const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
+    unsigned BalOp = Subtarget.hasMips32r6() ? Mips::BAL : Mips::BAL_BR;
+
     if (ABI != MipsSubtarget::N64) {
       // $longbr:
       //  addiu $sp, $sp, -8
@@ -305,9 +314,11 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_LUi), Mips::AT)
         .addMBB(TgtMBB).addMBB(BalTgtMBB);
       MIBundleBuilder(*LongBrMBB, Pos)
-        .append(BuildMI(*MF, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB))
-        .append(BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_ADDiu), Mips::AT)
-                  .addReg(Mips::AT).addMBB(TgtMBB).addMBB(BalTgtMBB));
+          .append(BuildMI(*MF, DL, TII->get(BalOp)).addMBB(BalTgtMBB))
+          .append(BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_ADDiu), Mips::AT)
+                      .addReg(Mips::AT)
+                      .addMBB(TgtMBB)
+                      .addMBB(BalTgtMBB));
 
       Pos = BalTgtMBB->begin();
 
@@ -316,10 +327,23 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA)
         .addReg(Mips::SP).addImm(0);
 
-      MIBundleBuilder(*BalTgtMBB, Pos)
-        .append(BuildMI(*MF, DL, TII->get(Mips::JR)).addReg(Mips::AT))
-        .append(BuildMI(*MF, DL, TII->get(Mips::ADDiu), Mips::SP)
-                .addReg(Mips::SP).addImm(8));
+      if (!TM.getSubtarget<MipsSubtarget>().isTargetNaCl()) {
+        MIBundleBuilder(*BalTgtMBB, Pos)
+          .append(BuildMI(*MF, DL, TII->get(Mips::JR)).addReg(Mips::AT))
+          .append(BuildMI(*MF, DL, TII->get(Mips::ADDiu), Mips::SP)
+                  .addReg(Mips::SP).addImm(8));
+      } else {
+        // In NaCl, modifying the sp is not allowed in branch delay slot.
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+          .addReg(Mips::SP).addImm(8);
+
+        MIBundleBuilder(*BalTgtMBB, Pos)
+          .append(BuildMI(*MF, DL, TII->get(Mips::JR)).addReg(Mips::AT))
+          .append(BuildMI(*MF, DL, TII->get(Mips::NOP)));
+
+        // Bundle-align the target of indirect branch JR.
+        TgtMBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
+      }
     } else {
       // $longbr:
       //  daddiu $sp, $sp, -16
@@ -364,11 +388,12 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
         .addReg(Mips::AT_64).addImm(16);
 
       MIBundleBuilder(*LongBrMBB, Pos)
-        .append(BuildMI(*MF, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB))
-        .append(BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_DADDiu),
-                        Mips::AT_64).addReg(Mips::AT_64)
-                                    .addMBB(TgtMBB, MipsII::MO_ABS_LO)
-                                    .addMBB(BalTgtMBB));
+          .append(BuildMI(*MF, DL, TII->get(BalOp)).addMBB(BalTgtMBB))
+          .append(
+              BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_DADDiu), Mips::AT_64)
+                  .addReg(Mips::AT_64)
+                  .addMBB(TgtMBB, MipsII::MO_ABS_LO)
+                  .addMBB(BalTgtMBB));
 
       Pos = BalTgtMBB->begin();
 
@@ -450,9 +475,18 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
         continue;
 
       int ShVal = TM.getSubtarget<MipsSubtarget>().inMicroMipsMode() ? 2 : 4;
+      int64_t Offset = computeOffset(I->Br) / ShVal;
+
+      if (TM.getSubtarget<MipsSubtarget>().isTargetNaCl()) {
+        // The offset calculation does not include sandboxing instructions
+        // that will be added later in the MC layer.  Since at this point we
+        // don't know the exact amount of code that "sandboxing" will add, we
+        // conservatively estimate that code will not grow more than 100%.
+        Offset *= 2;
+      }
 
       // Check if offset fits into 16-bit immediate field of branches.
-      if (!ForceLongBranch && isInt<16>(computeOffset(I->Br) / ShVal))
+      if (!ForceLongBranch && isInt<16>(Offset))
         continue;
 
       I->HasLongBranch = true;
diff --git a/lib/Target/Mips/MipsMSAInstrFormats.td b/lib/Target/Mips/MipsMSAInstrFormats.td
index 6bd0366..bff2d0f 100644
--- a/lib/Target/Mips/MipsMSAInstrFormats.td
+++ b/lib/Target/Mips/MipsMSAInstrFormats.td
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-def HasMSA : Predicate<"Subtarget.hasMSA()">,
+def HasMSA : Predicate<"Subtarget->hasMSA()">,
              AssemblerPredicate<"FeatureMSA">;
 
 class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index e9101cc..8c16f82 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -15,7 +15,6 @@
 #define MIPS_MACHINE_FUNCTION_INFO_H
 
 #include "Mips16HardFloatInfo.h"
-#include "MipsSubtarget.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 83d25ab..084449b 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -93,6 +93,9 @@ MipsRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   if (Subtarget.isFP64bit())
     return CSR_O32_FP64_SaveList;
 
+  if (Subtarget.isFPXX())
+    return CSR_O32_FPXX_SaveList;
+
   return CSR_O32_SaveList;
 }
 
@@ -110,6 +113,9 @@ MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const {
   if (Subtarget.isFP64bit())
     return CSR_O32_FP64_RegMask;
 
+  if (Subtarget.isFPXX())
+    return CSR_O32_FPXX_RegMask;
+
   return CSR_O32_RegMask;
 }
 
@@ -201,6 +207,11 @@ getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(Mips::GP_64);
   }
 
+  if (Subtarget.isABI_O32() && !Subtarget.useOddSPReg()) {
+    for (const auto &Reg : Mips::OddSPRegClass)
+      Reserved.set(Reg);
+  }
+
   return Reserved;
 }
 
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 875a596..6323da3 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -340,6 +340,12 @@ def AFGR64 : RegisterClass<"Mips", [f64], 64, (add
 
 def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)>;
 
+// Used to reserve odd registers when given -mattr=+nooddspreg
+def OddSP : RegisterClass<"Mips", [f32], 32,
+                          (add (decimate (sequence "F%u", 1, 31), 2),
+                               (decimate (sequence "F_HI%u", 1, 31), 2))>,
+            Unallocatable;
+
 // FP control registers.
 def CCR : RegisterClass<"Mips", [i32], 32, (sequence "FCR%u", 0, 31)>,
           Unallocatable;
@@ -348,6 +354,10 @@ def CCR : RegisterClass<"Mips", [i32], 32, (sequence "FCR%u", 0, 31)>,
 def FCC : RegisterClass<"Mips", [i32], 32, (sequence "FCC%u", 0, 7)>,
           Unallocatable;
 
+// MIPS32r6/MIPS64r6 store FPU condition codes in normal FGR registers.
+// This class allows us to represent this in codegen patterns.
+def FGRCC : RegisterClass<"Mips", [i32], 32, (sequence "F%u", 0, 31)>;
+
 def MSA128B: RegisterClass<"Mips", [v16i8], 128,
                            (sequence "W%u", 0, 31)>;
 def MSA128H: RegisterClass<"Mips", [v8i16, v8f16], 128,
@@ -512,6 +522,12 @@ def FGR32Opnd : RegisterOperand<FGR32> {
   let ParserMatchClass = FGR32AsmOperand;
 }
 
+def FGRCCOpnd : RegisterOperand<FGRCC> {
+  // The assembler doesn't use register classes so we can re-use
+  // FGR32AsmOperand.
+  let ParserMatchClass = FGR32AsmOperand;
+}
+
 def FGRH32Opnd : RegisterOperand<FGRH32> {
   let ParserMatchClass = FGRH32AsmOperand;
 }
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index 6ad5821..6573070 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -16,6 +16,7 @@
 #include "MipsAnalyzeImmediate.h"
 #include "MipsMachineFunction.h"
 #include "MipsSEInstrInfo.h"
+#include "MipsSubtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -257,6 +258,9 @@ bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I,
   return true;
 }
 
+MipsSEFrameLowering::MipsSEFrameLowering(const MipsSubtarget &STI)
+    : MipsFrameLowering(STI, STI.stackAlignment()) {}
+
 unsigned MipsSEFrameLowering::ehDataReg(unsigned I) const {
   static const unsigned EhDataReg[] = {
     Mips::A0, Mips::A1, Mips::A2, Mips::A3
diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h
index 5d2801f..e832848 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/lib/Target/Mips/MipsSEFrameLowering.h
@@ -20,8 +20,7 @@ namespace llvm {
 
 class MipsSEFrameLowering : public MipsFrameLowering {
 public:
-  explicit MipsSEFrameLowering(const MipsSubtarget &STI)
-    : MipsFrameLowering(STI, STI.stackAlignment()) {}
+  explicit MipsSEFrameLowering(const MipsSubtarget &STI);
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index d5385be..6f35947 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -37,7 +37,7 @@ using namespace llvm;
 #define DEBUG_TYPE "mips-isel"
 
 bool MipsSEDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
-  if (Subtarget.inMips16Mode())
+  if (Subtarget->inMips16Mode())
     return false;
   return MipsDAGToDAGISel::runOnMachineFunction(MF);
 }
@@ -134,7 +134,7 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
   unsigned V0, V1, GlobalBaseReg = MipsFI->getGlobalBaseReg();
   const TargetRegisterClass *RC;
 
-  if (Subtarget.isABI_N64())
+  if (Subtarget->isABI_N64())
     RC = (const TargetRegisterClass*)&Mips::GPR64RegClass;
   else
     RC = (const TargetRegisterClass*)&Mips::GPR32RegClass;
@@ -142,7 +142,7 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
   V0 = RegInfo.createVirtualRegister(RC);
   V1 = RegInfo.createVirtualRegister(RC);
 
-  if (Subtarget.isABI_N64()) {
+  if (Subtarget->isABI_N64()) {
     MF.getRegInfo().addLiveIn(Mips::T9_64);
     MBB.addLiveIn(Mips::T9_64);
 
@@ -174,7 +174,7 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
   MF.getRegInfo().addLiveIn(Mips::T9);
   MBB.addLiveIn(Mips::T9);
 
-  if (Subtarget.isABI_N32()) {
+  if (Subtarget->isABI_N32()) {
     // lui $v0, %hi(%neg(%gp_rel(fname)))
     // addu $v1, $v0, $t9
     // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
@@ -187,7 +187,7 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
     return;
   }
 
-  assert(Subtarget.isABI_O32());
+  assert(Subtarget->isABI_O32());
 
   // For O32 ABI, the following instruction sequence is emitted to initialize
   // the global base register:
@@ -408,7 +408,7 @@ bool MipsSEDAGToDAGISel::selectIntAddrMSA(SDValue Addr, SDValue &Base,
 // * MSA is enabled
 // * N is a ISD::BUILD_VECTOR representing a constant splat
 bool MipsSEDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const {
-  if (!Subtarget.hasMSA())
+  if (!Subtarget->hasMSA())
     return false;
 
   BuildVectorSDNode *Node = dyn_cast<BuildVectorSDNode>(N);
@@ -422,7 +422,7 @@ bool MipsSEDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const {
 
   if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
                              HasAnyUndefs, 8,
-                             !Subtarget.isLittle()))
+                             !Subtarget->isLittle()))
     return false;
 
   Imm = SplatValue;
@@ -648,7 +648,7 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
   }
 
   case ISD::ADDE: {
-    if (Subtarget.hasDSP()) // Select DSP instructions, ADDSC and ADDWC.
+    if (Subtarget->hasDSP()) // Select DSP instructions, ADDSC and ADDWC.
       break;
     SDValue InFlag = Node->getOperand(2);
     Result = selectAddESubE(Mips::ADDu, InFlag, InFlag.getValue(0), DL, Node);
@@ -658,11 +658,11 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
   case ISD::ConstantFP: {
     ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(Node);
     if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) {
-      if (Subtarget.isGP64bit()) {
+      if (Subtarget->isGP64bit()) {
         SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
                                               Mips::ZERO_64, MVT::i64);
         Result = CurDAG->getMachineNode(Mips::DMTC1, DL, MVT::f64, Zero);
-      } else if (Subtarget.isFP64bit()) {
+      } else if (Subtarget->isFP64bit()) {
         SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
                                               Mips::ZERO, MVT::i32);
         Result = CurDAG->getMachineNode(Mips::BuildPairF64_64, DL, MVT::f64,
@@ -813,12 +813,12 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
     EVT ResVecTy = BVN->getValueType(0);
     EVT ViaVecTy;
 
-    if (!Subtarget.hasMSA() || !BVN->getValueType(0).is128BitVector())
+    if (!Subtarget->hasMSA() || !BVN->getValueType(0).is128BitVector())
       return std::make_pair(false, nullptr);
 
     if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
                               HasAnyUndefs, 8,
-                              !Subtarget.isLittle()))
+                              !Subtarget->isLittle()))
       return std::make_pair(false, nullptr);
 
     switch (SplatBitSize) {
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 969d730..be4ca86 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -39,7 +39,7 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
   // Set up the register classes
   addRegisterClass(MVT::i32, &Mips::GPR32RegClass);
 
-  if (isGP64bit())
+  if (Subtarget->isGP64bit())
     addRegisterClass(MVT::i64, &Mips::GPR64RegClass);
 
   if (Subtarget->hasDSP() || Subtarget->hasMSA()) {
@@ -120,10 +120,10 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
 
   if (Subtarget->hasCnMips())
     setOperationAction(ISD::MUL,              MVT::i64, Legal);
-  else if (isGP64bit())
+  else if (Subtarget->isGP64bit())
     setOperationAction(ISD::MUL,              MVT::i64, Custom);
 
-  if (isGP64bit()) {
+  if (Subtarget->isGP64bit()) {
     setOperationAction(ISD::MULHS,            MVT::i64, Custom);
     setOperationAction(ISD::MULHU,            MVT::i64, Custom);
   }
@@ -152,6 +152,76 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
     setOperationAction(ISD::STORE, MVT::f64, Custom);
   }
 
+  if (Subtarget->hasMips32r6()) {
+    // MIPS32r6 replaces the accumulator-based multiplies with a three register
+    // instruction
+    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+    setOperationAction(ISD::MUL, MVT::i32, Legal);
+    setOperationAction(ISD::MULHS, MVT::i32, Legal);
+    setOperationAction(ISD::MULHU, MVT::i32, Legal);
+
+    // MIPS32r6 replaces the accumulator-based division/remainder with separate
+    // three register division and remainder instructions.
+    setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+    setOperationAction(ISD::SDIV, MVT::i32, Legal);
+    setOperationAction(ISD::UDIV, MVT::i32, Legal);
+    setOperationAction(ISD::SREM, MVT::i32, Legal);
+    setOperationAction(ISD::UREM, MVT::i32, Legal);
+
+    // MIPS32r6 replaces conditional moves with an equivalent that removes the
+    // need for three GPR read ports.
+    setOperationAction(ISD::SETCC, MVT::i32, Legal);
+    setOperationAction(ISD::SELECT, MVT::i32, Legal);
+    setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+
+    setOperationAction(ISD::SETCC, MVT::f32, Legal);
+    setOperationAction(ISD::SELECT, MVT::f32, Legal);
+    setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+
+    assert(Subtarget->isFP64bit() && "FR=1 is required for MIPS32r6");
+    setOperationAction(ISD::SETCC, MVT::f64, Legal);
+    setOperationAction(ISD::SELECT, MVT::f64, Legal);
+    setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+
+    setOperationAction(ISD::BRCOND, MVT::Other, Legal);
+
+    // Floating point > and >= are supported via < and <=
+    setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
+    setCondCodeAction(ISD::SETOGT, MVT::f32, Expand);
+    setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
+    setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+
+    setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
+    setCondCodeAction(ISD::SETOGT, MVT::f64, Expand);
+    setCondCodeAction(ISD::SETUGE, MVT::f64, Expand);
+    setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
+  }
+
+  if (Subtarget->hasMips64r6()) {
+    // MIPS64r6 replaces the accumulator-based multiplies with a three register
+    // instruction
+    setOperationAction(ISD::MUL, MVT::i64, Legal);
+    setOperationAction(ISD::MULHS, MVT::i64, Legal);
+    setOperationAction(ISD::MULHU, MVT::i64, Legal);
+
+    // MIPS32r6 replaces the accumulator-based division/remainder with separate
+    // three register division and remainder instructions.
+    setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+    setOperationAction(ISD::SDIV, MVT::i64, Legal);
+    setOperationAction(ISD::UDIV, MVT::i64, Legal);
+    setOperationAction(ISD::SREM, MVT::i64, Legal);
+    setOperationAction(ISD::UREM, MVT::i64, Legal);
+
+    // MIPS64r6 replaces conditional moves with an equivalent that removes the
+    // need for three GPR read ports.
+    setOperationAction(ISD::SETCC, MVT::i64, Legal);
+    setOperationAction(ISD::SELECT, MVT::i64, Legal);
+    setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+  }
+
   computeRegisterProperties();
 }
 
@@ -160,6 +230,14 @@ llvm::createMipsSETargetLowering(MipsTargetMachine &TM) {
   return new MipsSETargetLowering(TM);
 }
 
+const TargetRegisterClass *
+MipsSETargetLowering::getRepRegClassFor(MVT VT) const {
+  if (VT == MVT::Untyped)
+    return Subtarget->hasDSP() ? &Mips::ACC64DSPRegClass : &Mips::ACC64RegClass;
+
+  return TargetLowering::getRepRegClassFor(VT);
+}
+
 // Enable MSA support for the given integer type and Register class.
 void MipsSETargetLowering::
 addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) {
@@ -449,8 +527,8 @@ static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalize())
     return SDValue();
 
-  if (Subtarget->hasMips32() && N->getValueType(0) == MVT::i32 &&
-      selectMADD(N, &DAG))
+  if (Subtarget->hasMips32() && !Subtarget->hasMips32r6() &&
+      N->getValueType(0) == MVT::i32 && selectMADD(N, &DAG))
     return SDValue(N, 0);
 
   return SDValue();
@@ -1178,6 +1256,9 @@ SDValue MipsSETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
                                           bool HasLo, bool HasHi,
                                           SelectionDAG &DAG) const {
+  // MIPS32r6/MIPS64r6 removed accumulator based multiplies.
+  assert(!Subtarget->hasMips32r6());
+
   EVT Ty = Op.getOperand(0).getValueType();
   SDLoc DL(Op);
   SDValue Mult = DAG.getNode(NewOpc, DL, MVT::Untyped,
@@ -1651,7 +1732,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_copy_s_w:
     return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_SEXT_ELT);
   case Intrinsic::mips_copy_s_d:
-    if (hasMips64())
+    if (Subtarget->hasMips64())
       // Lower directly into VEXTRACT_SEXT_ELT since i64 is legal on Mips64.
       return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_SEXT_ELT);
     else {
@@ -1666,7 +1747,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_copy_u_w:
     return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_ZEXT_ELT);
   case Intrinsic::mips_copy_u_d:
-    if (hasMips64())
+    if (Subtarget->hasMips64())
       // Lower directly into VEXTRACT_ZEXT_ELT since i64 is legal on Mips64.
       return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_ZEXT_ELT);
     else {
@@ -2943,8 +3024,8 @@ MipsSETargetLowering::emitINSERT_DF_VIDX(MachineInstr *MI,
   unsigned SrcValReg = MI->getOperand(3).getReg();
 
   const TargetRegisterClass *VecRC = nullptr;
-  const TargetRegisterClass *GPRRC = isGP64bit() ? &Mips::GPR64RegClass
-                                                 : &Mips::GPR32RegClass;
+  const TargetRegisterClass *GPRRC =
+      Subtarget->isGP64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
   unsigned EltLog2Size;
   unsigned InsertOp = 0;
   unsigned InsveOp = 0;
diff --git a/lib/Target/Mips/MipsSEISelLowering.h b/lib/Target/Mips/MipsSEISelLowering.h
index 03a20ef..13ef6fc 100644
--- a/lib/Target/Mips/MipsSEISelLowering.h
+++ b/lib/Target/Mips/MipsSEISelLowering.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MipsSEISELLOWERING_H
-#define MipsSEISELLOWERING_H
+#ifndef MIPSSEISELLOWERING_H
+#define MIPSSEISELLOWERING_H
 
 #include "MipsISelLowering.h"
 #include "MipsRegisterInfo.h"
@@ -46,13 +46,7 @@ namespace llvm {
       return false;
     }
 
-    const TargetRegisterClass *getRepRegClassFor(MVT VT) const override {
-      if (VT == MVT::Untyped)
-        return Subtarget->hasDSP() ? &Mips::ACC64DSPRegClass :
-                                     &Mips::ACC64RegClass;
-
-      return TargetLowering::getRepRegClassFor(VT);
-    }
+    const TargetRegisterClass *getRepRegClassFor(MVT VT) const override;
 
   private:
     bool isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index f6f364f..32da749 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -272,7 +272,7 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   default:
     return false;
   case Mips::RetRA:
-    expandRetRA(MBB, MI, Mips::RET);
+    expandRetRA(MBB, MI);
     break;
   case Mips::PseudoMFHI:
     Opc = isMicroMips ? Mips::MFHI16_MM : Mips::MFHI;
@@ -428,9 +428,14 @@ unsigned MipsSEInstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
 }
 
 void MipsSEInstrInfo::expandRetRA(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator I,
-                                unsigned Opc) const {
-  BuildMI(MBB, I, I->getDebugLoc(), get(Opc)).addReg(Mips::RA);
+                                  MachineBasicBlock::iterator I) const {
+  const auto &Subtarget = TM.getSubtarget<MipsSubtarget>();
+
+  if (Subtarget.isGP64bit())
+    BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn64))
+        .addReg(Mips::RA_64);
+  else
+    BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn)).addReg(Mips::RA);
 }
 
 std::pair<bool, bool>
@@ -542,20 +547,31 @@ void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB,
   const MCInstrDesc& Mtc1Tdd = get(Mips::MTC1);
   DebugLoc dl = I->getDebugLoc();
   const TargetRegisterInfo &TRI = getRegisterInfo();
+  bool HasMTHC1 = TM.getSubtarget<MipsSubtarget>().hasMips32r2() ||
+                  TM.getSubtarget<MipsSubtarget>().hasMips32r6();
 
-  // For FP32 mode:
-  //   mtc1 Lo, $fp
-  //   mtc1 Hi, $fp + 1
-  // For FP64 mode:
+  // When mthc1 is available, use:
   //   mtc1 Lo, $fp
   //   mthc1 Hi, $fp
+  //
+  // Otherwise, for FP64:
+  //   spill + reload via ldc1
+  // This has not been implemented since FP64 on MIPS32 and earlier is not
+  // supported.
+  //
+  // Otherwise, for FP32:
+  //   mtc1 Lo, $fp
+  //   mtc1 Hi, $fp + 1
 
   BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_lo))
     .addReg(LoReg);
 
-  if (FP64) {
-    // FIXME: The .addReg(DstReg, RegState::Implicit) is a white lie used to
-    //        temporarily work around a widespread bug in the -mfp64 support.
+  if (HasMTHC1 || FP64) {
+    assert(TM.getSubtarget<MipsSubtarget>().hasMips32r2() &&
+           "MTHC1 requires MIPS32r2");
+
+    // FIXME: The .addReg(DstReg) is a white lie used to temporarily work
+    //        around a widespread bug in the -mfp64 support.
     //        The problem is that none of the 32-bit fpu ops mention the fact
     //        that they clobber the upper 32-bits of the 64-bit FPR. Fixing that
     //        requires a major overhaul of the FPU implementation which can't
@@ -565,9 +581,9 @@ void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB,
     //        We therefore pretend that it reads the bottom 32-bits to
     //        artificially create a dependency and prevent the scheduler
     //        changing the behaviour of the code.
-    BuildMI(MBB, I, dl, get(Mips::MTHC1), TRI.getSubReg(DstReg, Mips::sub_hi))
-        .addReg(HiReg)
-        .addReg(DstReg, RegState::Implicit);
+    BuildMI(MBB, I, dl, get(FP64 ? Mips::MTHC1_D64 : Mips::MTHC1_D32), DstReg)
+        .addReg(DstReg)
+        .addReg(HiReg);
   } else
     BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_hi))
       .addReg(HiReg);
@@ -580,17 +596,16 @@ void MipsSEInstrInfo::expandEhReturn(MachineBasicBlock &MBB,
   // indirect jump to TargetReg
   const MipsSubtarget &STI = TM.getSubtarget<MipsSubtarget>();
   unsigned ADDU = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
-  unsigned JR = STI.isABI_N64() ? Mips::JR64 : Mips::JR;
-  unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
-  unsigned RA = STI.isABI_N64() ? Mips::RA_64 : Mips::RA;
-  unsigned T9 = STI.isABI_N64() ? Mips::T9_64 : Mips::T9;
-  unsigned ZERO = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
+  unsigned SP = STI.isGP64bit() ? Mips::SP_64 : Mips::SP;
+  unsigned RA = STI.isGP64bit() ? Mips::RA_64 : Mips::RA;
+  unsigned T9 = STI.isGP64bit() ? Mips::T9_64 : Mips::T9;
+  unsigned ZERO = STI.isGP64bit() ? Mips::ZERO_64 : Mips::ZERO;
   unsigned OffsetReg = I->getOperand(0).getReg();
   unsigned TargetReg = I->getOperand(1).getReg();
 
   // addu $ra, $v0, $zero
   // addu $sp, $sp, $v1
-  // jr   $ra
+  // jr   $ra (via RetRA)
   if (TM.getRelocationModel() == Reloc::PIC_)
     BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(ADDU), T9)
         .addReg(TargetReg).addReg(ZERO);
@@ -598,7 +613,7 @@ void MipsSEInstrInfo::expandEhReturn(MachineBasicBlock &MBB,
       .addReg(TargetReg).addReg(ZERO);
   BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(ADDU), SP)
       .addReg(SP).addReg(OffsetReg);
-  BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(JR)).addReg(RA);
+  expandRetRA(MBB, I);
 }
 
 const MipsInstrInfo *llvm::createMipsSEInstrInfo(MipsTargetMachine &TM) {
diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h
index aa68552..9ac94ce 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/lib/Target/Mips/MipsSEInstrInfo.h
@@ -81,8 +81,7 @@ public:
 private:
   unsigned getAnalyzableBrOpc(unsigned Opc) const override;
 
-  void expandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                   unsigned Opc) const;
+  void expandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const;
 
   std::pair<bool, bool> compareOpndSize(unsigned Opc,
                                         const MachineFunction &MF) const;
diff --git a/lib/Target/Mips/MipsSelectionDAGInfo.cpp b/lib/Target/Mips/MipsSelectionDAGInfo.cpp
index 0d4398e..edd8f67 100644
--- a/lib/Target/Mips/MipsSelectionDAGInfo.cpp
+++ b/lib/Target/Mips/MipsSelectionDAGInfo.cpp
@@ -16,9 +16,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "mips-selectiondag-info"
 
-MipsSelectionDAGInfo::MipsSelectionDAGInfo(const MipsTargetMachine &TM)
-  : TargetSelectionDAGInfo(TM) {
-}
+MipsSelectionDAGInfo::MipsSelectionDAGInfo(const DataLayout &DL)
+    : TargetSelectionDAGInfo(&DL) {}
 
 MipsSelectionDAGInfo::~MipsSelectionDAGInfo() {
 }
diff --git a/lib/Target/Mips/MipsSelectionDAGInfo.h b/lib/Target/Mips/MipsSelectionDAGInfo.h
index 6cafb55..2b3d527 100644
--- a/lib/Target/Mips/MipsSelectionDAGInfo.h
+++ b/lib/Target/Mips/MipsSelectionDAGInfo.h
@@ -22,7 +22,7 @@ class MipsTargetMachine;
 
 class MipsSelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
-  explicit MipsSelectionDAGInfo(const MipsTargetMachine &TM);
+  explicit MipsSelectionDAGInfo(const DataLayout &DL);
   ~MipsSelectionDAGInfo();
 };
 
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 74ec064..693daa3 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -60,11 +60,9 @@ Mips16ConstantIslands(
 
 /// Select the Mips CPU for the given triple and cpu name.
 /// FIXME: Merge with the copy in MipsMCTargetDesc.cpp
-static inline StringRef selectMipsCPU(StringRef TT, StringRef CPU) {
+static StringRef selectMipsCPU(Triple TT, StringRef CPU) {
   if (CPU.empty() || CPU == "generic") {
-    Triple TheTriple(TT);
-    if (TheTriple.getArch() == Triple::mips ||
-        TheTriple.getArch() == Triple::mipsel)
+    if (TT.getArch() == Triple::mips || TT.getArch() == Triple::mipsel)
       CPU = "mips32";
     else
       CPU = "mips64";
@@ -74,39 +72,56 @@ static inline StringRef selectMipsCPU(StringRef TT, StringRef CPU) {
 
 void MipsSubtarget::anchor() { }
 
+static std::string computeDataLayout(const MipsSubtarget &ST) {
+  std::string Ret = "";
+
+  // There are both little and big endian mips.
+  if (ST.isLittle())
+    Ret += "e";
+  else
+    Ret += "E";
+
+  Ret += "-m:m";
+
+  // Pointers are 32 bit on some ABIs.
+  if (!ST.isABI_N64())
+    Ret += "-p:32:32";
+
+  // 8 and 16 bit integers only need no have natural alignment, but try to
+  // align them to 32 bits. 64 bit integers have natural alignment.
+  Ret += "-i8:8:32-i16:16:32-i64:64";
+
+  // 32 bit registers are always available and the stack is at least 64 bit
+  // aligned. On N64 64 bit registers are also available and the stack is
+  // 128 bit aligned.
+  if (ST.isABI_N64() || ST.isABI_N32())
+    Ret += "-n32:64-S128";
+  else
+    Ret += "-n32-S64";
+
+  return Ret;
+}
+
 MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
                              const std::string &FS, bool little,
                              Reloc::Model _RM, MipsTargetMachine *_TM)
     : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(Mips32),
       MipsABI(UnknownABI), IsLittle(little), IsSingleFloat(false),
-      IsFP64bit(false), IsNaN2008bit(false), IsGP64bit(false), HasVFPU(false),
-      HasCnMips(false), IsLinux(true), HasMips3_32(false), HasMips3_32r2(false),
-      HasMips4_32(false), HasMips4_32r2(false), HasMips5_32r2(false),
-      InMips16Mode(false), InMips16HardFloat(Mips16HardFloat),
-      InMicroMipsMode(false), HasDSP(false), HasDSPR2(false),
-      AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16), HasMSA(false),
-      RM(_RM), OverrideMode(NoOverride), TM(_TM), TargetTriple(TT) {
-  std::string CPUName = CPU;
-  CPUName = selectMipsCPU(TT, CPUName);
-
-  // Parse features string.
-  ParseSubtargetFeatures(CPUName, FS);
-
-  if (InMips16Mode && !TM->Options.UseSoftFloat) {
-    // Hard float for mips16 means essentially to compile as soft float
-    // but to use a runtime library for soft float that is written with
-    // native mips32 floating point instructions (those runtime routines
-    // run in mips32 hard float mode).
-    TM->Options.UseSoftFloat = true;
-    TM->Options.FloatABIType = FloatABI::Soft;
-    InMips16HardFloat = true;
-  }
+      IsFPXX(false), IsFP64bit(false), UseOddSPReg(true), IsNaN2008bit(false),
+      IsGP64bit(false), HasVFPU(false), HasCnMips(false), IsLinux(true),
+      HasMips3_32(false), HasMips3_32r2(false), HasMips4_32(false),
+      HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false),
+      InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false),
+      HasDSPR2(false), AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16),
+      HasMSA(false), RM(_RM), OverrideMode(NoOverride), TM(_TM),
+      TargetTriple(TT),
+      DL(computeDataLayout(initializeSubtargetDependencies(CPU, FS, TM))),
+      TSInfo(DL), JITInfo(), InstrInfo(MipsInstrInfo::create(*TM)),
+      FrameLowering(MipsFrameLowering::create(*TM, *this)),
+      TLInfo(MipsTargetLowering::create(*TM)) {
 
   PreviousInMips16Mode = InMips16Mode;
 
-  // Initialize scheduling itinerary for the specified CPU.
-  InstrItins = getInstrItineraryForCPU(CPUName);
-
   // Don't even attempt to generate code for MIPS-I, MIPS-II, MIPS-III, and
   // MIPS-V. They have not been tested and currently exist for the integrated
   // assembler only.
@@ -137,6 +152,11 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
                        "See -mattr=+fp64.",
                        false);
 
+  if (!isABI_O32() && !useOddSPReg())
+    report_fatal_error("-mattr=+nooddspreg is not currently permitted for a "
+                       "the O32 ABI.",
+                       false);
+
   if (hasMips32r6()) {
     StringRef ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6";
 
@@ -167,6 +187,29 @@ MipsSubtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel,
   return OptLevel >= CodeGenOpt::Aggressive;
 }
 
+MipsSubtarget &
+MipsSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS,
+                                               const TargetMachine *TM) {
+  std::string CPUName = selectMipsCPU(TargetTriple, CPU);
+  
+  // Parse features string.
+  ParseSubtargetFeatures(CPUName, FS);
+  // Initialize scheduling itinerary for the specified CPU.
+  InstrItins = getInstrItineraryForCPU(CPUName);
+
+  if (InMips16Mode && !TM->Options.UseSoftFloat) {
+    // Hard float for mips16 means essentially to compile as soft float
+    // but to use a runtime library for soft float that is written with
+    // native mips32 floating point instructions (those runtime routines
+    // run in mips32 hard float mode).
+    TM->Options.UseSoftFloat = true;
+    TM->Options.FloatABIType = FloatABI::Soft;
+    InMips16HardFloat = true;
+  }
+
+  return *this;
+}
+
 //FIXME: This logic for reseting the subtarget along with
 // the helper classes can probably be simplified but there are a lot of
 // cases so we will defer rewriting this to later.
@@ -186,14 +229,14 @@ void MipsSubtarget::resetSubtarget(MachineFunction *MF) {
       return;
     OverrideMode = Mips16Override;
     PreviousInMips16Mode = true;
-    TM->setHelperClassesMips16();
+    setHelperClassesMips16();
     return;
   } else if (ChangeToNoMips16) {
     if (!PreviousInMips16Mode)
       return;
     OverrideMode = NoMips16Override;
     PreviousInMips16Mode = false;
-    TM->setHelperClassesMipsSE();
+    setHelperClassesMipsSE();
     return;
   } else {
     if (OverrideMode == NoOverride)
@@ -201,16 +244,52 @@ void MipsSubtarget::resetSubtarget(MachineFunction *MF) {
     OverrideMode = NoOverride;
     DEBUG(dbgs() << "back to default" << "\n");
     if (inMips16Mode() && !PreviousInMips16Mode) {
-      TM->setHelperClassesMips16();
+      setHelperClassesMips16();
       PreviousInMips16Mode = true;
     } else if (!inMips16Mode() && PreviousInMips16Mode) {
-      TM->setHelperClassesMipsSE();
+      setHelperClassesMipsSE();
       PreviousInMips16Mode = false;
     }
     return;
   }
 }
 
+void MipsSubtarget::setHelperClassesMips16() {
+  InstrInfoSE.swap(InstrInfo);
+  FrameLoweringSE.swap(FrameLowering);
+  TLInfoSE.swap(TLInfo);
+  if (!InstrInfo16) {
+    InstrInfo.reset(MipsInstrInfo::create(*TM));
+    FrameLowering.reset(MipsFrameLowering::create(*TM, *this));
+    TLInfo.reset(MipsTargetLowering::create(*TM));
+  } else {
+    InstrInfo16.swap(InstrInfo);
+    FrameLowering16.swap(FrameLowering);
+    TLInfo16.swap(TLInfo);
+  }
+  assert(TLInfo && "null target lowering 16");
+  assert(InstrInfo && "null instr info 16");
+  assert(FrameLowering && "null frame lowering 16");
+}
+
+void MipsSubtarget::setHelperClassesMipsSE() {
+  InstrInfo16.swap(InstrInfo);
+  FrameLowering16.swap(FrameLowering);
+  TLInfo16.swap(TLInfo);
+  if (!InstrInfoSE) {
+    InstrInfo.reset(MipsInstrInfo::create(*TM));
+    FrameLowering.reset(MipsFrameLowering::create(*TM, *this));
+    TLInfo.reset(MipsTargetLowering::create(*TM));
+  } else {
+    InstrInfoSE.swap(InstrInfo);
+    FrameLoweringSE.swap(FrameLowering);
+    TLInfoSE.swap(TLInfo);
+  }
+  assert(TLInfo && "null target lowering in SE");
+  assert(InstrInfo && "null instr info SE");
+  assert(FrameLowering && "null frame lowering SE");
+}
+
 bool MipsSubtarget::mipsSEUsesSoftFloat() const {
   return TM->Options.UseSoftFloat && !InMips16HardFloat;
 }
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 373f481..a3dcf03 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -14,6 +14,12 @@
 #ifndef MIPSSUBTARGET_H
 #define MIPSSUBTARGET_H
 
+#include "MipsFrameLowering.h"
+#include "MipsISelLowering.h"
+#include "MipsInstrInfo.h"
+#include "MipsJITInfo.h"
+#include "MipsSelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -56,9 +62,16 @@ protected:
   // floating point registers instead of only using even ones.
   bool IsSingleFloat;
 
+  // IsFPXX - MIPS O32 modeless ABI.
+  bool IsFPXX;
+
   // IsFP64bit - The target processor has 64-bit floating point registers.
   bool IsFP64bit;
 
+  /// Are odd single-precision registers permitted?
+  /// This corresponds to -modd-spreg and -mno-odd-spreg
+  bool UseOddSPReg;
+
   // IsNan2008 - IEEE 754-2008 NaN encoding.
   bool IsNaN2008bit;
 
@@ -132,6 +145,20 @@ protected:
   MipsTargetMachine *TM;
 
   Triple TargetTriple;
+
+  const DataLayout DL; // Calculates type size & alignment
+  const MipsSelectionDAGInfo TSInfo;
+  MipsJITInfo JITInfo;
+  std::unique_ptr<const MipsInstrInfo> InstrInfo;
+  std::unique_ptr<const MipsFrameLowering> FrameLowering;
+  std::unique_ptr<const MipsTargetLowering> TLInfo;
+  std::unique_ptr<const MipsInstrInfo> InstrInfo16;
+  std::unique_ptr<const MipsFrameLowering> FrameLowering16;
+  std::unique_ptr<const MipsTargetLowering> TLInfo16;
+  std::unique_ptr<const MipsInstrInfo> InstrInfoSE;
+  std::unique_ptr<const MipsFrameLowering> FrameLoweringSE;
+  std::unique_ptr<const MipsTargetLowering> TLInfoSE;
+
 public:
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                              AntiDepBreakMode& Mode,
@@ -142,6 +169,7 @@ public:
   bool isABI_N64() const { return MipsABI == N64; }
   bool isABI_N32() const { return MipsABI == N32; }
   bool isABI_O32() const { return MipsABI == O32; }
+  bool isABI_FPXX() const { return false; } // TODO: add check for FPXX
   unsigned getTargetABI() const { return MipsABI; }
 
   /// This constructor initializes the data members to match that
@@ -154,23 +182,36 @@ public:
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
+  bool hasMips1() const { return MipsArchVersion >= Mips1; }
   bool hasMips2() const { return MipsArchVersion >= Mips2; }
   bool hasMips3() const { return MipsArchVersion >= Mips3; }
+  bool hasMips4() const { return MipsArchVersion >= Mips4; }
+  bool hasMips5() const { return MipsArchVersion >= Mips5; }
   bool hasMips4_32() const { return HasMips4_32; }
   bool hasMips4_32r2() const { return HasMips4_32r2; }
-  bool hasMips32() const { return MipsArchVersion >= Mips32; }
-  bool hasMips32r2() const { return MipsArchVersion == Mips32r2 ||
-                                   MipsArchVersion == Mips64r2; }
-  bool hasMips32r6() const { return MipsArchVersion == Mips32r6 ||
-                                   MipsArchVersion == Mips64r6; }
+  bool hasMips32() const {
+    return MipsArchVersion >= Mips32 && MipsArchVersion != Mips3 &&
+           MipsArchVersion != Mips4 && MipsArchVersion != Mips5;
+  }
+  bool hasMips32r2() const {
+    return MipsArchVersion == Mips32r2 || MipsArchVersion == Mips32r6 ||
+           MipsArchVersion == Mips64r2 || MipsArchVersion == Mips64r6;
+  }
+  bool hasMips32r6() const {
+    return MipsArchVersion == Mips32r6 || MipsArchVersion == Mips64r6;
+  }
   bool hasMips64() const { return MipsArchVersion >= Mips64; }
-  bool hasMips64r2() const { return MipsArchVersion == Mips64r2; }
+  bool hasMips64r2() const {
+    return MipsArchVersion == Mips64r2 || MipsArchVersion == Mips64r6;
+  }
   bool hasMips64r6() const { return MipsArchVersion == Mips64r6; }
 
   bool hasCnMips() const { return HasCnMips; }
 
   bool isLittle() const { return IsLittle; }
+  bool isFPXX() const { return IsFPXX; }
   bool isFP64bit() const { return IsFP64bit; }
+  bool useOddSPReg() const { return UseOddSPReg; }
   bool isNaN2008() const { return IsNaN2008bit; }
   bool isNotFP64bit() const { return !IsFP64bit; }
   bool isGP64bit() const { return IsGP64bit; }
@@ -234,12 +275,31 @@ public:
   /// \brief Reset the subtarget for the Mips target.
   void resetSubtarget(MachineFunction *MF);
 
+  MipsSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS,
+                                                 const TargetMachine *TM);
+
   /// Does the system support unaligned memory access.
   ///
   /// MIPS32r6/MIPS64r6 require full unaligned access support but does not
   /// specify which component of the system provides it. Hardware, software, and
   /// hybrid implementations are all valid.
   bool systemSupportsUnalignedAccess() const { return hasMips32r6(); }
+
+  // Set helper classes
+  void setHelperClassesMips16();
+  void setHelperClassesMipsSE();
+
+  MipsJITInfo *getJITInfo() { return &JITInfo; }
+  const MipsSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  const DataLayout *getDataLayout() const { return &DL; }
+  const MipsInstrInfo *getInstrInfo() const { return InstrInfo.get(); }
+  const TargetFrameLowering *getFrameLowering() const {
+    return FrameLowering.get();
+  }
+  const MipsRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo->getRegisterInfo();
+  }
+  const MipsTargetLowering *getTargetLowering() const { return TLInfo.get(); }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 984c58e..425dbf1 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -45,93 +45,21 @@ extern "C" void LLVMInitializeMipsTarget() {
   RegisterTargetMachine<MipselTargetMachine> B(TheMips64elTarget);
 }
 
-static std::string computeDataLayout(const MipsSubtarget &ST) {
-  std::string Ret = "";
-
-  // There are both little and big endian mips.
-  if (ST.isLittle())
-    Ret += "e";
-  else
-    Ret += "E";
-
-  Ret += "-m:m";
-
-  // Pointers are 32 bit on some ABIs.
-  if (!ST.isABI_N64())
-    Ret += "-p:32:32";
-
-  // 8 and 16 bit integers only need no have natural alignment, but try to
-  // align them to 32 bits. 64 bit integers have natural alignment.
-  Ret += "-i8:8:32-i16:16:32-i64:64";
-
-  // 32 bit registers are always available and the stack is at least 64 bit
-  // aligned. On N64 64 bit registers are also available and the stack is
-  // 128 bit aligned.
-  if (ST.isABI_N64() || ST.isABI_N32())
-    Ret += "-n32:64-S128";
-  else
-    Ret += "-n32-S64";
-
-  return Ret;
-}
-
 // On function prologue, the stack is created by decrementing
 // its pointer. Once decremented, all references are done with positive
 // offset from the stack/frame pointer, using StackGrowsUp enables
 // an easier handling.
 // Using CodeModel::Large enables different CALL behavior.
-MipsTargetMachine::
-MipsTargetMachine(const Target &T, StringRef TT,
-                  StringRef CPU, StringRef FS, const TargetOptions &Options,
-                  Reloc::Model RM, CodeModel::Model CM,
-                  CodeGenOpt::Level OL,
-                  bool isLittle)
-  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS, isLittle, RM, this),
-    DL(computeDataLayout(Subtarget)),
-    InstrInfo(MipsInstrInfo::create(*this)),
-    FrameLowering(MipsFrameLowering::create(*this, Subtarget)),
-    TLInfo(MipsTargetLowering::create(*this)), TSInfo(*this),
-    InstrItins(Subtarget.getInstrItineraryData()), JITInfo() {
+MipsTargetMachine::MipsTargetMachine(const Target &T, StringRef TT,
+                                     StringRef CPU, StringRef FS,
+                                     const TargetOptions &Options,
+                                     Reloc::Model RM, CodeModel::Model CM,
+                                     CodeGenOpt::Level OL, bool isLittle)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, isLittle, RM, this) {
   initAsmInfo();
 }
 
-
-void MipsTargetMachine::setHelperClassesMips16() {
-  InstrInfoSE.swap(InstrInfo);
-  FrameLoweringSE.swap(FrameLowering);
-  TLInfoSE.swap(TLInfo);
-  if (!InstrInfo16) {
-    InstrInfo.reset(MipsInstrInfo::create(*this));
-    FrameLowering.reset(MipsFrameLowering::create(*this, Subtarget));
-    TLInfo.reset(MipsTargetLowering::create(*this));
-  } else {
-    InstrInfo16.swap(InstrInfo);
-    FrameLowering16.swap(FrameLowering);
-    TLInfo16.swap(TLInfo);
-  }
-  assert(TLInfo && "null target lowering 16");
-  assert(InstrInfo && "null instr info 16");
-  assert(FrameLowering && "null frame lowering 16");
-}
-
-void MipsTargetMachine::setHelperClassesMipsSE() {
-  InstrInfo16.swap(InstrInfo);
-  FrameLowering16.swap(FrameLowering);
-  TLInfo16.swap(TLInfo);
-  if (!InstrInfoSE) {
-    InstrInfo.reset(MipsInstrInfo::create(*this));
-    FrameLowering.reset(MipsFrameLowering::create(*this, Subtarget));
-    TLInfo.reset(MipsTargetLowering::create(*this));
-  } else {
-    InstrInfoSE.swap(InstrInfo);
-    FrameLoweringSE.swap(FrameLowering);
-    TLInfoSE.swap(TLInfo);
-  }
-  assert(TLInfo && "null target lowering in SE");
-  assert(InstrInfo && "null instr info SE");
-  assert(FrameLowering && "null frame lowering SE");
-}
 void MipsebTargetMachine::anchor() { }
 
 MipsebTargetMachine::
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index a5aa39b..a0e7d43 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -14,15 +14,9 @@
 #ifndef MIPSTARGETMACHINE_H
 #define MIPSTARGETMACHINE_H
 
-#include "MipsFrameLowering.h"
-#include "MipsISelLowering.h"
-#include "MipsInstrInfo.h"
-#include "MipsJITInfo.h"
-#include "MipsSelectionDAGInfo.h"
 #include "MipsSubtarget.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -32,68 +26,47 @@ class MipsRegisterInfo;
 
 class MipsTargetMachine : public LLVMTargetMachine {
   MipsSubtarget       Subtarget;
-  const DataLayout    DL; // Calculates type size & alignment
-  std::unique_ptr<const MipsInstrInfo> InstrInfo;
-  std::unique_ptr<const MipsFrameLowering> FrameLowering;
-  std::unique_ptr<const MipsTargetLowering> TLInfo;
-  std::unique_ptr<const MipsInstrInfo> InstrInfo16;
-  std::unique_ptr<const MipsFrameLowering> FrameLowering16;
-  std::unique_ptr<const MipsTargetLowering> TLInfo16;
-  std::unique_ptr<const MipsInstrInfo> InstrInfoSE;
-  std::unique_ptr<const MipsFrameLowering> FrameLoweringSE;
-  std::unique_ptr<const MipsTargetLowering> TLInfoSE;
-  MipsSelectionDAGInfo TSInfo;
-  const InstrItineraryData &InstrItins;
-  MipsJITInfo JITInfo;
 
 public:
-  MipsTargetMachine(const Target &T, StringRef TT,
-                    StringRef CPU, StringRef FS, const TargetOptions &Options,
-                    Reloc::Model RM, CodeModel::Model CM,
-                    CodeGenOpt::Level OL,
-                    bool isLittle);
+  MipsTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+                    const TargetOptions &Options, Reloc::Model RM,
+                    CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle);
 
   virtual ~MipsTargetMachine() {}
 
   void addAnalysisPasses(PassManagerBase &PM) override;
 
-  const MipsInstrInfo *getInstrInfo() const override
-  { return InstrInfo.get(); }
-  const TargetFrameLowering *getFrameLowering() const override
-  { return FrameLowering.get(); }
-  const MipsSubtarget *getSubtargetImpl() const override
-  { return &Subtarget; }
-  const DataLayout *getDataLayout()    const override
-  { return &DL;}
-
+  const MipsInstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
+  }
+  const TargetFrameLowering *getFrameLowering() const override {
+    return getSubtargetImpl()->getFrameLowering();
+  }
+  const MipsSubtarget *getSubtargetImpl() const override { return &Subtarget; }
   const InstrItineraryData *getInstrItineraryData() const override {
-    return Subtarget.inMips16Mode() ? nullptr : &InstrItins;
+    return Subtarget.inMips16Mode()
+               ? nullptr
+               : &getSubtargetImpl()->getInstrItineraryData();
+  }
+  MipsJITInfo *getJITInfo() override {
+    return Subtarget.getJITInfo();
   }
-
-  MipsJITInfo *getJITInfo() override { return &JITInfo; }
-
   const MipsRegisterInfo *getRegisterInfo()  const override {
-    return &InstrInfo->getRegisterInfo();
+    return getSubtargetImpl()->getRegisterInfo();
   }
-
   const MipsTargetLowering *getTargetLowering() const override {
-    return TLInfo.get();
+    return getSubtargetImpl()->getTargetLowering();
+  }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
   }
-
   const MipsSelectionDAGInfo* getSelectionDAGInfo() const override {
-    return &TSInfo;
+    return getSubtargetImpl()->getSelectionDAGInfo();
   }
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
   bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE) override;
-
-  // Set helper classes
-  void setHelperClassesMips16();
-
-  void setHelperClassesMipsSE();
-
-
 };
 
 /// MipsebTargetMachine - Mips32/64 big endian target machine.
diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h
index 4ad37ac..99f7d4c 100644
--- a/lib/Target/Mips/MipsTargetStreamer.h
+++ b/lib/Target/Mips/MipsTargetStreamer.h
@@ -12,46 +12,83 @@
 
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCStreamer.h"
+#include "MCTargetDesc/MipsABIFlagsSection.h"
 
 namespace llvm {
-class MipsTargetStreamer : public MCTargetStreamer {
-  virtual void anchor();
 
+struct MipsABIFlagsSection;
+
+class MipsTargetStreamer : public MCTargetStreamer {
 public:
   MipsTargetStreamer(MCStreamer &S);
-  virtual void emitDirectiveSetMicroMips() = 0;
-  virtual void emitDirectiveSetNoMicroMips() = 0;
-  virtual void emitDirectiveSetMips16() = 0;
-  virtual void emitDirectiveSetNoMips16() = 0;
-
-  virtual void emitDirectiveSetReorder() = 0;
-  virtual void emitDirectiveSetNoReorder() = 0;
-  virtual void emitDirectiveSetMacro() = 0;
-  virtual void emitDirectiveSetNoMacro() = 0;
-  virtual void emitDirectiveSetAt() = 0;
-  virtual void emitDirectiveSetNoAt() = 0;
-  virtual void emitDirectiveEnd(StringRef Name) = 0;
-
-  virtual void emitDirectiveEnt(const MCSymbol &Symbol) = 0;
-  virtual void emitDirectiveAbiCalls() = 0;
-  virtual void emitDirectiveNaN2008() = 0;
-  virtual void emitDirectiveNaNLegacy() = 0;
-  virtual void emitDirectiveOptionPic0() = 0;
-  virtual void emitDirectiveOptionPic2() = 0;
+  virtual void emitDirectiveSetMicroMips();
+  virtual void emitDirectiveSetNoMicroMips();
+  virtual void emitDirectiveSetMips16();
+  virtual void emitDirectiveSetNoMips16();
+
+  virtual void emitDirectiveSetReorder();
+  virtual void emitDirectiveSetNoReorder();
+  virtual void emitDirectiveSetMacro();
+  virtual void emitDirectiveSetNoMacro();
+  virtual void emitDirectiveSetAt();
+  virtual void emitDirectiveSetNoAt();
+  virtual void emitDirectiveEnd(StringRef Name);
+
+  virtual void emitDirectiveEnt(const MCSymbol &Symbol);
+  virtual void emitDirectiveAbiCalls();
+  virtual void emitDirectiveNaN2008();
+  virtual void emitDirectiveNaNLegacy();
+  virtual void emitDirectiveOptionPic0();
+  virtual void emitDirectiveOptionPic2();
   virtual void emitFrame(unsigned StackReg, unsigned StackSize,
-                         unsigned ReturnReg) = 0;
-  virtual void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) = 0;
-  virtual void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) = 0;
+                         unsigned ReturnReg);
+  virtual void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff);
+  virtual void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff);
 
-  virtual void emitDirectiveSetMips32R2() = 0;
-  virtual void emitDirectiveSetMips64() = 0;
-  virtual void emitDirectiveSetMips64R2() = 0;
-  virtual void emitDirectiveSetDsp() = 0;
+  virtual void emitDirectiveSetMips32R2();
+  virtual void emitDirectiveSetMips64();
+  virtual void emitDirectiveSetMips64R2();
+  virtual void emitDirectiveSetDsp();
 
   // PIC support
-  virtual void emitDirectiveCpload(unsigned RegNo) = 0;
+  virtual void emitDirectiveCpload(unsigned RegNo);
   virtual void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
-                                    const MCSymbol &Sym, bool IsReg) = 0;
+                                    const MCSymbol &Sym, bool IsReg);
+
+  /// Emit a '.module fp=value' directive using the given values.
+  /// Updates the .MIPS.abiflags section
+  virtual void emitDirectiveModuleFP(MipsABIFlagsSection::FpABIKind Value,
+                                     bool Is32BitABI) {
+    ABIFlagsSection.setFpABI(Value, Is32BitABI);
+  }
+
+  /// Emit a '.module fp=value' directive using the current values of the
+  /// .MIPS.abiflags section.
+  void emitDirectiveModuleFP() {
+    emitDirectiveModuleFP(ABIFlagsSection.getFpABI(),
+                          ABIFlagsSection.Is32BitABI);
+  }
+
+  virtual void emitDirectiveModuleOddSPReg(bool Enabled, bool IsO32ABI);
+  virtual void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value){};
+  virtual void emitMipsAbiFlags(){};
+  void setCanHaveModuleDir(bool Can) { canHaveModuleDirective = Can; }
+  bool getCanHaveModuleDir() { return canHaveModuleDirective; }
+
+  // This method enables template classes to set internal abi flags
+  // structure values.
+  template <class PredicateLibrary>
+  void updateABIInfo(const PredicateLibrary &P) {
+    ABIFlagsSection.setAllFromPredicates(P);
+  }
+
+  MipsABIFlagsSection &getABIFlagsSection() { return ABIFlagsSection; }
+
+protected:
+  MipsABIFlagsSection ABIFlagsSection;
+
+private:
+  bool canHaveModuleDirective;
 };
 
 // This part is for ascii assembly output
@@ -93,6 +130,13 @@ public:
   virtual void emitDirectiveCpload(unsigned RegNo);
   void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
                             const MCSymbol &Sym, bool IsReg) override;
+
+  // ABI Flags
+  void emitDirectiveModuleFP(MipsABIFlagsSection::FpABIKind Value,
+                             bool Is32BitABI) override;
+  void emitDirectiveModuleOddSPReg(bool Enabled, bool IsO32ABI) override;
+  void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value) override;
+  void emitMipsAbiFlags() override;
 };
 
 // This part is for ELF object output
@@ -144,6 +188,10 @@ public:
   void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
                             const MCSymbol &Sym, bool IsReg) override;
 
+  // ABI Flags
+  void emitDirectiveModuleOddSPReg(bool Enabled, bool IsO32ABI) override;
+  void emitMipsAbiFlags() override;
+
 protected:
   bool isO32() const { return STI.getFeatureBits() & Mips::FeatureO32; }
   bool isN32() const { return STI.getFeatureBits() & Mips::FeatureN32; }
diff --git a/lib/Target/NVPTX/NVPTX.td b/lib/Target/NVPTX/NVPTX.td
index d78b4e8..93fabf6 100644
--- a/lib/Target/NVPTX/NVPTX.td
+++ b/lib/Target/NVPTX/NVPTX.td
@@ -34,12 +34,18 @@ def SM30 : SubtargetFeature<"sm_30", "SmVersion", "30",
                             "Target SM 3.0">;
 def SM35 : SubtargetFeature<"sm_35", "SmVersion", "35",
                             "Target SM 3.5">;
+def SM50 : SubtargetFeature<"sm_50", "SmVersion", "50",
+                            "Target SM 5.0">;
 
 // PTX Versions
 def PTX30 : SubtargetFeature<"ptx30", "PTXVersion", "30",
                              "Use PTX version 3.0">;
 def PTX31 : SubtargetFeature<"ptx31", "PTXVersion", "31",
                              "Use PTX version 3.1">;
+def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32",
+                             "Use PTX version 3.2">;
+def PTX40 : SubtargetFeature<"ptx40", "PTXVersion", "40",
+                             "Use PTX version 4.0">;
 
 //===----------------------------------------------------------------------===//
 // NVPTX supported processors.
@@ -52,6 +58,7 @@ def : Proc<"sm_20", [SM20]>;
 def : Proc<"sm_21", [SM21]>;
 def : Proc<"sm_30", [SM30]>;
 def : Proc<"sm_35", [SM35]>;
+def : Proc<"sm_50", [SM50]>;
 
 
 def NVPTXInstrInfo : InstrInfo {
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 4ec575f..decf02a 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -734,23 +734,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
         << " func_retval0";
     } else {
       if ((Ty->getTypeID() == Type::StructTyID) || isa<VectorType>(Ty)) {
-        SmallVector<EVT, 16> vtparts;
-        ComputeValueVTs(*TLI, Ty, vtparts);
-        unsigned totalsz = 0;
-        for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
-          unsigned elems = 1;
-          EVT elemtype = vtparts[i];
-          if (vtparts[i].isVector()) {
-            elems = vtparts[i].getVectorNumElements();
-            elemtype = vtparts[i].getVectorElementType();
-          }
-          for (unsigned j = 0, je = elems; j != je; ++j) {
-            unsigned sz = elemtype.getSizeInBits();
-            if (elemtype.isInteger() && (sz < 8))
-              sz = 8;
-            totalsz += sz / 8;
-          }
-        }
+        unsigned totalsz = TD->getTypeAllocSize(Ty);
         unsigned retAlignment = 0;
         if (!llvm::getAlign(*F, 0, retAlignment))
           retAlignment = TD->getABITypeAlignment(Ty);
@@ -1321,6 +1305,10 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) {
 // external global variable with init     -> .visible
 // external without init                  -> .extern
 // appending                              -> not allowed, assert.
+// for any linkage other than
+// internal, private, linker_private,
+// linker_private_weak, linker_private_weak_def_auto,
+// we emit                                -> .weak.
 
 void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V,
                                            raw_ostream &O) {
@@ -1346,6 +1334,9 @@ void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V,
         msg.append(V->getName().str());
       msg.append("has unsupported appending linkage type");
       llvm_unreachable(msg.c_str());
+    } else if (!V->hasInternalLinkage() &&
+               !V->hasPrivateLinkage()) {
+      O << ".weak ";
     }
   }
 }
@@ -1356,10 +1347,15 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
 
   // Skip meta data
   if (GVar->hasSection()) {
-    if (GVar->getSection() == "llvm.metadata")
+    if (GVar->getSection() == StringRef("llvm.metadata"))
       return;
   }
 
+  // Skip LLVM intrinsic global variables
+  if (GVar->getName().startswith("llvm.") ||
+      GVar->getName().startswith("nvvm."))
+    return;
+
   const DataLayout *TD = TM.getDataLayout();
 
   // GlobalVariables are always constant pointers themselves.
@@ -1371,6 +1367,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
       O << ".visible ";
     else
       O << ".extern ";
+  } else if (GVar->hasLinkOnceLinkage() || GVar->hasWeakLinkage() ||
+             GVar->hasAvailableExternallyLinkage() ||
+             GVar->hasCommonLinkage()) {
+    O << ".weak ";
   }
 
   if (llvm::isTexture(*GVar)) {
@@ -1438,7 +1438,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
         O << "linear";
         break;
       case 2:
-        assert(0 && "Anisotropic filtering is not supported");
+        llvm_unreachable("Anisotropic filtering is not supported");
       default:
         O << "nearest";
         break;
@@ -1480,6 +1480,11 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
 
   O << ".";
   emitPTXAddressSpace(PTy->getAddressSpace(), O);
+
+  if (isManaged(*GVar)) {
+    O << " .attribute(.managed)";
+  }
+
   if (GVar->getAlignment() == 0)
     O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
   else
@@ -1497,13 +1502,24 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
 
     // Ptx allows variable initilization only for constant and global state
     // spaces.
-    if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
-         (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) &&
-        GVar->hasInitializer()) {
-      const Constant *Initializer = GVar->getInitializer();
-      if (!Initializer->isNullValue()) {
-        O << " = ";
-        printScalarConstant(Initializer, O);
+    if (GVar->hasInitializer()) {
+      if ((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
+          (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) {
+        const Constant *Initializer = GVar->getInitializer();
+        // 'undef' is treated as there is no value spefied.
+        if (!Initializer->isNullValue() && !isa<UndefValue>(Initializer)) {
+          O << " = ";
+          printScalarConstant(Initializer, O);
+        }
+      } else {
+        // The frontend adds zero-initializer to variables that don't have an
+        // initial value, so skip warning for this case.
+        if (!GVar->getInitializer()->isNullValue()) {
+          std::string warnMsg = "initial value of '" + GVar->getName().str() +
+              "' is not allowed in addrspace(" +
+              llvm::utostr_32(PTy->getAddressSpace()) + ")";
+          report_fatal_error(warnMsg.c_str());
+        }
       }
     }
   } else {
@@ -1562,7 +1578,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
       }
       break;
     default:
-      assert(0 && "type not supported yet");
+      llvm_unreachable("type not supported yet");
     }
 
   }
@@ -1682,7 +1698,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
     O << "]";
     break;
   default:
-    assert(0 && "type not supported yet");
+    llvm_unreachable("type not supported yet");
   }
   return;
 }
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 9030584..8b088412 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -26,6 +26,10 @@
 
 using namespace llvm;
 
+NVPTXFrameLowering::NVPTXFrameLowering(NVPTXSubtarget &STI)
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0),
+      is64bit(STI.is64Bit()) {}
+
 bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; }
 
 void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const {
@@ -43,17 +47,21 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const {
     // cvta.local %SP, %SPL;
     if (is64bit) {
       unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int64RegsRegClass);
-      MachineInstr *MI = BuildMI(
-          MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::cvta_local_yes_64),
-          NVPTX::VRFrame).addReg(LocalReg);
-      BuildMI(MBB, MI, dl, tm.getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64),
+      MachineInstr *MI =
+          BuildMI(MBB, MBBI, dl,
+                  MF.getTarget().getInstrInfo()->get(NVPTX::cvta_local_yes_64),
+                  NVPTX::VRFrame).addReg(LocalReg);
+      BuildMI(MBB, MI, dl,
+              MF.getTarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64),
               LocalReg).addImm(MF.getFunctionNumber());
     } else {
       unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int32RegsRegClass);
-      MachineInstr *MI = BuildMI(
-          MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::cvta_local_yes),
-          NVPTX::VRFrame).addReg(LocalReg);
-      BuildMI(MBB, MI, dl, tm.getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR),
+      MachineInstr *MI =
+          BuildMI(MBB, MBBI, dl,
+                  MF.getTarget().getInstrInfo()->get(NVPTX::cvta_local_yes),
+                  NVPTX::VRFrame).addReg(LocalReg);
+      BuildMI(MBB, MI, dl,
+              MF.getTarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR),
               LocalReg).addImm(MF.getFunctionNumber());
     }
   }
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h
index 2ae6d72..56fb673 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.h
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -17,16 +17,12 @@
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
-class NVPTXTargetMachine;
-
+class NVPTXSubtarget;
 class NVPTXFrameLowering : public TargetFrameLowering {
-  NVPTXTargetMachine &tm;
   bool is64bit;
 
 public:
-  explicit NVPTXFrameLowering(NVPTXTargetMachine &_tm, bool _is64bit)
-      : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0), tm(_tm),
-        is64bit(_is64bit) {}
+  explicit NVPTXFrameLowering(NVPTXSubtarget &STI);
 
   bool hasFP(const MachineFunction &MF) const override;
   void emitPrologue(MachineFunction &MF) const override;
diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 023dd5e..faa9fdb 100644
--- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -84,7 +84,7 @@ bool GenericToNVVM::runOnModule(Module &M) {
     GlobalVariable *GV = I++;
     if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC &&
         !llvm::isTexture(*GV) && !llvm::isSurface(*GV) &&
-        !GV->getName().startswith("llvm.")) {
+        !llvm::isSampler(*GV) && !GV->getName().startswith("llvm.")) {
       GlobalVariable *NewGV = new GlobalVariable(
           M, GV->getType()->getElementType(), GV->isConstant(),
           GV->getLinkage(),
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index cd30880..0dfbf10 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -24,11 +24,14 @@ using namespace llvm;
 
 #define DEBUG_TYPE "nvptx-isel"
 
-static cl::opt<int>
-FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
-                 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
-                          " 1: do it  2: do it aggressively"),
-                 cl::init(2));
+unsigned FMAContractLevel = 0;
+
+static cl::opt<unsigned, true>
+FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
+                    cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
+                             " 1: do it  2: do it aggressively"),
+                    cl::location(FMAContractLevel),
+                    cl::init(2));
 
 static cl::opt<int> UsePrecDivF32(
     "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
@@ -138,7 +141,7 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
   case NVPTXISD::LDGV4:
   case NVPTXISD::LDUV2:
   case NVPTXISD::LDUV4:
-    ResNode = SelectLDGLDUVector(N);
+    ResNode = SelectLDGLDU(N);
     break;
   case NVPTXISD::StoreV2:
   case NVPTXISD::StoreV4:
@@ -164,6 +167,9 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
   case ISD::INTRINSIC_WO_CHAIN:
     ResNode = SelectIntrinsicNoChain(N);
     break;
+  case ISD::INTRINSIC_W_CHAIN:
+    ResNode = SelectIntrinsicChain(N);
+    break;
   case NVPTXISD::Tex1DFloatI32:
   case NVPTXISD::Tex1DFloatFloat:
   case NVPTXISD::Tex1DFloatFloatLevel:
@@ -253,6 +259,12 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
   case NVPTXISD::Suld3DV4I32Trap:
     ResNode = SelectSurfaceIntrinsic(N);
     break;
+  case ISD::AND:
+  case ISD::SRA:
+  case ISD::SRL:
+    // Try to select BFE
+    ResNode = SelectBFE(N);
+    break;
   case ISD::ADDRSPACECAST:
     ResNode = SelectAddrSpaceCast(N);
     break;
@@ -264,6 +276,21 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
   return SelectCode(N);
 }
 
+SDNode *NVPTXDAGToDAGISel::SelectIntrinsicChain(SDNode *N) {
+  unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  switch (IID) {
+  default:
+    return NULL;
+  case Intrinsic::nvvm_ldg_global_f:
+  case Intrinsic::nvvm_ldg_global_i:
+  case Intrinsic::nvvm_ldg_global_p:
+  case Intrinsic::nvvm_ldu_global_f:
+  case Intrinsic::nvvm_ldu_global_i:
+  case Intrinsic::nvvm_ldu_global_p:
+    return SelectLDGLDU(N);
+  }
+}
+
 static unsigned int getCodeAddrSpace(MemSDNode *N,
                                      const NVPTXSubtarget &Subtarget) {
   const Value *Src = N->getMemOperand()->getValue();
@@ -981,22 +1008,101 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
   return LD;
 }
 
-SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
+SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
 
   SDValue Chain = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
+  SDValue Op1;
+  MemSDNode *Mem;
+  bool IsLDG = true;
+
+  // If this is an LDG intrinsic, the address is the third operand. Its its an
+  // LDG/LDU SD node (from custom vector handling), then its the second operand
+  if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+    Op1 = N->getOperand(2);
+    Mem = cast<MemIntrinsicSDNode>(N);
+    unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IID) {
+    default:
+      return NULL;
+    case Intrinsic::nvvm_ldg_global_f:
+    case Intrinsic::nvvm_ldg_global_i:
+    case Intrinsic::nvvm_ldg_global_p:
+      IsLDG = true;
+      break;
+    case Intrinsic::nvvm_ldu_global_f:
+    case Intrinsic::nvvm_ldu_global_i:
+    case Intrinsic::nvvm_ldu_global_p:
+      IsLDG = false;
+      break;
+    }
+  } else {
+    Op1 = N->getOperand(1);
+    Mem = cast<MemSDNode>(N);
+  }
+
   unsigned Opcode;
   SDLoc DL(N);
   SDNode *LD;
-  MemSDNode *Mem = cast<MemSDNode>(N);
   SDValue Base, Offset, Addr;
 
-  EVT EltVT = Mem->getMemoryVT().getVectorElementType();
+  EVT EltVT = Mem->getMemoryVT();
+  if (EltVT.isVector()) {
+    EltVT = EltVT.getVectorElementType();
+  }
 
   if (SelectDirectAddr(Op1, Addr)) {
     switch (N->getOpcode()) {
     default:
       return nullptr;
+    case ISD::INTRINSIC_W_CHAIN:
+      if (IsLDG) {
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8avar;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16avar;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32avar;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64avar;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32avar;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64avar;
+          break;
+        }
+      } else {
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return nullptr;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8avar;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16avar;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32avar;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64avar;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32avar;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64avar;
+          break;
+        }
+      }
+      break;
     case NVPTXISD::LDGV2:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
@@ -1092,6 +1198,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       switch (N->getOpcode()) {
       default:
         return nullptr;
+      case ISD::INTRINSIC_W_CHAIN:
+        if (IsLDG) {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return nullptr;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari64;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari64;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari64;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari64;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari64;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari64;
+            break;
+          }
+        } else {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return nullptr;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari64;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari64;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari64;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari64;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari64;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari64;
+            break;
+          }
+        }
+        break;
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
@@ -1181,6 +1336,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       switch (N->getOpcode()) {
       default:
         return nullptr;
+      case ISD::INTRINSIC_W_CHAIN:
+        if (IsLDG) {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return nullptr;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari;
+            break;
+          }
+        } else {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return nullptr;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari;
+            break;
+          }
+        }
+        break;
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
@@ -1276,6 +1480,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       switch (N->getOpcode()) {
       default:
         return nullptr;
+      case ISD::INTRINSIC_W_CHAIN:
+        if (IsLDG) {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return nullptr;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg64;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg64;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg64;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg64;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg64;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg64;
+            break;
+          }
+        } else {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return nullptr;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg64;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg64;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg64;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg64;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg64;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg64;
+            break;
+          }
+        }
+        break;
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
@@ -1365,6 +1618,55 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       switch (N->getOpcode()) {
       default:
         return nullptr;
+      case ISD::INTRINSIC_W_CHAIN:
+        if (IsLDG) {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return nullptr;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg;
+            break;
+          }
+        } else {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return nullptr;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg;
+            break;
+          }
+        }
+        break;
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
@@ -1457,7 +1759,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
   }
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
-  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  MemRefs0[0] = Mem->getMemOperand();
   cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
 
   return LD;
@@ -2959,6 +3261,214 @@ SDNode *NVPTXDAGToDAGISel::SelectSurfaceIntrinsic(SDNode *N) {
   return Ret;
 }
 
+/// SelectBFE - Look for instruction sequences that can be made more efficient
+/// by using the 'bfe' (bit-field extract) PTX instruction
+SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue Len;
+  SDValue Start;
+  SDValue Val;
+  bool IsSigned = false;
+
+  if (N->getOpcode() == ISD::AND) {
+    // Canonicalize the operands
+    // We want 'and %val, %mask'
+    if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
+      std::swap(LHS, RHS);
+    }
+
+    ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
+    if (!Mask) {
+      // We need a constant mask on the RHS of the AND
+      return NULL;
+    }
+
+    // Extract the mask bits
+    uint64_t MaskVal = Mask->getZExtValue();
+    if (!isMask_64(MaskVal)) {
+      // We *could* handle shifted masks here, but doing so would require an
+      // 'and' operation to fix up the low-order bits so we would trade
+      // shr+and for bfe+and, which has the same throughput
+      return NULL;
+    }
+
+    // How many bits are in our mask?
+    uint64_t NumBits = CountTrailingOnes_64(MaskVal);
+    Len = CurDAG->getTargetConstant(NumBits, MVT::i32);
+
+    if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
+      // We have a 'srl/and' pair, extract the effective start bit and length
+      Val = LHS.getNode()->getOperand(0);
+      Start = LHS.getNode()->getOperand(1);
+      ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
+      if (StartConst) {
+        uint64_t StartVal = StartConst->getZExtValue();
+        // How many "good" bits do we have left?  "good" is defined here as bits
+        // that exist in the original value, not shifted in.
+        uint64_t GoodBits = Start.getValueType().getSizeInBits() - StartVal;
+        if (NumBits > GoodBits) {
+          // Do not handle the case where bits have been shifted in. In theory
+          // we could handle this, but the cost is likely higher than just
+          // emitting the srl/and pair.
+          return NULL;
+        }
+        Start = CurDAG->getTargetConstant(StartVal, MVT::i32);
+      } else {
+        // Do not handle the case where the shift amount (can be zero if no srl
+        // was found) is not constant. We could handle this case, but it would
+        // require run-time logic that would be more expensive than just
+        // emitting the srl/and pair.
+        return NULL;
+      }
+    } else {
+      // Do not handle the case where the LHS of the and is not a shift. While
+      // it would be trivial to handle this case, it would just transform
+      // 'and' -> 'bfe', but 'and' has higher-throughput.
+      return NULL;
+    }
+  } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
+    if (LHS->getOpcode() == ISD::AND) {
+      ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
+      if (!ShiftCnst) {
+        // Shift amount must be constant
+        return NULL;
+      }
+
+      uint64_t ShiftAmt = ShiftCnst->getZExtValue();
+
+      SDValue AndLHS = LHS->getOperand(0);
+      SDValue AndRHS = LHS->getOperand(1);
+
+      // Canonicalize the AND to have the mask on the RHS
+      if (isa<ConstantSDNode>(AndLHS)) {
+        std::swap(AndLHS, AndRHS);
+      }
+
+      ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
+      if (!MaskCnst) {
+        // Mask must be constant
+        return NULL;
+      }
+
+      uint64_t MaskVal = MaskCnst->getZExtValue();
+      uint64_t NumZeros;
+      uint64_t NumBits;
+      if (isMask_64(MaskVal)) {
+        NumZeros = 0;
+        // The number of bits in the result bitfield will be the number of
+        // trailing ones (the AND) minus the number of bits we shift off
+        NumBits = CountTrailingOnes_64(MaskVal) - ShiftAmt;
+      } else if (isShiftedMask_64(MaskVal)) {
+        NumZeros = countTrailingZeros(MaskVal);
+        unsigned NumOnes = CountTrailingOnes_64(MaskVal >> NumZeros);
+        // The number of bits in the result bitfield will be the number of
+        // trailing zeros plus the number of set bits in the mask minus the
+        // number of bits we shift off
+        NumBits = NumZeros + NumOnes - ShiftAmt;
+      } else {
+        // This is not a mask we can handle
+        return NULL;
+      }
+
+      if (ShiftAmt < NumZeros) {
+        // Handling this case would require extra logic that would make this
+        // transformation non-profitable
+        return NULL;
+      }
+
+      Val = AndLHS;
+      Start = CurDAG->getTargetConstant(ShiftAmt, MVT::i32);
+      Len = CurDAG->getTargetConstant(NumBits, MVT::i32);
+    } else if (LHS->getOpcode() == ISD::SHL) {
+      // Here, we have a pattern like:
+      //
+      // (sra (shl val, NN), MM)
+      // or
+      // (srl (shl val, NN), MM)
+      //
+      // If MM >= NN, we can efficiently optimize this with bfe
+      Val = LHS->getOperand(0);
+
+      SDValue ShlRHS = LHS->getOperand(1);
+      ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
+      if (!ShlCnst) {
+        // Shift amount must be constant
+        return NULL;
+      }
+      uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
+
+      SDValue ShrRHS = RHS;
+      ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
+      if (!ShrCnst) {
+        // Shift amount must be constant
+        return NULL;
+      }
+      uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
+
+      // To avoid extra codegen and be profitable, we need Outer >= Inner
+      if (OuterShiftAmt < InnerShiftAmt) {
+        return NULL;
+      }
+
+      // If the outer shift is more than the type size, we have no bitfield to
+      // extract (since we also check that the inner shift is <= the outer shift
+      // then this also implies that the inner shift is < the type size)
+      if (OuterShiftAmt >= Val.getValueType().getSizeInBits()) {
+        return NULL;
+      }
+
+      Start =
+        CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, MVT::i32);
+      Len =
+        CurDAG->getTargetConstant(Val.getValueType().getSizeInBits() -
+                                  OuterShiftAmt, MVT::i32);
+
+      if (N->getOpcode() == ISD::SRA) {
+        // If we have a arithmetic right shift, we need to use the signed bfe
+        // variant
+        IsSigned = true;
+      }
+    } else {
+      // No can do...
+      return NULL;
+    }
+  } else {
+    // No can do...
+    return NULL;
+  }
+
+
+  unsigned Opc;
+  // For the BFE operations we form here from "and" and "srl", always use the
+  // unsigned variants.
+  if (Val.getValueType() == MVT::i32) {
+    if (IsSigned) {
+      Opc = NVPTX::BFE_S32rii;
+    } else {
+      Opc = NVPTX::BFE_U32rii;
+    }
+  } else if (Val.getValueType() == MVT::i64) {
+    if (IsSigned) {
+      Opc = NVPTX::BFE_S64rii;
+    } else {
+      Opc = NVPTX::BFE_U64rii;
+    }
+  } else {
+    // We cannot handle this type
+    return NULL;
+  }
+
+  SDValue Ops[] = {
+    Val, Start, Len
+  };
+
+  SDNode *Ret =
+    CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
+
+  return Ret;
+}
+
 // SelectDirectAddr - Match a direct address for DAG.
 // A direct address could be a globaladdress or externalsymbol.
 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 11f92e7..c44ccb2 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -59,10 +59,11 @@ private:
 
   SDNode *Select(SDNode *N) override;
   SDNode *SelectIntrinsicNoChain(SDNode *N);
+  SDNode *SelectIntrinsicChain(SDNode *N);
   SDNode *SelectTexSurfHandle(SDNode *N);
   SDNode *SelectLoad(SDNode *N);
   SDNode *SelectLoadVector(SDNode *N);
-  SDNode *SelectLDGLDUVector(SDNode *N);
+  SDNode *SelectLDGLDU(SDNode *N);
   SDNode *SelectStore(SDNode *N);
   SDNode *SelectStoreVector(SDNode *N);
   SDNode *SelectLoadParam(SDNode *N);
@@ -71,6 +72,7 @@ private:
   SDNode *SelectAddrSpaceCast(SDNode *N);
   SDNode *SelectTextureIntrinsic(SDNode *N);
   SDNode *SelectSurfaceIntrinsic(SDNode *N);
+  SDNode *SelectBFE(SDNode *N);
         
   inline SDValue getI32Imm(unsigned Imm) {
     return CurDAG->getTargetConstant(Imm, MVT::i32);
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index b0943be..cb452ff 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <sstream>
 
@@ -111,6 +112,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
   MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
 
   setBooleanContents(ZeroOrNegativeOneBooleanContent);
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
   // Jump is Expensive. Don't create extra control flow for 'and', 'or'
   // condition branches.
@@ -130,7 +132,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
   addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
 
   // Operations not directly supported by NVPTX.
-  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i8, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
   setOperationAction(ISD::BR_CC, MVT::f64, Expand);
   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
@@ -146,6 +154,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
+  setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32  , Custom);
+  setOperationAction(ISD::SHL_PARTS, MVT::i64  , Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);
+
   if (nvptxSubtarget.hasROT64()) {
     setOperationAction(ISD::ROTL, MVT::i64, Legal);
     setOperationAction(ISD::ROTR, MVT::i64, Legal);
@@ -237,6 +252,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
   setOperationAction(ISD::CTPOP, MVT::i32, Legal);
   setOperationAction(ISD::CTPOP, MVT::i64, Legal);
 
+  // We have some custom DAG combine patterns for these nodes
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::SHL);
+
   // Now deduce the information based on the above mentioned
   // actions
   computeRegisterProperties();
@@ -328,6 +350,16 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "NVPTXISD::StoreV2";
   case NVPTXISD::StoreV4:
     return "NVPTXISD::StoreV4";
+  case NVPTXISD::FUN_SHFL_CLAMP:
+    return "NVPTXISD::FUN_SHFL_CLAMP";
+  case NVPTXISD::FUN_SHFR_CLAMP:
+    return "NVPTXISD::FUN_SHFR_CLAMP";
+  case NVPTXISD::IMAD:
+    return "NVPTXISD::IMAD";
+  case NVPTXISD::MUL_WIDE_SIGNED:
+    return "NVPTXISD::MUL_WIDE_SIGNED";
+  case NVPTXISD::MUL_WIDE_UNSIGNED:
+    return "NVPTXISD::MUL_WIDE_UNSIGNED";
   case NVPTXISD::Tex1DFloatI32:        return "NVPTXISD::Tex1DFloatI32";
   case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
   case NVPTXISD::Tex1DFloatFloatLevel:
@@ -441,8 +473,12 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
-bool NVPTXTargetLowering::shouldSplitVectorType(EVT VT) const {
-  return VT.getScalarType() == MVT::i1;
+TargetLoweringBase::LegalizeTypeAction
+NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
+  if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
+    return TypeSplitVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
 SDValue
@@ -487,26 +523,12 @@ NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
     } else if (isa<PointerType>(retTy)) {
       O << ".param .b" << getPointerTy().getSizeInBits() << " _";
     } else {
-      if ((retTy->getTypeID() == Type::StructTyID) || isa<VectorType>(retTy)) {
-        SmallVector<EVT, 16> vtparts;
-        ComputeValueVTs(*this, retTy, vtparts);
-        unsigned totalsz = 0;
-        for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
-          unsigned elems = 1;
-          EVT elemtype = vtparts[i];
-          if (vtparts[i].isVector()) {
-            elems = vtparts[i].getVectorNumElements();
-            elemtype = vtparts[i].getVectorElementType();
-          }
-          // TODO: no need to loop
-          for (unsigned j = 0, je = elems; j != je; ++j) {
-            unsigned sz = elemtype.getSizeInBits();
-            if (elemtype.isInteger() && (sz < 8))
-              sz = 8;
-            totalsz += sz / 8;
-          }
-        }
-        O << ".param .align " << retAlignment << " .b8 _[" << totalsz << "]";
+      if((retTy->getTypeID() == Type::StructTyID) ||
+         isa<VectorType>(retTy)) {
+        O << ".param .align "
+          << retAlignment
+          << " .b8 _["
+          << getDataLayout()->getTypeAllocSize(retTy) << "]";
       } else {
         assert(false && "Unknown return type");
       }
@@ -675,7 +697,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       if (Ty->isAggregateType()) {
         // aggregate
         SmallVector<EVT, 16> vtparts;
-        ComputeValueVTs(*this, Ty, vtparts);
+        SmallVector<uint64_t, 16> Offsets;
+        ComputePTXValueVTs(*this, Ty, vtparts, &Offsets, 0);
 
         unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
         // declare .param .align <align> .b8 .param<n>[<size>];
@@ -687,34 +710,26 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
                             DeclareParamOps);
         InFlag = Chain.getValue(1);
-        unsigned curOffset = 0;
         for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
-          unsigned elems = 1;
           EVT elemtype = vtparts[j];
-          if (vtparts[j].isVector()) {
-            elems = vtparts[j].getVectorNumElements();
-            elemtype = vtparts[j].getVectorElementType();
-          }
-          for (unsigned k = 0, ke = elems; k != ke; ++k) {
-            unsigned sz = elemtype.getSizeInBits();
-            if (elemtype.isInteger() && (sz < 8))
-              sz = 8;
-            SDValue StVal = OutVals[OIdx];
-            if (elemtype.getSizeInBits() < 16) {
-              StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
-            }
-            SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-            SDValue CopyParamOps[] = { Chain,
-                                       DAG.getConstant(paramCount, MVT::i32),
-                                       DAG.getConstant(curOffset, MVT::i32),
-                                       StVal, InFlag };
-            Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
-                                            CopyParamVTs, CopyParamOps,
-                                            elemtype, MachinePointerInfo());
-            InFlag = Chain.getValue(1);
-            curOffset += sz / 8;
-            ++OIdx;
+          unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]);
+          if (elemtype.isInteger() && (sz < 8))
+            sz = 8;
+          SDValue StVal = OutVals[OIdx];
+          if (elemtype.getSizeInBits() < 16) {
+            StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
           }
+          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+          SDValue CopyParamOps[] = { Chain,
+                                     DAG.getConstant(paramCount, MVT::i32),
+                                     DAG.getConstant(Offsets[j], MVT::i32),
+                                     StVal, InFlag };
+          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
+                                          CopyParamVTs, CopyParamOps,
+                                          elemtype, MachinePointerInfo(),
+                                          ArgAlign);
+          InFlag = Chain.getValue(1);
+          ++OIdx;
         }
         if (vtparts.size() > 0)
           --OIdx;
@@ -899,13 +914,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     }
     // struct or vector
     SmallVector<EVT, 16> vtparts;
+    SmallVector<uint64_t, 16> Offsets;
     const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty);
     assert(PTy && "Type of a byval parameter should be pointer");
-    ComputeValueVTs(*this, PTy->getElementType(), vtparts);
+    ComputePTXValueVTs(*this, PTy->getElementType(), vtparts, &Offsets, 0);
 
     // declare .param .align <align> .b8 .param<n>[<size>];
     unsigned sz = Outs[OIdx].Flags.getByValSize();
     SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
     // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
     // so we don't need to worry about natural alignment or not.
     // See TargetLowering::LowerCallTo().
@@ -917,38 +934,28 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
                         DeclareParamOps);
     InFlag = Chain.getValue(1);
-    unsigned curOffset = 0;
     for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
-      unsigned elems = 1;
       EVT elemtype = vtparts[j];
-      if (vtparts[j].isVector()) {
-        elems = vtparts[j].getVectorNumElements();
-        elemtype = vtparts[j].getVectorElementType();
+      int curOffset = Offsets[j];
+      unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
+      SDValue srcAddr =
+          DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx],
+                      DAG.getConstant(curOffset, getPointerTy()));
+      SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
+                                   MachinePointerInfo(), false, false, false,
+                                   PartAlign);
+      if (elemtype.getSizeInBits() < 16) {
+        theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
       }
-      for (unsigned k = 0, ke = elems; k != ke; ++k) {
-        unsigned sz = elemtype.getSizeInBits();
-        if (elemtype.isInteger() && (sz < 8))
-          sz = 8;
-        SDValue srcAddr =
-            DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx],
-                        DAG.getConstant(curOffset, getPointerTy()));
-        SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
-                                     MachinePointerInfo(), false, false, false,
-                                     0);
-        if (elemtype.getSizeInBits() < 16) {
-          theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
-        }
-        SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-        SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
-                                   DAG.getConstant(curOffset, MVT::i32), theVal,
-                                   InFlag };
-        Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
-                                        CopyParamOps, elemtype,
-                                        MachinePointerInfo());
+      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
+                                 DAG.getConstant(curOffset, MVT::i32), theVal,
+                                 InFlag };
+      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
+                                      CopyParamOps, elemtype,
+                                      MachinePointerInfo());
 
-        InFlag = Chain.getValue(1);
-        curOffset += sz / 8;
-      }
+      InFlag = Chain.getValue(1);
     }
     ++paramCount;
   }
@@ -1057,7 +1064,6 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Generate loads from param memory/moves from registers for result
   if (Ins.size() > 0) {
-    unsigned resoffset = 0;
     if (retTy && retTy->isVectorTy()) {
       EVT ObjectVT = getValueType(retTy);
       unsigned NumElts = ObjectVT.getVectorNumElements();
@@ -1066,14 +1072,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                                         ObjectVT) == NumElts &&
              "Vector was not scalarized");
       unsigned sz = EltVT.getSizeInBits();
-      bool needTruncate = sz < 16 ? true : false;
+      bool needTruncate = sz < 8 ? true : false;
 
       if (NumElts == 1) {
         // Just a simple load
         SmallVector<EVT, 4> LoadRetVTs;
-        if (needTruncate) {
-          // If loading i1 result, generate
-          //   load i16
+        if (EltVT == MVT::i1 || EltVT == MVT::i8) {
+          // If loading i1/i8 result, generate
+          //   load.b8 i16
+          //   if i1
           //   trunc i16 to i1
           LoadRetVTs.push_back(MVT::i16);
         } else
@@ -1097,9 +1104,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       } else if (NumElts == 2) {
         // LoadV2
         SmallVector<EVT, 4> LoadRetVTs;
-        if (needTruncate) {
-          // If loading i1 result, generate
-          //   load i16
+        if (EltVT == MVT::i1 || EltVT == MVT::i8) {
+          // If loading i1/i8 result, generate
+          //   load.b8 i16
+          //   if i1
           //   trunc i16 to i1
           LoadRetVTs.push_back(MVT::i16);
           LoadRetVTs.push_back(MVT::i16);
@@ -1142,9 +1150,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
         for (unsigned i = 0; i < NumElts; i += VecSize) {
           SmallVector<EVT, 8> LoadRetVTs;
-          if (needTruncate) {
-            // If loading i1 result, generate
-            //   load i16
+          if (EltVT == MVT::i1 || EltVT == MVT::i8) {
+            // If loading i1/i8 result, generate
+            //   load.b8 i16
+            //   if i1
             //   trunc i16 to i1
             for (unsigned j = 0; j < VecSize; ++j)
               LoadRetVTs.push_back(MVT::i16);
@@ -1183,10 +1192,13 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       }
     } else {
       SmallVector<EVT, 16> VTs;
-      ComputePTXValueVTs(*this, retTy, VTs);
+      SmallVector<uint64_t, 16> Offsets;
+      ComputePTXValueVTs(*this, retTy, VTs, &Offsets, 0);
       assert(VTs.size() == Ins.size() && "Bad value decomposition");
+      unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0);
       for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
         unsigned sz = VTs[i].getSizeInBits();
+        unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]);
         bool needTruncate = sz < 8 ? true : false;
         if (VTs[i].isInteger() && (sz < 8))
           sz = 8;
@@ -1212,19 +1224,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         SmallVector<SDValue, 4> LoadRetOps;
         LoadRetOps.push_back(Chain);
         LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
-        LoadRetOps.push_back(DAG.getConstant(resoffset, MVT::i32));
+        LoadRetOps.push_back(DAG.getConstant(Offsets[i], MVT::i32));
         LoadRetOps.push_back(InFlag);
         SDValue retval = DAG.getMemIntrinsicNode(
             NVPTXISD::LoadParam, dl,
             DAG.getVTList(LoadRetVTs), LoadRetOps,
-            TheLoadType, MachinePointerInfo());
+            TheLoadType, MachinePointerInfo(), AlignI);
         Chain = retval.getValue(1);
         InFlag = retval.getValue(2);
         SDValue Ret0 = retval.getValue(0);
         if (needTruncate)
           Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0);
         InVals.push_back(Ret0);
-        resoffset += sz / 8;
       }
     }
   }
@@ -1262,6 +1273,127 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Ops);
 }
 
+/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
+/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
+///    amount, or
+/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
+///    amount.
+SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+  if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) {
+
+    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
+    // {dHi, dLo} = {aHi, aLo} >> Amt
+    //   dHi = aHi >> Amt
+    //   dLo = shf.r.clamp aLo, aHi, Amt
+
+    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+    SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
+                             ShAmt);
+
+    SDValue Ops[2] = { Lo, Hi };
+    return DAG.getMergeValues(Ops, dl);
+  }
+  else {
+
+    // {dHi, dLo} = {aHi, aLo} >> Amt
+    // - if (Amt>=size) then
+    //      dLo = aHi >> (Amt-size)
+    //      dHi = aHi >> Amt (this is either all 0 or all 1)
+    //   else
+    //      dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
+    //      dHi = aHi >> Amt
+
+    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                   DAG.getConstant(VTBits, MVT::i32), ShAmt);
+    SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+                                     DAG.getConstant(VTBits, MVT::i32));
+    SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+    SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+
+    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
+                               DAG.getConstant(VTBits, MVT::i32), ISD::SETGE);
+    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+    SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
+
+    SDValue Ops[2] = { Lo, Hi };
+    return DAG.getMergeValues(Ops, dl);
+  }
+}
+
+/// LowerShiftLeftParts - Lower SHL_PARTS, which
+/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
+///    amount, or
+/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
+///    amount.
+SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  assert(Op.getOpcode() == ISD::SHL_PARTS);
+
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+
+  if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) {
+
+    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
+    // {dHi, dLo} = {aHi, aLo} << Amt
+    //   dHi = shf.l.clamp aLo, aHi, Amt
+    //   dLo = aLo << Amt
+
+    SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
+                             ShAmt);
+    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+
+    SDValue Ops[2] = { Lo, Hi };
+    return DAG.getMergeValues(Ops, dl);
+  }
+  else {
+
+    // {dHi, dLo} = {aHi, aLo} << Amt
+    // - if (Amt>=size) then
+    //      dLo = aLo << Amt (all 0)
+    //      dLo = aLo << (Amt-size)
+    //   else
+    //      dLo = aLo << Amt
+    //      dHi = (aHi << Amt) | (aLo >> (size-Amt))
+
+    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                   DAG.getConstant(VTBits, MVT::i32), ShAmt);
+    SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+                                     DAG.getConstant(VTBits, MVT::i32));
+    SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+    SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+
+    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
+                               DAG.getConstant(VTBits, MVT::i32), ISD::SETGE);
+    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+    SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
+
+    SDValue Ops[2] = { Lo, Hi };
+    return DAG.getMergeValues(Ops, dl);
+  }
+}
+
 SDValue
 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
@@ -1282,6 +1414,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerSTORE(Op, DAG);
   case ISD::LOAD:
     return LowerLOAD(Op, DAG);
+  case ISD::SHL_PARTS:
+    return LowerShiftLeftParts(Op, DAG);
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS:
+    return LowerShiftRightParts(Op, DAG);
   default:
     llvm_unreachable("Custom lowering not defined for operation");
   }
@@ -1495,7 +1632,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
 
   const Function *F = MF.getFunction();
   const AttributeSet &PAL = F->getAttributes();
-  const TargetLowering *TLI = nvTM->getTargetLowering();
+  const TargetLowering *TLI = DAG.getTarget().getTargetLowering();
 
   SDValue Root = DAG.getRoot();
   std::vector<SDValue> OutChains;
@@ -1549,8 +1686,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
         assert(vtparts.size() > 0 && "empty aggregate type not expected");
         for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
              ++parti) {
-          EVT partVT = vtparts[parti];
-          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, partVT));
+          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
           ++InsIdx;
         }
         if (vtparts.size() > 0)
@@ -1866,7 +2002,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       unsigned Offset = 0;
 
       EVT VecVT =
-          EVT::getVectorVT(F->getContext(), OutVals[0].getValueType(), VecSize);
+          EVT::getVectorVT(F->getContext(), EltVT, VecSize);
       unsigned PerStoreOffset =
           TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
 
@@ -1925,12 +2061,10 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     }
   } else {
     SmallVector<EVT, 16> ValVTs;
-    // const_cast is necessary since we are still using an LLVM version from
-    // before the type system re-write.
-    ComputePTXValueVTs(*this, RetTy, ValVTs);
+    SmallVector<uint64_t, 16> Offsets;
+    ComputePTXValueVTs(*this, RetTy, ValVTs, &Offsets, 0);
     assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition");
 
-    unsigned SizeSoFar = 0;
     for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
       SDValue theVal = OutVals[i];
       EVT TheValType = theVal.getValueType();
@@ -1954,16 +2088,14 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
         else if (TmpVal.getValueType().getSizeInBits() < 16)
           TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal);
 
-        SDValue Ops[] = { Chain, DAG.getConstant(SizeSoFar, MVT::i32), TmpVal };
+        SDValue Ops[] = {
+          Chain,
+          DAG.getConstant(Offsets[i], MVT::i32),
+          TmpVal };
         Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
                                         DAG.getVTList(MVT::Other), Ops,
                                         TheStoreType,
                                         MachinePointerInfo());
-        if(TheValType.isVector())
-          SizeSoFar += 
-            TheStoreType.getVectorElementType().getStoreSizeInBits() / 8;
-        else
-          SizeSoFar += TheStoreType.getStoreSizeInBits()/8;
       }
     }
   }
@@ -2220,22 +2352,62 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
 
   case Intrinsic::nvvm_ldu_global_i:
   case Intrinsic::nvvm_ldu_global_f:
-  case Intrinsic::nvvm_ldu_global_p:
+  case Intrinsic::nvvm_ldu_global_p: {
 
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
       Info.memVT = getValueType(I.getType());
-    else if (Intrinsic == Intrinsic::nvvm_ldu_global_p)
+    else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
+      Info.memVT = getPointerTy();
+    else
       Info.memVT = getValueType(I.getType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+
+    // alignment is available as metadata.
+    // Grab it and set the alignment.
+    assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata");
+    MDNode *AlignMD = I.getMetadata("align");
+    assert(AlignMD && "Must have a non-null MDNode");
+    assert(AlignMD->getNumOperands() == 1 && "Must have a single operand");
+    Value *Align = AlignMD->getOperand(0);
+    int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue();
+    Info.align = Alignment;
+
+    return true;
+  }
+  case Intrinsic::nvvm_ldg_global_i:
+  case Intrinsic::nvvm_ldg_global_f:
+  case Intrinsic::nvvm_ldg_global_p: {
+
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
+      Info.memVT = getValueType(I.getType());
+    else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
+      Info.memVT = getPointerTy();
     else
-      Info.memVT = MVT::f32;
+      Info.memVT = getValueType(I.getType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.vol = 0;
     Info.readMem = true;
     Info.writeMem = false;
-    Info.align = 0;
+
+    // alignment is available as metadata.
+    // Grab it and set the alignment.
+    assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata");
+    MDNode *AlignMD = I.getMetadata("align");
+    assert(AlignMD && "Must have a non-null MDNode");
+    assert(AlignMD->getNumOperands() == 1 && "Must have a single operand");
+    Value *Align = AlignMD->getOperand(0);
+    int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue();
+    Info.align = Alignment;
+
     return true;
+  }
 
   case Intrinsic::nvvm_tex_1d_v4f32_i32:
   case Intrinsic::nvvm_tex_1d_v4f32_f32:
@@ -2427,6 +2599,7 @@ NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const {
     switch (Constraint[0]) {
     default:
       break;
+    case 'b':
     case 'r':
     case 'h':
     case 'c':
@@ -2446,6 +2619,8 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
                                                   MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
+    case 'b':
+      return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
     case 'c':
       return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
     case 'h':
@@ -2469,6 +2644,406 @@ unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const {
   return 4;
 }
 
+//===----------------------------------------------------------------------===//
+//                         NVPTX DAG Combining
+//===----------------------------------------------------------------------===//
+
+extern unsigned FMAContractLevel;
+
+/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
+/// operands N0 and N1.  This is a helper for PerformADDCombine that is
+/// called with the default operands, and if that fails, with commuted
+/// operands.
+static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                             const NVPTXSubtarget &Subtarget,
+                                             CodeGenOpt::Level OptLevel) {
+  SelectionDAG  &DAG = DCI.DAG;
+  // Skip non-integer, non-scalar case
+  EVT VT=N0.getValueType();
+  if (VT.isVector())
+    return SDValue();
+
+  // fold (add (mul a, b), c) -> (mad a, b, c)
+  //
+  if (N0.getOpcode() == ISD::MUL) {
+    assert (VT.isInteger());
+    // For integer:
+    // Since integer multiply-add costs the same as integer multiply
+    // but is more costly than integer add, do the fusion only when
+    // the mul is only used in the add.
+    if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
+        !N0.getNode()->hasOneUse())
+      return SDValue();
+
+    // Do the folding
+    return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
+                       N0.getOperand(0), N0.getOperand(1), N1);
+  }
+  else if (N0.getOpcode() == ISD::FMUL) {
+    if (VT == MVT::f32 || VT == MVT::f64) {
+      if (FMAContractLevel == 0)
+        return SDValue();
+
+      // For floating point:
+      // Do the fusion only when the mul has less than 5 uses and all
+      // are add.
+      // The heuristic is that if a use is not an add, then that use
+      // cannot be fused into fma, therefore mul is still needed anyway.
+      // If there are more than 4 uses, even if they are all add, fusing
+      // them will increase register pressue.
+      //
+      int numUses = 0;
+      int nonAddCount = 0;
+      for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
+           UE = N0.getNode()->use_end();
+           UI != UE; ++UI) {
+        numUses++;
+        SDNode *User = *UI;
+        if (User->getOpcode() != ISD::FADD)
+          ++nonAddCount;
+      }
+      if (numUses >= 5)
+        return SDValue();
+      if (nonAddCount) {
+        int orderNo = N->getIROrder();
+        int orderNo2 = N0.getNode()->getIROrder();
+        // simple heuristics here for considering potential register
+        // pressure, the logics here is that the differnce are used
+        // to measure the distance between def and use, the longer distance
+        // more likely cause register pressure.
+        if (orderNo - orderNo2 < 500)
+          return SDValue();
+
+        // Now, check if at least one of the FMUL's operands is live beyond the node N,
+        // which guarantees that the FMA will not increase register pressure at node N.
+        bool opIsLive = false;
+        const SDNode *left = N0.getOperand(0).getNode();
+        const SDNode *right = N0.getOperand(1).getNode();
+
+        if (dyn_cast<ConstantSDNode>(left) || dyn_cast<ConstantSDNode>(right))
+          opIsLive = true;
+
+        if (!opIsLive)
+          for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
+            SDNode *User = *UI;
+            int orderNo3 = User->getIROrder();
+            if (orderNo3 > orderNo) {
+              opIsLive = true;
+              break;
+            }
+          }
+
+        if (!opIsLive)
+          for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
+            SDNode *User = *UI;
+            int orderNo3 = User->getIROrder();
+            if (orderNo3 > orderNo) {
+              opIsLive = true;
+              break;
+            }
+          }
+
+        if (!opIsLive)
+          return SDValue();
+      }
+
+      return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                         N0.getOperand(0), N0.getOperand(1), N1);
+    }
+  }
+
+  return SDValue();
+}
+
+/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
+///
+static SDValue PerformADDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const NVPTXSubtarget &Subtarget,
+                                 CodeGenOpt::Level OptLevel) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // First try with the default operand order.
+  SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget,
+                                                 OptLevel);
+  if (Result.getNode())
+    return Result;
+
+  // If that didn't work, try again with the operands commuted.
+  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
+}
+
+static SDValue PerformANDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  // The type legalizer turns a vector load of i8 values into a zextload to i16
+  // registers, optionally ANY_EXTENDs it (if target type is integer),
+  // and ANDs off the high 8 bits. Since we turn this load into a
+  // target-specific DAG node, the DAG combiner fails to eliminate these AND
+  // nodes. Do that here.
+  SDValue Val = N->getOperand(0);
+  SDValue Mask = N->getOperand(1);
+
+  if (isa<ConstantSDNode>(Val)) {
+    std::swap(Val, Mask);
+  }
+
+  SDValue AExt;
+  // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
+  if (Val.getOpcode() == ISD::ANY_EXTEND) {
+    AExt = Val;
+    Val = Val->getOperand(0);
+  }
+
+  if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
+    Val = Val->getOperand(0);
+  }
+
+  if (Val->getOpcode() == NVPTXISD::LoadV2 ||
+      Val->getOpcode() == NVPTXISD::LoadV4) {
+    ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
+    if (!MaskCnst) {
+      // Not an AND with a constant
+      return SDValue();
+    }
+
+    uint64_t MaskVal = MaskCnst->getZExtValue();
+    if (MaskVal != 0xff) {
+      // Not an AND that chops off top 8 bits
+      return SDValue();
+    }
+
+    MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
+    if (!Mem) {
+      // Not a MemSDNode?!?
+      return SDValue();
+    }
+
+    EVT MemVT = Mem->getMemoryVT();
+    if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
+      // We only handle the i8 case
+      return SDValue();
+    }
+
+    unsigned ExtType =
+      cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
+        getZExtValue();
+    if (ExtType == ISD::SEXTLOAD) {
+      // If for some reason the load is a sextload, the and is needed to zero
+      // out the high 8 bits
+      return SDValue();
+    }
+
+    bool AddTo = false;
+    if (AExt.getNode() != 0) {
+      // Re-insert the ext as a zext.
+      Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
+                            AExt.getValueType(), Val);
+      AddTo = true;
+    }
+
+    // If we get here, the AND is unnecessary.  Just replace it with the load
+    DCI.CombineTo(N, Val, AddTo);
+  }
+
+  return SDValue();
+}
+
+enum OperandSignedness {
+  Signed = 0,
+  Unsigned,
+  Unknown
+};
+
+/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
+/// that can be demoted to \p OptSize bits without loss of information. The
+/// signedness of the operand, if determinable, is placed in \p S.
+static bool IsMulWideOperandDemotable(SDValue Op,
+                                      unsigned OptSize,
+                                      OperandSignedness &S) {
+  S = Unknown;
+
+  if (Op.getOpcode() == ISD::SIGN_EXTEND ||
+      Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    EVT OrigVT = Op.getOperand(0).getValueType();
+    if (OrigVT.getSizeInBits() == OptSize) {
+      S = Signed;
+      return true;
+    }
+  } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
+    EVT OrigVT = Op.getOperand(0).getValueType();
+    if (OrigVT.getSizeInBits() == OptSize) {
+      S = Unsigned;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
+/// be demoted to \p OptSize bits without loss of information. If the operands
+/// contain a constant, it should appear as the RHS operand. The signedness of
+/// the operands is placed in \p IsSigned.
+static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
+                                        unsigned OptSize,
+                                        bool &IsSigned) {
+
+  OperandSignedness LHSSign;
+
+  // The LHS operand must be a demotable op
+  if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
+    return false;
+
+  // We should have been able to determine the signedness from the LHS
+  if (LHSSign == Unknown)
+    return false;
+
+  IsSigned = (LHSSign == Signed);
+
+  // The RHS can be a demotable op or a constant
+  if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
+    APInt Val = CI->getAPIntValue();
+    if (LHSSign == Unsigned) {
+      if (Val.isIntN(OptSize)) {
+        return true;
+      }
+      return false;
+    } else {
+      if (Val.isSignedIntN(OptSize)) {
+        return true;
+      }
+      return false;
+    }
+  } else {
+    OperandSignedness RHSSign;
+    if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
+      return false;
+
+    if (LHSSign != RHSSign)
+      return false;
+
+    return true;
+  }
+}
+
+/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
+/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
+/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
+/// amount.
+static SDValue TryMULWIDECombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  EVT MulType = N->getValueType(0);
+  if (MulType != MVT::i32 && MulType != MVT::i64) {
+    return SDValue();
+  }
+
+  unsigned OptSize = MulType.getSizeInBits() >> 1;
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Canonicalize the multiply so the constant (if any) is on the right
+  if (N->getOpcode() == ISD::MUL) {
+    if (isa<ConstantSDNode>(LHS)) {
+      std::swap(LHS, RHS);
+    }
+  }
+
+  // If we have a SHL, determine the actual multiply amount
+  if (N->getOpcode() == ISD::SHL) {
+    ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
+    if (!ShlRHS) {
+      return SDValue();
+    }
+
+    APInt ShiftAmt = ShlRHS->getAPIntValue();
+    unsigned BitWidth = MulType.getSizeInBits();
+    if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
+      APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
+      RHS = DCI.DAG.getConstant(MulVal, MulType);
+    } else {
+      return SDValue();
+    }
+  }
+
+  bool Signed;
+  // Verify that our operands are demotable
+  if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
+    return SDValue();
+  }
+
+  EVT DemotedVT;
+  if (MulType == MVT::i32) {
+    DemotedVT = MVT::i16;
+  } else {
+    DemotedVT = MVT::i32;
+  }
+
+  // Truncate the operands to the correct size. Note that these are just for
+  // type consistency and will (likely) be eliminated in later phases.
+  SDValue TruncLHS =
+    DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, LHS);
+  SDValue TruncRHS =
+    DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, RHS);
+
+  unsigned Opc;
+  if (Signed) {
+    Opc = NVPTXISD::MUL_WIDE_SIGNED;
+  } else {
+    Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
+  }
+
+  return DCI.DAG.getNode(Opc, SDLoc(N), MulType, TruncLHS, TruncRHS);
+}
+
+/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
+static SDValue PerformMULCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 CodeGenOpt::Level OptLevel) {
+  if (OptLevel > 0) {
+    // Try mul.wide combining at OptLevel > 0
+    SDValue Ret = TryMULWIDECombine(N, DCI);
+    if (Ret.getNode())
+      return Ret;
+  }
+
+  return SDValue();
+}
+
+/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
+static SDValue PerformSHLCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 CodeGenOpt::Level OptLevel) {
+  if (OptLevel > 0) {
+    // Try mul.wide combining at OptLevel > 0
+    SDValue Ret = TryMULWIDECombine(N, DCI);
+    if (Ret.getNode())
+      return Ret;
+  }
+
+  return SDValue();
+}
+
+SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
+                                               DAGCombinerInfo &DCI) const {
+  // FIXME: Get this from the DAG somehow
+  CodeGenOpt::Level OptLevel = CodeGenOpt::Aggressive;
+  switch (N->getOpcode()) {
+    default: break;
+    case ISD::ADD:
+    case ISD::FADD:
+      return PerformADDCombine(N, DCI, nvptxSubtarget, OptLevel);
+    case ISD::MUL:
+      return PerformMULCombine(N, DCI, OptLevel);
+    case ISD::SHL:
+      return PerformSHLCombine(N, DCI, OptLevel);
+    case ISD::AND:
+      return PerformANDCombine(N, DCI);
+  }
+  return SDValue();
+}
+
 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
                               SmallVectorImpl<SDValue> &Results) {
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index 7bad8a2..7b4026d 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -16,7 +16,6 @@
 #define NVPTXISELLOWERING_H
 
 #include "NVPTX.h"
-#include "NVPTXSubtarget.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetLowering.h"
 
@@ -50,6 +49,11 @@ enum NodeType {
   CallSeqBegin,
   CallSeqEnd,
   CallPrototype,
+  FUN_SHFL_CLAMP,
+  FUN_SHFR_CLAMP,
+  MUL_WIDE_SIGNED,
+  MUL_WIDE_UNSIGNED,
+  IMAD,
   Dummy,
 
   LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
@@ -167,6 +171,8 @@ enum NodeType {
 };
 }
 
+class NVPTXSubtarget;
+
 //===--------------------------------------------------------------------===//
 // TargetLowering Implementation
 //===--------------------------------------------------------------------===//
@@ -196,9 +202,9 @@ public:
   /// getFunctionAlignment - Return the Log2 alignment of this function.
   unsigned getFunctionAlignment(const Function *F) const;
 
-  EVT getSetCCResultType(LLVMContext &, EVT VT) const override {
+  EVT getSetCCResultType(LLVMContext &Ctx, EVT VT) const override {
     if (VT.isVector())
-      return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+      return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
     return MVT::i1;
   }
 
@@ -236,7 +242,8 @@ public:
   // PTX always uses 32-bit shift amounts
   MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
 
-  bool shouldSplitVectorType(EVT VT) const override;
+  TargetLoweringBase::LegalizeTypeAction
+  getPreferredVectorAction(EVT VT) const override;
 
 private:
   const NVPTXSubtarget &nvptxSubtarget; // cache the subtarget here
@@ -255,8 +262,12 @@ private:
   SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+
   void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
   unsigned getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS,
                                 Type *Ty, unsigned Idx) const;
diff --git a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
index 397f4bc..a98fb37 100644
--- a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
+++ b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -146,7 +146,7 @@ bool NVPTXImageOptimizer::replaceIsTypePTexture(Instruction &I) {
 void NVPTXImageOptimizer::replaceWith(Instruction *From, ConstantInt *To) {
   // We implement "poor man's DCE" here to make sure any code that is no longer
   // live is actually unreachable and can be trivially eliminated by the
-  // unreachable block elimiation pass.
+  // unreachable block elimination pass.
   for (CallInst::use_iterator UI = From->use_begin(), UE = From->use_end();
        UI != UE; ++UI) {
     if (BranchInst *BI = dyn_cast<BranchInst>(*UI)) {
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index cdc8088..b5b4fbe 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -29,8 +29,8 @@ using namespace llvm;
 void NVPTXInstrInfo::anchor() {}
 
 // FIXME: Add the subtarget support on this constructor.
-NVPTXInstrInfo::NVPTXInstrInfo(NVPTXTargetMachine &tm)
-    : NVPTXGenInstrInfo(), TM(tm), RegInfo(*TM.getSubtargetImpl()) {}
+NVPTXInstrInfo::NVPTXInstrInfo(NVPTXSubtarget &STI)
+    : NVPTXGenInstrInfo(), RegInfo(STI) {}
 
 void NVPTXInstrInfo::copyPhysReg(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h
index 88a9e45..2ac2974 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -24,11 +24,10 @@
 namespace llvm {
 
 class NVPTXInstrInfo : public NVPTXGenInstrInfo {
-  NVPTXTargetMachine &TM;
   const NVPTXRegisterInfo RegInfo;
   virtual void anchor();
 public:
-  explicit NVPTXInstrInfo(NVPTXTargetMachine &TM);
+  explicit NVPTXInstrInfo(NVPTXSubtarget &STI);
 
   const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; }
 
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index fbcd0e4..d2c0373 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -158,9 +158,12 @@ def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
 def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
 
 def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">;
+def noHWROT32 : Predicate<"!Subtarget.hasHWROT32()">;
 
 def true : Predicate<"1">;
 
+def hasPTX31 : Predicate<"Subtarget.getPTXVersion() >= 31">;
+
 
 //===----------------------------------------------------------------------===//
 // Some Common Instruction Class Templates
@@ -461,33 +464,45 @@ def SHL2MUL16 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(temp.shl(v), MVT::i16);
 }]>;
 
-def MULWIDES64 : NVPTXInst<(outs Int64Regs:$dst),
-                           (ins Int32Regs:$a, Int32Regs:$b),
+def MULWIDES64
+  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+              "mul.wide.s32 \t$dst, $a, $b;", []>;
+def MULWIDES64Imm
+  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
                            "mul.wide.s32 \t$dst, $a, $b;", []>;
-def MULWIDES64Imm : NVPTXInst<(outs Int64Regs:$dst),
-                            (ins Int32Regs:$a, i64imm:$b),
+def MULWIDES64Imm64
+  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
                            "mul.wide.s32 \t$dst, $a, $b;", []>;
 
-def MULWIDEU64 : NVPTXInst<(outs Int64Regs:$dst),
-                           (ins Int32Regs:$a, Int32Regs:$b),
+def MULWIDEU64
+  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+              "mul.wide.u32 \t$dst, $a, $b;", []>;
+def MULWIDEU64Imm
+  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
                            "mul.wide.u32 \t$dst, $a, $b;", []>;
-def MULWIDEU64Imm : NVPTXInst<(outs Int64Regs:$dst),
-                            (ins Int32Regs:$a, i64imm:$b),
+def MULWIDEU64Imm64
+  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
                            "mul.wide.u32 \t$dst, $a, $b;", []>;
 
-def MULWIDES32 : NVPTXInst<(outs Int32Regs:$dst),
-                            (ins Int16Regs:$a, Int16Regs:$b),
+def MULWIDES32
+  : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
                            "mul.wide.s16 \t$dst, $a, $b;", []>;
-def MULWIDES32Imm : NVPTXInst<(outs Int32Regs:$dst),
-                            (ins Int16Regs:$a, i32imm:$b),
+def MULWIDES32Imm
+  : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+              "mul.wide.s16 \t$dst, $a, $b;", []>;
+def MULWIDES32Imm32
+  : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
                            "mul.wide.s16 \t$dst, $a, $b;", []>;
 
-def MULWIDEU32 : NVPTXInst<(outs Int32Regs:$dst),
-                            (ins Int16Regs:$a, Int16Regs:$b),
-                           "mul.wide.u16 \t$dst, $a, $b;", []>;
-def MULWIDEU32Imm : NVPTXInst<(outs Int32Regs:$dst),
-                            (ins Int16Regs:$a, i32imm:$b),
+def MULWIDEU32
+  : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+              "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm
+  : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
                            "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm32
+  : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+                            "mul.wide.u16 \t$dst, $a, $b;", []>;
 
 def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
           (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
@@ -507,25 +522,63 @@ def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
           (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
           Requires<[doMulWide]>;
 def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
-          (MULWIDES64Imm Int32Regs:$a, (i64 SInt32Const:$b))>,
+          (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
           Requires<[doMulWide]>;
 
 def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
-          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, Requires<[doMulWide]>;
+          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+      Requires<[doMulWide]>;
 def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
-          (MULWIDEU64Imm Int32Regs:$a, (i64 UInt32Const:$b))>,
+          (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
           Requires<[doMulWide]>;
 
 def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
-          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>;
+          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+      Requires<[doMulWide]>;
 def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
-          (MULWIDES32Imm Int16Regs:$a, (i32 SInt16Const:$b))>,
+          (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
           Requires<[doMulWide]>;
 
 def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
-          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>;
+          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+      Requires<[doMulWide]>;
 def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
-          (MULWIDEU32Imm Int16Regs:$a, (i32 UInt16Const:$b))>,
+          (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
+          Requires<[doMulWide]>;
+
+
+def SDTMulWide
+  : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
+def mul_wide_signed
+  : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
+def mul_wide_unsigned
+  : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
+
+def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
+          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
+          (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
+          Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
+          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+          Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
+          (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
+          Requires<[doMulWide]>;
+
+
+def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
+          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+          Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
+          (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
+          Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
+          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+          Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
+          (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
           Requires<[doMulWide]>;
 
 defm MULT : I3<"mul.lo.s", mul>;
@@ -541,69 +594,75 @@ defm SREM : I3<"rem.s", srem>;
 defm UREM : I3<"rem.u", urem>;
 // The ri version will not be selected as DAGCombiner::visitUREM will lower it.
 
+def SDTIMAD
+  : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>,
+                         SDTCisInt<2>, SDTCisSameAs<0, 2>,
+                         SDTCisSameAs<0, 3>]>;
+def imad
+  : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
+
 def MAD16rrr : NVPTXInst<(outs Int16Regs:$dst),
                       (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
                       "mad.lo.s16 \t$dst, $a, $b, $c;",
-                      [(set Int16Regs:$dst, (add
-                        (mul Int16Regs:$a, Int16Regs:$b), Int16Regs:$c))]>;
+                      [(set Int16Regs:$dst,
+                         (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
 def MAD16rri : NVPTXInst<(outs Int16Regs:$dst),
                       (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
                       "mad.lo.s16 \t$dst, $a, $b, $c;",
-                      [(set Int16Regs:$dst, (add
-                        (mul Int16Regs:$a, Int16Regs:$b), imm:$c))]>;
+                      [(set Int16Regs:$dst,
+                         (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
 def MAD16rir : NVPTXInst<(outs Int16Regs:$dst),
                       (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
                       "mad.lo.s16 \t$dst, $a, $b, $c;",
-                      [(set Int16Regs:$dst, (add
-                        (mul Int16Regs:$a, imm:$b), Int16Regs:$c))]>;
+                      [(set Int16Regs:$dst,
+                        (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
 def MAD16rii : NVPTXInst<(outs Int16Regs:$dst),
     (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
                       "mad.lo.s16 \t$dst, $a, $b, $c;",
-                      [(set Int16Regs:$dst, (add (mul Int16Regs:$a, imm:$b),
-                        imm:$c))]>;
+                      [(set Int16Regs:$dst,
+                        (imad Int16Regs:$a, imm:$b, imm:$c))]>;
 
 def MAD32rrr : NVPTXInst<(outs Int32Regs:$dst),
                       (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
                       "mad.lo.s32 \t$dst, $a, $b, $c;",
-                      [(set Int32Regs:$dst, (add
-                        (mul Int32Regs:$a, Int32Regs:$b), Int32Regs:$c))]>;
+                      [(set Int32Regs:$dst,
+                        (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
 def MAD32rri : NVPTXInst<(outs Int32Regs:$dst),
                       (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
                       "mad.lo.s32 \t$dst, $a, $b, $c;",
-                      [(set Int32Regs:$dst, (add
-                        (mul Int32Regs:$a, Int32Regs:$b), imm:$c))]>;
+                      [(set Int32Regs:$dst,
+                        (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
 def MAD32rir : NVPTXInst<(outs Int32Regs:$dst),
                       (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
                       "mad.lo.s32 \t$dst, $a, $b, $c;",
-                      [(set Int32Regs:$dst, (add
-                        (mul Int32Regs:$a, imm:$b), Int32Regs:$c))]>;
+                      [(set Int32Regs:$dst,
+                        (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
 def MAD32rii : NVPTXInst<(outs Int32Regs:$dst),
                       (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
                       "mad.lo.s32 \t$dst, $a, $b, $c;",
-                      [(set Int32Regs:$dst, (add
-                        (mul Int32Regs:$a, imm:$b), imm:$c))]>;
+                      [(set Int32Regs:$dst,
+                        (imad Int32Regs:$a, imm:$b, imm:$c))]>;
 
 def MAD64rrr : NVPTXInst<(outs Int64Regs:$dst),
                       (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
                       "mad.lo.s64 \t$dst, $a, $b, $c;",
-                      [(set Int64Regs:$dst, (add
-                        (mul Int64Regs:$a, Int64Regs:$b), Int64Regs:$c))]>;
+                      [(set Int64Regs:$dst,
+                        (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
 def MAD64rri : NVPTXInst<(outs Int64Regs:$dst),
                       (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
                       "mad.lo.s64 \t$dst, $a, $b, $c;",
-                      [(set Int64Regs:$dst, (add
-                        (mul Int64Regs:$a, Int64Regs:$b), imm:$c))]>;
+                      [(set Int64Regs:$dst,
+                        (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
 def MAD64rir : NVPTXInst<(outs Int64Regs:$dst),
                       (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
                       "mad.lo.s64 \t$dst, $a, $b, $c;",
-                      [(set Int64Regs:$dst, (add
-                        (mul Int64Regs:$a, imm:$b), Int64Regs:$c))]>;
+                      [(set Int64Regs:$dst,
+                        (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
 def MAD64rii : NVPTXInst<(outs Int64Regs:$dst),
                       (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
                       "mad.lo.s64 \t$dst, $a, $b, $c;",
-                      [(set Int64Regs:$dst, (add
-                        (mul Int64Regs:$a, imm:$b), imm:$c))]>;
-
+                      [(set Int64Regs:$dst,
+                        (imad Int64Regs:$a, imm:$b, imm:$c))]>;
 
 def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
                      "neg.s16 \t$dst, $src;",
@@ -809,36 +868,26 @@ multiclass FPCONTRACT32<string OpcStr, Predicate Pred> {
    def rrr : NVPTXInst<(outs Float32Regs:$dst),
                       (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                      [(set Float32Regs:$dst, (fadd
-                        (fmul Float32Regs:$a, Float32Regs:$b),
-                        Float32Regs:$c))]>, Requires<[Pred]>;
-   // This is to WAR a weird bug in Tablegen that does not automatically
-   // generate the following permutated rule rrr2 from the above rrr.
-   // So we explicitly add it here. This happens to FMA32 only.
-   // See the comments at FMAD32 and FMA32 for more information.
-   def rrr2 : NVPTXInst<(outs Float32Regs:$dst),
-                        (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
-                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                      [(set Float32Regs:$dst, (fadd Float32Regs:$c,
-                        (fmul Float32Regs:$a, Float32Regs:$b)))]>,
+                      [(set Float32Regs:$dst,
+                        (fma Float32Regs:$a, Float32Regs:$b, Float32Regs:$c))]>,
                       Requires<[Pred]>;
    def rri : NVPTXInst<(outs Float32Regs:$dst),
                       (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c),
                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                      [(set Float32Regs:$dst, (fadd
-                        (fmul Float32Regs:$a, Float32Regs:$b), fpimm:$c))]>,
+                      [(set Float32Regs:$dst,
+                        (fma Float32Regs:$a, Float32Regs:$b, fpimm:$c))]>,
                       Requires<[Pred]>;
    def rir : NVPTXInst<(outs Float32Regs:$dst),
                       (ins Float32Regs:$a, f32imm:$b, Float32Regs:$c),
                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                      [(set Float32Regs:$dst, (fadd
-                        (fmul Float32Regs:$a, fpimm:$b), Float32Regs:$c))]>,
+                      [(set Float32Regs:$dst,
+                        (fma Float32Regs:$a, fpimm:$b, Float32Regs:$c))]>,
                       Requires<[Pred]>;
    def rii : NVPTXInst<(outs Float32Regs:$dst),
                       (ins Float32Regs:$a, f32imm:$b, f32imm:$c),
                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                      [(set Float32Regs:$dst, (fadd
-                        (fmul Float32Regs:$a, fpimm:$b), fpimm:$c))]>,
+                      [(set Float32Regs:$dst,
+                        (fma Float32Regs:$a, fpimm:$b, fpimm:$c))]>,
                       Requires<[Pred]>;
 }
 
@@ -846,73 +895,32 @@ multiclass FPCONTRACT64<string OpcStr, Predicate Pred> {
    def rrr : NVPTXInst<(outs Float64Regs:$dst),
                       (ins Float64Regs:$a, Float64Regs:$b, Float64Regs:$c),
                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                      [(set Float64Regs:$dst, (fadd
-                        (fmul Float64Regs:$a, Float64Regs:$b),
-                        Float64Regs:$c))]>, Requires<[Pred]>;
+                      [(set Float64Regs:$dst,
+                        (fma Float64Regs:$a, Float64Regs:$b, Float64Regs:$c))]>,
+                      Requires<[Pred]>;
    def rri : NVPTXInst<(outs Float64Regs:$dst),
                       (ins Float64Regs:$a, Float64Regs:$b, f64imm:$c),
                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                      [(set Float64Regs:$dst, (fadd (fmul Float64Regs:$a,
-                        Float64Regs:$b), fpimm:$c))]>, Requires<[Pred]>;
+                      [(set Float64Regs:$dst,
+                        (fma Float64Regs:$a, Float64Regs:$b, fpimm:$c))]>,
+                      Requires<[Pred]>;
    def rir : NVPTXInst<(outs Float64Regs:$dst),
                       (ins Float64Regs:$a, f64imm:$b, Float64Regs:$c),
                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                      [(set Float64Regs:$dst, (fadd
-                        (fmul Float64Regs:$a, fpimm:$b), Float64Regs:$c))]>,
+                      [(set Float64Regs:$dst,
+                        (fma Float64Regs:$a, fpimm:$b, Float64Regs:$c))]>,
                       Requires<[Pred]>;
    def rii : NVPTXInst<(outs Float64Regs:$dst),
                       (ins Float64Regs:$a, f64imm:$b, f64imm:$c),
                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                      [(set Float64Regs:$dst, (fadd
-                        (fmul Float64Regs:$a, fpimm:$b), fpimm:$c))]>,
+                      [(set Float64Regs:$dst,
+                        (fma Float64Regs:$a, fpimm:$b, fpimm:$c))]>,
                       Requires<[Pred]>;
 }
 
-// Due to a unknown reason (most likely a bug in tablegen), tablegen does not
-// automatically generate the rrr2 rule from
-// the rrr rule (see FPCONTRACT32) for FMA32, though it does for FMAD32.
-// If we reverse the order of the following two lines, then rrr2 rule will be
-// generated for FMA32, but not for rrr.
-// Therefore, we manually write the rrr2 rule in FPCONTRACT32.
-defm FMA32_ftz  : FPCONTRACT32<"fma.rn.ftz.f32", doFMAF32_ftz>;
-defm FMA32  : FPCONTRACT32<"fma.rn.f32", doFMAF32>;
-defm FMA64  : FPCONTRACT64<"fma.rn.f64", doFMAF64>;
-
-// b*c-a => fmad(b, c, -a)
-multiclass FPCONTRACT32_SUB_PAT_MAD<NVPTXInst Inst, Predicate Pred> {
-  def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a),
-          (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>,
-          Requires<[Pred]>;
-}
-
-// a-b*c => fmad(-b,c, a)
-// - legal because a-b*c <=> a+(-b*c) <=> a+(-b)*c
-// b*c-a => fmad(b, c, -a)
-// - legal because b*c-a <=> b*c+(-a)
-multiclass FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
-  def : Pat<(fsub Float32Regs:$a, (fmul Float32Regs:$b, Float32Regs:$c)),
-          (Inst (FNEGf32 Float32Regs:$b), Float32Regs:$c, Float32Regs:$a)>,
-          Requires<[Pred]>;
-  def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a),
-          (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>,
-          Requires<[Pred]>;
-}
-
-// a-b*c => fmad(-b,c, a)
-// b*c-a => fmad(b, c, -a)
-multiclass FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
-  def : Pat<(fsub Float64Regs:$a, (fmul Float64Regs:$b, Float64Regs:$c)),
-          (Inst (FNEGf64 Float64Regs:$b), Float64Regs:$c, Float64Regs:$a)>,
-          Requires<[Pred]>;
-
-  def : Pat<(fsub (fmul Float64Regs:$b, Float64Regs:$c), Float64Regs:$a),
-          (Inst Float64Regs:$b, Float64Regs:$c, (FNEGf64 Float64Regs:$a))>,
-          Requires<[Pred]>;
-}
-
-defm FMAF32ext_ftz  : FPCONTRACT32_SUB_PAT<FMA32_ftzrrr, doFMAF32AGG_ftz>;
-defm FMAF32ext  : FPCONTRACT32_SUB_PAT<FMA32rrr, doFMAF32AGG>;
-defm FMAF64ext  : FPCONTRACT64_SUB_PAT<FMA64rrr, doFMAF64AGG>;
+defm FMA32_ftz  : FPCONTRACT32<"fma.rn.ftz.f32", doF32FTZ>;
+defm FMA32  : FPCONTRACT32<"fma.rn.f32", doNoF32FTZ>;
+defm FMA64  : FPCONTRACT64<"fma.rn.f64", doNoF32FTZ>;
 
 def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
                       "sin.approx.f32 \t$dst, $src;",
@@ -1083,6 +1091,43 @@ multiclass RSHIFT_FORMAT<string OpcStr, SDNode OpNode> {
 defm SRA : RSHIFT_FORMAT<"shr.s", sra>;
 defm SRL : RSHIFT_FORMAT<"shr.u", srl>;
 
+//
+// Rotate: use ptx shf instruction if available.
+//
+
+// 32 bit r2 = rotl r1, n
+//    =>
+//        r2 = shf.l r1, r1, n
+def ROTL32imm_hw : NVPTXInst<(outs Int32Regs:$dst),
+                             (ins Int32Regs:$src, i32imm:$amt),
+              "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+    [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
+    Requires<[hasHWROT32]> ;
+
+def ROTL32reg_hw : NVPTXInst<(outs Int32Regs:$dst),
+                             (ins Int32Regs:$src, Int32Regs:$amt),
+              "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+    [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+    Requires<[hasHWROT32]>;
+
+// 32 bit r2 = rotr r1, n
+//    =>
+//        r2 = shf.r r1, r1, n
+def ROTR32imm_hw : NVPTXInst<(outs Int32Regs:$dst),
+                             (ins Int32Regs:$src, i32imm:$amt),
+              "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+    [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
+    Requires<[hasHWROT32]>;
+
+def ROTR32reg_hw : NVPTXInst<(outs Int32Regs:$dst),
+                             (ins Int32Regs:$src, Int32Regs:$amt),
+              "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+    [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+    Requires<[hasHWROT32]>;
+
+//
+// Rotate: if ptx shf instruction is not available, then use shift+add
+//
 // 32bit
 def ROT32imm_sw : NVPTXInst<(outs Int32Regs:$dst),
   (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
@@ -1100,9 +1145,11 @@ def SUB_FRM_32 : SDNodeXForm<imm, [{
 }]>;
 
 def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
-          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>;
+          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+      Requires<[noHWROT32]>;
 def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
-          (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>;
+          (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
+      Requires<[noHWROT32]>;
 
 def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
     Int32Regs:$amt),
@@ -1115,7 +1162,8 @@ def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
     !strconcat("shr.b32 \t%rhs, $src, %amt2;\n\t",
     !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
     !strconcat("}}", ""))))))))),
-    [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>;
+    [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+    Requires<[noHWROT32]>;
 
 def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
     Int32Regs:$amt),
@@ -1128,7 +1176,8 @@ def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
     !strconcat("shl.b32 \t%rhs, $src, %amt2;\n\t",
     !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
     !strconcat("}}", ""))))))))),
-    [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>;
+    [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+    Requires<[noHWROT32]>;
 
 // 64bit
 def ROT64imm_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
@@ -1177,6 +1226,29 @@ def ROTR64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
     !strconcat("}}", ""))))))))),
     [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
 
+// BFE - bit-field extract
+
+multiclass BFE<string TyStr, RegisterClass RC> {
+  // BFE supports both 32-bit and 64-bit values, but the start and length
+  // operands are always 32-bit
+  def rrr
+    : NVPTXInst<(outs RC:$d),
+                (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
+                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+  def rri
+    : NVPTXInst<(outs RC:$d),
+                (ins RC:$a, Int32Regs:$b, i32imm:$c),
+                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+  def rii
+    : NVPTXInst<(outs RC:$d),
+                (ins RC:$a, i32imm:$b, i32imm:$c),
+                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+}
+
+defm BFE_S32 : BFE<"s32", Int32Regs>;
+defm BFE_U32 : BFE<"u32", Int32Regs>;
+defm BFE_S64 : BFE<"s64", Int64Regs>;
+defm BFE_U64 : BFE<"u64", Int64Regs>;
 
 //-----------------------------------
 // General Comparison
@@ -1292,6 +1364,32 @@ def : Pat<(i1 (select Int1Regs:$p, Int1Regs:$a, Int1Regs:$b)),
               (ORb1rr (ANDb1rr Int1Regs:$p, Int1Regs:$a),
               (ANDb1rr (NOT1 Int1Regs:$p), Int1Regs:$b))>;
 
+//
+// Funnnel shift in clamp mode
+//
+// - SDNodes are created so they can be used in the DAG code,
+//   e.g. NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
+//
+def SDTIntShiftDOp: SDTypeProfile<1, 3,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                   SDTCisInt<0>, SDTCisInt<3>]>;
+def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
+def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
+
+def FUNSHFLCLAMP : NVPTXInst<(outs Int32Regs:$dst),
+                             (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+                  "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
+                  [(set Int32Regs:$dst,
+                     (FUN_SHFL_CLAMP Int32Regs:$lo,
+                        Int32Regs:$hi, Int32Regs:$amt))]>;
+
+def FUNSHFRCLAMP : NVPTXInst<(outs Int32Regs:$dst),
+                             (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+                  "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
+                  [(set Int32Regs:$dst,
+                     (FUN_SHFR_CLAMP Int32Regs:$lo,
+                        Int32Regs:$hi, Int32Regs:$amt))]>;
+
 //-----------------------------------
 // Data Movement (Load / Store, Move)
 //-----------------------------------
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index 5e228fc..0ad3dfa 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1057,12 +1057,24 @@ def atomic_load_max_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
   (atomic_load_max_32 node:$a, node:$b)>;
 def atomic_load_max_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
   (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b)
+  , (atomic_load_max_64 node:$a, node:$b)>;
+def atomic_load_max_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_max_64 node:$a, node:$b)>;
+def atomic_load_max_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_max_64 node:$a, node:$b)>;
 def atomic_load_umax_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
   (atomic_load_umax_32 node:$a, node:$b)>;
 def atomic_load_umax_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
   (atomic_load_umax_32 node:$a, node:$b)>;
 def atomic_load_umax_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
   (atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_64 node:$a, node:$b)>;
+def atomic_load_umax_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_64 node:$a, node:$b)>;
+def atomic_load_umax_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_64 node:$a, node:$b)>;
 
 defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
   ".max", atomic_load_max_32_g, i32imm, imm, hasAtomRedG32>;
@@ -1072,6 +1084,14 @@ defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".max",
   atomic_load_max_32_gen, i32imm, imm, hasAtomRedGen32>;
 defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
   ".s32", ".max", atomic_load_max_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64",
+  ".max", atomic_load_max_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64",
+  ".max", atomic_load_max_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".max",
+  atomic_load_max_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+  ".s64", ".max", atomic_load_max_64_gen, i64imm, imm, useAtomRedG64forGen64>;
 defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
   ".max", atomic_load_umax_32_g, i32imm, imm, hasAtomRedG32>;
 defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
@@ -1080,6 +1100,14 @@ defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".max",
   atomic_load_umax_32_gen, i32imm, imm, hasAtomRedGen32>;
 defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
   ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64",
+  ".max", atomic_load_umax_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64",
+  ".max", atomic_load_umax_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".max",
+  atomic_load_umax_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+  ".u64", ".max", atomic_load_umax_64_gen, i64imm, imm, useAtomRedG64forGen64>;
 
 // atom_min
 
@@ -1089,12 +1117,24 @@ def atomic_load_min_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
   (atomic_load_min_32 node:$a, node:$b)>;
 def atomic_load_min_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
   (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_64 node:$a, node:$b)>;
+def atomic_load_min_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_64 node:$a, node:$b)>;
+def atomic_load_min_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_64 node:$a, node:$b)>;
 def atomic_load_umin_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
   (atomic_load_umin_32 node:$a, node:$b)>;
 def atomic_load_umin_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
   (atomic_load_umin_32 node:$a, node:$b)>;
 def atomic_load_umin_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
   (atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_64 node:$a, node:$b)>;
+def atomic_load_umin_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_64 node:$a, node:$b)>;
+def atomic_load_umin_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_64 node:$a, node:$b)>;
 
 defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
   ".min", atomic_load_min_32_g, i32imm, imm, hasAtomRedG32>;
@@ -1104,6 +1144,14 @@ defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".min",
   atomic_load_min_32_gen, i32imm, imm, hasAtomRedGen32>;
 defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
   ".s32", ".min", atomic_load_min_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64",
+  ".min", atomic_load_min_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64",
+  ".min", atomic_load_min_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".min",
+  atomic_load_min_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+  ".s64", ".min", atomic_load_min_64_gen, i64imm, imm, useAtomRedG64forGen64>;
 defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
   ".min", atomic_load_umin_32_g, i32imm, imm, hasAtomRedG32>;
 defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
@@ -1112,6 +1160,14 @@ defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".min",
   atomic_load_umin_32_gen, i32imm, imm, hasAtomRedGen32>;
 defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
   ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64",
+  ".min", atomic_load_umin_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64",
+  ".min", atomic_load_umin_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".min",
+  atomic_load_umin_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+  ".u64", ".min", atomic_load_umin_64_gen, i64imm, imm, useAtomRedG64forGen64>;
 
 // atom_inc  atom_dec
 
@@ -1153,6 +1209,12 @@ def atomic_load_and_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
   (atomic_load_and_32 node:$a, node:$b)>;
 def atomic_load_and_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
   (atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_64 node:$a, node:$b)>;
+def atomic_load_and_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_64 node:$a, node:$b)>;
+def atomic_load_and_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_64 node:$a, node:$b)>;
 
 defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".and",
   atomic_load_and_32_g, i32imm, imm, hasAtomRedG32>;
@@ -1162,6 +1224,14 @@ defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".and",
   atomic_load_and_32_gen, i32imm, imm, hasAtomRedGen32>;
 defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
   ".and", atomic_load_and_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".and",
+  atomic_load_and_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".and",
+  atomic_load_and_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".and",
+  atomic_load_and_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+  ".and", atomic_load_and_64_gen, i64imm, imm, useAtomRedG64forGen64>;
 
 // atom_or
 
@@ -1171,6 +1241,12 @@ def atomic_load_or_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
   (atomic_load_or_32 node:$a, node:$b)>;
 def atomic_load_or_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
   (atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_64 node:$a, node:$b)>;
+def atomic_load_or_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_64 node:$a, node:$b)>;
+def atomic_load_or_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_64 node:$a, node:$b)>;
 
 defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".or",
   atomic_load_or_32_g, i32imm, imm, hasAtomRedG32>;
@@ -1180,6 +1256,14 @@ defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
   ".or", atomic_load_or_32_gen, i32imm, imm, useAtomRedG32forGen32>;
 defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".or",
   atomic_load_or_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".or",
+  atomic_load_or_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".or",
+  atomic_load_or_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+  ".or", atomic_load_or_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".or",
+  atomic_load_or_64_s, i64imm, imm, hasAtomRedS64>;
 
 // atom_xor
 
@@ -1189,6 +1273,12 @@ def atomic_load_xor_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
   (atomic_load_xor_32 node:$a, node:$b)>;
 def atomic_load_xor_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
   (atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_64 node:$a, node:$b)>;
+def atomic_load_xor_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_64 node:$a, node:$b)>;
+def atomic_load_xor_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_64 node:$a, node:$b)>;
 
 defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".xor",
   atomic_load_xor_32_g, i32imm, imm, hasAtomRedG32>;
@@ -1198,6 +1288,14 @@ defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".xor",
   atomic_load_xor_32_gen, i32imm, imm, hasAtomRedGen32>;
 defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
   ".xor", atomic_load_xor_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".xor",
+  atomic_load_xor_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".xor",
+  atomic_load_xor_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".xor",
+  atomic_load_xor_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+  ".xor", atomic_load_xor_64_gen, i64imm, imm, useAtomRedG64forGen64>;
 
 // atom_cas
 
@@ -1276,67 +1374,33 @@ def INT_PTX_SREG_WARPSIZE : F_SREG<"mov.u32 \t$dst, WARP_SZ;", Int32Regs,
 // Support for ldu on sm_20 or later
 //-----------------------------------
 
-def ldu_i8 : PatFrag<(ops node:$ptr), (int_nvvm_ldu_global_i node:$ptr), [{
-  MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
-  return M->getMemoryVT() == MVT::i8;
-}]>;
-
 // Scalar
-// @TODO: Revisit this, Changed imemAny to imem
-multiclass LDU_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> {
+multiclass LDU_G<string TyStr, NVPTXRegClass regclass> {
   def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
                !strconcat("ldu.global.", TyStr),
-         [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDU]>;
+                      []>, Requires<[hasLDU]>;
   def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
                !strconcat("ldu.global.", TyStr),
-         [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDU]>;
- def avar:  NVPTXInst<(outs regclass:$result), (ins imem:$src),
+                        []>, Requires<[hasLDU]>;
+ def avar:  NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
                !strconcat("ldu.global.", TyStr),
-                [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>,
-                Requires<[hasLDU]>;
+                      []>, Requires<[hasLDU]>;
  def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
                !strconcat("ldu.global.", TyStr),
-         [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDU]>;
+                      []>, Requires<[hasLDU]>;
  def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
                !strconcat("ldu.global.", TyStr),
-         [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>;
+                        []>, Requires<[hasLDU]>;
 }
 
-multiclass LDU_G_NOINTRIN<string TyStr, NVPTXRegClass regclass, PatFrag IntOp> {
-  def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
-               !strconcat("ldu.global.", TyStr),
-         [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDU]>;
-  def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
-               !strconcat("ldu.global.", TyStr),
-         [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDU]>;
- def avar:  NVPTXInst<(outs regclass:$result), (ins imem:$src),
-               !strconcat("ldu.global.", TyStr),
-         [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>,
-         Requires<[hasLDU]>;
- def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
-               !strconcat("ldu.global.", TyStr),
-         [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDU]>;
- def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
-               !strconcat("ldu.global.", TyStr),
-         [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>;
-}
-
-defm INT_PTX_LDU_GLOBAL_i8  : LDU_G_NOINTRIN<"u8 \t$result, [$src];", Int16Regs,
-                                             ldu_i8>;
-defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs,
-int_nvvm_ldu_global_i>;
-defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs,
-int_nvvm_ldu_global_i>;
-defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs,
-int_nvvm_ldu_global_i>;
-defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs,
-int_nvvm_ldu_global_f>;
-defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs,
-int_nvvm_ldu_global_f>;
-defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs,
-int_nvvm_ldu_global_p>;
-defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs,
-int_nvvm_ldu_global_p>;
+defm INT_PTX_LDU_GLOBAL_i8  : LDU_G<"u8 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
+defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>;
+defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
+defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
 
 // vector
 
@@ -1406,65 +1470,40 @@ defm INT_PTX_LDU_G_v4f32_ELE
 // Support for ldg on sm_35 or later 
 //-----------------------------------
 
-def ldg_i8 : PatFrag<(ops node:$ptr), (int_nvvm_ldg_global_i node:$ptr), [{
-  MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
-  return M->getMemoryVT() == MVT::i8;
-}]>;
-
-multiclass LDG_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> {
+multiclass LDG_G<string TyStr, NVPTXRegClass regclass> {
   def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
                !strconcat("ld.global.nc.", TyStr),
-         [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDG]>;
+                      []>, Requires<[hasLDG]>;
   def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
                !strconcat("ld.global.nc.", TyStr),
-         [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDG]>;
- def avar:  NVPTXInst<(outs regclass:$result), (ins imem:$src),
+                        []>, Requires<[hasLDG]>;
+ def avar:  NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
                !strconcat("ld.global.nc.", TyStr),
-         [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>,
-         Requires<[hasLDG]>;
+                      []>, Requires<[hasLDG]>;
  def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
                !strconcat("ld.global.nc.", TyStr),
-         [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDG]>;
+                      []>, Requires<[hasLDG]>;
  def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
                !strconcat("ld.global.nc.", TyStr),
-         [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDG]>;
-}
-
-multiclass LDG_G_NOINTRIN<string TyStr, NVPTXRegClass regclass, PatFrag IntOp> {
-  def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
-               !strconcat("ld.global.nc.", TyStr),
-         [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDG]>;
-  def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
-               !strconcat("ld.global.nc.", TyStr),
-         [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDG]>;
- def avar:  NVPTXInst<(outs regclass:$result), (ins imem:$src),
-               !strconcat("ld.global.nc.", TyStr),
-         [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>,
-        Requires<[hasLDG]>;
- def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
-               !strconcat("ld.global.nc.", TyStr),
-         [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDG]>;
- def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
-               !strconcat("ld.global.nc.", TyStr),
-         [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDG]>;
+                        []>, Requires<[hasLDG]>;
 }
 
 defm INT_PTX_LDG_GLOBAL_i8
-  : LDG_G_NOINTRIN<"u8 \t$result, [$src];",  Int16Regs, ldg_i8>;
+  : LDG_G<"u8 \t$result, [$src];", Int16Regs>;
 defm INT_PTX_LDG_GLOBAL_i16
-  : LDG_G<"u16 \t$result, [$src];", Int16Regs,   int_nvvm_ldg_global_i>;
+  : LDG_G<"u16 \t$result, [$src];", Int16Regs>;
 defm INT_PTX_LDG_GLOBAL_i32
-  : LDG_G<"u32 \t$result, [$src];", Int32Regs,   int_nvvm_ldg_global_i>;
+  : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
 defm INT_PTX_LDG_GLOBAL_i64
-  : LDG_G<"u64 \t$result, [$src];", Int64Regs,   int_nvvm_ldg_global_i>;
+  : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
 defm INT_PTX_LDG_GLOBAL_f32
-  : LDG_G<"f32 \t$result, [$src];", Float32Regs, int_nvvm_ldg_global_f>;
+  : LDG_G<"f32 \t$result, [$src];", Float32Regs>;
 defm INT_PTX_LDG_GLOBAL_f64
-  : LDG_G<"f64 \t$result, [$src];", Float64Regs, int_nvvm_ldg_global_f>;
+  : LDG_G<"f64 \t$result, [$src];", Float64Regs>;
 defm INT_PTX_LDG_GLOBAL_p32
-  : LDG_G<"u32 \t$result, [$src];", Int32Regs,   int_nvvm_ldg_global_p>;
+  : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
 defm INT_PTX_LDG_GLOBAL_p64
-  : LDG_G<"u64 \t$result, [$src];", Int64Regs,   int_nvvm_ldg_global_p>;
+  : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
 
 // vector
 
@@ -1689,6 +1728,207 @@ def INT_NVVM_COMPILER_ERROR_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
                 [(int_nvvm_compiler_error Int64Regs:$a)]>;
 
 
+// isspacep
+
+def ISSPACEP_CONST_32
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+              "isspacep.const \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_const Int32Regs:$a))]>,
+    Requires<[hasPTX31]>;
+def ISSPACEP_CONST_64
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "isspacep.const \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_const Int64Regs:$a))]>,
+    Requires<[hasPTX31]>;
+def ISSPACEP_GLOBAL_32
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+              "isspacep.global \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_global Int32Regs:$a))]>;
+def ISSPACEP_GLOBAL_64
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "isspacep.global \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_global Int64Regs:$a))]>;
+def ISSPACEP_LOCAL_32
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+              "isspacep.local \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_local Int32Regs:$a))]>;
+def ISSPACEP_LOCAL_64
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "isspacep.local \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_local Int64Regs:$a))]>;
+def ISSPACEP_SHARED_32
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+              "isspacep.shared \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int32Regs:$a))]>;
+def ISSPACEP_SHARED_64
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "isspacep.shared \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int64Regs:$a))]>;
+
+
+// Special register reads
+def MOV_SPECIAL : NVPTXInst<(outs Int32Regs:$d),
+                            (ins SpecialRegs:$r),
+                            "mov.b32\t$d, $r;", []>;
+
+def : Pat<(int_nvvm_read_ptx_sreg_envreg0), (MOV_SPECIAL ENVREG0)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg1), (MOV_SPECIAL ENVREG1)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg2), (MOV_SPECIAL ENVREG2)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg3), (MOV_SPECIAL ENVREG3)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg4), (MOV_SPECIAL ENVREG4)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg5), (MOV_SPECIAL ENVREG5)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg6), (MOV_SPECIAL ENVREG6)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg7), (MOV_SPECIAL ENVREG7)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg8), (MOV_SPECIAL ENVREG8)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg9), (MOV_SPECIAL ENVREG9)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg10), (MOV_SPECIAL ENVREG10)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg11), (MOV_SPECIAL ENVREG11)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg12), (MOV_SPECIAL ENVREG12)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg13), (MOV_SPECIAL ENVREG13)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg14), (MOV_SPECIAL ENVREG14)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg15), (MOV_SPECIAL ENVREG15)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg16), (MOV_SPECIAL ENVREG16)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg17), (MOV_SPECIAL ENVREG17)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg18), (MOV_SPECIAL ENVREG18)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg19), (MOV_SPECIAL ENVREG19)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg20), (MOV_SPECIAL ENVREG20)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg21), (MOV_SPECIAL ENVREG21)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg22), (MOV_SPECIAL ENVREG22)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg23), (MOV_SPECIAL ENVREG23)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg24), (MOV_SPECIAL ENVREG24)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg25), (MOV_SPECIAL ENVREG25)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg26), (MOV_SPECIAL ENVREG26)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg27), (MOV_SPECIAL ENVREG27)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg28), (MOV_SPECIAL ENVREG28)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg29), (MOV_SPECIAL ENVREG29)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg30), (MOV_SPECIAL ENVREG30)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>;
+
+
+// rotate builtin support
+
+def ROTATE_B32_HW_IMM
+  : NVPTXInst<(outs Int32Regs:$dst),
+              (ins  Int32Regs:$src, i32imm:$amt),
+              "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+              [(set Int32Regs:$dst,
+                 (int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)))]>,
+              Requires<[hasHWROT32]> ;
+
+def ROTATE_B32_HW_REG
+  : NVPTXInst<(outs Int32Regs:$dst),
+              (ins  Int32Regs:$src, Int32Regs:$amt),
+              "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+              [(set Int32Regs:$dst,
+                 (int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt))]>,
+              Requires<[hasHWROT32]> ;
+
+def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)),
+          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+      Requires<[noHWROT32]> ;
+
+def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt),
+          (ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>,
+      Requires<[noHWROT32]> ;
+
+def GET_LO_INT64
+  : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+              !strconcat("{{\n\t",
+              !strconcat(".reg .b32 %dummy;\n\t",
+              !strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t",
+        !strconcat("}}", "")))),
+        []> ;
+
+def GET_HI_INT64
+  : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+              !strconcat("{{\n\t",
+              !strconcat(".reg .b32 %dummy;\n\t",
+              !strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t",
+        !strconcat("}}", "")))),
+        []> ;
+
+def PACK_TWO_INT32
+  : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi),
+              "mov.b64 \t$dst, {{$lo, $hi}};", []> ;
+
+def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src),
+          (PACK_TWO_INT32 (GET_HI_INT64 Int64Regs:$src),
+                          (GET_LO_INT64 Int64Regs:$src))> ;
+
+// funnel shift, requires >= sm_32
+def SHF_L_WRAP_B32_IMM
+  : NVPTXInst<(outs Int32Regs:$dst),
+              (ins  Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+              "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+    Requires<[hasHWROT32]>;
+
+def SHF_L_WRAP_B32_REG
+  : NVPTXInst<(outs Int32Regs:$dst),
+              (ins  Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+              "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+    Requires<[hasHWROT32]>;
+
+def SHF_R_WRAP_B32_IMM
+  : NVPTXInst<(outs Int32Regs:$dst),
+              (ins  Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+              "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+    Requires<[hasHWROT32]>;
+
+def SHF_R_WRAP_B32_REG
+  : NVPTXInst<(outs Int32Regs:$dst),
+              (ins  Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+              "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+    Requires<[hasHWROT32]>;
+
+// HW version of rotate 64
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
+          (PACK_TWO_INT32
+            (SHF_L_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
+                                (GET_LO_INT64 Int64Regs:$src), imm:$amt),
+            (SHF_L_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
+                                (GET_HI_INT64 Int64Regs:$src), imm:$amt))>,
+      Requires<[hasHWROT32]>;
+
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
+          (PACK_TWO_INT32
+            (SHF_L_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
+                                (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt),
+            (SHF_L_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
+                               (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt))>,
+      Requires<[hasHWROT32]>;
+
+
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
+          (PACK_TWO_INT32
+            (SHF_R_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
+                                (GET_HI_INT64 Int64Regs:$src), imm:$amt),
+            (SHF_R_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
+                                (GET_LO_INT64 Int64Regs:$src), imm:$amt))>,
+      Requires<[hasHWROT32]>;
+
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
+          (PACK_TWO_INT32
+            (SHF_R_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
+                                (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt),
+            (SHF_R_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
+                               (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt))>,
+      Requires<[hasHWROT32]>;
+
+// SW version of rotate 64
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
+          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+      Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
+          (ROTL64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
+      Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
+          (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>,
+      Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
+          (ROTR64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
+      Requires<[noHWROT32]>;
+
+
 //-----------------------------------
 // Texture Intrinsics
 //-----------------------------------
diff --git a/lib/Target/NVPTX/NVPTXMCExpr.h b/lib/Target/NVPTX/NVPTXMCExpr.h
index 0ee018c..5547649 100644
--- a/lib/Target/NVPTX/NVPTXMCExpr.h
+++ b/lib/Target/NVPTX/NVPTXMCExpr.h
@@ -66,7 +66,7 @@ public:
                                  const MCAsmLayout *Layout) const override {
     return false;
   }
-  void AddValueSymbols(MCAssembler *) const override {};
+  void visitUsedExpr(MCStreamer &Streamer) const override {};
   const MCSection *FindAssociatedSection() const override {
     return nullptr;
   }
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.td b/lib/Target/NVPTX/NVPTXRegisterInfo.td
index 7a38a66..3482248 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -46,6 +46,10 @@ foreach i = 0-4 in {
   def da#i : NVPTXReg<"%da"#i>;
 }
 
+foreach i = 0-31 in {
+  def ENVREG#i : NVPTXReg<"%envreg"#i>;
+}
+
 //===----------------------------------------------------------------------===//
 //  Register classes
 //===----------------------------------------------------------------------===//
@@ -61,4 +65,5 @@ def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 4))>;
 def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>;
 
 // Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
-def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot)>;
+def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot,
+                                            (sequence "ENVREG%u", 0, 31))>;
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 8c7df52..d5cded2 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -25,10 +25,41 @@ using namespace llvm;
 // Pin the vtable to this file.
 void NVPTXSubtarget::anchor() {}
 
+static std::string computeDataLayout(bool is64Bit) {
+  std::string Ret = "e";
+
+  if (!is64Bit)
+    Ret += "-p:32:32";
+
+  Ret += "-i64:64-v16:16-v32:32-n16:32:64";
+
+  return Ret;
+}
+
+NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                                StringRef FS) {
+    // Provide the default CPU if we don't have one.
+  if (CPU.empty() && FS.size())
+    llvm_unreachable("we are not using FeatureStr");
+  TargetName = CPU.empty() ? "sm_20" : CPU;
+
+  ParseSubtargetFeatures(TargetName, FS);
+
+  // Set default to PTX 3.2 (CUDA 5.5)
+  if (PTXVersion == 0) {
+    PTXVersion = 32;
+  }
+
+  return *this;
+}
+
 NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
-                               const std::string &FS, bool is64Bit)
+                               const std::string &FS, const TargetMachine &TM,
+                               bool is64Bit)
     : NVPTXGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit), PTXVersion(0),
-      SmVersion(20) {
+      SmVersion(20), DL(computeDataLayout(is64Bit)),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+      TLInfo((NVPTXTargetMachine &)TM), TSInfo(&DL), FrameLowering(*this) {
 
   Triple T(TT);
 
@@ -36,26 +67,4 @@ NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
     drvInterface = NVPTX::NVCL;
   else
     drvInterface = NVPTX::CUDA;
-
-  // Provide the default CPU if none
-  std::string defCPU = "sm_20";
-
-  ParseSubtargetFeatures((CPU.empty() ? defCPU : CPU), FS);
-
-  // Get the TargetName from the FS if available
-  if (FS.empty() && CPU.empty())
-    TargetName = defCPU;
-  else if (!CPU.empty())
-    TargetName = CPU;
-  else
-    llvm_unreachable("we are not using FeatureStr");
-
-  // We default to PTX 3.1, but we cannot just default to it in the initializer
-  // since the attribute parser checks if the given option is >= the default.
-  // So if we set ptx31 as the default, the ptx30 attribute would never match.
-  // Instead, we use 0 as the default and manually set 31 if the default is
-  // used.
-  if (PTXVersion == 0) {
-    PTXVersion = 31;
-  }
 }
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
index 581e5ed..3ed5747 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -15,6 +15,12 @@
 #define NVPTXSUBTARGET_H
 
 #include "NVPTX.h"
+#include "NVPTXFrameLowering.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
@@ -35,12 +41,30 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31
   unsigned int SmVersion;
 
+  const DataLayout DL; // Calculates type size & alignment
+  NVPTXInstrInfo InstrInfo;
+  NVPTXTargetLowering TLInfo;
+  TargetSelectionDAGInfo TSInfo;
+
+  // NVPTX does not have any call stack frame, but need a NVPTX specific
+  // FrameLowering class because TargetFrameLowering is abstract.
+  NVPTXFrameLowering FrameLowering;
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified module.
   ///
   NVPTXSubtarget(const std::string &TT, const std::string &CPU,
-                 const std::string &FS, bool is64Bit);
+                 const std::string &FS, const TargetMachine &TM, bool is64Bit);
+
+  const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; }
+  const NVPTXInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  const DataLayout *getDataLayout() const { return &DL; }
+  const NVPTXRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const NVPTXTargetLowering *getTargetLowering() const { return &TLInfo; }
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
 
   bool hasBrkPt() const { return SmVersion >= 11; }
   bool hasAtomRedG32() const { return SmVersion >= 11; }
@@ -57,10 +81,12 @@ public:
   bool hasFMAF32() const { return SmVersion >= 20; }
   bool hasFMAF64() const { return SmVersion >= 13; }
   bool hasLDG() const { return SmVersion >= 32; }
-  bool hasLDU() const { return SmVersion >= 20; }
+  bool hasLDU() const { return ((SmVersion >= 20) && (SmVersion < 30)); }
   bool hasGenericLdSt() const { return SmVersion >= 20; }
-  inline bool hasHWROT32() const { return false; }
-  inline bool hasSWROT32() const { return true; }
+  inline bool hasHWROT32() const { return SmVersion >= 32; }
+  inline bool hasSWROT32() const {
+    return ((SmVersion >= 20) && (SmVersion < 32));
+  }
   inline bool hasROT32() const { return hasHWROT32() || hasSWROT32(); }
   inline bool hasROT64() const { return SmVersion >= 20; }
 
@@ -76,6 +102,7 @@ public:
 
   unsigned getPTXVersion() const { return PTXVersion; }
 
+  NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 };
 
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 26a4f84..069a1b9 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -66,26 +66,13 @@ extern "C" void LLVMInitializeNVPTXTarget() {
     *PassRegistry::getPassRegistry());
 }
 
-static std::string computeDataLayout(const NVPTXSubtarget &ST) {
-  std::string Ret = "e";
-
-  if (!ST.is64Bit())
-    Ret += "-p:32:32";
-
-  Ret += "-i64:64-v16:16-v32:32-n16:32:64";
-
-  return Ret;
-}
-
-NVPTXTargetMachine::NVPTXTargetMachine(
-    const Target &T, StringRef TT, StringRef CPU, StringRef FS,
-    const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
-    CodeGenOpt::Level OL, bool is64bit)
+NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL, bool is64bit)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-      Subtarget(TT, CPU, FS, is64bit), DL(computeDataLayout(Subtarget)),
-      InstrInfo(*this), TLInfo(*this), TSInfo(*this),
-      FrameLowering(
-          *this, is64bit) /*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ {
+      Subtarget(TT, CPU, FS, *this, is64bit) {
   initAsmInfo();
 }
 
@@ -119,6 +106,7 @@ public:
   bool addInstSelector() override;
   bool addPreRegAlloc() override;
   bool addPostRegAlloc() override;
+  void addMachineSSAOptimization() override;
 
   FunctionPass *createTargetRegisterAllocator(bool) override;
   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
@@ -220,3 +208,43 @@ void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
 
   printAndVerify("After StackSlotColoring");
 }
+
+void NVPTXPassConfig::addMachineSSAOptimization() {
+  // Pre-ra tail duplication.
+  if (addPass(&EarlyTailDuplicateID))
+    printAndVerify("After Pre-RegAlloc TailDuplicate");
+
+  // Optimize PHIs before DCE: removing dead PHI cycles may make more
+  // instructions dead.
+  addPass(&OptimizePHIsID);
+
+  // This pass merges large allocas. StackSlotColoring is a different pass
+  // which merges spill slots.
+  addPass(&StackColoringID);
+
+  // If the target requests it, assign local variables to stack slots relative
+  // to one another and simplify frame index references where possible.
+  addPass(&LocalStackSlotAllocationID);
+
+  // With optimization, dead code should already be eliminated. However
+  // there is one known exception: lowered code for arguments that are only
+  // used by tail calls, where the tail calls reuse the incoming stack
+  // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
+  addPass(&DeadMachineInstructionElimID);
+  printAndVerify("After codegen DCE pass");
+
+  // Allow targets to insert passes that improve instruction level parallelism,
+  // like if-conversion. Such passes will typically need dominator trees and
+  // loop info, just like LICM and CSE below.
+  if (addILPOpts())
+    printAndVerify("After ILP optimizations");
+
+  addPass(&MachineLICMID);
+  addPass(&MachineCSEID);
+
+  addPass(&MachineSinkingID);
+  printAndVerify("After Machine LICM, CSE and Sinking passes");
+
+  addPass(&PeepholeOptimizerID);
+  printAndVerify("After codegen peephole optimization pass");
+}
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h
index 2db7c18..a7a1c8f 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -14,13 +14,8 @@
 #ifndef NVPTX_TARGETMACHINE_H
 #define NVPTX_TARGETMACHINE_H
 
-#include "ManagedStringPool.h"
-#include "NVPTXFrameLowering.h"
-#include "NVPTXISelLowering.h"
-#include "NVPTXInstrInfo.h"
-#include "NVPTXRegisterInfo.h"
 #include "NVPTXSubtarget.h"
-#include "llvm/IR/DataLayout.h"
+#include "ManagedStringPool.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSelectionDAGInfo.h"
@@ -31,50 +26,37 @@ namespace llvm {
 ///
 class NVPTXTargetMachine : public LLVMTargetMachine {
   NVPTXSubtarget Subtarget;
-  const DataLayout DL; // Calculates type size & alignment
-  NVPTXInstrInfo InstrInfo;
-  NVPTXTargetLowering TLInfo;
-  TargetSelectionDAGInfo TSInfo;
-
-  // NVPTX does not have any call stack frame, but need a NVPTX specific
-  // FrameLowering class because TargetFrameLowering is abstract.
-  NVPTXFrameLowering FrameLowering;
 
   // Hold Strings that can be free'd all together with NVPTXTargetMachine
   ManagedStringPool ManagedStrPool;
 
-  //bool addCommonCodeGenPasses(PassManagerBase &, CodeGenOpt::Level,
-  //                            bool DisableVerify, MCContext *&OutCtx);
-
 public:
   NVPTXTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
                      const TargetOptions &Options, Reloc::Model RM,
                      CodeModel::Model CM, CodeGenOpt::Level OP, bool is64bit);
 
   const TargetFrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
+    return getSubtargetImpl()->getFrameLowering();
+  }
+  const NVPTXInstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
+  }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
   }
-  const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
-  const DataLayout *getDataLayout() const override { return &DL; }
   const NVPTXSubtarget *getSubtargetImpl() const override { return &Subtarget; }
-
   const NVPTXRegisterInfo *getRegisterInfo() const override {
-    return &(InstrInfo.getRegisterInfo());
+    return getSubtargetImpl()->getRegisterInfo();
   }
 
-  NVPTXTargetLowering *getTargetLowering() const override {
-    return const_cast<NVPTXTargetLowering *>(&TLInfo);
+  const NVPTXTargetLowering *getTargetLowering() const override {
+    return getSubtargetImpl()->getTargetLowering();
   }
 
   const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
-    return &TSInfo;
+    return getSubtargetImpl()->getSelectionDAGInfo();
   }
 
-  //virtual bool addInstSelector(PassManagerBase &PM,
-  //                             CodeGenOpt::Level OptLevel);
-
-  //virtual bool addPreRegAlloc(PassManagerBase &, CodeGenOpt::Level);
-
   ManagedStringPool *getManagedStrPool() const {
     return const_cast<ManagedStringPool *>(&ManagedStrPool);
   }
diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp
index cb8bd72..a8d6b95 100644
--- a/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/lib/Target/NVPTX/NVVMReflect.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
@@ -47,17 +48,16 @@ class NVVMReflect : public ModulePass {
 private:
   StringMap<int> VarMap;
   typedef DenseMap<std::string, int>::iterator VarMapIter;
-  Function *ReflectFunction;
 
 public:
   static char ID;
-  NVVMReflect() : ModulePass(ID), ReflectFunction(nullptr) {
+  NVVMReflect() : ModulePass(ID) {
     initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
     VarMap.clear();
   }
 
   NVVMReflect(const StringMap<int> &Mapping)
-  : ModulePass(ID), ReflectFunction(nullptr) {
+  : ModulePass(ID) {
     initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
     for (StringMap<int>::const_iterator I = Mapping.begin(), E = Mapping.end();
          I != E; ++I) {
@@ -70,6 +70,8 @@ public:
   }
   bool runOnModule(Module &) override;
 
+private:
+  bool handleFunction(Function *ReflectFunction);
   void setVarMap();
 };
 }
@@ -120,19 +122,7 @@ void NVVMReflect::setVarMap() {
   }
 }
 
-bool NVVMReflect::runOnModule(Module &M) {
-  if (!NVVMReflectEnabled)
-    return false;
-
-  setVarMap();
-
-  ReflectFunction = M.getFunction(NVVM_REFLECT_FUNCTION);
-
-  // If reflect function is not used, then there will be
-  // no entry in the module.
-  if (!ReflectFunction)
-    return false;
-
+bool NVVMReflect::handleFunction(Function *ReflectFunction) {
   // Validate _reflect function
   assert(ReflectFunction->isDeclaration() &&
          "_reflect function should not have a body");
@@ -155,13 +145,15 @@ bool NVVMReflect::runOnModule(Module &M) {
            "Only one operand expect for _reflect function");
     // In cuda, we will have an extra constant-to-generic conversion of
     // the string.
-    const Value *conv = Reflect->getArgOperand(0);
-    assert(isa<CallInst>(conv) && "Expected a const-to-gen conversion");
-    const CallInst *ConvCall = cast<CallInst>(conv);
-    const Value *str = ConvCall->getArgOperand(0);
-    assert(isa<ConstantExpr>(str) &&
+    const Value *Str = Reflect->getArgOperand(0);
+    if (isa<CallInst>(Str)) {
+      // CUDA path
+      const CallInst *ConvCall = cast<CallInst>(Str);
+      Str = ConvCall->getArgOperand(0);
+    }
+    assert(isa<ConstantExpr>(Str) &&
            "Format of _reflect function not recognized");
-    const ConstantExpr *GEP = cast<ConstantExpr>(str);
+    const ConstantExpr *GEP = cast<ConstantExpr>(Str);
 
     const Value *Sym = GEP->getOperand(0);
     assert(isa<Constant>(Sym) && "Format of _reflect function not recognized");
@@ -195,3 +187,36 @@ bool NVVMReflect::runOnModule(Module &M) {
     ToRemove[i]->eraseFromParent();
   return true;
 }
+
+bool NVVMReflect::runOnModule(Module &M) {
+  if (!NVVMReflectEnabled)
+    return false;
+
+  setVarMap();
+
+
+  bool Res = false;
+  std::string Name;
+  Type *Tys[1];
+  Type *I8Ty = Type::getInt8Ty(M.getContext());
+  Function *ReflectFunction;
+
+  // Check for standard overloaded versions of llvm.nvvm.reflect
+
+  for (unsigned i = 0; i != 5; ++i) {
+    Tys[0] = PointerType::get(I8Ty, i);
+    Name = Intrinsic::getName(Intrinsic::nvvm_reflect, Tys);
+    ReflectFunction = M.getFunction(Name);
+    if(ReflectFunction != 0) {
+      Res |= handleFunction(ReflectFunction);
+    }
+  }
+
+  ReflectFunction = M.getFunction(NVVM_REFLECT_FUNCTION);
+  // If reflect function is not used, then there will be
+  // no entry in the module.
+  if (ReflectFunction != 0)
+    Res |= handleFunction(ReflectFunction);
+
+  return Res;
+}
diff --git a/lib/Target/NVPTX/cl_common_defines.h b/lib/Target/NVPTX/cl_common_defines.h
index 45cc0b8..02c5a94 100644
--- a/lib/Target/NVPTX/cl_common_defines.h
+++ b/lib/Target/NVPTX/cl_common_defines.h
@@ -1,5 +1,5 @@
-#ifndef __CL_COMMON_DEFINES_H__
-#define __CL_COMMON_DEFINES_H__
+#ifndef CL_COMMON_DEFINES_H
+#define CL_COMMON_DEFINES_H
 // This file includes defines that are common to both kernel code and
 // the NVPTX back-end.
 
@@ -119,4 +119,4 @@ typedef enum clk_sampler_type {
 #define CLK_LOCAL_MEM_FENCE (1 << 0)
 #define CLK_GLOBAL_MEM_FENCE (1 << 1)
 
-#endif // __CL_COMMON_DEFINES_H__
+#endif // CL_COMMON_DEFINES_H
diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 3ac037d..2f562ca 100644
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -238,7 +238,7 @@ class PPCAsmParser : public MCTargetAsmParser {
   bool ParseExpression(const MCExpr *&EVal);
   bool ParseDarwinExpression(const MCExpr *&EVal);
 
-  bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  bool ParseOperand(OperandVector &Operands);
 
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
   bool ParseDirectiveTC(unsigned Size, SMLoc L);
@@ -246,12 +246,11 @@ class PPCAsmParser : public MCTargetAsmParser {
   bool ParseDarwinDirectiveMachine(SMLoc L);
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer &Out, unsigned &ErrorInfo,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
                                bool MatchingInlineAsm) override;
 
-  void ProcessInstruction(MCInst &Inst,
-                          const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
+  void ProcessInstruction(MCInst &Inst, const OperandVector &Ops);
 
   /// @name Auto-generated Match Functions
   /// {
@@ -276,13 +275,12 @@ public:
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
 
-  bool ParseInstruction(ParseInstructionInfo &Info,
-                        StringRef Name, SMLoc NameLoc,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands) override;
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
 
   bool ParseDirective(AsmToken DirectiveID) override;
 
-  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
                                       unsigned Kind) override;
 
   const MCExpr *applyModifierToExpr(const MCExpr *E,
@@ -548,8 +546,9 @@ public:
 
   void print(raw_ostream &OS) const override;
 
-  static PPCOperand *CreateToken(StringRef Str, SMLoc S, bool IsPPC64) {
-    PPCOperand *Op = new PPCOperand(Token);
+  static std::unique_ptr<PPCOperand> CreateToken(StringRef Str, SMLoc S,
+                                                 bool IsPPC64) {
+    auto Op = make_unique<PPCOperand>(Token);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->StartLoc = S;
@@ -558,22 +557,27 @@ public:
     return Op;
   }
 
-  static PPCOperand *CreateTokenWithStringCopy(StringRef Str, SMLoc S,
-                                               bool IsPPC64) {
+  static std::unique_ptr<PPCOperand>
+  CreateTokenWithStringCopy(StringRef Str, SMLoc S, bool IsPPC64) {
     // Allocate extra memory for the string and copy it.
+    // FIXME: This is incorrect, Operands are owned by unique_ptr with a default
+    // deleter which will destroy them by simply using "delete", not correctly
+    // calling operator delete on this extra memory after calling the dtor
+    // explicitly.
     void *Mem = ::operator new(sizeof(PPCOperand) + Str.size());
-    PPCOperand *Op = new (Mem) PPCOperand(Token);
-    Op->Tok.Data = (const char *)(Op + 1);
+    std::unique_ptr<PPCOperand> Op(new (Mem) PPCOperand(Token));
+    Op->Tok.Data = (const char *)(Op.get() + 1);
     Op->Tok.Length = Str.size();
-    std::memcpy((char *)(Op + 1), Str.data(), Str.size());
+    std::memcpy((void *)Op->Tok.Data, Str.data(), Str.size());
     Op->StartLoc = S;
     Op->EndLoc = S;
     Op->IsPPC64 = IsPPC64;
     return Op;
   }
 
-  static PPCOperand *CreateImm(int64_t Val, SMLoc S, SMLoc E, bool IsPPC64) {
-    PPCOperand *Op = new PPCOperand(Immediate);
+  static std::unique_ptr<PPCOperand> CreateImm(int64_t Val, SMLoc S, SMLoc E,
+                                               bool IsPPC64) {
+    auto Op = make_unique<PPCOperand>(Immediate);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -581,9 +585,9 @@ public:
     return Op;
   }
 
-  static PPCOperand *CreateExpr(const MCExpr *Val,
-                                SMLoc S, SMLoc E, bool IsPPC64) {
-    PPCOperand *Op = new PPCOperand(Expression);
+  static std::unique_ptr<PPCOperand> CreateExpr(const MCExpr *Val, SMLoc S,
+                                                SMLoc E, bool IsPPC64) {
+    auto Op = make_unique<PPCOperand>(Expression);
     Op->Expr.Val = Val;
     Op->Expr.CRVal = EvaluateCRExpr(Val);
     Op->StartLoc = S;
@@ -592,9 +596,9 @@ public:
     return Op;
   }
 
-  static PPCOperand *CreateTLSReg(const MCSymbolRefExpr *Sym,
-                                  SMLoc S, SMLoc E, bool IsPPC64) {
-    PPCOperand *Op = new PPCOperand(TLSRegister);
+  static std::unique_ptr<PPCOperand>
+  CreateTLSReg(const MCSymbolRefExpr *Sym, SMLoc S, SMLoc E, bool IsPPC64) {
+    auto Op = make_unique<PPCOperand>(TLSRegister);
     Op->TLSReg.Sym = Sym;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -602,8 +606,8 @@ public:
     return Op;
   }
 
-  static PPCOperand *CreateFromMCExpr(const MCExpr *Val,
-                                      SMLoc S, SMLoc E, bool IsPPC64) {
+  static std::unique_ptr<PPCOperand>
+  CreateFromMCExpr(const MCExpr *Val, SMLoc S, SMLoc E, bool IsPPC64) {
     if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Val))
       return CreateImm(CE->getValue(), S, E, IsPPC64);
 
@@ -634,10 +638,8 @@ void PPCOperand::print(raw_ostream &OS) const {
   }
 }
 
-
-void PPCAsmParser::
-ProcessInstruction(MCInst &Inst,
-                   const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+void PPCAsmParser::ProcessInstruction(MCInst &Inst,
+                                      const OperandVector &Operands) {
   int Opcode = Inst.getOpcode();
   switch (Opcode) {
   case PPC::LAx: {
@@ -917,11 +919,10 @@ ProcessInstruction(MCInst &Inst,
   }
 }
 
-bool PPCAsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCStreamer &Out, unsigned &ErrorInfo,
-                        bool MatchingInlineAsm) {
+bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                           OperandVector &Operands,
+                                           MCStreamer &Out, unsigned &ErrorInfo,
+                                           bool MatchingInlineAsm) {
   MCInst Inst;
 
   switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
@@ -942,7 +943,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
-      ErrorLoc = ((PPCOperand*)Operands[ErrorInfo])->getStartLoc();
+      ErrorLoc = ((PPCOperand &)*Operands[ErrorInfo]).getStartLoc();
       if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
     }
 
@@ -1216,12 +1217,10 @@ ParseDarwinExpression(const MCExpr *&EVal) {
 /// ParseOperand
 /// This handles registers in the form 'NN', '%rNN' for ELF platforms and
 /// rNN for MachO.
-bool PPCAsmParser::
-ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool PPCAsmParser::ParseOperand(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
   const MCExpr *EVal;
-  PPCOperand *Op;
 
   // Attempt to parse the next token as an immediate
   switch (getLexer().getKind()) {
@@ -1233,8 +1232,7 @@ ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     int64_t IntVal;
     if (!MatchRegisterName(Parser.getTok(), RegNo, IntVal)) {
       Parser.Lex(); // Eat the identifier token.
-      Op = PPCOperand::CreateImm(IntVal, S, E, isPPC64());
-      Operands.push_back(Op);
+      Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64()));
       return false;
     }
     return Error(S, "invalid register name");
@@ -1249,8 +1247,7 @@ ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
       int64_t IntVal;
       if (!MatchRegisterName(Parser.getTok(), RegNo, IntVal)) {
         Parser.Lex(); // Eat the identifier token.
-        Op = PPCOperand::CreateImm(IntVal, S, E, isPPC64());
-        Operands.push_back(Op);
+        Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64()));
         return false;
       }
     }
@@ -1272,8 +1269,7 @@ ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   }
 
   // Push the parsed operand into the list of operands
-  Op = PPCOperand::CreateFromMCExpr(EVal, S, E, isPPC64());
-  Operands.push_back(Op);
+  Operands.push_back(PPCOperand::CreateFromMCExpr(EVal, S, E, isPPC64()));
 
   // Check whether this is a TLS call expression
   bool TLSCall = false;
@@ -1292,8 +1288,7 @@ ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     E = Parser.getTok().getLoc();
     Parser.Lex(); // Eat the ')'.
 
-    Op = PPCOperand::CreateFromMCExpr(TLSSym, S, E, isPPC64());
-    Operands.push_back(Op);
+    Operands.push_back(PPCOperand::CreateFromMCExpr(TLSSym, S, E, isPPC64()));
   }
 
   // Otherwise, check for D-form memory operands
@@ -1340,17 +1335,15 @@ ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     E = Parser.getTok().getLoc();
     Parser.Lex(); // Eat the ')'.
 
-    Op = PPCOperand::CreateImm(IntVal, S, E, isPPC64());
-    Operands.push_back(Op);
+    Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64()));
   }
 
   return false;
 }
 
 /// Parse an instruction mnemonic followed by its operands.
-bool PPCAsmParser::
-ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
-                 SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                    SMLoc NameLoc, OperandVector &Operands) {
   // The first operand is the token for the instruction name.
   // If the next character is a '+' or '-', we need to add it to the
   // instruction name, to match what TableGen is doing.
@@ -1554,7 +1547,7 @@ extern "C" void LLVMInitializePowerPCAsmParser() {
 
 // Define this matcher function after the auto-generated include so we
 // have the match class enum definitions.
-unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
+unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
                                                   unsigned Kind) {
   // If the kind is a token for a literal immediate, check if our asm
   // operand matches. This is for InstAliases which have a fixed-value
@@ -1568,8 +1561,8 @@ unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
     default: return Match_InvalidOperand;
   }
 
-  PPCOperand *Op = static_cast<PPCOperand*>(AsmOp);
-  if (Op->isImm() && Op->getImm() == ImmVal)
+  PPCOperand &Op = static_cast<PPCOperand &>(AsmOp);
+  if (Op.isImm() && Op.getImm() == ImmVal)
     return Match_Success;
 
   return Match_InvalidOperand;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index a4983ad..435a93f 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -102,17 +102,45 @@ public:
 
     // Output the constant in big/little endian byte order.
     unsigned Size = Desc.getSize();
-    if (IsLittleEndian) {
-      for (unsigned i = 0; i != Size; ++i) {
-        OS << (char)Bits;
-        Bits >>= 8;
+    switch (Size) {
+    case 4:
+      if (IsLittleEndian) {
+        OS << (char)(Bits);
+        OS << (char)(Bits >> 8);
+        OS << (char)(Bits >> 16);
+        OS << (char)(Bits >> 24);
+      } else {
+        OS << (char)(Bits >> 24);
+        OS << (char)(Bits >> 16);
+        OS << (char)(Bits >> 8);
+        OS << (char)(Bits);
       }
-    } else {
-      int ShiftValue = (Size * 8) - 8;
-      for (unsigned i = 0; i != Size; ++i) {
-        OS << (char)(Bits >> ShiftValue);
-        Bits <<= 8;
+      break;
+    case 8:
+      // If we emit a pair of instructions, the first one is
+      // always in the top 32 bits, even on little-endian.
+      if (IsLittleEndian) {
+        OS << (char)(Bits >> 32);
+        OS << (char)(Bits >> 40);
+        OS << (char)(Bits >> 48);
+        OS << (char)(Bits >> 56);
+        OS << (char)(Bits);
+        OS << (char)(Bits >> 8);
+        OS << (char)(Bits >> 16);
+        OS << (char)(Bits >> 24);
+      } else {
+        OS << (char)(Bits >> 56);
+        OS << (char)(Bits >> 48);
+        OS << (char)(Bits >> 40);
+        OS << (char)(Bits >> 32);
+        OS << (char)(Bits >> 24);
+        OS << (char)(Bits >> 16);
+        OS << (char)(Bits >> 8);
+        OS << (char)(Bits);
       }
+      break;
+    default:
+      llvm_unreachable ("Invalid instruction size");
     }
     
     ++MCNumEmitted;  // Keep track of the # of mi's emitted.
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index 10d068d..3ac0aca 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -11,6 +11,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectStreamer.h"
 
 using namespace llvm;
 
@@ -127,33 +128,6 @@ PPCMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
   return true;
 }
 
-// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
-// that method should be made public?
-static void AddValueSymbols_(const MCExpr *Value, MCAssembler *Asm) {
-  switch (Value->getKind()) {
-  case MCExpr::Target:
-    llvm_unreachable("Can't handle nested target expr!");
-
-  case MCExpr::Constant:
-    break;
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
-    AddValueSymbols_(BE->getLHS(), Asm);
-    AddValueSymbols_(BE->getRHS(), Asm);
-    break;
-  }
-
-  case MCExpr::SymbolRef:
-    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
-    break;
-
-  case MCExpr::Unary:
-    AddValueSymbols_(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
-    break;
-  }
-}
-
-void PPCMCExpr::AddValueSymbols(MCAssembler *Asm) const {
-  AddValueSymbols_(getSubExpr(), Asm);
+void PPCMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*getSubExpr());
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
index 3421b91..bca4085 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -79,7 +79,7 @@ public:
   void PrintImpl(raw_ostream &OS) const override;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAsmLayout *Layout) const override;
-  void AddValueSymbols(MCAssembler *) const override;
+  void visitUsedExpr(MCStreamer &Streamer) const override;
   const MCSection *FindAssociatedSection() const override {
     return getSubExpr()->FindAssociatedSection();
   }
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index bd58539..a9842b2 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -46,6 +46,7 @@ def DirectivePwr5x: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5X", ""
 def DirectivePwr6: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6", "">;
 def DirectivePwr6x: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6X", "">;
 def DirectivePwr7: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR7", "">;
+def DirectivePwr8: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR8", "">;
 
 def Feature64Bit     : SubtargetFeature<"64bit","Has64BitSupport", "true",
                                         "Enable 64-bit instructions">;
@@ -285,6 +286,15 @@ def : ProcessorModel<"pwr7", P7Model,
                    FeaturePOPCNTD, FeatureLDBRX,
                    Feature64Bit /*, Feature64BitRegs */,
                    DeprecatedMFTB, DeprecatedDST]>;
+def : ProcessorModel<"pwr8", P7Model /* FIXME: Update to P8Model when available */,
+                  [DirectivePwr8, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
+                   FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFPRND, FeatureFPCVT, FeatureISEL,
+                   FeaturePOPCNTD, FeatureLDBRX,
+                   Feature64Bit /*, Feature64BitRegs */,
+                   DeprecatedMFTB, DeprecatedDST]>;
 def : Processor<"ppc", G3Itineraries, [Directive32]>;
 def : ProcessorModel<"ppc64", G5Model,
                   [Directive64, FeatureAltivec,
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index e89fb2d..fd044d9 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -365,8 +365,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // Transform %Xd = ADDIStocHA %X2, <ga:@sym>
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
 
-    // Change the opcode to ADDIS8.  If the global address is external,
-    // has common linkage, is a function address, or is a jump table
+    // Change the opcode to ADDIS8.  If the global address is external, has
+    // common linkage, is a non-local function address, or is a jump table
     // address, then generate a TOC entry and reference that.  Otherwise
     // reference the symbol directly.
     TmpInst.setOpcode(PPC::ADDIS8);
@@ -375,7 +375,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
            "Invalid operand for ADDIStocHA!");
     MCSymbol *MOSymbol = nullptr;
     bool IsExternal = false;
-    bool IsFunction = false;
+    bool IsNonLocalFunction = false;
     bool IsCommon = false;
     bool IsAvailExt = false;
 
@@ -384,15 +384,16 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MOSymbol = getSymbol(GV);
       IsExternal = GV->isDeclaration();
       IsCommon = GV->hasCommonLinkage();
-      IsFunction = GV->getType()->getElementType()->isFunctionTy();
+      IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() &&
+        (GV->isDeclaration() || GV->isWeakForLinker());
       IsAvailExt = GV->hasAvailableExternallyLinkage();
     } else if (MO.isCPI())
       MOSymbol = GetCPISymbol(MO.getIndex());
     else if (MO.isJTI())
       MOSymbol = GetJTISymbol(MO.getIndex());
 
-    if (IsExternal || IsFunction || IsCommon || IsAvailExt || MO.isJTI() ||
-        TM.getCodeModel() == CodeModel::Large)
+    if (IsExternal || IsNonLocalFunction || IsCommon || IsAvailExt ||
+        MO.isJTI() || TM.getCodeModel() == CodeModel::Large)
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
 
     const MCExpr *Exp =
@@ -425,7 +426,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     else if (MO.isGlobal()) {
       const GlobalValue *GValue = MO.getGlobal();
       MOSymbol = getSymbol(GValue);
-      if (GValue->isDeclaration() || GValue->hasCommonLinkage() ||
+      if (GValue->getType()->getElementType()->isFunctionTy() ||
+          GValue->isDeclaration() || GValue->hasCommonLinkage() ||
           GValue->hasAvailableExternallyLinkage() ||
           TM.getCodeModel() == CodeModel::Large)
         MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
@@ -450,17 +452,19 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL");
     MCSymbol *MOSymbol = nullptr;
     bool IsExternal = false;
-    bool IsFunction = false;
+    bool IsNonLocalFunction = false;
 
     if (MO.isGlobal()) {
       const GlobalValue *GV = MO.getGlobal();
       MOSymbol = getSymbol(GV);
       IsExternal = GV->isDeclaration();
-      IsFunction = GV->getType()->getElementType()->isFunctionTy();
+      IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() &&
+        (GV->isDeclaration() || GV->isWeakForLinker());
     } else if (MO.isCPI())
       MOSymbol = GetCPISymbol(MO.getIndex());
 
-    if (IsFunction || IsExternal || TM.getCodeModel() == CodeModel::Large)
+    if (IsNonLocalFunction || IsExternal ||
+        TM.getCodeModel() == CodeModel::Large)
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
 
     const MCExpr *Exp =
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index ed3cb4d..92a0ec1 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -1030,6 +1030,10 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
   if (DstVT != MVT::i32 && DstVT != MVT::i64)
     return false;
 
+  // If we don't have FCTIDUZ and we need it, punt to SelectionDAG.
+  if (DstVT == MVT::i64 && !IsSigned && !PPCSubTarget->hasFPCVT())
+    return false;
+
   Value *Src = I->getOperand(0);
   Type *SrcTy = Src->getType();
   if (!isTypeLegal(SrcTy, SrcVT))
@@ -1197,6 +1201,11 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
                                   bool IsVarArg) {
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, ArgLocs, *Context);
+
+  // Reserve space for the linkage area on the stack.
+  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false);
+  CCInfo.AllocateStack(LinkageSize, 8);
+
   CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS);
 
   // Bail out if we can't handle any of the arguments.
@@ -1218,6 +1227,13 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
   // Get a count of how many bytes are to be pushed onto the stack.
   NumBytes = CCInfo.getNextStackOffset();
 
+  // The prolog code of the callee may store up to 8 GPR argument registers to
+  // the stack, allowing va_start to index over them in memory if its varargs.
+  // Because we cannot tell if this is needed on the caller side, we have to
+  // conservatively assume that it is needed.  As such, make sure we have at
+  // least enough stack space for the caller to store the 8 GPRs.
+  NumBytes = std::max(NumBytes, LinkageSize + 64);
+
   // Issue CALLSEQ_START.
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
           TII.get(TII.getCallFrameSetupOpcode()))
@@ -1858,16 +1874,9 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
   // FIXME: Jump tables are not yet required because fast-isel doesn't
   // handle switches; if that changes, we need them as well.  For now,
   // what follows assumes everything's a generic (or TLS) global address.
-  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
-  if (!GVar) {
-    // If GV is an alias, use the aliasee for determining thread-locality.
-    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      GVar = dyn_cast_or_null<GlobalVariable>(GA->getAliasee());
-  }
 
   // FIXME: We don't yet handle the complexity of TLS.
-  bool IsTLS = GVar && GVar->isThreadLocal();
-  if (IsTLS)
+  if (GV->isThreadLocal())
     return 0;
 
   // For small code model, generate a simple TOC load.
@@ -1877,8 +1886,8 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
         .addGlobalAddress(GV)
         .addReg(PPC::X2);
   else {
-    // If the address is an externally defined symbol, a symbol with
-    // common or externally available linkage, a function address, or a
+    // If the address is an externally defined symbol, a symbol with common
+    // or externally available linkage, a non-local function address, or a
     // jump table address (not yet needed), or if we are generating code
     // for large code model, we generate:
     //       LDtocL(GV, ADDIStocHA(%X2, GV))
@@ -1889,12 +1898,13 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
             HighPartReg).addReg(PPC::X2).addGlobalAddress(GV);
 
-    // !GVar implies a function address.  An external variable is one
-    // without an initializer.
     // If/when switches are implemented, jump tables should be handled
     // on the "if" path here.
-    if (CModel == CodeModel::Large || !GVar || !GVar->hasInitializer() ||
-        GVar->hasCommonLinkage() || GVar->hasAvailableExternallyLinkage())
+    if (CModel == CodeModel::Large ||
+        (GV->getType()->getElementType()->isFunctionTy() &&
+         (GV->isDeclaration() || GV->isWeakForLinker())) ||
+        GV->isDeclaration() || GV->hasCommonLinkage() ||
+        GV->hasAvailableExternallyLinkage())
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
               DestReg).addGlobalAddress(GV).addReg(HighPartReg);
     else
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index e294156..65e9cf2 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -15,6 +15,7 @@
 #include "PPCInstrBuilder.h"
 #include "PPCInstrInfo.h"
 #include "PPCMachineFunctionInfo.h"
+#include "PPCSubtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -35,6 +36,167 @@ static const uint16_t VRRegNo[] = {
  PPC::V24, PPC::V25, PPC::V26, PPC::V27, PPC::V28, PPC::V29, PPC::V30, PPC::V31
 };
 
+PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI)
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
+                          (STI.hasQPX() || STI.isBGQ()) ? 32 : 16, 0),
+      Subtarget(STI) {}
+
+// With the SVR4 ABI, callee-saved registers have fixed offsets on the stack.
+const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
+    unsigned &NumEntries) const {
+  if (Subtarget.isDarwinABI()) {
+    NumEntries = 1;
+    if (Subtarget.isPPC64()) {
+      static const SpillSlot darwin64Offsets = {PPC::X31, -8};
+      return &darwin64Offsets;
+    } else {
+      static const SpillSlot darwinOffsets = {PPC::R31, -4};
+      return &darwinOffsets;
+    }
+  }
+
+  // Early exit if not using the SVR4 ABI.
+  if (!Subtarget.isSVR4ABI()) {
+    NumEntries = 0;
+    return nullptr;
+  }
+
+  // Note that the offsets here overlap, but this is fixed up in
+  // processFunctionBeforeFrameFinalized.
+
+  static const SpillSlot Offsets[] = {
+      // Floating-point register save area offsets.
+      {PPC::F31, -8},
+      {PPC::F30, -16},
+      {PPC::F29, -24},
+      {PPC::F28, -32},
+      {PPC::F27, -40},
+      {PPC::F26, -48},
+      {PPC::F25, -56},
+      {PPC::F24, -64},
+      {PPC::F23, -72},
+      {PPC::F22, -80},
+      {PPC::F21, -88},
+      {PPC::F20, -96},
+      {PPC::F19, -104},
+      {PPC::F18, -112},
+      {PPC::F17, -120},
+      {PPC::F16, -128},
+      {PPC::F15, -136},
+      {PPC::F14, -144},
+
+      // General register save area offsets.
+      {PPC::R31, -4},
+      {PPC::R30, -8},
+      {PPC::R29, -12},
+      {PPC::R28, -16},
+      {PPC::R27, -20},
+      {PPC::R26, -24},
+      {PPC::R25, -28},
+      {PPC::R24, -32},
+      {PPC::R23, -36},
+      {PPC::R22, -40},
+      {PPC::R21, -44},
+      {PPC::R20, -48},
+      {PPC::R19, -52},
+      {PPC::R18, -56},
+      {PPC::R17, -60},
+      {PPC::R16, -64},
+      {PPC::R15, -68},
+      {PPC::R14, -72},
+
+      // CR save area offset.  We map each of the nonvolatile CR fields
+      // to the slot for CR2, which is the first of the nonvolatile CR
+      // fields to be assigned, so that we only allocate one save slot.
+      // See PPCRegisterInfo::hasReservedSpillSlot() for more information.
+      {PPC::CR2, -4},
+
+      // VRSAVE save area offset.
+      {PPC::VRSAVE, -4},
+
+      // Vector register save area
+      {PPC::V31, -16},
+      {PPC::V30, -32},
+      {PPC::V29, -48},
+      {PPC::V28, -64},
+      {PPC::V27, -80},
+      {PPC::V26, -96},
+      {PPC::V25, -112},
+      {PPC::V24, -128},
+      {PPC::V23, -144},
+      {PPC::V22, -160},
+      {PPC::V21, -176},
+      {PPC::V20, -192}};
+
+  static const SpillSlot Offsets64[] = {
+      // Floating-point register save area offsets.
+      {PPC::F31, -8},
+      {PPC::F30, -16},
+      {PPC::F29, -24},
+      {PPC::F28, -32},
+      {PPC::F27, -40},
+      {PPC::F26, -48},
+      {PPC::F25, -56},
+      {PPC::F24, -64},
+      {PPC::F23, -72},
+      {PPC::F22, -80},
+      {PPC::F21, -88},
+      {PPC::F20, -96},
+      {PPC::F19, -104},
+      {PPC::F18, -112},
+      {PPC::F17, -120},
+      {PPC::F16, -128},
+      {PPC::F15, -136},
+      {PPC::F14, -144},
+
+      // General register save area offsets.
+      {PPC::X31, -8},
+      {PPC::X30, -16},
+      {PPC::X29, -24},
+      {PPC::X28, -32},
+      {PPC::X27, -40},
+      {PPC::X26, -48},
+      {PPC::X25, -56},
+      {PPC::X24, -64},
+      {PPC::X23, -72},
+      {PPC::X22, -80},
+      {PPC::X21, -88},
+      {PPC::X20, -96},
+      {PPC::X19, -104},
+      {PPC::X18, -112},
+      {PPC::X17, -120},
+      {PPC::X16, -128},
+      {PPC::X15, -136},
+      {PPC::X14, -144},
+
+      // VRSAVE save area offset.
+      {PPC::VRSAVE, -4},
+
+      // Vector register save area
+      {PPC::V31, -16},
+      {PPC::V30, -32},
+      {PPC::V29, -48},
+      {PPC::V28, -64},
+      {PPC::V27, -80},
+      {PPC::V26, -96},
+      {PPC::V25, -112},
+      {PPC::V24, -128},
+      {PPC::V23, -144},
+      {PPC::V22, -160},
+      {PPC::V21, -176},
+      {PPC::V20, -192}};
+
+  if (Subtarget.isPPC64()) {
+    NumEntries = array_lengthof(Offsets64);
+
+    return Offsets64;
+  } else {
+    NumEntries = array_lengthof(Offsets);
+
+    return Offsets;
+  }
+}
+
 /// RemoveVRSaveCode - We have found that this function does not need any code
 /// to manipulate the VRSAVE register, even though it uses vector registers.
 /// This can happen when the only registers used are known to be live in or out
@@ -236,9 +398,9 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
   // Get the maximum call frame size of all the calls.
   unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
 
-  // Maximum call frame needs to be at least big enough for linkage and 8 args.
-  unsigned minCallFrameSize = getMinCallFrameSize(Subtarget.isPPC64(),
-                                                  Subtarget.isDarwinABI());
+  // Maximum call frame needs to be at least big enough for linkage area.
+  unsigned minCallFrameSize = getLinkageSize(Subtarget.isPPC64(),
+                                             Subtarget.isDarwinABI());
   maxCallFrameSize = std::max(maxCallFrameSize, minCallFrameSize);
 
   // If we have dynamic alloca then maxCallFrameSize needs to be aligned so
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index 94e9b67..7a226f7 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -14,23 +14,18 @@
 #define POWERPC_FRAMEINFO_H
 
 #include "PPC.h"
-#include "PPCSubtarget.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
-  class PPCSubtarget;
+class PPCSubtarget;
 
 class PPCFrameLowering: public TargetFrameLowering {
   const PPCSubtarget &Subtarget;
 
 public:
-  PPCFrameLowering(const PPCSubtarget &sti)
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
-        (sti.hasQPX() || sti.isBGQ()) ? 32 : 16, 0),
-      Subtarget(sti) {
-  }
+  PPCFrameLowering(const PPCSubtarget &STI);
 
   unsigned determineFrameLayout(MachineFunction &MF,
                                 bool UpdateMF = true,
@@ -79,6 +74,12 @@ public:
     return isPPC64 ? 16 : 4;
   }
 
+  /// getTOCSaveOffset - Return the previous frame offset to save the
+  /// TOC register -- 64-bit SVR4 ABI only.
+  static unsigned getTOCSaveOffset(void) {
+    return 40;
+  }
+
   /// getFramePointerSaveOffset - Return the previous frame offset to save the
   /// frame pointer.
   static unsigned getFramePointerSaveOffset(bool isPPC64, bool isDarwinABI) {
@@ -114,190 +115,9 @@ public:
     return 8;
   }
 
-  /// getMinCallArgumentsSize - Return the size of the minium PowerPC ABI
-  /// argument area.
-  static unsigned getMinCallArgumentsSize(bool isPPC64, bool isDarwinABI) {
-    // For the Darwin ABI / 64-bit SVR4 ABI:
-    // The prolog code of the callee may store up to 8 GPR argument registers to
-    // the stack, allowing va_start to index over them in memory if its varargs.
-    // Because we cannot tell if this is needed on the caller side, we have to
-    // conservatively assume that it is needed.  As such, make sure we have at
-    // least enough stack space for the caller to store the 8 GPRs.
-    if (isDarwinABI || isPPC64)
-      return 8 * (isPPC64 ? 8 : 4);
-
-    // 32-bit SVR4 ABI:
-    // There is no default stack allocated for the 8 first GPR arguments.
-    return 0;
-  }
-
-  /// getMinCallFrameSize - Return the minimum size a call frame can be using
-  /// the PowerPC ABI.
-  static unsigned getMinCallFrameSize(bool isPPC64, bool isDarwinABI) {
-    // The call frame needs to be at least big enough for linkage and 8 args.
-    return getLinkageSize(isPPC64, isDarwinABI) +
-           getMinCallArgumentsSize(isPPC64, isDarwinABI);
-  }
-
-  // With the SVR4 ABI, callee-saved registers have fixed offsets on the stack.
   const SpillSlot *
-  getCalleeSavedSpillSlots(unsigned &NumEntries) const override {
-    if (Subtarget.isDarwinABI()) {
-      NumEntries = 1;
-      if (Subtarget.isPPC64()) {
-        static const SpillSlot darwin64Offsets = {PPC::X31, -8};
-        return &darwin64Offsets;
-      } else {
-        static const SpillSlot darwinOffsets = {PPC::R31, -4};
-        return &darwinOffsets;
-      }
-    }
-
-    // Early exit if not using the SVR4 ABI.
-    if (!Subtarget.isSVR4ABI()) {
-      NumEntries = 0;
-      return nullptr;
-    }
-
-    // Note that the offsets here overlap, but this is fixed up in
-    // processFunctionBeforeFrameFinalized.
-
-    static const SpillSlot Offsets[] = {
-      // Floating-point register save area offsets.
-      {PPC::F31, -8},
-      {PPC::F30, -16},
-      {PPC::F29, -24},
-      {PPC::F28, -32},
-      {PPC::F27, -40},
-      {PPC::F26, -48},
-      {PPC::F25, -56},
-      {PPC::F24, -64},
-      {PPC::F23, -72},
-      {PPC::F22, -80},
-      {PPC::F21, -88},
-      {PPC::F20, -96},
-      {PPC::F19, -104},
-      {PPC::F18, -112},
-      {PPC::F17, -120},
-      {PPC::F16, -128},
-      {PPC::F15, -136},
-      {PPC::F14, -144},
-
-      // General register save area offsets.
-      {PPC::R31, -4},
-      {PPC::R30, -8},
-      {PPC::R29, -12},
-      {PPC::R28, -16},
-      {PPC::R27, -20},
-      {PPC::R26, -24},
-      {PPC::R25, -28},
-      {PPC::R24, -32},
-      {PPC::R23, -36},
-      {PPC::R22, -40},
-      {PPC::R21, -44},
-      {PPC::R20, -48},
-      {PPC::R19, -52},
-      {PPC::R18, -56},
-      {PPC::R17, -60},
-      {PPC::R16, -64},
-      {PPC::R15, -68},
-      {PPC::R14, -72},
-
-      // CR save area offset.  We map each of the nonvolatile CR fields
-      // to the slot for CR2, which is the first of the nonvolatile CR
-      // fields to be assigned, so that we only allocate one save slot.
-      // See PPCRegisterInfo::hasReservedSpillSlot() for more information.
-      {PPC::CR2, -4},
-
-      // VRSAVE save area offset.
-      {PPC::VRSAVE, -4},
-
-      // Vector register save area
-      {PPC::V31, -16},
-      {PPC::V30, -32},
-      {PPC::V29, -48},
-      {PPC::V28, -64},
-      {PPC::V27, -80},
-      {PPC::V26, -96},
-      {PPC::V25, -112},
-      {PPC::V24, -128},
-      {PPC::V23, -144},
-      {PPC::V22, -160},
-      {PPC::V21, -176},
-      {PPC::V20, -192}
-    };
-
-    static const SpillSlot Offsets64[] = {
-      // Floating-point register save area offsets.
-      {PPC::F31, -8},
-      {PPC::F30, -16},
-      {PPC::F29, -24},
-      {PPC::F28, -32},
-      {PPC::F27, -40},
-      {PPC::F26, -48},
-      {PPC::F25, -56},
-      {PPC::F24, -64},
-      {PPC::F23, -72},
-      {PPC::F22, -80},
-      {PPC::F21, -88},
-      {PPC::F20, -96},
-      {PPC::F19, -104},
-      {PPC::F18, -112},
-      {PPC::F17, -120},
-      {PPC::F16, -128},
-      {PPC::F15, -136},
-      {PPC::F14, -144},
-
-      // General register save area offsets.
-      {PPC::X31, -8},
-      {PPC::X30, -16},
-      {PPC::X29, -24},
-      {PPC::X28, -32},
-      {PPC::X27, -40},
-      {PPC::X26, -48},
-      {PPC::X25, -56},
-      {PPC::X24, -64},
-      {PPC::X23, -72},
-      {PPC::X22, -80},
-      {PPC::X21, -88},
-      {PPC::X20, -96},
-      {PPC::X19, -104},
-      {PPC::X18, -112},
-      {PPC::X17, -120},
-      {PPC::X16, -128},
-      {PPC::X15, -136},
-      {PPC::X14, -144},
-
-      // VRSAVE save area offset.
-      {PPC::VRSAVE, -4},
-
-      // Vector register save area
-      {PPC::V31, -16},
-      {PPC::V30, -32},
-      {PPC::V29, -48},
-      {PPC::V28, -64},
-      {PPC::V27, -80},
-      {PPC::V26, -96},
-      {PPC::V25, -112},
-      {PPC::V24, -128},
-      {PPC::V23, -144},
-      {PPC::V22, -160},
-      {PPC::V21, -176},
-      {PPC::V20, -192}
-    };
-
-    if (Subtarget.isPPC64()) {
-      NumEntries = array_lengthof(Offsets64);
-
-      return Offsets64;
-    } else {
-      NumEntries = array_lengthof(Offsets);
-
-      return Offsets;
-    }
-  }
+  getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
 };
-
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index 7ca706b..d9b242c 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -162,7 +162,8 @@ unsigned PPCDispatchGroupSBHazardRecognizer::PreEmitNoops(SUnit *SU) {
     unsigned Directive =
       DAG->TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
     // If we're using a special group-terminating nop, then we need only one.
-    if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7)
+    if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 ||
+        Directive == PPC::DIR_PWR8 )
       return 1;
 
     return 5 - CurSlots;
@@ -223,7 +224,7 @@ void PPCDispatchGroupSBHazardRecognizer::EmitNoop() {
   // If the group has now filled all of its slots, or if we're using a special
   // group-terminating nop, the group is complete.
   if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 ||
-      CurSlots == 6)  {
+      Directive == PPC::DIR_PWR8 || CurSlots == 6)  {
     CurGroup.clear();
     CurSlots = CurBranches = 0;
   } else {
@@ -258,8 +259,8 @@ void PPCDispatchGroupSBHazardRecognizer::EmitNoop() {
 //   3. Handling of the esoteric cases in "Resource-based Instruction Grouping".
 //
 
-PPCHazardRecognizer970::PPCHazardRecognizer970(const TargetMachine &TM)
-  : TM(TM) {
+PPCHazardRecognizer970::PPCHazardRecognizer970(const ScheduleDAG &DAG)
+    : DAG(DAG) {
   EndDispatchGroup();
 }
 
@@ -278,7 +279,7 @@ PPCHazardRecognizer970::GetInstrType(unsigned Opcode,
                                      bool &isFirst, bool &isSingle,
                                      bool &isCracked,
                                      bool &isLoad, bool &isStore) {
-  const MCInstrDesc &MCID = TM.getInstrInfo()->get(Opcode);
+  const MCInstrDesc &MCID = DAG.TII->get(Opcode);
 
   isLoad  = MCID.mayLoad();
   isStore = MCID.mayStore();
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.h b/lib/Target/PowerPC/PPCHazardRecognizers.h
index cf4332c..23f76c1 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.h
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.h
@@ -54,7 +54,7 @@ public:
 /// setting the CTR register then branching through it within a dispatch group),
 /// or storing then loading from the same address within a dispatch group.
 class PPCHazardRecognizer970 : public ScheduleHazardRecognizer {
-  const TargetMachine &TM;
+  const ScheduleDAG &DAG;
 
   unsigned NumIssued;  // Number of insts issued, including advanced cycles.
 
@@ -75,7 +75,7 @@ class PPCHazardRecognizer970 : public ScheduleHazardRecognizer {
   unsigned NumStores;
 
 public:
-  PPCHazardRecognizer970(const TargetMachine &TM);
+  PPCHazardRecognizer970(const ScheduleDAG &DAG);
   virtual HazardType getHazardType(SUnit *SU, int Stalls) override;
   virtual void EmitInstruction(SUnit *SU) override;
   virtual void AdvanceCycle() override;
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 251e8b6..4881b3f 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -1454,10 +1454,10 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     if (CModel != CodeModel::Medium && CModel != CodeModel::Large)
       break;
 
-    // The first source operand is a TargetGlobalAddress or a
-    // TargetJumpTable.  If it is an externally defined symbol, a symbol
-    // with common linkage, a function address, or a jump table address,
-    // or if we are generating code for large code model, we generate:
+    // The first source operand is a TargetGlobalAddress or a TargetJumpTable.
+    // If it is an externally defined symbol, a symbol with common linkage,
+    // a non-local function address, or a jump table address, or if we are
+    // generating code for large code model, we generate:
     //   LDtocL(<ga:@sym>, ADDIStocHA(%X2, <ga:@sym>))
     // Otherwise we generate:
     //   ADDItocL(ADDIStocHA(%X2, <ga:@sym>), <ga:@sym>)
@@ -1472,8 +1472,10 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
 
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
       const GlobalValue *GValue = G->getGlobal();
-      if (GValue->isDeclaration() || GValue->hasCommonLinkage() ||
-        GValue->hasAvailableExternallyLinkage())
+      if ((GValue->getType()->getElementType()->isFunctionTy() &&
+           (GValue->isDeclaration() || GValue->isWeakForLinker())) ||
+          GValue->isDeclaration() || GValue->hasCommonLinkage() ||
+          GValue->hasAvailableExternallyLinkage())
         return CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
                                       SDValue(Tmp, 0));
     }
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index cf4c9e6..bc057bf 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -19,6 +19,7 @@
 #include "PPCTargetObjectFile.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -50,20 +51,18 @@ cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
 // FIXME: Remove this once the bug has been fixed!
 extern cl::opt<bool> ANDIGlueBug;
 
-static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) {
-  if (TM.getSubtargetImpl()->isDarwin())
+static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
+  // If it isn't a Mach-O file then it's going to be a linux ELF
+  // object file.
+  if (TT.isOSDarwin())
     return new TargetLoweringObjectFileMachO();
 
-  if (TM.getSubtargetImpl()->isSVR4ABI())
-    return new PPC64LinuxTargetObjectFile();
-
-  return new TargetLoweringObjectFileELF();
+  return new PPC64LinuxTargetObjectFile();
 }
 
 PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
-  : TargetLowering(TM, CreateTLOF(TM)), PPCSubTarget(*TM.getSubtargetImpl()) {
-  const PPCSubtarget *Subtarget = &TM.getSubtarget<PPCSubtarget>();
-
+    : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))),
+      Subtarget(*TM.getSubtargetImpl()) {
   setPow2DivIsCheap();
 
   // Use _setjmp/_longjmp instead of setjmp/longjmp.
@@ -72,7 +71,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
 
   // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
   // arguments are at least 4/8 bytes aligned.
-  bool isPPC64 = Subtarget->isPPC64();
+  bool isPPC64 = Subtarget.isPPC64();
   setMinStackArgumentAlignment(isPPC64 ? 8:4);
 
   // Set up the register classes.
@@ -98,10 +97,10 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
   setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
 
-  if (Subtarget->useCRBits()) {
+  if (Subtarget.useCRBits()) {
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
-    if (isPPC64 || Subtarget->hasFPCVT()) {
+    if (isPPC64 || Subtarget.hasFPCVT()) {
       setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
       AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,
                          isPPC64 ? MVT::i64 : MVT::i32);
@@ -176,17 +175,17 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
 
   // If we're enabling GP optimizations, use hardware square root
-  if (!Subtarget->hasFSQRT() &&
+  if (!Subtarget.hasFSQRT() &&
       !(TM.Options.UnsafeFPMath &&
-        Subtarget->hasFRSQRTE() && Subtarget->hasFRE()))
+        Subtarget.hasFRSQRTE() && Subtarget.hasFRE()))
     setOperationAction(ISD::FSQRT, MVT::f64, Expand);
 
-  if (!Subtarget->hasFSQRT() &&
+  if (!Subtarget.hasFSQRT() &&
       !(TM.Options.UnsafeFPMath &&
-        Subtarget->hasFRSQRTES() && Subtarget->hasFRES()))
+        Subtarget.hasFRSQRTES() && Subtarget.hasFRES()))
     setOperationAction(ISD::FSQRT, MVT::f32, Expand);
 
-  if (Subtarget->hasFCPSGN()) {
+  if (Subtarget.hasFCPSGN()) {
     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal);
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
   } else {
@@ -194,7 +193,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
   }
 
-  if (Subtarget->hasFPRND()) {
+  if (Subtarget.hasFPRND()) {
     setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
     setOperationAction(ISD::FCEIL,  MVT::f64, Legal);
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
@@ -216,7 +215,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
 
-  if (Subtarget->hasPOPCNTD()) {
+  if (Subtarget.hasPOPCNTD()) {
     setOperationAction(ISD::CTPOP, MVT::i32  , Legal);
     setOperationAction(ISD::CTPOP, MVT::i64  , Legal);
   } else {
@@ -228,7 +227,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setOperationAction(ISD::ROTR, MVT::i32   , Expand);
   setOperationAction(ISD::ROTR, MVT::i64   , Expand);
 
-  if (!Subtarget->useCRBits()) {
+  if (!Subtarget.useCRBits()) {
     // PowerPC does not have Select
     setOperationAction(ISD::SELECT, MVT::i32, Expand);
     setOperationAction(ISD::SELECT, MVT::i64, Expand);
@@ -241,11 +240,11 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
 
   // PowerPC wants to optimize integer setcc a bit
-  if (!Subtarget->useCRBits())
+  if (!Subtarget.useCRBits())
     setOperationAction(ISD::SETCC, MVT::i32, Custom);
 
   // PowerPC does not have BRCOND which requires SetCC
-  if (!Subtarget->useCRBits())
+  if (!Subtarget.useCRBits())
     setOperationAction(ISD::BRCOND, MVT::Other, Expand);
 
   setOperationAction(ISD::BR_JT,  MVT::Other, Expand);
@@ -297,7 +296,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 
-  if (Subtarget->isSVR4ABI()) {
+  if (Subtarget.isSVR4ABI()) {
     if (isPPC64) {
       // VAARG always uses double-word chunks, so promote anything smaller.
       setOperationAction(ISD::VAARG, MVT::i1, Promote);
@@ -317,7 +316,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   } else
     setOperationAction(ISD::VAARG, MVT::Other, Expand);
 
-  if (Subtarget->isSVR4ABI() && !isPPC64)
+  if (Subtarget.isSVR4ABI() && !isPPC64)
     // VACOPY is custom lowered with the 32-bit SVR4 ABI.
     setOperationAction(ISD::VACOPY            , MVT::Other, Custom);
   else
@@ -350,7 +349,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
   setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
 
-  if (Subtarget->has64BitSupport()) {
+  if (Subtarget.has64BitSupport()) {
     // They also have instructions for converting between i64 and fp.
     setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
     setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
@@ -360,7 +359,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     // We cannot do this with Promote because i64 is not a legal type.
     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
 
-    if (PPCSubTarget.hasLFIWAX() || Subtarget->isPPC64())
+    if (Subtarget.hasLFIWAX() || Subtarget.isPPC64())
       setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
   } else {
     // PowerPC does not have FP_TO_UINT on 32-bit implementations.
@@ -368,8 +367,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   }
 
   // With the instructions enabled under FPCVT, we can do everything.
-  if (PPCSubTarget.hasFPCVT()) {
-    if (Subtarget->has64BitSupport()) {
+  if (Subtarget.hasFPCVT()) {
+    if (Subtarget.has64BitSupport()) {
       setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
       setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
       setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
@@ -382,7 +381,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
   }
 
-  if (Subtarget->use64BitRegs()) {
+  if (Subtarget.use64BitRegs()) {
     // 64-bit PowerPC implementations can support i64 types directly
     addRegisterClass(MVT::i64, &PPC::G8RCRegClass);
     // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
@@ -398,7 +397,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
   }
 
-  if (Subtarget->hasAltivec()) {
+  if (Subtarget.hasAltivec()) {
     // First set operation action for all vector types to expand. Then we
     // will selectively turn on ones that can be effectively codegen'd.
     for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
@@ -488,7 +487,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::XOR   , MVT::v4i32, Legal);
     setOperationAction(ISD::LOAD  , MVT::v4i32, Legal);
     setOperationAction(ISD::SELECT, MVT::v4i32,
-                       Subtarget->useCRBits() ? Legal : Expand);
+                       Subtarget.useCRBits() ? Legal : Expand);
     setOperationAction(ISD::STORE , MVT::v4i32, Legal);
     setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
     setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
@@ -507,7 +506,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::MUL, MVT::v4f32, Legal);
     setOperationAction(ISD::FMA, MVT::v4f32, Legal);
 
-    if (TM.Options.UnsafeFPMath || Subtarget->hasVSX()) {
+    if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) {
       setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
       setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
     }
@@ -535,7 +534,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setCondCodeAction(ISD::SETO,   MVT::v4f32, Expand);
     setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand);
 
-    if (Subtarget->hasVSX()) {
+    if (Subtarget.hasVSX()) {
       setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
 
@@ -613,7 +612,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     }
   }
 
-  if (Subtarget->has64BitSupport()) {
+  if (Subtarget.has64BitSupport()) {
     setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
     setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
   }
@@ -642,7 +641,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setTargetDAGCombine(ISD::LOAD);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::BR_CC);
-  if (Subtarget->useCRBits())
+  if (Subtarget.useCRBits())
     setTargetDAGCombine(ISD::BRCOND);
   setTargetDAGCombine(ISD::BSWAP);
   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
@@ -651,7 +650,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::ANY_EXTEND);
 
-  if (Subtarget->useCRBits()) {
+  if (Subtarget.useCRBits()) {
     setTargetDAGCombine(ISD::TRUNCATE);
     setTargetDAGCombine(ISD::SETCC);
     setTargetDAGCombine(ISD::SELECT_CC);
@@ -664,7 +663,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   }
 
   // Darwin long double math library functions have $LDBL128 appended.
-  if (Subtarget->isDarwin()) {
+  if (Subtarget.isDarwin()) {
     setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
     setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128");
     setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128");
@@ -679,21 +678,21 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
 
   // With 32 condition bits, we don't need to sink (and duplicate) compares
   // aggressively in CodeGenPrep.
-  if (Subtarget->useCRBits())
+  if (Subtarget.useCRBits())
     setHasMultipleConditionRegisters();
 
   setMinFunctionAlignment(2);
-  if (PPCSubTarget.isDarwin())
+  if (Subtarget.isDarwin())
     setPrefFunctionAlignment(4);
 
-  if (isPPC64 && Subtarget->isJITCodeModel())
+  if (isPPC64 && Subtarget.isJITCodeModel())
     // Temporary workaround for the inability of PPC64 JIT to handle jump
     // tables.
     setSupportJumpTables(false);
 
   setInsertFencesForAtomic(true);
 
-  if (Subtarget->enableMachineScheduler())
+  if (Subtarget.enableMachineScheduler())
     setSchedulingPreference(Sched::Source);
   else
     setSchedulingPreference(Sched::Hybrid);
@@ -702,8 +701,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
 
   // The Freescale cores does better with aggressive inlining of memcpy and
   // friends. Gcc uses same threshold of 128 bytes (= 32 word stores).
-  if (Subtarget->getDarwinDirective() == PPC::DIR_E500mc ||
-      Subtarget->getDarwinDirective() == PPC::DIR_E5500) {
+  if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc ||
+      Subtarget.getDarwinDirective() == PPC::DIR_E5500) {
     MaxStoresPerMemset = 32;
     MaxStoresPerMemsetOptSize = 16;
     MaxStoresPerMemcpy = 32;
@@ -747,14 +746,14 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
 /// function arguments in the caller parameter area.
 unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const {
   // Darwin passes everything on 4 byte boundary.
-  if (PPCSubTarget.isDarwin())
+  if (Subtarget.isDarwin())
     return 4;
 
   // 16byte and wider vectors are passed on 16byte boundary.
   // The rest is 8 on PPC64 and 4 on PPC32 boundary.
-  unsigned Align = PPCSubTarget.isPPC64() ? 8 : 4;
-  if (PPCSubTarget.hasAltivec() || PPCSubTarget.hasQPX())
-    getMaxByValAlign(Ty, Align, PPCSubTarget.hasQPX() ? 32 : 16);
+  unsigned Align = Subtarget.isPPC64() ? 8 : 4;
+  if (Subtarget.hasAltivec() || Subtarget.hasQPX())
+    getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16);
   return Align;
 }
 
@@ -774,7 +773,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::Hi:              return "PPCISD::Hi";
   case PPCISD::Lo:              return "PPCISD::Lo";
   case PPCISD::TOC_ENTRY:       return "PPCISD::TOC_ENTRY";
-  case PPCISD::TOC_RESTORE:     return "PPCISD::TOC_RESTORE";
   case PPCISD::LOAD:            return "PPCISD::LOAD";
   case PPCISD::LOAD_TOC:        return "PPCISD::LOAD_TOC";
   case PPCISD::DYNALLOC:        return "PPCISD::DYNALLOC";
@@ -826,7 +824,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
 EVT PPCTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   if (!VT.isVector())
-    return PPCSubTarget.useCRBits() ? MVT::i1 : MVT::i32;
+    return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
   return VT.changeVectorElementTypeToInteger();
 }
 
@@ -855,15 +853,17 @@ static bool isConstantOrUndef(int Op, int Val) {
 
 /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
 /// VPKUHUM instruction.
-bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary) {
+bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary,
+                               SelectionDAG &DAG) {
+  unsigned j = DAG.getTarget().getDataLayout()->isLittleEndian() ? 0 : 1;
   if (!isUnary) {
     for (unsigned i = 0; i != 16; ++i)
-      if (!isConstantOrUndef(N->getMaskElt(i),  i*2+1))
+      if (!isConstantOrUndef(N->getMaskElt(i),  i*2+j))
         return false;
   } else {
     for (unsigned i = 0; i != 8; ++i)
-      if (!isConstantOrUndef(N->getMaskElt(i),    i*2+1) ||
-          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+1))
+      if (!isConstantOrUndef(N->getMaskElt(i),    i*2+j) ||
+          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+j))
         return false;
   }
   return true;
@@ -871,18 +871,27 @@ bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary) {
 
 /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
 /// VPKUWUM instruction.
-bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary) {
+bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary,
+                               SelectionDAG &DAG) {
+  unsigned j, k;
+  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+    j = 0;
+    k = 1;
+  } else {
+    j = 2;
+    k = 3;
+  }
   if (!isUnary) {
     for (unsigned i = 0; i != 16; i += 2)
-      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+2) ||
-          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+3))
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+j) ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+k))
         return false;
   } else {
     for (unsigned i = 0; i != 8; i += 2)
-      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+2) ||
-          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+3) ||
-          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+2) ||
-          !isConstantOrUndef(N->getMaskElt(i+9),  i*2+3))
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+j) ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+k) ||
+          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+j) ||
+          !isConstantOrUndef(N->getMaskElt(i+9),  i*2+k))
         return false;
   }
   return true;
@@ -909,27 +918,39 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
 }
 
 /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
-/// a VRGL* instruction with the specified unit size (1,2 or 4 bytes).
+/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
 bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
-                             bool isUnary) {
-  if (!isUnary)
-    return isVMerge(N, UnitSize, 8, 24);
-  return isVMerge(N, UnitSize, 8, 8);
+                             bool isUnary, SelectionDAG &DAG) {
+  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+    if (!isUnary)
+      return isVMerge(N, UnitSize, 0, 16);
+    return isVMerge(N, UnitSize, 0, 0);
+  } else {
+    if (!isUnary)
+      return isVMerge(N, UnitSize, 8, 24);
+    return isVMerge(N, UnitSize, 8, 8);
+  }
 }
 
 /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
-/// a VRGH* instruction with the specified unit size (1,2 or 4 bytes).
+/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
 bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
-                             bool isUnary) {
-  if (!isUnary)
-    return isVMerge(N, UnitSize, 0, 16);
-  return isVMerge(N, UnitSize, 0, 0);
+                             bool isUnary, SelectionDAG &DAG) {
+  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+    if (!isUnary)
+      return isVMerge(N, UnitSize, 8, 24);
+    return isVMerge(N, UnitSize, 8, 8);
+  } else {
+    if (!isUnary)
+      return isVMerge(N, UnitSize, 0, 16);
+    return isVMerge(N, UnitSize, 0, 0);
+  }
 }
 
 
 /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
 /// amount, otherwise return -1.
-int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary) {
+int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary, SelectionDAG &DAG) {
   if (N->getValueType(0) != MVT::v16i8)
     return -1;
 
@@ -946,18 +967,38 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary) {
   // numbered from this value.
   unsigned ShiftAmt = SVOp->getMaskElt(i);
   if (ShiftAmt < i) return -1;
-  ShiftAmt -= i;
 
-  if (!isUnary) {
-    // Check the rest of the elements to see if they are consecutive.
-    for (++i; i != 16; ++i)
-      if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
-        return -1;
-  } else {
-    // Check the rest of the elements to see if they are consecutive.
-    for (++i; i != 16; ++i)
-      if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
-        return -1;
+  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+
+    ShiftAmt += i;
+
+    if (!isUnary) {
+      // Check the rest of the elements to see if they are consecutive.
+      for (++i; i != 16; ++i)
+        if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt - i))
+          return -1;
+    } else {
+      // Check the rest of the elements to see if they are consecutive.
+      for (++i; i != 16; ++i)
+        if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt - i) & 15))
+          return -1;
+    }
+
+  } else {  // Big Endian
+
+    ShiftAmt -= i;
+
+    if (!isUnary) {
+      // Check the rest of the elements to see if they are consecutive.
+      for (++i; i != 16; ++i)
+        if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
+          return -1;
+    } else {
+      // Check the rest of the elements to see if they are consecutive.
+      for (++i; i != 16; ++i)
+        if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
+          return -1;
+    }
   }
   return ShiftAmt;
 }
@@ -1010,10 +1051,14 @@ bool PPC::isAllNegativeZeroVector(SDNode *N) {
 
 /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
 /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
-unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize) {
+unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
+                                SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   assert(isSplatShuffleMask(SVOp, EltSize));
-  return SVOp->getMaskElt(0) / EltSize;
+  if (DAG.getTarget().getDataLayout()->isLittleEndian())
+    return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
+  else
+    return SVOp->getMaskElt(0) / EltSize;
 }
 
 /// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
@@ -1299,7 +1344,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     short Imm;
     if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) {
       Disp = DAG.getTargetConstant(Imm, CN->getValueType(0));
-      Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
+      Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
                              CN->getValueType(0));
       return true;
     }
@@ -1350,7 +1395,7 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
   }
 
   // Otherwise, do it the hard way, using R0 as the base register.
-  Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
+  Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
                          N.getValueType());
   Index = N;
   return true;
@@ -1497,7 +1542,7 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
 
   // 64-bit SVR4 ABI code is always position-independent.
   // The actual address of the GlobalValue is stored in the TOC.
-  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
     SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
     return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(CP), MVT::i64, GA,
                        DAG.getRegister(PPC::X2, MVT::i64));
@@ -1518,7 +1563,7 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
 
   // 64-bit SVR4 ABI code is always position-independent.
   // The actual address of the GlobalValue is stored in the TOC.
-  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
     SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
     return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), MVT::i64, GA,
                        DAG.getRegister(PPC::X2, MVT::i64));
@@ -1555,7 +1600,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
   SDLoc dl(GA);
   const GlobalValue *GV = GA->getGlobal();
   EVT PtrVT = getPointerTy();
-  bool is64bit = PPCSubTarget.isPPC64();
+  bool is64bit = Subtarget.isPPC64();
 
   TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
 
@@ -1646,7 +1691,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
 
   // 64-bit SVR4 ABI code is always position-independent.
   // The actual address of the GlobalValue is stored in the TOC.
-  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
     return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i64, GA,
                        DAG.getRegister(PPC::X2, MVT::i64));
@@ -1891,7 +1936,8 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(Chain)
     .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-               DAG.getExternalSymbol("__trampoline_setup", PtrVT), &Args, 0);
+               DAG.getExternalSymbol("__trampoline_setup", PtrVT),
+               std::move(Args), 0);
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.second;
@@ -2086,6 +2132,43 @@ static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
   return ArgSize;
 }
 
+/// CalculateStackSlotAlignment - Calculates the alignment of this argument
+/// on the stack.
+static unsigned CalculateStackSlotAlignment(EVT ArgVT, ISD::ArgFlagsTy Flags,
+                                            unsigned PtrByteSize) {
+  unsigned Align = PtrByteSize;
+
+  // Altivec parameters are padded to a 16 byte boundary.
+  if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
+      ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
+      ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64)
+    Align = 16;
+
+  // ByVal parameters are aligned as requested.
+  if (Flags.isByVal()) {
+    unsigned BVAlign = Flags.getByValAlign();
+    if (BVAlign > PtrByteSize) {
+      if (BVAlign % PtrByteSize != 0)
+          llvm_unreachable(
+            "ByVal alignment is not a multiple of the pointer size");
+
+      Align = BVAlign;
+    }
+  }
+
+  return Align;
+}
+
+/// EnsureStackAlignment - Round stack frame size up from NumBytes to
+/// ensure minimum alignment required for target.
+static unsigned EnsureStackAlignment(const TargetMachine &Target,
+                                     unsigned NumBytes) {
+  unsigned TargetAlign = Target.getFrameLowering()->getStackAlignment();
+  unsigned AlignMask = TargetAlign - 1;
+  NumBytes = (NumBytes + AlignMask) & ~AlignMask;
+  return NumBytes;
+}
+
 SDValue
 PPCTargetLowering::LowerFormalArguments(SDValue Chain,
                                         CallingConv::ID CallConv, bool isVarArg,
@@ -2094,8 +2177,8 @@ PPCTargetLowering::LowerFormalArguments(SDValue Chain,
                                         SDLoc dl, SelectionDAG &DAG,
                                         SmallVectorImpl<SDValue> &InVals)
                                           const {
-  if (PPCSubTarget.isSVR4ABI()) {
-    if (PPCSubTarget.isPPC64())
+  if (Subtarget.isSVR4ABI()) {
+    if (Subtarget.isPPC64())
       return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins,
                                          dl, DAG, InVals);
     else
@@ -2161,7 +2244,8 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
                  getTargetMachine(), ArgLocs, *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
-  CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
+  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(false, false);
+  CCInfo.AllocateStack(LinkageSize, PtrByteSize);
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
 
@@ -2184,7 +2268,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
           RC = &PPC::F4RCRegClass;
           break;
         case MVT::f64:
-          if (PPCSubTarget.hasVSX())
+          if (Subtarget.hasVSX())
             RC = &PPC::VSFRCRegClass;
           else
             RC = &PPC::F8RCRegClass;
@@ -2240,23 +2324,14 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
 
   // Area that is at least reserved in the caller of this function.
   unsigned MinReservedArea = CCByValInfo.getNextStackOffset();
+  MinReservedArea = std::max(MinReservedArea, LinkageSize);
 
   // Set the size that is at least reserved in caller of this function.  Tail
   // call optimized function's reserved stack space needs to be aligned so that
   // taking the difference between two stack areas will result in an aligned
   // stack.
-  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
-
-  MinReservedArea =
-    std::max(MinReservedArea,
-             PPCFrameLowering::getMinCallFrameSize(false, false));
-
-  unsigned TargetAlign = DAG.getMachineFunction().getTarget().getFrameLowering()->
-    getStackAlignment();
-  unsigned AlignMask = TargetAlign-1;
-  MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask;
-
-  FI->setMinReservedArea(MinReservedArea);
+  MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea);
+  FuncInfo->setMinReservedArea(MinReservedArea);
 
   SmallVector<SDValue, 8> MemOps;
 
@@ -2352,32 +2427,6 @@ PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT,
   return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal);
 }
 
-// Set the size that is at least reserved in caller of this function.  Tail
-// call optimized functions' reserved stack space needs to be aligned so that
-// taking the difference between two stack areas will result in an aligned
-// stack.
-void
-PPCTargetLowering::setMinReservedArea(MachineFunction &MF, SelectionDAG &DAG,
-                                      unsigned nAltivecParamsAtEnd,
-                                      unsigned MinReservedArea,
-                                      bool isPPC64) const {
-  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
-  // Add the Altivec parameters at the end, if needed.
-  if (nAltivecParamsAtEnd) {
-    MinReservedArea = ((MinReservedArea+15)/16)*16;
-    MinReservedArea += 16*nAltivecParamsAtEnd;
-  }
-  MinReservedArea =
-    std::max(MinReservedArea,
-             PPCFrameLowering::getMinCallFrameSize(isPPC64, true));
-  unsigned TargetAlign
-    = DAG.getMachineFunction().getTarget().getFrameLowering()->
-        getStackAlignment();
-  unsigned AlignMask = TargetAlign-1;
-  MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask;
-  FI->setMinReservedArea(MinReservedArea);
-}
-
 SDValue
 PPCTargetLowering::LowerFormalArguments_64SVR4(
                                       SDValue Chain,
@@ -2388,6 +2437,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
                                       SmallVectorImpl<SDValue> &InVals) const {
   // TODO: add description of PPC stack frame format, or at least some docs.
   //
+  bool isLittleEndian = Subtarget.isLittleEndian();
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
@@ -2398,9 +2448,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
                        (CallConv == CallingConv::Fast));
   unsigned PtrByteSize = 8;
 
-  unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true);
-  // Area that is at least reserved in caller of this function.
-  unsigned MinReservedArea = ArgOffset;
+  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false);
+  unsigned ArgOffset = LinkageSize;
 
   static const MCPhysReg GPR[] = {
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
@@ -2422,14 +2471,13 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
   const unsigned Num_FPR_Regs = 13;
   const unsigned Num_VR_Regs  = array_lengthof(VR);
 
-  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+  unsigned GPR_idx, FPR_idx = 0, VR_idx = 0;
 
   // Add DAG nodes to load the arguments or copy them out of registers.  On
   // entry to a function on PPC, the arguments start after the linkage area,
   // although the first ones are often in registers.
 
   SmallVector<SDValue, 8> MemOps;
-  unsigned nAltivecParamsAtEnd = 0;
   Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
   unsigned CurArgIdx = 0;
   for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
@@ -2442,24 +2490,15 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
     std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx);
     CurArgIdx = Ins[ArgNo].OrigArgIndex;
 
+    /* Respect alignment of argument on the stack.  */
+    unsigned Align =
+      CalculateStackSlotAlignment(ObjectVT, Flags, PtrByteSize);
+    ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
     unsigned CurArgOffset = ArgOffset;
 
-    // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
-    if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
-        ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8 ||
-        ObjectVT==MVT::v2f64 || ObjectVT==MVT::v2i64) {
-      if (isVarArg) {
-        MinReservedArea = ((MinReservedArea+15)/16)*16;
-        MinReservedArea += CalculateStackSlotSize(ObjectVT,
-                                                  Flags,
-                                                  PtrByteSize);
-      } else
-        nAltivecParamsAtEnd++;
-    } else
-      // Calculate min reserved area.
-      MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
-                                                Flags,
-                                                PtrByteSize);
+    /* Compute GPR index associated with argument offset.  */
+    GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
+    GPR_idx = std::min(GPR_idx, Num_GPR_Regs);
 
     // FIXME the codegen can be much improved in some cases.
     // We do not have to keep everything in memory.
@@ -2481,14 +2520,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
         continue;
       }
 
-      unsigned BVAlign = Flags.getByValAlign();
-      if (BVAlign > 8) {
-        ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign;
-        CurArgOffset = ArgOffset;
-      }
-
       // All aggregates smaller than 8 bytes must be passed right-justified.
-      if (ObjSize < PtrByteSize)
+      if (ObjSize < PtrByteSize && !isLittleEndian)
         CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize);
       // The value of the object is its address.
       int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
@@ -2522,7 +2555,6 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
           }
 
           MemOps.push_back(Store);
-          ++GPR_idx;
         }
         // Whether we copied from a register or not, advance the offset
         // into the parameter save area by a full doubleword.
@@ -2567,8 +2599,6 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
           // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
           // value to MVT::i64 and then truncate to the correct register size.
           ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
-
-        ++GPR_idx;
       } else {
         needsLoad = true;
         ArgSize = PtrByteSize;
@@ -2578,18 +2608,13 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
 
     case MVT::f32:
     case MVT::f64:
-      // Every 8 bytes of argument space consumes one of the GPRs available for
-      // argument passing.
-      if (GPR_idx != Num_GPR_Regs) {
-        ++GPR_idx;
-      }
       if (FPR_idx != Num_FPR_Regs) {
         unsigned VReg;
 
         if (ObjectVT == MVT::f32)
           VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
         else
-          VReg = MF.addLiveIn(FPR[FPR_idx], PPCSubTarget.hasVSX() ?
+          VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX() ?
                                             &PPC::VSFRCRegClass :
                                             &PPC::F8RCRegClass);
 
@@ -2608,39 +2633,25 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
     case MVT::v16i8:
     case MVT::v2f64:
     case MVT::v2i64:
-      // Note that vector arguments in registers don't reserve stack space,
-      // except in varargs functions.
       if (VR_idx != Num_VR_Regs) {
         unsigned VReg = (ObjectVT == MVT::v2f64 || ObjectVT == MVT::v2i64) ?
                         MF.addLiveIn(VSRH[VR_idx], &PPC::VSHRCRegClass) :
                         MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
-        if (isVarArg) {
-          while ((ArgOffset % 16) != 0) {
-            ArgOffset += PtrByteSize;
-            if (GPR_idx != Num_GPR_Regs)
-              GPR_idx++;
-          }
-          ArgOffset += 16;
-          GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
-        }
         ++VR_idx;
       } else {
-        // Vectors are aligned.
-        ArgOffset = ((ArgOffset+15)/16)*16;
-        CurArgOffset = ArgOffset;
-        ArgOffset += 16;
         needsLoad = true;
       }
+      ArgOffset += 16;
       break;
     }
 
     // We need to load the argument to a virtual register if we determined
     // above that we ran out of physical registers of the appropriate type.
     if (needsLoad) {
-      int FI = MFI->CreateFixedObject(ObjSize,
-                                      CurArgOffset + (ArgSize - ObjSize),
-                                      isImmutable);
+      if (ObjSize < ArgSize && !isLittleEndian)
+        CurArgOffset += ArgSize - ObjSize;
+      int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, isImmutable);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
       ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
                            false, false, false, 0);
@@ -2649,11 +2660,16 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
     InVals.push_back(ArgVal);
   }
 
+  // Area that is at least reserved in the caller of this function.
+  unsigned MinReservedArea;
+  MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize);
+
   // Set the size that is at least reserved in caller of this function.  Tail
   // call optimized functions' reserved stack space needs to be aligned so that
   // taking the difference between two stack areas will result in an aligned
   // stack.
-  setMinReservedArea(MF, DAG, nAltivecParamsAtEnd, MinReservedArea, true);
+  MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea);
+  FuncInfo->setMinReservedArea(MinReservedArea);
 
   // If the function takes variable number of arguments, make a frame index for
   // the start of the first vararg value... for expansion of llvm.va_start.
@@ -2667,7 +2683,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
     // If this function is vararg, store any remaining integer argument regs
     // to their spots on the stack so that they may be loaded by deferencing the
     // result of va_next.
-    for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
+    for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
+         GPR_idx < Num_GPR_Regs; ++GPR_idx) {
       unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
       SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
       SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
@@ -2706,7 +2723,8 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
                        (CallConv == CallingConv::Fast));
   unsigned PtrByteSize = isPPC64 ? 8 : 4;
 
-  unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true);
+  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true);
+  unsigned ArgOffset = LinkageSize;
   // Area that is at least reserved in caller of this function.
   unsigned MinReservedArea = ArgOffset;
 
@@ -2997,11 +3015,21 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
     InVals.push_back(ArgVal);
   }
 
+  // Allow for Altivec parameters at the end, if needed.
+  if (nAltivecParamsAtEnd) {
+    MinReservedArea = ((MinReservedArea+15)/16)*16;
+    MinReservedArea += 16*nAltivecParamsAtEnd;
+  }
+
+  // Area that is at least reserved in the caller of this function.
+  MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize);
+
   // Set the size that is at least reserved in caller of this function.  Tail
   // call optimized functions' reserved stack space needs to be aligned so that
   // taking the difference between two stack areas will result in an aligned
   // stack.
-  setMinReservedArea(MF, DAG, nAltivecParamsAtEnd, MinReservedArea, isPPC64);
+  MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea);
+  FuncInfo->setMinReservedArea(MinReservedArea);
 
   // If the function takes variable number of arguments, make a frame index for
   // the start of the first vararg value... for expansion of llvm.va_start.
@@ -3040,75 +3068,6 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
   return Chain;
 }
 
-/// CalculateParameterAndLinkageAreaSize - Get the size of the parameter plus
-/// linkage area for the Darwin ABI, or the 64-bit SVR4 ABI.
-static unsigned
-CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG,
-                                     bool isPPC64,
-                                     bool isVarArg,
-                                     unsigned CC,
-                                     const SmallVectorImpl<ISD::OutputArg>
-                                       &Outs,
-                                     const SmallVectorImpl<SDValue> &OutVals,
-                                     unsigned &nAltivecParamsAtEnd) {
-  // Count how many bytes are to be pushed on the stack, including the linkage
-  // area, and parameter passing area.  We start with 24/48 bytes, which is
-  // prereserved space for [SP][CR][LR][3 x unused].
-  unsigned NumBytes = PPCFrameLowering::getLinkageSize(isPPC64, true);
-  unsigned NumOps = Outs.size();
-  unsigned PtrByteSize = isPPC64 ? 8 : 4;
-
-  // Add up all the space actually used.
-  // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
-  // they all go in registers, but we must reserve stack space for them for
-  // possible use by the caller.  In varargs or 64-bit calls, parameters are
-  // assigned stack space in order, with padding so Altivec parameters are
-  // 16-byte aligned.
-  nAltivecParamsAtEnd = 0;
-  for (unsigned i = 0; i != NumOps; ++i) {
-    ISD::ArgFlagsTy Flags = Outs[i].Flags;
-    EVT ArgVT = Outs[i].VT;
-    // Varargs Altivec parameters are padded to a 16 byte boundary.
-    if (ArgVT==MVT::v4f32 || ArgVT==MVT::v4i32 ||
-        ArgVT==MVT::v8i16 || ArgVT==MVT::v16i8 ||
-        ArgVT==MVT::v2f64 || ArgVT==MVT::v2i64) {
-      if (!isVarArg && !isPPC64) {
-        // Non-varargs Altivec parameters go after all the non-Altivec
-        // parameters; handle those later so we know how much padding we need.
-        nAltivecParamsAtEnd++;
-        continue;
-      }
-      // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary.
-      NumBytes = ((NumBytes+15)/16)*16;
-    }
-    NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
-  }
-
-   // Allow for Altivec parameters at the end, if needed.
-  if (nAltivecParamsAtEnd) {
-    NumBytes = ((NumBytes+15)/16)*16;
-    NumBytes += 16*nAltivecParamsAtEnd;
-  }
-
-  // The prolog code of the callee may store up to 8 GPR argument registers to
-  // the stack, allowing va_start to index over them in memory if its varargs.
-  // Because we cannot tell if this is needed on the caller side, we have to
-  // conservatively assume that it is needed.  As such, make sure we have at
-  // least enough stack space for the caller to store the 8 GPRs.
-  NumBytes = std::max(NumBytes,
-                      PPCFrameLowering::getMinCallFrameSize(isPPC64, true));
-
-  // Tail call needs the stack to be aligned.
-  if (CC == CallingConv::Fast && DAG.getTarget().Options.GuaranteedTailCallOpt){
-    unsigned TargetAlign = DAG.getMachineFunction().getTarget().
-      getFrameLowering()->getStackAlignment();
-    unsigned AlignMask = TargetAlign-1;
-    NumBytes = (NumBytes + AlignMask) & ~AlignMask;
-  }
-
-  return NumBytes;
-}
-
 /// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
 /// adjusted to accommodate the arguments for the tailcall.
 static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
@@ -3280,7 +3239,7 @@ SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
                                                         SDLoc dl) const {
   if (SPDiff) {
     // Load the LR and FP stack slot for later adjusting.
-    EVT VT = PPCSubTarget.isPPC64() ? MVT::i64 : MVT::i32;
+    EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
     LROpOut = getReturnAddrFrameIndex(DAG);
     LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo(),
                           false, false, false, 0);
@@ -3373,10 +3332,10 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
                      SDValue &Chain, SDLoc dl, int SPDiff, bool isTailCall,
                      SmallVectorImpl<std::pair<unsigned, SDValue> > &RegsToPass,
                      SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
-                     const PPCSubtarget &PPCSubTarget) {
+                     const PPCSubtarget &Subtarget) {
 
-  bool isPPC64 = PPCSubTarget.isPPC64();
-  bool isSVR4ABI = PPCSubTarget.isSVR4ABI();
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isSVR4ABI = Subtarget.isSVR4ABI();
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   NodeTys.push_back(MVT::Other);   // Returns a chain
@@ -3385,11 +3344,12 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
   unsigned CallOpc = PPCISD::CALL;
 
   bool needIndirectCall = true;
-  if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) {
-    // If this is an absolute destination address, use the munged value.
-    Callee = SDValue(Dest, 0);
-    needIndirectCall = false;
-  }
+  if (!isSVR4ABI || !isPPC64)
+    if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) {
+      // If this is an absolute destination address, use the munged value.
+      Callee = SDValue(Dest, 0);
+      needIndirectCall = false;
+    }
 
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     // XXX Work around for http://llvm.org/bugs/show_bug.cgi?id=5201
@@ -3398,8 +3358,8 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     if (!DAG.getTarget().getSubtarget<PPCSubtarget>().isJITCodeModel()) {
       unsigned OpFlags = 0;
       if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
-          (PPCSubTarget.getTargetTriple().isMacOSX() &&
-           PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
+          (Subtarget.getTargetTriple().isMacOSX() &&
+           Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
           (G->getGlobal()->isDeclaration() ||
            G->getGlobal()->isWeakForLinker())) {
         // PC-relative references to external symbols should go through $stub,
@@ -3422,8 +3382,8 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     unsigned char OpFlags = 0;
 
     if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
-        (PPCSubTarget.getTargetTriple().isMacOSX() &&
-         PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5))) {
+        (Subtarget.getTargetTriple().isMacOSX() &&
+         Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) {
       // PC-relative references to external symbols should go through $stub,
       // unless we're building with the leopard linker or later, which
       // automatically synthesizes these stubs.
@@ -3497,8 +3457,10 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
       // additional register being allocated and an unnecessary move instruction
       // being generated.
       VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue TOCOff = DAG.getIntPtrConstant(8);
+      SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff);
       SDValue LoadTOCPtr = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain,
-                                       Callee, InFlag);
+                                       AddTOC, InFlag);
       Chain = LoadTOCPtr.getValue(0);
       InFlag = LoadTOCPtr.getValue(1);
 
@@ -3613,10 +3575,10 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
   SmallVector<SDValue, 8> Ops;
   unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, dl, SPDiff,
                                  isTailCall, RegsToPass, Ops, NodeTys,
-                                 PPCSubTarget);
+                                 Subtarget);
 
   // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
-  if (isVarArg && PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64())
+  if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64())
     Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));
 
   // When performing tail call optimization the callee pops its arguments off
@@ -3657,7 +3619,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
   // same TOC), the NOP will remain unchanged.
 
   bool needsTOCRestore = false;
-  if (!isTailCall && PPCSubTarget.isSVR4ABI()&& PPCSubTarget.isPPC64()) {
+  if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64()) {
     if (CallOpc == PPCISD::BCTRL) {
       // This is a call through a function pointer.
       // Restore the caller TOC from the save area into R2.
@@ -3682,7 +3644,12 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
 
   if (needsTOCRestore) {
     SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    Chain = DAG.getNode(PPCISD::TOC_RESTORE, dl, VTs, Chain, InFlag);
+    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT);
+    unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset();
+    SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset);
+    SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff);
+    Chain = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain, AddTOC, InFlag);
     InFlag = Chain.getValue(1);
   }
 
@@ -3718,8 +3685,8 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     report_fatal_error("failed to perform tail call elimination on a call "
                        "site marked musttail");
 
-  if (PPCSubTarget.isSVR4ABI()) {
-    if (PPCSubTarget.isPPC64())
+  if (Subtarget.isSVR4ABI()) {
+    if (Subtarget.isPPC64())
       return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
                               isTailCall, Outs, OutVals, Ins,
                               dl, DAG, InVals);
@@ -3981,6 +3948,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
                                     SDLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const {
 
+  bool isLittleEndian = Subtarget.isLittleEndian();
   unsigned NumOps = Outs.size();
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
@@ -3997,16 +3965,37 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       CallConv == CallingConv::Fast)
     MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
 
-  unsigned nAltivecParamsAtEnd = 0;
-
   // Count how many bytes are to be pushed on the stack, including the linkage
   // area, and parameter passing area.  We start with at least 48 bytes, which
   // is reserved space for [SP][CR][LR][3 x unused].
-  // NOTE: For PPC64, nAltivecParamsAtEnd always remains zero as a result
-  // of this call.
-  unsigned NumBytes =
-    CalculateParameterAndLinkageAreaSize(DAG, true, isVarArg, CallConv,
-                                         Outs, OutVals, nAltivecParamsAtEnd);
+  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false);
+  unsigned NumBytes = LinkageSize;
+
+  // Add up all the space actually used.
+  for (unsigned i = 0; i != NumOps; ++i) {
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    EVT ArgVT = Outs[i].VT;
+
+    /* Respect alignment of argument on the stack.  */
+    unsigned Align = CalculateStackSlotAlignment(ArgVT, Flags, PtrByteSize);
+    NumBytes = ((NumBytes + Align - 1) / Align) * Align;
+
+    NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
+  }
+
+  unsigned NumBytesActuallyUsed = NumBytes;
+
+  // The prolog code of the callee may store up to 8 GPR argument registers to
+  // the stack, allowing va_start to index over them in memory if its varargs.
+  // Because we cannot tell if this is needed on the caller side, we have to
+  // conservatively assume that it is needed.  As such, make sure we have at
+  // least enough stack space for the caller to store the 8 GPRs.
+  NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
+
+  // Tail call needs the stack to be aligned.
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
+    NumBytes = EnsureStackAlignment(MF.getTarget(), NumBytes);
 
   // Calculate by how many bytes the stack has to be adjusted in case of tail
   // call optimization.
@@ -4038,8 +4027,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
   // memory.  Also, if this is a vararg function, floating point operations
   // must be stored to our stack, and loaded into integer regs as well, if
   // any integer regs are available for argument passing.
-  unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true);
-  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+  unsigned ArgOffset = LinkageSize;
+  unsigned GPR_idx, FPR_idx = 0, VR_idx = 0;
 
   static const MCPhysReg GPR[] = {
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
@@ -4068,6 +4057,15 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     SDValue Arg = OutVals[i];
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
 
+    /* Respect alignment of argument on the stack.  */
+    unsigned Align =
+      CalculateStackSlotAlignment(Outs[i].VT, Flags, PtrByteSize);
+    ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+
+    /* Compute GPR index associated with argument offset.  */
+    GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
+    GPR_idx = std::min(GPR_idx, NumGPRs);
+
     // PtrOff will be used to store the current argument to the stack if a
     // register cannot be found for it.
     SDValue PtrOff;
@@ -4099,15 +4097,6 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       if (Size == 0)
         continue;
 
-      unsigned BVAlign = Flags.getByValAlign();
-      if (BVAlign > 8) {
-        if (BVAlign % PtrByteSize != 0)
-          llvm_unreachable(
-            "ByVal alignment is not a multiple of the pointer size");
-
-        ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign;
-      }
-
       // All aggregates smaller than 8 bytes must be passed right-justified.
       if (Size==1 || Size==2 || Size==4) {
         EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
@@ -4116,7 +4105,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
                                         MachinePointerInfo(), VT,
                                         false, false, 0);
           MemOpChains.push_back(Load.getValue(1));
-          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load));
 
           ArgOffset += PtrByteSize;
           continue;
@@ -4124,9 +4113,12 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       }
 
       if (GPR_idx == NumGPRs && Size < 8) {
-        SDValue Const = DAG.getConstant(PtrByteSize - Size,
-                                        PtrOff.getValueType());
-        SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        SDValue AddPtr = PtrOff;
+        if (!isLittleEndian) {
+          SDValue Const = DAG.getConstant(PtrByteSize - Size,
+                                          PtrOff.getValueType());
+          AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        }
         Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
                                                           CallSeqStart,
                                                           Flags, DAG, dl);
@@ -4161,8 +4153,11 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
         // small aggregates, particularly for packed ones.
         // FIXME: It would be preferable to use the slot in the
         // parameter save area instead of a new local variable.
-        SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType());
-        SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        SDValue AddPtr = PtrOff;
+        if (!isLittleEndian) {
+          SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType());
+          AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        }
         Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
                                                           CallSeqStart,
                                                           Flags, DAG, dl);
@@ -4172,7 +4167,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
                                    MachinePointerInfo(),
                                    false, false, false, 0);
         MemOpChains.push_back(Load.getValue(1));
-        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load));
 
         // Done with this argument.
         ArgOffset += PtrByteSize;
@@ -4205,7 +4200,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     case MVT::i32:
     case MVT::i64:
       if (GPR_idx != NumGPRs) {
-        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Arg));
       } else {
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, false, MemOpChains,
@@ -4223,7 +4218,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
           // must be passed right-justified in the stack doubleword, and
           // in the GPR, if one is available.
           SDValue StoreOff;
-          if (Arg.getSimpleValueType().SimpleTy == MVT::f32) {
+          if (Arg.getSimpleValueType().SimpleTy == MVT::f32 &&
+              !isLittleEndian) {
             SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
             StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
           } else
@@ -4239,15 +4235,13 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
                                        MachinePointerInfo(), false, false,
                                        false, 0);
             MemOpChains.push_back(Load.getValue(1));
-            RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+            RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load));
           }
-        } else if (GPR_idx != NumGPRs)
-          // If we have any FPRs remaining, we may also have GPRs remaining.
-          ++GPR_idx;
+        }
       } else {
         // Single-precision floating-point values are mapped to the
         // second (rightmost) word of the stack doubleword.
-        if (Arg.getValueType() == MVT::f32) {
+        if (Arg.getValueType() == MVT::f32 && !isLittleEndian) {
           SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
           PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
         }
@@ -4264,21 +4258,13 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
     case MVT::v16i8:
     case MVT::v2f64:
     case MVT::v2i64:
+      // For a varargs call, named arguments go into VRs or on the stack as
+      // usual; unnamed arguments always go to the stack or the corresponding
+      // GPRs when within range.  For now, we always put the value in both
+      // locations (or even all three).
       if (isVarArg) {
-        // These go aligned on the stack, or in the corresponding R registers
-        // when within range.  The Darwin PPC ABI doc claims they also go in
-        // V registers; in fact gcc does this only for arguments that are
-        // prototyped, not for those that match the ...  We do it for all
-        // arguments, seems to work.
-        while (ArgOffset % 16 !=0) {
-          ArgOffset += PtrByteSize;
-          if (GPR_idx != NumGPRs)
-            GPR_idx++;
-        }
         // We could elide this store in the case where the object fits
         // entirely in R registers.  Maybe later.
-        PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
-                            DAG.getConstant(ArgOffset, PtrVT));
         SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff,
                                      MachinePointerInfo(), false, false, 0);
         MemOpChains.push_back(Store);
@@ -4309,10 +4295,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
         break;
       }
 
-      // Non-varargs Altivec params generally go in registers, but have
-      // stack space allocated at the end.
+      // Non-varargs Altivec params go into VRs or on the stack.
       if (VR_idx != NumVRs) {
-        // Doesn't have GPR space allocated.
         unsigned VReg = (Arg.getSimpleValueType() == MVT::v2f64 ||
                          Arg.getSimpleValueType() == MVT::v2i64) ?
                         VSRH[VR_idx] : VR[VR_idx];
@@ -4323,12 +4307,15 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, true, MemOpChains,
                          TailCallArguments, dl);
-        ArgOffset += 16;
       }
+      ArgOffset += 16;
       break;
     }
   }
 
+  assert(NumBytesActuallyUsed == ArgOffset);
+  (void)NumBytesActuallyUsed;
+
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
@@ -4337,19 +4324,15 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
   // pointers in the 64-bit SVR4 ABI.
   if (!isTailCall &&
       !dyn_cast<GlobalAddressSDNode>(Callee) &&
-      !dyn_cast<ExternalSymbolSDNode>(Callee) &&
-      !isBLACompatibleAddress(Callee, DAG)) {
+      !dyn_cast<ExternalSymbolSDNode>(Callee)) {
     // Load r2 into a virtual register and store it to the TOC save area.
     SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
     // TOC save area offset.
-    SDValue PtrOff = DAG.getIntPtrConstant(40);
+    unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset();
+    SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset);
     SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
     Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(),
                          false, false, 0);
-    // R12 must contain the address of an indirect callee.  This does not
-    // mean the MTCTR instruction must use R12; it's easier to model this
-    // as an extra parameter, so do that.
-    RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
   }
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
@@ -4397,15 +4380,55 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
       CallConv == CallingConv::Fast)
     MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
 
-  unsigned nAltivecParamsAtEnd = 0;
-
   // Count how many bytes are to be pushed on the stack, including the linkage
   // area, and parameter passing area.  We start with 24/48 bytes, which is
   // prereserved space for [SP][CR][LR][3 x unused].
-  unsigned NumBytes =
-    CalculateParameterAndLinkageAreaSize(DAG, isPPC64, isVarArg, CallConv,
-                                         Outs, OutVals,
-                                         nAltivecParamsAtEnd);
+  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true);
+  unsigned NumBytes = LinkageSize;
+
+  // Add up all the space actually used.
+  // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
+  // they all go in registers, but we must reserve stack space for them for
+  // possible use by the caller.  In varargs or 64-bit calls, parameters are
+  // assigned stack space in order, with padding so Altivec parameters are
+  // 16-byte aligned.
+  unsigned nAltivecParamsAtEnd = 0;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    EVT ArgVT = Outs[i].VT;
+    // Varargs Altivec parameters are padded to a 16 byte boundary.
+    if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
+        ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
+        ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) {
+      if (!isVarArg && !isPPC64) {
+        // Non-varargs Altivec parameters go after all the non-Altivec
+        // parameters; handle those later so we know how much padding we need.
+        nAltivecParamsAtEnd++;
+        continue;
+      }
+      // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary.
+      NumBytes = ((NumBytes+15)/16)*16;
+    }
+    NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
+  }
+
+  // Allow for Altivec parameters at the end, if needed.
+  if (nAltivecParamsAtEnd) {
+    NumBytes = ((NumBytes+15)/16)*16;
+    NumBytes += 16*nAltivecParamsAtEnd;
+  }
+
+  // The prolog code of the callee may store up to 8 GPR argument registers to
+  // the stack, allowing va_start to index over them in memory if its varargs.
+  // Because we cannot tell if this is needed on the caller side, we have to
+  // conservatively assume that it is needed.  As such, make sure we have at
+  // least enough stack space for the caller to store the 8 GPRs.
+  NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
+
+  // Tail call needs the stack to be aligned.
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
+    NumBytes = EnsureStackAlignment(MF.getTarget(), NumBytes);
 
   // Calculate by how many bytes the stack has to be adjusted in case of tail
   // call optimization.
@@ -4441,7 +4464,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
   // memory.  Also, if this is a vararg function, floating point operations
   // must be stored to our stack, and loaded into integer regs as well, if
   // any integer regs are available for argument passing.
-  unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true);
+  unsigned ArgOffset = LinkageSize;
   unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
 
   static const MCPhysReg GPR_32[] = {           // 32-bit registers.
@@ -4818,8 +4841,8 @@ SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
 SDValue
 PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
-  bool isPPC64 = PPCSubTarget.isPPC64();
-  bool isDarwinABI = PPCSubTarget.isDarwinABI();
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isDarwinABI = Subtarget.isDarwinABI();
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 
   // Get current frame pointer save index.  The users of this index will be
@@ -4842,8 +4865,8 @@ PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const {
 SDValue
 PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
-  bool isPPC64 = PPCSubTarget.isPPC64();
-  bool isDarwinABI = PPCSubTarget.isDarwinABI();
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isDarwinABI = Subtarget.isDarwinABI();
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 
   // Get current frame pointer save index.  The users of this index will be
@@ -5063,12 +5086,12 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
   default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
   case MVT::i32:
     Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIWZ :
-                        (PPCSubTarget.hasFPCVT() ? PPCISD::FCTIWUZ :
+                        (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ :
                                                    PPCISD::FCTIDZ),
                       dl, MVT::f64, Src);
     break;
   case MVT::i64:
-    assert((Op.getOpcode() == ISD::FP_TO_SINT || PPCSubTarget.hasFPCVT()) &&
+    assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) &&
            "i64 FP_TO_UINT is supported only with FPCVT");
     Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
                                                         PPCISD::FCTIDUZ,
@@ -5077,8 +5100,8 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
   }
 
   // Convert the FP value to an int value through memory.
-  bool i32Stack = Op.getValueType() == MVT::i32 && PPCSubTarget.hasSTFIWX() &&
-    (Op.getOpcode() == ISD::FP_TO_SINT || PPCSubTarget.hasFPCVT());
+  bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&
+    (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT());
   SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);
   int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(FI);
@@ -5120,17 +5143,17 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
                        DAG.getConstantFP(1.0, Op.getValueType()),
                        DAG.getConstantFP(0.0, Op.getValueType()));
 
-  assert((Op.getOpcode() == ISD::SINT_TO_FP || PPCSubTarget.hasFPCVT()) &&
+  assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
          "UINT_TO_FP is supported only with FPCVT");
 
   // If we have FCFIDS, then use it when converting to single-precision.
   // Otherwise, convert to double-precision and then round.
-  unsigned FCFOp = (PPCSubTarget.hasFPCVT() && Op.getValueType() == MVT::f32) ?
+  unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ?
                    (Op.getOpcode() == ISD::UINT_TO_FP ?
                     PPCISD::FCFIDUS : PPCISD::FCFIDS) :
                    (Op.getOpcode() == ISD::UINT_TO_FP ?
                     PPCISD::FCFIDU : PPCISD::FCFID);
-  MVT      FCFTy = (PPCSubTarget.hasFPCVT() && Op.getValueType() == MVT::f32) ?
+  MVT      FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ?
                    MVT::f32 : MVT::f64;
 
   if (Op.getOperand(0).getValueType() == MVT::i64) {
@@ -5146,7 +5169,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
     // However, if -enable-unsafe-fp-math is in effect, accept double
     // rounding to avoid the extra overhead.
     if (Op.getValueType() == MVT::f32 &&
-        !PPCSubTarget.hasFPCVT() &&
+        !Subtarget.hasFPCVT() &&
         !DAG.getTarget().Options.UnsafeFPMath) {
 
       // Twiddle input to make sure the low 11 bits are zero.  (If this
@@ -5184,7 +5207,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
     SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
     SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits);
 
-    if (Op.getValueType() == MVT::f32 && !PPCSubTarget.hasFPCVT())
+    if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
       FP = DAG.getNode(ISD::FP_ROUND, dl,
                        MVT::f32, FP, DAG.getIntPtrConstant(0));
     return FP;
@@ -5201,7 +5224,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 
   SDValue Ld;
-  if (PPCSubTarget.hasLFIWAX() || PPCSubTarget.hasFPCVT()) {
+  if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
     int FrameIdx = FrameInfo->CreateStackObject(4, 4, false);
     SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
@@ -5220,7 +5243,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
                                  dl, DAG.getVTList(MVT::f64, MVT::Other),
                                  Ops, MVT::i32, MMO);
   } else {
-    assert(PPCSubTarget.isPPC64() &&
+    assert(Subtarget.isPPC64() &&
            "i32->FP without LFIWAX supported only on PPC64");
 
     int FrameIdx = FrameInfo->CreateStackObject(8, 8, false);
@@ -5242,7 +5265,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
 
   // FCFID it and return it.
   SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld);
-  if (Op.getValueType() == MVT::f32 && !PPCSubTarget.hasFPCVT())
+  if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
     FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0));
   return FP;
 }
@@ -5557,6 +5580,22 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
   }
 
+  // The remaining cases assume either big endian element order or
+  // a splat-size that equates to the element size of the vector
+  // to be built.  An example that doesn't work for little endian is
+  // {0, -1, 0, -1, 0, -1, 0, -1} which has a splat size of 32 bits
+  // and a vector element size of 16 bits.  The code below will
+  // produce the vector in big endian element order, which for little
+  // endian is {-1, 0, -1, 0, -1, 0, -1, 0}.
+
+  // For now, just avoid these optimizations in that case.
+  // FIXME: Develop correct optimizations for LE with mismatched
+  // splat and element sizes.
+
+  if (Subtarget.isLittleEndian() &&
+      SplatSize != Op.getValueType().getVectorElementType().getSizeInBits())
+    return SDValue();
+
   // Check to see if this is a wide variety of vsplti*, binop self cases.
   static const signed char SplatCsts[] = {
     -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
@@ -5725,6 +5764,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   SDValue V2 = Op.getOperand(1);
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   EVT VT = Op.getValueType();
+  bool isLittleEndian = Subtarget.isLittleEndian();
 
   // Cases that are handled by instructions that take permute immediates
   // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
@@ -5733,15 +5773,15 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     if (PPC::isSplatShuffleMask(SVOp, 1) ||
         PPC::isSplatShuffleMask(SVOp, 2) ||
         PPC::isSplatShuffleMask(SVOp, 4) ||
-        PPC::isVPKUWUMShuffleMask(SVOp, true) ||
-        PPC::isVPKUHUMShuffleMask(SVOp, true) ||
-        PPC::isVSLDOIShuffleMask(SVOp, true) != -1 ||
-        PPC::isVMRGLShuffleMask(SVOp, 1, true) ||
-        PPC::isVMRGLShuffleMask(SVOp, 2, true) ||
-        PPC::isVMRGLShuffleMask(SVOp, 4, true) ||
-        PPC::isVMRGHShuffleMask(SVOp, 1, true) ||
-        PPC::isVMRGHShuffleMask(SVOp, 2, true) ||
-        PPC::isVMRGHShuffleMask(SVOp, 4, true)) {
+        PPC::isVPKUWUMShuffleMask(SVOp, true, DAG) ||
+        PPC::isVPKUHUMShuffleMask(SVOp, true, DAG) ||
+        PPC::isVSLDOIShuffleMask(SVOp, true, DAG) != -1 ||
+        PPC::isVMRGLShuffleMask(SVOp, 1, true, DAG) ||
+        PPC::isVMRGLShuffleMask(SVOp, 2, true, DAG) ||
+        PPC::isVMRGLShuffleMask(SVOp, 4, true, DAG) ||
+        PPC::isVMRGHShuffleMask(SVOp, 1, true, DAG) ||
+        PPC::isVMRGHShuffleMask(SVOp, 2, true, DAG) ||
+        PPC::isVMRGHShuffleMask(SVOp, 4, true, DAG)) {
       return Op;
     }
   }
@@ -5749,15 +5789,15 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   // Altivec has a variety of "shuffle immediates" that take two vector inputs
   // and produce a fixed permutation.  If any of these match, do not lower to
   // VPERM.
-  if (PPC::isVPKUWUMShuffleMask(SVOp, false) ||
-      PPC::isVPKUHUMShuffleMask(SVOp, false) ||
-      PPC::isVSLDOIShuffleMask(SVOp, false) != -1 ||
-      PPC::isVMRGLShuffleMask(SVOp, 1, false) ||
-      PPC::isVMRGLShuffleMask(SVOp, 2, false) ||
-      PPC::isVMRGLShuffleMask(SVOp, 4, false) ||
-      PPC::isVMRGHShuffleMask(SVOp, 1, false) ||
-      PPC::isVMRGHShuffleMask(SVOp, 2, false) ||
-      PPC::isVMRGHShuffleMask(SVOp, 4, false))
+  if (PPC::isVPKUWUMShuffleMask(SVOp, false, DAG) ||
+      PPC::isVPKUHUMShuffleMask(SVOp, false, DAG) ||
+      PPC::isVSLDOIShuffleMask(SVOp, false, DAG) != -1 ||
+      PPC::isVMRGLShuffleMask(SVOp, 1, false, DAG) ||
+      PPC::isVMRGLShuffleMask(SVOp, 2, false, DAG) ||
+      PPC::isVMRGLShuffleMask(SVOp, 4, false, DAG) ||
+      PPC::isVMRGHShuffleMask(SVOp, 1, false, DAG) ||
+      PPC::isVMRGHShuffleMask(SVOp, 2, false, DAG) ||
+      PPC::isVMRGHShuffleMask(SVOp, 4, false, DAG))
     return Op;
 
   // Check to see if this is a shuffle of 4-byte values.  If so, we can use our
@@ -5791,7 +5831,9 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   // If this shuffle can be expressed as a shuffle of 4-byte elements, use the
   // perfect shuffle vector to determine if it is cost effective to do this as
   // discrete instructions, or whether we should use a vperm.
-  if (isFourElementShuffle) {
+  // For now, we skip this for little endian until such time as we have a
+  // little-endian perfect shuffle table.
+  if (isFourElementShuffle && !isLittleEndian) {
     // Compute the index in the perfect shuffle table.
     unsigned PFTableIndex =
       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
@@ -5820,6 +5862,11 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
 
   // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
   // that it is in input element units, not in bytes.  Convert now.
+
+  // For little endian, the order of the input vectors is reversed, and
+  // the permutation mask is complemented with respect to 31.  This is
+  // necessary to produce proper semantics with the big-endian-biased vperm
+  // instruction.
   EVT EltVT = V1.getValueType().getVectorElementType();
   unsigned BytesPerElement = EltVT.getSizeInBits()/8;
 
@@ -5828,13 +5875,22 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
 
     for (unsigned j = 0; j != BytesPerElement; ++j)
-      ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,
-                                           MVT::i32));
+      if (isLittleEndian)
+        ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement+j),
+                                             MVT::i32));
+      else
+        ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,
+                                             MVT::i32));
   }
 
   SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
                                   ResultMask);
-  return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), V1, V2, VPermMask);
+  if (isLittleEndian)
+    return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
+                       V2, V1, VPermMask);
+  else
+    return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
+                       V1, V2, VPermMask);
 }
 
 /// getAltivecCompareInfo - Given an intrinsic, return false if it is not an
@@ -6027,6 +6083,7 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
                             LHS, RHS, Zero, DAG, dl);
   } else if (Op.getValueType() == MVT::v16i8) {
     SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+    bool isLittleEndian = Subtarget.isLittleEndian();
 
     // Multiply the even 8-bit parts, producing 16-bit sums.
     SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,
@@ -6038,13 +6095,24 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
                                           LHS, RHS, DAG, dl, MVT::v8i16);
     OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts);
 
-    // Merge the results together.
+    // Merge the results together.  Because vmuleub and vmuloub are
+    // instructions with a big-endian bias, we must reverse the
+    // element numbering and reverse the meaning of "odd" and "even"
+    // when generating little endian code.
     int Ops[16];
     for (unsigned i = 0; i != 8; ++i) {
-      Ops[i*2  ] = 2*i+1;
-      Ops[i*2+1] = 2*i+1+16;
+      if (isLittleEndian) {
+        Ops[i*2  ] = 2*i;
+        Ops[i*2+1] = 2*i+16;
+      } else {
+        Ops[i*2  ] = 2*i+1;
+        Ops[i*2+1] = 2*i+1+16;
+      }
     }
-    return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
+    if (isLittleEndian)
+      return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops);
+    else
+      return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
   } else {
     llvm_unreachable("Unknown mul to lower!");
   }
@@ -6064,17 +6132,17 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::VASTART:
-    return LowerVASTART(Op, DAG, PPCSubTarget);
+    return LowerVASTART(Op, DAG, Subtarget);
 
   case ISD::VAARG:
-    return LowerVAARG(Op, DAG, PPCSubTarget);
+    return LowerVAARG(Op, DAG, Subtarget);
 
   case ISD::VACOPY:
-    return LowerVACOPY(Op, DAG, PPCSubTarget);
+    return LowerVACOPY(Op, DAG, Subtarget);
 
-  case ISD::STACKRESTORE:       return LowerSTACKRESTORE(Op, DAG, PPCSubTarget);
+  case ISD::STACKRESTORE:       return LowerSTACKRESTORE(Op, DAG, Subtarget);
   case ISD::DYNAMIC_STACKALLOC:
-    return LowerDYNAMIC_STACKALLOC(Op, DAG, PPCSubTarget);
+    return LowerDYNAMIC_STACKALLOC(Op, DAG, Subtarget);
 
   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
@@ -6144,7 +6212,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
     EVT VT = N->getValueType(0);
 
     if (VT == MVT::i64) {
-      SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG, PPCSubTarget);
+      SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG, Subtarget);
 
       Results.push_back(NewNode);
       Results.push_back(NewNode.getValue(1));
@@ -6255,7 +6323,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
   // lwarx/stwcx are 32 bits.  With the 32-bit atomics we can use address
   // registers without caring whether they're 32 or 64, but here we're
   // doing actual arithmetic on the addresses.
-  bool is64bit = PPCSubTarget.isPPC64();
+  bool is64bit = Subtarget.isPPC64();
   unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
 
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -6450,7 +6518,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   unsigned LabelReg = MRI.createVirtualRegister(PtrRC);
   unsigned BufReg = MI->getOperand(1).getReg();
 
-  if (PPCSubTarget.isPPC64() && PPCSubTarget.isSVR4ABI()) {
+  if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) {
     MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
             .addReg(PPC::X2)
             .addImm(TOCOffset)
@@ -6463,12 +6531,12 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   unsigned BaseReg;
   if (MF->getFunction()->getAttributes().hasAttribute(
           AttributeSet::FunctionIndex, Attribute::Naked))
-    BaseReg = PPCSubTarget.isPPC64() ? PPC::X1 : PPC::R1;
+    BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
   else
-    BaseReg = PPCSubTarget.isPPC64() ? PPC::BP8 : PPC::BP;
+    BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;
 
   MIB = BuildMI(*thisMBB, MI, DL,
-                TII->get(PPCSubTarget.isPPC64() ? PPC::STD : PPC::STW))
+                TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW))
           .addReg(BaseReg)
           .addImm(BPOffset)
           .addReg(BufReg);
@@ -6492,10 +6560,10 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   // mainMBB:
   //  mainDstReg = 0
   MIB = BuildMI(mainMBB, DL,
-    TII->get(PPCSubTarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);
+    TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);
 
   // Store IP
-  if (PPCSubTarget.isPPC64()) {
+  if (Subtarget.isPPC64()) {
     MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD))
             .addReg(LabelReg)
             .addImm(LabelOffset)
@@ -6607,7 +6675,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   MIB.setMemRefs(MMOBegin, MMOEnd);
 
   // Reload TOC
-  if (PVT == MVT::i64 && PPCSubTarget.isSVR4ABI()) {
+  if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2)
             .addImm(TOCOffset)
             .addReg(BufReg);
@@ -6645,7 +6713,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
   MachineFunction *F = BB->getParent();
 
-  if (PPCSubTarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 ||
+  if (Subtarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 ||
                                  MI->getOpcode() == PPC::SELECT_CC_I8 ||
                                  MI->getOpcode() == PPC::SELECT_I4 ||
                                  MI->getOpcode() == PPC::SELECT_I8)) {
@@ -6765,13 +6833,13 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     BB = EmitAtomicBinary(MI, BB, true, PPC::XOR8);
 
   else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
-    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ANDC);
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND);
   else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
-    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ANDC);
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND);
   else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
-    BB = EmitAtomicBinary(MI, BB, false, PPC::ANDC);
+    BB = EmitAtomicBinary(MI, BB, false, PPC::NAND);
   else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
-    BB = EmitAtomicBinary(MI, BB, true, PPC::ANDC8);
+    BB = EmitAtomicBinary(MI, BB, true, PPC::NAND8);
 
   else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
     BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF);
@@ -6862,7 +6930,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     // We must use 64-bit registers for addresses when targeting 64-bit,
     // since we're actually doing arithmetic on them.  Other registers
     // can be 32-bit.
-    bool is64bit = PPCSubTarget.isPPC64();
+    bool is64bit = Subtarget.isPPC64();
     bool is8bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
 
     unsigned dest   = MI->getOperand(0).getReg();
@@ -7070,10 +7138,10 @@ SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op,
 
   EVT VT = Op.getValueType();
 
-  if ((VT == MVT::f32 && PPCSubTarget.hasFRES()) ||
-      (VT == MVT::f64 && PPCSubTarget.hasFRE())  ||
-      (VT == MVT::v4f32 && PPCSubTarget.hasAltivec()) ||
-      (VT == MVT::v2f64 && PPCSubTarget.hasVSX())) {
+  if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
+      (VT == MVT::f64 && Subtarget.hasFRE())  ||
+      (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
+      (VT == MVT::v2f64 && Subtarget.hasVSX())) {
 
     // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
     // For the reciprocal, we need to find the zero of the function:
@@ -7086,7 +7154,7 @@ SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op,
     // correct after every iteration. The minimum architected relative
     // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
     // 23 digits and double has 52 digits.
-    int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3;
+    int Iterations = Subtarget.hasRecipPrec() ? 1 : 3;
     if (VT.getScalarType() == MVT::f64)
       ++Iterations;
 
@@ -7133,10 +7201,10 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op,
 
   EVT VT = Op.getValueType();
 
-  if ((VT == MVT::f32 && PPCSubTarget.hasFRSQRTES()) ||
-      (VT == MVT::f64 && PPCSubTarget.hasFRSQRTE())  ||
-      (VT == MVT::v4f32 && PPCSubTarget.hasAltivec()) ||
-      (VT == MVT::v2f64 && PPCSubTarget.hasVSX())) {
+  if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
+      (VT == MVT::f64 && Subtarget.hasFRSQRTE())  ||
+      (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
+      (VT == MVT::v2f64 && Subtarget.hasVSX())) {
 
     // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
     // For the reciprocal sqrt, we need to find the zero of the function:
@@ -7149,7 +7217,7 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op,
     // correct after every iteration. The minimum architected relative
     // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
     // 23 digits and double has 52 digits.
-    int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3;
+    int Iterations = Subtarget.hasRecipPrec() ? 1 : 3;
     if (VT.getScalarType() == MVT::f64)
       ++Iterations;
 
@@ -7266,10 +7334,9 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
       if (!Visited.count(ChainLD->getChain().getNode()))
         Queue.push_back(ChainLD->getChain().getNode());
     } else if (ChainNext->getOpcode() == ISD::TokenFactor) {
-      for (SDNode::op_iterator O = ChainNext->op_begin(),
-           OE = ChainNext->op_end(); O != OE; ++O)
-        if (!Visited.count(O->getNode()))
-          Queue.push_back(O->getNode());
+      for (const SDUse &O : ChainNext->ops())
+        if (!Visited.count(O.getNode()))
+          Queue.push_back(O.getNode());
     } else
       LoadRoots.insert(ChainNext);
   }
@@ -7312,7 +7379,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
   SelectionDAG &DAG = DCI.DAG;
   SDLoc dl(N);
 
-  assert(PPCSubTarget.useCRBits() &&
+  assert(Subtarget.useCRBits() &&
          "Expecting to be tracking CR bits");
   // If we're tracking CR bits, we need to be careful that we don't have:
   //   trunc(binary-ops(zext(x), zext(y)))
@@ -7610,9 +7677,9 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
     return SDValue();
 
   if (!((N->getOperand(0).getValueType() == MVT::i1 &&
-        PPCSubTarget.useCRBits()) ||
+        Subtarget.useCRBits()) ||
        (N->getOperand(0).getValueType() == MVT::i32 &&
-        PPCSubTarget.isPPC64())))
+        Subtarget.isPPC64())))
     return SDValue();
 
   if (N->getOperand(0).getOpcode() != ISD::AND &&
@@ -7930,8 +7997,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       DCI.AddToWorklist(RV.getNode());
       RV = DAGCombineFastRecip(RV, DCI);
       if (RV.getNode()) {
-	// Unfortunately, RV is now NaN if the input was exactly 0. Select out
-	// this case and force the answer to 0.
+        // Unfortunately, RV is now NaN if the input was exactly 0. Select out
+        // this case and force the answer to 0.
 
         EVT VT = RV.getValueType();
 
@@ -8051,6 +8118,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       // This is a type-legal unaligned Altivec load.
       SDValue Chain = LD->getChain();
       SDValue Ptr = LD->getBasePtr();
+      bool isLittleEndian = Subtarget.isLittleEndian();
 
       // This implements the loading of unaligned vectors as described in
       // the venerable Apple Velocity Engine overview. Specifically:
@@ -8058,25 +8126,28 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
       //
       // The general idea is to expand a sequence of one or more unaligned
-      // loads into a alignment-based permutation-control instruction (lvsl),
-      // a series of regular vector loads (which always truncate their
-      // input address to an aligned address), and a series of permutations.
-      // The results of these permutations are the requested loaded values.
-      // The trick is that the last "extra" load is not taken from the address
-      // you might suspect (sizeof(vector) bytes after the last requested
-      // load), but rather sizeof(vector) - 1 bytes after the last
-      // requested vector. The point of this is to avoid a page fault if the
-      // base address happened to be aligned. This works because if the base
-      // address is aligned, then adding less than a full vector length will
-      // cause the last vector in the sequence to be (re)loaded. Otherwise,
-      // the next vector will be fetched as you might suspect was necessary.
+      // loads into an alignment-based permutation-control instruction (lvsl
+      // or lvsr), a series of regular vector loads (which always truncate
+      // their input address to an aligned address), and a series of
+      // permutations.  The results of these permutations are the requested
+      // loaded values.  The trick is that the last "extra" load is not taken
+      // from the address you might suspect (sizeof(vector) bytes after the
+      // last requested load), but rather sizeof(vector) - 1 bytes after the
+      // last requested vector. The point of this is to avoid a page fault if
+      // the base address happened to be aligned. This works because if the
+      // base address is aligned, then adding less than a full vector length
+      // will cause the last vector in the sequence to be (re)loaded.
+      // Otherwise, the next vector will be fetched as you might suspect was
+      // necessary.
 
       // We might be able to reuse the permutation generation from
       // a different base address offset from this one by an aligned amount.
       // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
       // optimization later.
-      SDValue PermCntl = BuildIntrinsicOp(Intrinsic::ppc_altivec_lvsl, Ptr,
-                                          DAG, dl, MVT::v16i8);
+      Intrinsic::ID Intr = (isLittleEndian ?
+                            Intrinsic::ppc_altivec_lvsr :
+                            Intrinsic::ppc_altivec_lvsl);
+      SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, MVT::v16i8);
 
       // Refine the alignment of the original load (a "new" load created here
       // which was identical to the first except for the alignment would be
@@ -8125,8 +8196,18 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       if (ExtraLoad.getValueType() != MVT::v4i32)
         ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad);
 
-      SDValue Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
-                                      BaseLoad, ExtraLoad, PermCntl, DAG, dl);
+      // Because vperm has a big-endian bias, we must reverse the order
+      // of the input vectors and complement the permute control vector
+      // when generating little endian code.  We have already handled the
+      // latter by using lvsr instead of lvsl, so just reverse BaseLoad
+      // and ExtraLoad here.
+      SDValue Perm;
+      if (isLittleEndian)
+        Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
+                                ExtraLoad, BaseLoad, PermCntl, DAG, dl);
+      else
+        Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
+                                BaseLoad, ExtraLoad, PermCntl, DAG, dl);
 
       if (VT != MVT::v4i32)
         Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm);
@@ -8151,12 +8232,11 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         ++UI;
 
         SmallVector<SDValue, 8> Ops;
-        for (SDNode::op_iterator O = User->op_begin(),
-             OE = User->op_end(); O != OE; ++O) {
-          if (*O == Use)
+        for (const SDUse &O : User->ops()) {
+          if (O == Use)
             Ops.push_back(To);
           else
-            Ops.push_back(*O);
+            Ops.push_back(O);
         }
 
         DAG.UpdateNodeOperands(User, Ops);
@@ -8166,9 +8246,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     }
     }
     break;
-  case ISD::INTRINSIC_WO_CHAIN:
-    if (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue() ==
-          Intrinsic::ppc_altivec_lvsl &&
+  case ISD::INTRINSIC_WO_CHAIN: {
+    bool isLittleEndian = Subtarget.isLittleEndian();
+    Intrinsic::ID Intr = (isLittleEndian ?
+                          Intrinsic::ppc_altivec_lvsr :
+                          Intrinsic::ppc_altivec_lvsl);
+    if (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue() == Intr &&
         N->getOperand(1)->getOpcode() == ISD::ADD) {
       SDValue Add = N->getOperand(1);
 
@@ -8180,8 +8263,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
              UE = BasePtr->use_end(); UI != UE; ++UI) {
           if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
               cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() ==
-                Intrinsic::ppc_altivec_lvsl) {
-            // We've found another LVSL, and this address if an aligned
+                Intr) {
+            // We've found another LVSL/LVSR, and this address is an aligned
             // multiple of that one. The results will be the same, so use the
             // one we've just found instead.
 
@@ -8190,6 +8273,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         }
       }
     }
+    }
 
     break;
   case ISD::BSWAP:
@@ -8537,11 +8621,11 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     // GCC RS6000 Constraint Letters
     switch (Constraint[0]) {
     case 'b':   // R1-R31
-      if (VT == MVT::i64 && PPCSubTarget.isPPC64())
+      if (VT == MVT::i64 && Subtarget.isPPC64())
         return std::make_pair(0U, &PPC::G8RC_NOX0RegClass);
       return std::make_pair(0U, &PPC::GPRC_NOR0RegClass);
     case 'r':   // R0-R31
-      if (VT == MVT::i64 && PPCSubTarget.isPPC64())
+      if (VT == MVT::i64 && Subtarget.isPPC64())
         return std::make_pair(0U, &PPC::G8RCRegClass);
       return std::make_pair(0U, &PPC::GPRCRegClass);
     case 'f':
@@ -8573,7 +8657,7 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   // register.
   // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use
   // the AsmName field from *RegisterInfo.td, then this would not be necessary.
-  if (R.first && VT == MVT::i64 && PPCSubTarget.isPPC64() &&
+  if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
       PPC::GPRCRegClass.contains(R.first)) {
     const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
     return std::make_pair(TRI->getMatchingSuperReg(R.first,
@@ -8707,8 +8791,8 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
   // the stack.
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
   FuncInfo->setLRStoreRequired();
-  bool isPPC64 = PPCSubTarget.isPPC64();
-  bool isDarwinABI = PPCSubTarget.isDarwinABI();
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isDarwinABI = Subtarget.isDarwinABI();
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
@@ -8762,8 +8846,8 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
 // this table could be generated automatically from RegInfo.
 unsigned PPCTargetLowering::getRegisterByName(const char* RegName,
                                               EVT VT) const {
-  bool isPPC64 = PPCSubTarget.isPPC64();
-  bool isDarwinABI = PPCSubTarget.isDarwinABI();
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isDarwinABI = Subtarget.isDarwinABI();
 
   if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) ||
       (!isPPC64 && VT != MVT::i32))
@@ -8804,7 +8888,7 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
                                            bool IsMemset, bool ZeroMemset,
                                            bool MemcpyStrSrc,
                                            MachineFunction &MF) const {
-  if (this->PPCSubTarget.isPPC64()) {
+  if (Subtarget.isPPC64()) {
     return MVT::i64;
   } else {
     return MVT::i32;
@@ -8863,7 +8947,7 @@ bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
     return false;
 
   if (VT.getSimpleVT().isVector()) {
-    if (PPCSubTarget.hasVSX()) {
+    if (Subtarget.hasVSX()) {
       if (VT != MVT::v2f64 && VT != MVT::v2i64)
         return false;
     } else {
@@ -8907,7 +8991,7 @@ PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
 }
 
 Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
-  if (DisableILPPref || PPCSubTarget.enableMachineScheduler())
+  if (DisableILPPref || Subtarget.enableMachineScheduler())
     return TargetLowering::getSchedulingPreference(N);
 
   return Sched::ILP;
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 080ef5d..df05aa5 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -18,7 +18,6 @@
 #include "PPC.h"
 #include "PPCInstrInfo.h"
 #include "PPCRegisterInfo.h"
-#include "PPCSubtarget.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetLowering.h"
@@ -71,19 +70,14 @@ namespace llvm {
 
       TOC_ENTRY,
 
-      /// The following three target-specific nodes are used for calls through
+      /// The following two target-specific nodes are used for calls through
       /// function pointers in the 64-bit SVR4 ABI.
 
-      /// Restore the TOC from the TOC save area of the current stack frame.
-      /// This is basically a hard coded load instruction which additionally
-      /// takes/produces a flag.
-      TOC_RESTORE,
-
       /// Like a regular LOAD but additionally taking/producing a flag.
       LOAD,
 
-      /// LOAD into r2 (also taking/producing a flag). Like TOC_RESTORE, this is
-      /// a hard coded load instruction.
+      /// Like LOAD (taking/producing a flag), but using r2 as hard-coded
+      /// destination.
       LOAD_TOC,
 
       /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX)
@@ -303,25 +297,27 @@ namespace llvm {
   namespace PPC {
     /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
     /// VPKUHUM instruction.
-    bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary);
+    bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary,
+                              SelectionDAG &DAG);
 
     /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
     /// VPKUWUM instruction.
-    bool isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary);
+    bool isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary,
+                              SelectionDAG &DAG);
 
     /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
     /// a VRGL* instruction with the specified unit size (1,2 or 4 bytes).
     bool isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
-                            bool isUnary);
+                            bool isUnary, SelectionDAG &DAG);
 
     /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
     /// a VRGH* instruction with the specified unit size (1,2 or 4 bytes).
     bool isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
-                            bool isUnary);
+                            bool isUnary, SelectionDAG &DAG);
 
     /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
     /// amount, otherwise return -1.
-    int isVSLDOIShuffleMask(SDNode *N, bool isUnary);
+    int isVSLDOIShuffleMask(SDNode *N, bool isUnary, SelectionDAG &DAG);
 
     /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
     /// specifies a splat of a single element that is suitable for input to
@@ -334,7 +330,7 @@ namespace llvm {
 
     /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
     /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
-    unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize);
+    unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG);
 
     /// get_VSPLTI_elt - If this is a build_vector of constants which can be
     /// formed by using a vspltis[bhw] instruction of the specified element
@@ -343,8 +339,9 @@ namespace llvm {
     SDValue get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
   }
 
+  class PPCSubtarget;
   class PPCTargetLowering : public TargetLowering {
-    const PPCSubtarget &PPCSubTarget;
+    const PPCSubtarget &Subtarget;
 
   public:
     explicit PPCTargetLowering(PPCTargetMachine &TM);
@@ -613,11 +610,6 @@ namespace llvm {
       extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, SelectionDAG &DAG,
                         SDValue ArgVal, SDLoc dl) const;
 
-    void
-      setMinReservedArea(MachineFunction &MF, SelectionDAG &DAG,
-                         unsigned nAltivecParamsAtEnd,
-                         unsigned MinReservedArea, bool isPPC64) const;
-
     SDValue
       LowerFormalArguments_Darwin(SDValue Chain,
                                   CallingConv::ID CallConv, bool isVarArg,
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index b71c09e..9318f70 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -802,17 +802,11 @@ def LDtocCPT: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
                   [(set i64:$rD,
                      (PPCtoc_entry tconstpool:$disp, i64:$reg))]>, isPPC64;
 
-let hasSideEffects = 1, isCodeGenOnly = 1 in {
-let RST = 2, DS = 2 in
-def LDinto_toc: DSForm_1a<58, 0, (outs), (ins g8rc:$reg),
-                    "ld 2, 8($reg)", IIC_LdStLD,
-                    [(PPCload_toc i64:$reg)]>, isPPC64;
-                    
-let RST = 2, DS = 10, RA = 1 in
-def LDtoc_restore : DSForm_1a<58, 0, (outs), (ins),
-                    "ld 2, 40(1)", IIC_LdStLD,
-                    [(PPCtoc_restore)]>, isPPC64;
-}
+let hasSideEffects = 1, isCodeGenOnly = 1, RST = 2 in
+def LDinto_toc: DSForm_1<58, 0, (outs), (ins memrix:$src),
+                    "ld 2, $src", IIC_LdStLD,
+                    [(PPCload_toc ixaddr:$src)]>, isPPC64;
+
 def LDX  : XForm_1<31,  21, (outs g8rc:$rD), (ins memrr:$src),
                    "ldx $rD, $src", IIC_LdStLD,
                    [(set i64:$rD, (load xaddr:$src))]>, isPPC64;
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index f3c2eab..dce46d8 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -22,111 +22,127 @@ def vnot_ppc : PatFrag<(ops node:$in),
 
 def vpkuhum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                               (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), false);
+  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), false,
+                                   *CurDAG);
 }]>;
 def vpkuwum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                               (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), false);
+  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), false,
+                                   *CurDAG);
 }]>;
 def vpkuhum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                     (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), true);
+  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), true,
+                                   *CurDAG);
 }]>;
 def vpkuwum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                     (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), true);
+  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), true,
+                                   *CurDAG);
 }]>;
 
 
 def vmrglb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false);
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false,
+                                 *CurDAG);
 }]>;
 def vmrglh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false);
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false,
+                                 *CurDAG);
 }]>;
 def vmrglw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false);
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false,
+                                 *CurDAG);
 }]>;
 def vmrghb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false);
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false,
+                                 *CurDAG);
 }]>;
 def vmrghh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false);
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false,
+                                 *CurDAG);
 }]>;
 def vmrghw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false);
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false,
+                                 *CurDAG);
 }]>;
 
 
 def vmrglb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true);
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true,
+                                 *CurDAG);
 }]>;
 def vmrglh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true);
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true,
+                                 *CurDAG);
 }]>;
 def vmrglw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true);
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true,
+                                 *CurDAG);
 }]>;
 def vmrghb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true);
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true,
+                                 *CurDAG);
 }]>;
 def vmrghh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true);
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true,
+                                 *CurDAG);
 }]>;
 def vmrghw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true);
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true,
+                                 *CurDAG);
 }]>;
 
 
 def VSLDOI_get_imm : SDNodeXForm<vector_shuffle, [{
-  return getI32Imm(PPC::isVSLDOIShuffleMask(N, false));
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, false, *CurDAG));
 }]>;
 def vsldoi_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVSLDOIShuffleMask(N, false) != -1;
+  return PPC::isVSLDOIShuffleMask(N, false, *CurDAG) != -1;
 }], VSLDOI_get_imm>;
 
 
 /// VSLDOI_unary* - These are used to match vsldoi(X,X), which is turned into
 /// vector_shuffle(X,undef,mask) by the dag combiner.
 def VSLDOI_unary_get_imm : SDNodeXForm<vector_shuffle, [{
-  return getI32Imm(PPC::isVSLDOIShuffleMask(N, true));
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, true, *CurDAG));
 }]>;
 def vsldoi_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVSLDOIShuffleMask(N, true) != -1;
+  return PPC::isVSLDOIShuffleMask(N, true, *CurDAG) != -1;
 }], VSLDOI_unary_get_imm>;
 
 
 // VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm.
 def VSPLTB_get_imm : SDNodeXForm<vector_shuffle, [{
-  return getI32Imm(PPC::getVSPLTImmediate(N, 1));
+  return getI32Imm(PPC::getVSPLTImmediate(N, 1, *CurDAG));
 }]>;
 def vspltb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle node:$lhs, node:$rhs), [{
   return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 1);
 }], VSPLTB_get_imm>;
 def VSPLTH_get_imm : SDNodeXForm<vector_shuffle, [{
-  return getI32Imm(PPC::getVSPLTImmediate(N, 2));
+  return getI32Imm(PPC::getVSPLTImmediate(N, 2, *CurDAG));
 }]>;
 def vsplth_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle node:$lhs, node:$rhs), [{
   return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 2);
 }], VSPLTH_get_imm>;
 def VSPLTW_get_imm : SDNodeXForm<vector_shuffle, [{
-  return getI32Imm(PPC::getVSPLTImmediate(N, 4));
+  return getI32Imm(PPC::getVSPLTImmediate(N, 4, *CurDAG));
 }]>;
 def vspltw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle node:$lhs, node:$rhs), [{
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index 7fed2c6..1e4396c 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -360,20 +360,6 @@ class DSForm_1<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr,
   let Inst{30-31} = xo;
 }
 
-class DSForm_1a<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr,
-                InstrItinClass itin, list<dag> pattern>
-         : I<opcode, OOL, IOL, asmstr, itin> {
-   bits<5>  RST;
-   bits<14> DS;
-   bits<5>  RA;
- 
-   let Pattern = pattern;
-   
-   let Inst{6-10}  = RST;
-   let Inst{11-15} = RA;
-   let Inst{16-29} = DS;
-   let Inst{30-31} = xo;
-}
 
 // 1.7.6 X-Form
 class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index fd72384..9bac91d 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/CommandLine.h"
@@ -60,23 +61,25 @@ cl::Hidden);
 // Pin the vtable to this file.
 void PPCInstrInfo::anchor() {}
 
-PPCInstrInfo::PPCInstrInfo(PPCTargetMachine &tm)
-  : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
-    TM(tm), RI(*TM.getSubtargetImpl()) {}
+PPCInstrInfo::PPCInstrInfo(PPCSubtarget &STI)
+    : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
+      Subtarget(STI), RI(STI) {}
 
 /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
 /// this target when scheduling the DAG.
-ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetHazardRecognizer(
-  const TargetMachine *TM,
-  const ScheduleDAG *DAG) const {
-  unsigned Directive = TM->getSubtarget<PPCSubtarget>().getDarwinDirective();
+ScheduleHazardRecognizer *
+PPCInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+                                           const ScheduleDAG *DAG) const {
+  unsigned Directive =
+      static_cast<const PPCSubtarget *>(STI)->getDarwinDirective();
   if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2 ||
       Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) {
-    const InstrItineraryData *II = TM->getInstrItineraryData();
+    const InstrItineraryData *II =
+        &static_cast<const PPCSubtarget *>(STI)->getInstrItineraryData();
     return new ScoreboardHazardRecognizer(II, DAG);
   }
 
-  return TargetInstrInfo::CreateTargetHazardRecognizer(TM, DAG);
+  return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG);
 }
 
 /// CreateTargetPostRAHazardRecognizer - Return the postRA hazard recognizer
@@ -84,17 +87,18 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetHazardRecognizer(
 ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer(
   const InstrItineraryData *II,
   const ScheduleDAG *DAG) const {
-  unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
+  unsigned Directive =
+      DAG->TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
 
-  if (Directive == PPC::DIR_PWR7)
+  if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8)
     return new PPCDispatchGroupSBHazardRecognizer(II, DAG);
 
   // Most subtargets use a PPC970 recognizer.
   if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 &&
       Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) {
-    assert(TM.getInstrInfo() && "No InstrInfo?");
+    assert(DAG->TII && "No InstrInfo?");
 
-    return new PPCHazardRecognizer970(TM);
+    return new PPCHazardRecognizer970(*DAG);
   }
 
   return new ScoreboardHazardRecognizer(II, DAG);
@@ -129,7 +133,7 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
 
     // On some cores, there is an additional delay between writing to a condition
     // register, and using it from a branch.
-    unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
+    unsigned Directive = Subtarget.getDarwinDirective();
     switch (Directive) {
     default: break;
     case PPC::DIR_7400:
@@ -142,6 +146,7 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case PPC::DIR_PWR6:
     case PPC::DIR_PWR6X:
     case PPC::DIR_PWR7:
+    case PPC::DIR_PWR8:
       Latency += 2;
       break;
     }
@@ -313,12 +318,13 @@ void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MI) const {
   // This function is used for scheduling, and the nop wanted here is the type
   // that terminates dispatch groups on the POWER cores.
-  unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
+  unsigned Directive = Subtarget.getDarwinDirective();
   unsigned Opcode;
   switch (Directive) {
   default:            Opcode = PPC::NOP; break;
   case PPC::DIR_PWR6: Opcode = PPC::NOP_GT_PWR6; break;
   case PPC::DIR_PWR7: Opcode = PPC::NOP_GT_PWR7; break;
+  case PPC::DIR_PWR8: Opcode = PPC::NOP_GT_PWR7; break; /* FIXME: Update when P8 InstrScheduling model is ready */
   }
 
   DebugLoc DL;
@@ -332,7 +338,7 @@ bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
                                  MachineBasicBlock *&FBB,
                                  SmallVectorImpl<MachineOperand> &Cond,
                                  bool AllowModify) const {
-  bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+  bool isPPC64 = Subtarget.isPPC64();
 
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
@@ -538,7 +544,7 @@ PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   assert((Cond.size() == 2 || Cond.size() == 0) &&
          "PPC branch conditions have two components!");
 
-  bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+  bool isPPC64 = Subtarget.isPPC64();
 
   // One-way branch.
   if (!FBB) {
@@ -579,7 +585,7 @@ bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
                 const SmallVectorImpl<MachineOperand> &Cond,
                 unsigned TrueReg, unsigned FalseReg,
                 int &CondCycles, int &TrueCycles, int &FalseCycles) const {
-  if (!TM.getSubtargetImpl()->hasISEL())
+  if (!Subtarget.hasISEL())
     return false;
 
   if (Cond.size() != 2)
@@ -623,7 +629,7 @@ void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB,
   assert(Cond.size() == 2 &&
          "PPC branch conditions have two components!");
 
-  assert(TM.getSubtargetImpl()->hasISEL() &&
+  assert(Subtarget.hasISEL() &&
          "Cannot insert select on target without ISEL support");
 
   // Get the register classes.
@@ -826,7 +832,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                        FrameIdx));
     NonRI = true;
   } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
-    assert(TM.getSubtargetImpl()->isDarwin() &&
+    assert(Subtarget.isDarwin() &&
            "VRSAVE only needs spill/restore on Darwin");
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_VRSAVE))
                                        .addReg(SrcReg,
@@ -921,7 +927,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                        FrameIdx));
     NonRI = true;
   } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
-    assert(TM.getSubtargetImpl()->isDarwin() &&
+    assert(Subtarget.isDarwin() &&
            "VRSAVE only needs spill/restore on Darwin");
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
                                                get(PPC::RESTORE_VRSAVE),
@@ -1035,7 +1041,7 @@ bool PPCInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
 
   unsigned ZeroReg;
   if (UseInfo->isLookupPtrRegClass()) {
-    bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+    bool isPPC64 = Subtarget.isPPC64();
     ZeroReg = isPPC64 ? PPC::ZERO8 : PPC::ZERO;
   } else {
     ZeroReg = UseInfo->RegClass == PPC::G8RC_NOX0RegClassID ?
@@ -1102,7 +1108,7 @@ bool PPCInstrInfo::PredicateInstruction(
   unsigned OpC = MI->getOpcode();
   if (OpC == PPC::BLR) {
     if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) {
-      bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+      bool isPPC64 = Subtarget.isPPC64();
       MI->setDesc(get(Pred[0].getImm() ?
                       (isPPC64 ? PPC::BDNZLR8 : PPC::BDNZLR) :
                       (isPPC64 ? PPC::BDZLR8  : PPC::BDZLR)));
@@ -1124,7 +1130,7 @@ bool PPCInstrInfo::PredicateInstruction(
     return true;
   } else if (OpC == PPC::B) {
     if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) {
-      bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+      bool isPPC64 = Subtarget.isPPC64();
       MI->setDesc(get(Pred[0].getImm() ?
                       (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
                       (isPPC64 ? PPC::BDZ8  : PPC::BDZ)));
@@ -1162,7 +1168,7 @@ bool PPCInstrInfo::PredicateInstruction(
       llvm_unreachable("Cannot predicate bctr[l] on the ctr register");
 
     bool setLR = OpC == PPC::BCTRL || OpC == PPC::BCTRL8;
-    bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+    bool isPPC64 = Subtarget.isPPC64();
 
     if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
       MI->setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8 : PPC::BCCTR8) :
@@ -1323,7 +1329,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
   // for equality checks (as those don't depend on the sign). On PPC64,
   // we are restricted to equality for unsigned 64-bit comparisons and for
   // signed 32-bit comparisons the applicability is more restricted.
-  bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+  bool isPPC64 = Subtarget.isPPC64();
   bool is32BitSignedCompare   = OpC ==  PPC::CMPWI || OpC == PPC::CMPW;
   bool is32BitUnsignedCompare = OpC == PPC::CMPLWI || OpC == PPC::CMPLW;
   bool is64BitUnsignedCompare = OpC == PPC::CMPLDI || OpC == PPC::CMPLD;
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index d9db3e1..83f14c6 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -65,7 +65,7 @@ enum PPC970_Unit {
 
 
 class PPCInstrInfo : public PPCGenInstrInfo {
-  PPCTargetMachine &TM;
+  PPCSubtarget &Subtarget;
   const PPCRegisterInfo RI;
 
   bool StoreRegToStackSlot(MachineFunction &MF,
@@ -80,7 +80,7 @@ class PPCInstrInfo : public PPCGenInstrInfo {
                             bool &NonRI, bool &SpillsVRS) const;
   virtual void anchor();
 public:
-  explicit PPCInstrInfo(PPCTargetMachine &TM);
+  explicit PPCInstrInfo(PPCSubtarget &STI);
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
@@ -89,7 +89,7 @@ public:
   const PPCRegisterInfo &getRegisterInfo() const { return RI; }
 
   ScheduleHazardRecognizer *
-  CreateTargetHazardRecognizer(const TargetMachine *TM,
+  CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
                                const ScheduleDAG *DAG) const override;
   ScheduleHazardRecognizer *
   CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index e421f8e..c2e3382 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -141,9 +141,6 @@ def PPCload   : SDNode<"PPCISD::LOAD", SDTypeProfile<1, 1, []>,
 def PPCload_toc : SDNode<"PPCISD::LOAD_TOC", SDTypeProfile<0, 1, []>,
                           [SDNPHasChain, SDNPSideEffect,
                            SDNPInGlue, SDNPOutGlue]>;
-def PPCtoc_restore : SDNode<"PPCISD::TOC_RESTORE", SDTypeProfile<0, 0, []>,
-                            [SDNPHasChain, SDNPSideEffect,
-                             SDNPInGlue, SDNPOutGlue]>;
 def PPCmtctr      : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def PPCbctrl : SDNode<"PPCISD::BCTRL", SDTNone,
diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp
index 7bbc71b..e5f113a 100644
--- a/lib/Target/PowerPC/PPCJITInfo.cpp
+++ b/lib/Target/PowerPC/PPCJITInfo.cpp
@@ -13,7 +13,7 @@
 
 #include "PPCJITInfo.h"
 #include "PPCRelocations.h"
-#include "PPCTargetMachine.h"
+#include "PPCSubtarget.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -25,6 +25,11 @@ using namespace llvm;
 
 static TargetJITInfo::JITCompilerFn JITCompilerFunction;
 
+PPCJITInfo::PPCJITInfo(PPCSubtarget &STI)
+    : Subtarget(STI), is64Bit(STI.isPPC64()) {
+  useGOT = 0;
+}
+
 #define BUILD_ADDIS(RD,RS,IMM16) \
   ((15 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 65535))
 #define BUILD_ORI(RD,RS,UIMM16) \
@@ -393,7 +398,7 @@ void *PPCJITInfo::emitFunctionStub(const Function* F, void *Fn,
     JCE.emitWordBE(0xf821ffb1);     // stdu r1,-80(r1)
     JCE.emitWordBE(0x7d6802a6);     // mflr r11
     JCE.emitWordBE(0xf9610060);     // std r11, 96(r1)
-  } else if (TM.getSubtargetImpl()->isDarwinABI()){
+  } else if (Subtarget.isDarwinABI()){
     JCE.emitWordBE(0x9421ffe0);     // stwu r1,-32(r1)
     JCE.emitWordBE(0x7d6802a6);     // mflr r11
     JCE.emitWordBE(0x91610028);     // stw r11, 40(r1)
diff --git a/lib/Target/PowerPC/PPCJITInfo.h b/lib/Target/PowerPC/PPCJITInfo.h
index 0693e3e..b6b37ff 100644
--- a/lib/Target/PowerPC/PPCJITInfo.h
+++ b/lib/Target/PowerPC/PPCJITInfo.h
@@ -18,32 +18,29 @@
 #include "llvm/Target/TargetJITInfo.h"
 
 namespace llvm {
-  class PPCTargetMachine;
+class PPCSubtarget;
+class PPCJITInfo : public TargetJITInfo {
+protected:
+  PPCSubtarget &Subtarget;
+  bool is64Bit;
 
-  class PPCJITInfo : public TargetJITInfo {
-  protected:
-    PPCTargetMachine &TM;
-    bool is64Bit;
-  public:
-    PPCJITInfo(PPCTargetMachine &tm, bool tmIs64Bit) : TM(tm) {
-      useGOT = 0;
-      is64Bit = tmIs64Bit;
-    }
+public:
+  PPCJITInfo(PPCSubtarget &STI);
 
-    StubLayout getStubLayout() override;
-    void *emitFunctionStub(const Function* F, void *Fn,
-                           JITCodeEmitter &JCE) override;
-    LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
-    void relocate(void *Function, MachineRelocation *MR,
-                  unsigned NumRelocs, unsigned char* GOTBase) override;
+  StubLayout getStubLayout() override;
+  void *emitFunctionStub(const Function *F, void *Fn,
+                         JITCodeEmitter &JCE) override;
+  LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
+  void relocate(void *Function, MachineRelocation *MR, unsigned NumRelocs,
+                unsigned char *GOTBase) override;
 
-    /// replaceMachineCodeForFunction - Make it so that calling the function
-    /// whose machine code is at OLD turns into a call to NEW, perhaps by
-    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
-    /// code.
-    ///
-    void replaceMachineCodeForFunction(void *Old, void *New) override;
-  };
+  /// replaceMachineCodeForFunction - Make it so that calling the function
+  /// whose machine code is at OLD turns into a call to NEW, perhaps by
+  /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+  /// code.
+  ///
+  void replaceMachineCodeForFunction(void *Old, void *New) override;
+};
 }
 
 #endif
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index e333b51..eca774e 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -973,6 +973,14 @@ void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
   unsigned OffsetOperandNo = getOffsetONFromFION(MI, FIOperandNum);
   Offset += MI.getOperand(OffsetOperandNo).getImm();
   MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const MCInstrDesc &MCID = MI.getDesc();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MRI.constrainRegClass(BaseReg,
+                        TII.getRegClass(MCID, FIOperandNum, this, MF));
 }
 
 bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
diff --git a/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp b/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
index f742f72..dc16742 100644
--- a/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
+++ b/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
@@ -16,9 +16,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "powerpc-selectiondag-info"
 
-PPCSelectionDAGInfo::PPCSelectionDAGInfo(const PPCTargetMachine &TM)
-  : TargetSelectionDAGInfo(TM) {
-}
+PPCSelectionDAGInfo::PPCSelectionDAGInfo(const DataLayout *DL)
+    : TargetSelectionDAGInfo(DL) {}
 
-PPCSelectionDAGInfo::~PPCSelectionDAGInfo() {
-}
+PPCSelectionDAGInfo::~PPCSelectionDAGInfo() {}
diff --git a/lib/Target/PowerPC/PPCSelectionDAGInfo.h b/lib/Target/PowerPC/PPCSelectionDAGInfo.h
index 341b69c..b2e7f3b 100644
--- a/lib/Target/PowerPC/PPCSelectionDAGInfo.h
+++ b/lib/Target/PowerPC/PPCSelectionDAGInfo.h
@@ -22,7 +22,7 @@ class PPCTargetMachine;
 
 class PPCSelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
-  explicit PPCSelectionDAGInfo(const PPCTargetMachine &TM);
+  explicit PPCSelectionDAGInfo(const DataLayout *DL);
   ~PPCSelectionDAGInfo();
 };
 
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index ea9daee..2e1b74a 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -32,15 +32,57 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_CTOR
 #include "PPCGenSubtargetInfo.inc"
 
-PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
-                           const std::string &FS, bool is64Bit,
-                           CodeGenOpt::Level OptLevel)
-    : PPCGenSubtargetInfo(TT, CPU, FS), IsPPC64(is64Bit), TargetTriple(TT),
-      OptLevel(OptLevel) {
+/// Return the datalayout string of a subtarget.
+static std::string getDataLayoutString(const PPCSubtarget &ST) {
+  const Triple &T = ST.getTargetTriple();
+
+  std::string Ret;
+
+  // Most PPC* platforms are big endian, PPC64LE is little endian.
+  if (ST.isLittleEndian())
+    Ret = "e";
+  else
+    Ret = "E";
+
+  Ret += DataLayout::getManglingComponent(T);
+
+  // PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit
+  // pointers.
+  if (!ST.isPPC64() || T.getOS() == Triple::Lv2)
+    Ret += "-p:32:32";
+
+  // Note, the alignment values for f64 and i64 on ppc64 in Darwin
+  // documentation are wrong; these are correct (i.e. "what gcc does").
+  if (ST.isPPC64() || ST.isSVR4ABI())
+    Ret += "-i64:64";
+  else
+    Ret += "-f64:32:64";
+
+  // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones.
+  if (ST.isPPC64())
+    Ret += "-n32:64";
+  else
+    Ret += "-n32";
+
+  return Ret;
+}
+
+PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                            StringRef FS) {
   initializeEnvironment();
   resetSubtargetFeatures(CPU, FS);
+  return *this;
 }
 
+PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
+                           const std::string &FS, PPCTargetMachine &TM,
+                           bool is64Bit, CodeGenOpt::Level OptLevel)
+    : PPCGenSubtargetInfo(TT, CPU, FS), IsPPC64(is64Bit), TargetTriple(TT),
+      OptLevel(OptLevel),
+      FrameLowering(initializeSubtargetDependencies(CPU, FS)),
+      DL(getDataLayoutString(*this)), InstrInfo(*this), JITInfo(*this),
+      TLInfo(TM), TSInfo(&DL) {}
+
 /// SetJITMode - This is called to inform the subtarget info that we are
 /// producing code for the JIT.
 void PPCSubtarget::SetJITMode() {
@@ -156,6 +198,11 @@ void PPCSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
 
   // Determine endianness.
   IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le);
+
+  // FIXME: For now, we disable VSX in little-endian mode until endian
+  // issues in those instructions can be addressed.
+  if (IsLittleEndian)
+    HasVSX = false;
 }
 
 /// hasLazyResolverStub - Return true if accesses to the specified global have
@@ -200,6 +247,7 @@ static bool needsAggressiveScheduling(unsigned Directive) {
   case PPC::DIR_E500mc:
   case PPC::DIR_E5500:
   case PPC::DIR_PWR7:
+  case PPC::DIR_PWR8:
     return true;
   }
 }
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index ee43fd5..2a16699 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -14,7 +14,13 @@
 #ifndef POWERPCSUBTARGET_H
 #define POWERPCSUBTARGET_H
 
+#include "PPCFrameLowering.h"
+#include "PPCInstrInfo.h"
+#include "PPCISelLowering.h"
+#include "PPCJITInfo.h"
+#include "PPCSelectionDAGInfo.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
@@ -50,6 +56,7 @@ namespace PPC {
     DIR_PWR6,
     DIR_PWR6X,
     DIR_PWR7,
+    DIR_PWR8,
     DIR_64
   };
 }
@@ -102,12 +109,19 @@ protected:
   /// OptLevel - What default optimization level we're emitting code for.
   CodeGenOpt::Level OptLevel;
 
+  PPCFrameLowering FrameLowering;
+  const DataLayout DL;
+  PPCInstrInfo InstrInfo;
+  PPCJITInfo JITInfo;
+  PPCTargetLowering TLInfo;
+  PPCSelectionDAGInfo TSInfo;
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   ///
   PPCSubtarget(const std::string &TT, const std::string &CPU,
-               const std::string &FS, bool is64Bit,
+               const std::string &FS, PPCTargetMachine &TM, bool is64Bit,
                CodeGenOpt::Level OptLevel);
 
   /// ParseSubtargetFeatures - Parses features string setting specified
@@ -127,10 +141,21 @@ public:
   ///
   unsigned getDarwinDirective() const { return DarwinDirective; }
 
-  /// getInstrItins - Return the instruction itineraies based on subtarget
+  /// getInstrItins - Return the instruction itineraries based on subtarget
   /// selection.
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
 
+  const PPCFrameLowering *getFrameLowering() const { return &FrameLowering; }
+  const DataLayout *getDataLayout() const { return &DL; }
+  const PPCInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  PPCJITInfo *getJITInfo() { return &JITInfo; }
+  const PPCTargetLowering *getTargetLowering() const { return &TLInfo; }
+  const PPCSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+
+  /// initializeSubtargetDependencies - Initializes using a CPU and feature string
+  /// so that we can use initializer lists for subtarget initialization.
+  PPCSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
   /// \brief Reset the features for the PowerPC target.
   void resetSubtargetFeatures(const MachineFunction *MF) override;
 private:
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 2323add..9563b90 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -37,53 +37,12 @@ extern "C" void LLVMInitializePowerPCTarget() {
   RegisterTargetMachine<PPC64TargetMachine> C(ThePPC64LETarget);
 }
 
-/// Return the datalayout string of a subtarget.
-static std::string getDataLayoutString(const PPCSubtarget &ST) {
-  const Triple &T = ST.getTargetTriple();
-
-  std::string Ret;
-
-  // Most PPC* platforms are big endian, PPC64LE is little endian.
-  if (ST.isLittleEndian())
-    Ret = "e";
-  else
-    Ret = "E";
-
-  Ret += DataLayout::getManglingComponent(T);
-
-  // PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit
-  // pointers.
-  if (!ST.isPPC64() || T.getOS() == Triple::Lv2)
-    Ret += "-p:32:32";
-
-  // Note, the alignment values for f64 and i64 on ppc64 in Darwin
-  // documentation are wrong; these are correct (i.e. "what gcc does").
-  if (ST.isPPC64() || ST.isSVR4ABI())
-    Ret += "-i64:64";
-  else
-    Ret += "-f64:32:64";
-
-  // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones.
-  if (ST.isPPC64())
-    Ret += "-n32:64";
-  else
-    Ret += "-n32";
-
-  return Ret;
-}
-
-PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT,
-                                   StringRef CPU, StringRef FS,
-                                   const TargetOptions &Options,
+PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                                   StringRef FS, const TargetOptions &Options,
                                    Reloc::Model RM, CodeModel::Model CM,
-                                   CodeGenOpt::Level OL,
-                                   bool is64Bit)
-  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS, is64Bit, OL),
-    DL(getDataLayoutString(Subtarget)), InstrInfo(*this),
-    FrameLowering(Subtarget), JITInfo(*this, is64Bit),
-    TLInfo(*this), TSInfo(*this),
-    InstrItins(Subtarget.getInstrItineraryData()) {
+                                   CodeGenOpt::Level OL, bool is64Bit)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, *this, is64Bit, OL) {
   initAsmInfo();
 }
 
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 9e92494..4c7029c 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -14,11 +14,7 @@
 #ifndef PPC_TARGETMACHINE_H
 #define PPC_TARGETMACHINE_H
 
-#include "PPCFrameLowering.h"
-#include "PPCISelLowering.h"
 #include "PPCInstrInfo.h"
-#include "PPCJITInfo.h"
-#include "PPCSelectionDAGInfo.h"
 #include "PPCSubtarget.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
@@ -29,13 +25,6 @@ namespace llvm {
 ///
 class PPCTargetMachine : public LLVMTargetMachine {
   PPCSubtarget        Subtarget;
-  const DataLayout    DL;       // Calculates type size & alignment
-  PPCInstrInfo        InstrInfo;
-  PPCFrameLowering    FrameLowering;
-  PPCJITInfo          JITInfo;
-  PPCTargetLowering   TLInfo;
-  PPCSelectionDAGInfo TSInfo;
-  InstrItineraryData  InstrItins;
 
 public:
   PPCTargetMachine(const Target &T, StringRef TT,
@@ -43,25 +32,29 @@ public:
                    Reloc::Model RM, CodeModel::Model CM,
                    CodeGenOpt::Level OL, bool is64Bit);
 
-  const PPCInstrInfo      *getInstrInfo() const override { return &InstrInfo; }
-  const PPCFrameLowering  *getFrameLowering() const override {
-    return &FrameLowering;
+  const PPCInstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
   }
-        PPCJITInfo        *getJITInfo() override         { return &JITInfo; }
+  const PPCFrameLowering *getFrameLowering() const override {
+    return getSubtargetImpl()->getFrameLowering();
+  }
+  PPCJITInfo *getJITInfo() override { return Subtarget.getJITInfo(); }
   const PPCTargetLowering *getTargetLowering() const override {
-   return &TLInfo;
+    return getSubtargetImpl()->getTargetLowering();
   }
   const PPCSelectionDAGInfo* getSelectionDAGInfo() const override {
-    return &TSInfo;
+    return getSubtargetImpl()->getSelectionDAGInfo();
   }
-  const PPCRegisterInfo   *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
+  const PPCRegisterInfo *getRegisterInfo() const override {
+    return &getInstrInfo()->getRegisterInfo();
   }
 
-  const DataLayout    *getDataLayout() const override    { return &DL; }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
+  }
   const PPCSubtarget  *getSubtargetImpl() const override { return &Subtarget; }
   const InstrItineraryData *getInstrItineraryData() const override {
-    return &InstrItins;
+    return &getSubtargetImpl()->getInstrItineraryData();
   }
 
   // Pass Pipeline Configuration
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index 949fdfb..713fc4b 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -17,6 +17,7 @@
 namespace llvm {
 
 class AMDGPUInstrPrinter;
+class AMDGPUSubtarget;
 class AMDGPUTargetMachine;
 class FunctionPass;
 class MCAsmInfo;
@@ -40,6 +41,7 @@ FunctionPass *createSIAnnotateControlFlowPass();
 FunctionPass *createSILowerI1CopiesPass();
 FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
 FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
+FunctionPass *createSIFixSGPRLiveRangesPass();
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 FunctionPass *createSIInsertWaits(TargetMachine &tm);
 
@@ -47,14 +49,18 @@ void initializeSILowerI1CopiesPass(PassRegistry &);
 extern char &SILowerI1CopiesID;
 
 // Passes common to R600 and SI
+FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
 Pass *createAMDGPUStructurizeCFGPass();
-FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
 FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
 
 /// \brief Creates an AMDGPU-specific Target Transformation Info pass.
 ImmutablePass *
 createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM);
 
+void initializeSIFixSGPRLiveRangesPass(PassRegistry&);
+extern char &SIFixSGPRLiveRangesID;
+
+
 extern Target TheAMDGPUTarget;
 
 } // End namespace llvm
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index 2edc115..6ff9ab7 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -7,8 +7,7 @@
 //
 //==-----------------------------------------------------------------------===//
 
-// Include AMDIL TD files
-include "AMDILBase.td"
+include "llvm/Target/Target.td"
 
 //===----------------------------------------------------------------------===//
 // Subtarget Features
@@ -33,30 +32,25 @@ def FeatureIfCvt : SubtargetFeature <"disable-ifcvt",
         "false",
         "Disable the if conversion pass">;
 
-def FeatureFP64     : SubtargetFeature<"fp64",
+def FeatureFP64 : SubtargetFeature<"fp64",
         "FP64",
         "true",
-        "Enable 64bit double precision operations">;
+        "Enable double precision operations">;
 
 def Feature64BitPtr : SubtargetFeature<"64BitPtr",
         "Is64bit",
         "true",
-        "Specify if 64bit addressing should be used.">;
-
-def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr",
-        "Is32on64bit",
-        "false",
-        "Specify if 64bit sized pointers with 32bit addressing should be used.">;
+        "Specify if 64-bit addressing should be used">;
 
 def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
         "R600ALUInst",
         "false",
-        "Older version of ALU instructions encoding.">;
+        "Older version of ALU instructions encoding">;
 
 def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
         "HasVertexCache",
         "true",
-        "Specify use of dedicated vertex cache.">;
+        "Specify use of dedicated vertex cache">;
 
 def FeatureCaymanISA : SubtargetFeature<"caymanISA",
         "CaymanISA",
@@ -87,28 +81,40 @@ def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
 def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
 def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
 
+class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
+        "localmemorysize"#Value,
+        "LocalMemorySize",
+        !cast<string>(Value),
+        "The size of local memory in bytes">;
+
 class SubtargetFeatureGeneration <string Value,
                                   list<SubtargetFeature> Implies> :
         SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
                           Value#" GPU generation", Implies>;
 
+def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
+def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
+def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
+
 def FeatureR600 : SubtargetFeatureGeneration<"R600",
-        [FeatureR600ALUInst, FeatureFetchLimit8]>;
+        [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>;
 
 def FeatureR700 : SubtargetFeatureGeneration<"R700",
-        [FeatureFetchLimit16]>;
+        [FeatureFetchLimit16, FeatureLocalMemorySize0]>;
 
 def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
-        [FeatureFetchLimit16]>;
+        [FeatureFetchLimit16, FeatureLocalMemorySize32768]>;
 
 def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
-        [FeatureFetchLimit16, FeatureWavefrontSize64]>;
+        [FeatureFetchLimit16, FeatureWavefrontSize64,
+         FeatureLocalMemorySize32768]
+>;
 
 def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
-        [Feature64BitPtr, FeatureFP64]>;
+        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768]>;
 
 def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
-        [Feature64BitPtr, FeatureFP64]>;
+        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536]>;
 //===----------------------------------------------------------------------===//
 
 def AMDGPUInstrInfo : InstrInfo {
@@ -120,6 +126,10 @@ def AMDGPU : Target {
   let InstructionSet = AMDGPUInstrInfo;
 }
 
+// Dummy Instruction itineraries for pseudo instructions
+def ALU_NULL : FuncUnit;
+def NullALU : InstrItinClass;
+
 //===----------------------------------------------------------------------===//
 // Predicate helper class
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index 170f479..a6e217b 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -19,6 +19,7 @@
 
 #include "AMDGPUAsmPrinter.h"
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "R600Defines.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
@@ -35,6 +36,24 @@
 
 using namespace llvm;
 
+// TODO: This should get the default rounding mode from the kernel. We just set
+// the default here, but this could change if the OpenCL rounding mode pragmas
+// are used.
+//
+// The denormal mode here should match what is reported by the OpenCL runtime
+// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
+// can also be override to flush with the -cl-denorms-are-zero compiler flag.
+//
+// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
+// precision, and leaves single precision to flush all and does not report
+// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
+// CL_FP_DENORM for both.
+static uint32_t getFPMode(MachineFunction &) {
+  return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
+         FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
+         FP_DENORM_MODE_SP(FP_DENORM_FLUSH_NONE) |
+         FP_DENORM_MODE_DP(FP_DENORM_FLUSH_NONE);
+}
 
 static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm,
                                               MCStreamer &Streamer) {
@@ -92,6 +111,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                                  false);
       OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
                                  false);
+      OutStreamer.emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
+                                 false);
+      OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
+                                 false);
     } else {
       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
       OutStreamer.emitRawComment(
@@ -279,16 +302,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   if (VCCUsed)
     MaxSGPR += 2;
 
-  ProgInfo.CodeLen = CodeSize;
-  ProgInfo.NumSGPR = MaxSGPR;
   ProgInfo.NumVGPR = MaxVGPR;
+  ProgInfo.NumSGPR = MaxSGPR;
+
+  // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
+  // register.
+  ProgInfo.FloatMode = getFPMode(MF);
+
+  // XXX: Not quite sure what this does, but sc seems to unset this.
+  ProgInfo.IEEEMode = 0;
+
+  // Do not clamp NAN to 0.
+  ProgInfo.DX10Clamp = 0;
+
+  ProgInfo.CodeLen = CodeSize;
 }
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF,
                                          const SIProgramInfo &KernelInfo) {
   const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
-
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
   unsigned RsrcReg;
   switch (MFI->ShaderType) {
   default: // Fall through
@@ -298,25 +332,41 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF,
   case ShaderType::VERTEX:   RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break;
   }
 
-  OutStreamer.EmitIntValue(RsrcReg, 4);
-  OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) |
-                           S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4);
-
   unsigned LDSAlignShift;
   if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
-    // LDS is allocated in 64 dword blocks
+    // LDS is allocated in 64 dword blocks.
     LDSAlignShift = 8;
   } else {
-    // LDS is allocated in 128 dword blocks
+    // LDS is allocated in 128 dword blocks.
     LDSAlignShift = 9;
   }
+
   unsigned LDSBlocks =
-          RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
+    RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
 
   if (MFI->ShaderType == ShaderType::COMPUTE) {
+    OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
+
+    const uint32_t ComputePGMRSrc1 =
+      S_00B848_VGPRS(KernelInfo.NumVGPR / 4) |
+      S_00B848_SGPRS(KernelInfo.NumSGPR / 8) |
+      S_00B848_PRIORITY(KernelInfo.Priority) |
+      S_00B848_FLOAT_MODE(KernelInfo.FloatMode) |
+      S_00B848_PRIV(KernelInfo.Priv) |
+      S_00B848_DX10_CLAMP(KernelInfo.DX10Clamp) |
+      S_00B848_IEEE_MODE(KernelInfo.DebugMode) |
+      S_00B848_IEEE_MODE(KernelInfo.IEEEMode);
+
+    OutStreamer.EmitIntValue(ComputePGMRSrc1, 4);
+
     OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
     OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4);
+  } else {
+    OutStreamer.EmitIntValue(RsrcReg, 4);
+    OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) |
+                             S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4);
   }
+
   if (MFI->ShaderType == ShaderType::PIXEL) {
     OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
     OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4);
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
index 71adc9a..c1acb6e 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.h
+++ b/lib/Target/R600/AMDGPUAsmPrinter.h
@@ -25,13 +25,28 @@ class AMDGPUAsmPrinter : public AsmPrinter {
 private:
   struct SIProgramInfo {
     SIProgramInfo() :
-      CodeLen(0),
+      NumVGPR(0),
       NumSGPR(0),
-      NumVGPR(0) {}
+      Priority(0),
+      FloatMode(0),
+      Priv(0),
+      DX10Clamp(0),
+      DebugMode(0),
+      IEEEMode(0),
+      CodeLen(0) {}
 
+    // Fields set in PGM_RSRC1 pm4 packet.
+    uint32_t NumVGPR;
+    uint32_t NumSGPR;
+    uint32_t Priority;
+    uint32_t FloatMode;
+    uint32_t Priv;
+    uint32_t DX10Clamp;
+    uint32_t DebugMode;
+    uint32_t IEEEMode;
+
+    // Bonus information for debugging.
     uint64_t CodeLen;
-    unsigned NumSGPR;
-    unsigned NumVGPR;
   };
 
   void getSIProgramInfo(SIProgramInfo &Out, MachineFunction &MF) const;
diff --git a/lib/Target/R600/AMDGPUConvertToISA.cpp b/lib/Target/R600/AMDGPUConvertToISA.cpp
deleted file mode 100644
index 91aeee2..0000000
--- a/lib/Target/R600/AMDGPUConvertToISA.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-//===-- AMDGPUConvertToISA.cpp - Lower AMDIL to HW ISA --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief This pass lowers AMDIL machine instructions to the appropriate
-/// hardware instructions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUInstrInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-
-using namespace llvm;
-
-namespace {
-
-class AMDGPUConvertToISAPass : public MachineFunctionPass {
-
-private:
-  static char ID;
-  TargetMachine &TM;
-
-public:
-  AMDGPUConvertToISAPass(TargetMachine &tm) :
-    MachineFunctionPass(ID), TM(tm) { }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {return "AMDGPU Convert to ISA";}
-
-};
-
-} // End anonymous namespace
-
-char AMDGPUConvertToISAPass::ID = 0;
-
-FunctionPass *llvm::createAMDGPUConvertToISAPass(TargetMachine &tm) {
-  return new AMDGPUConvertToISAPass(tm);
-}
-
-bool AMDGPUConvertToISAPass::runOnMachineFunction(MachineFunction &MF) {
-  const AMDGPUInstrInfo * TII =
-                      static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo());
-
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                  BB != BB_E; ++BB) {
-    MachineBasicBlock &MBB = *BB;
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-                                                      I != E; ++I) {
-      MachineInstr &MI = *I;
-      TII->convertToISA(MI, MF, MBB.findDebugLoc(I));
-    }
-  }
-  return false;
-}
diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp
index e7e90d3..9e8302e 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.cpp
+++ b/lib/Target/R600/AMDGPUFrameLowering.cpp
@@ -83,7 +83,7 @@ int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
   for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
     OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i));
     OffsetBytes += MFI->getObjectSize(i);
-    // Each regiter holds 4 bytes, so we must always align the offset to at
+    // Each register holds 4 bytes, so we must always align the offset to at
     // least 4 bytes, so that 2 frame objects won't share the same register.
     OffsetBytes = RoundUpToAlignment(OffsetBytes, 4);
   }
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index f1f0bfa..b4d79e5 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -14,6 +14,7 @@
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPUISelLowering.h" // For AMDGPUISD
 #include "AMDGPURegisterInfo.h"
+#include "AMDGPUSubtarget.h"
 #include "R600InstrInfo.h"
 #include "SIISelLowering.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -83,6 +84,11 @@ private:
                                        SDValue& Offset);
   bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
   bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
+  bool SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, SDValue &Offset,
+                        SDValue &ImmOffset) const;
+
+  SDNode *SelectADD_SUB_I64(SDNode *N);
+  SDNode *SelectDIV_SCALE(SDNode *N);
 
   // Include the pieces autogenerated from the target description.
 #include "AMDGPUGenDAGISel.inc"
@@ -211,51 +217,16 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   // We are selecting i64 ADD here instead of custom lower it during
   // DAG legalization, so we can fold some i64 ADDs used for address
   // calculation into the LOAD and STORE instructions.
-  case ISD::ADD: {
+  case ISD::ADD:
+  case ISD::SUB: {
     if (N->getValueType(0) != MVT::i64 ||
         ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
       break;
 
-    SDLoc DL(N);
-    SDValue LHS = N->getOperand(0);
-    SDValue RHS = N->getOperand(1);
-
-    SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
-    SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);
-
-    SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                         DL, MVT::i32, LHS, Sub0);
-    SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                         DL, MVT::i32, LHS, Sub1);
-
-    SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                         DL, MVT::i32, RHS, Sub0);
-    SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                         DL, MVT::i32, RHS, Sub1);
-
-    SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
-
-    SmallVector<SDValue, 8> AddLoArgs;
-    AddLoArgs.push_back(SDValue(Lo0, 0));
-    AddLoArgs.push_back(SDValue(Lo1, 0));
-
-    SDNode *AddLo = CurDAG->getMachineNode(
-        isCFDepth0() ? AMDGPU::S_ADD_I32 : AMDGPU::V_ADD_I32_e32,
-        DL, VTList, AddLoArgs);
-    SDValue Carry = SDValue(AddLo, 1);
-    SDNode *AddHi = CurDAG->getMachineNode(
-        isCFDepth0() ? AMDGPU::S_ADDC_U32 : AMDGPU::V_ADDC_U32_e32,
-        DL, MVT::i32, SDValue(Hi0, 0), SDValue(Hi1, 0), Carry);
-
-    SDValue Args[5] = {
-      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
-      SDValue(AddLo,0),
-      Sub0,
-      SDValue(AddHi,0),
-      Sub1,
-    };
-    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
+    return SelectADD_SUB_I64(N);
   }
+  case ISD::SCALAR_TO_VECTOR:
+  case AMDGPUISD::BUILD_VERTICAL_VECTOR:
   case ISD::BUILD_VECTOR: {
     unsigned RegClassID;
     const AMDGPURegisterInfo *TRI =
@@ -264,7 +235,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
                    static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
     EVT VT = N->getValueType(0);
     unsigned NumVectorElts = VT.getVectorNumElements();
-    assert(VT.getVectorElementType().bitsEq(MVT::i32));
+    EVT EltVT = VT.getVectorElementType();
+    assert(EltVT.bitsEq(MVT::i32));
     if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
       bool UseVReg = true;
       for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
@@ -305,7 +277,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
       // can't be bundled by our scheduler.
       switch(NumVectorElts) {
       case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
-      case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
+      case 4:
+        if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+          RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
+        else
+          RegClassID = AMDGPU::R600_Reg128RegClassID;
+        break;
       default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
       }
     }
@@ -313,8 +290,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     SDValue RegClass = CurDAG->getTargetConstant(RegClassID, MVT::i32);
 
     if (NumVectorElts == 1) {
-      return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS,
-                                  VT.getVectorElementType(),
+      return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT,
                                   N->getOperand(0), RegClass);
     }
 
@@ -323,11 +299,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     // 16 = Max Num Vector Elements
     // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
     // 1 = Vector Register Class
-    SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(N->getNumOperands() * 2 + 1);
+    SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
 
     RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32);
     bool IsRegSeq = true;
-    for (unsigned i = 0; i < N->getNumOperands(); i++) {
+    unsigned NOps = N->getNumOperands();
+    for (unsigned i = 0; i < NOps; i++) {
       // XXX: Why is this here?
       if (dyn_cast<RegisterSDNode>(N->getOperand(i))) {
         IsRegSeq = false;
@@ -337,6 +314,20 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
       RegSeqArgs[1 + (2 * i) + 1] =
               CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32);
     }
+
+    if (NOps != NumVectorElts) {
+      // Fill in the missing undef elements if this was a scalar_to_vector.
+      assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
+
+      MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                     SDLoc(N), EltVT);
+      for (unsigned i = NOps; i < NumVectorElts; ++i) {
+        RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
+        RegSeqArgs[1 + (2 * i) + 1] =
+          CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32);
+      }
+    }
+
     if (!IsRegSeq)
       break;
     return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
@@ -466,6 +457,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
                                   PackedOffsetWidth);
 
   }
+  case AMDGPUISD::DIV_SCALE: {
+    return SelectDIV_SCALE(N);
+  }
   }
   return SelectCode(N);
 }
@@ -659,6 +653,129 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
   return true;
 }
 
+SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
+  SDLoc DL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  bool IsAdd = (N->getOpcode() == ISD::ADD);
+
+  SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
+  SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);
+
+  SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                       DL, MVT::i32, LHS, Sub0);
+  SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                       DL, MVT::i32, LHS, Sub1);
+
+  SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                       DL, MVT::i32, RHS, Sub0);
+  SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                       DL, MVT::i32, RHS, Sub1);
+
+  SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
+  SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
+
+
+  unsigned Opc = IsAdd ? AMDGPU::S_ADD_I32 : AMDGPU::S_SUB_I32;
+  unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+
+  if (!isCFDepth0()) {
+    Opc = IsAdd ? AMDGPU::V_ADD_I32_e32 : AMDGPU::V_SUB_I32_e32;
+    CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e32 : AMDGPU::V_SUBB_U32_e32;
+  }
+
+  SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs);
+  SDValue Carry(AddLo, 1);
+  SDNode *AddHi
+    = CurDAG->getMachineNode(CarryOpc, DL, MVT::i32,
+                             SDValue(Hi0, 0), SDValue(Hi1, 0), Carry);
+
+  SDValue Args[5] = {
+    CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
+    SDValue(AddLo,0),
+    Sub0,
+    SDValue(AddHi,0),
+    Sub1,
+  };
+  return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
+}
+
+SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
+  SDLoc SL(N);
+  EVT VT = N->getValueType(0);
+
+  assert(VT == MVT::f32 || VT == MVT::f64);
+
+  unsigned Opc
+    = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
+
+  const SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32);
+
+  SDValue Ops[] = {
+    N->getOperand(0),
+    N->getOperand(1),
+    N->getOperand(2),
+    Zero,
+    Zero,
+    Zero,
+    Zero
+  };
+
+  return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
+}
+
+static SDValue wrapAddr64Rsrc(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
+  return SDValue(DAG->getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::v4i32,
+                                     Ptr), 0);
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr,
+                                           SDValue &Offset,
+                                           SDValue &ImmOffset) const {
+  SDLoc DL(Addr);
+
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+
+    if (isUInt<12>(C1->getZExtValue())) {
+
+      if (N0.getOpcode() == ISD::ADD) {
+        // (add (add N2, N3), C1)
+        SDValue N2 = N0.getOperand(0);
+        SDValue N3 = N0.getOperand(1);
+        Ptr = wrapAddr64Rsrc(CurDAG, DL, N2);
+        Offset = N3;
+        ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
+        return true;
+      }
+
+      // (add N0, C1)
+      Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getTargetConstant(0, MVT::i64));;
+      Offset = N0;
+      ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
+      return true;
+    }
+  }
+  if (Addr.getOpcode() == ISD::ADD) {
+    // (add N0, N1)
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+    Ptr = wrapAddr64Rsrc(CurDAG, DL, N0);
+    Offset = N1;
+    ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
+    return true;
+  }
+
+  // default case
+  Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getConstant(0, MVT::i64));
+  Offset = Addr;
+  ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
+  return true;
+}
+
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   const AMDGPUTargetLowering& Lowering =
     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 6c443ea..0ada7a3 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -16,9 +16,9 @@
 #include "AMDGPUISelLowering.h"
 #include "AMDGPU.h"
 #include "AMDGPUFrameLowering.h"
+#include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
-#include "AMDILIntrinsicInfo.h"
 #include "R600MachineFunctionInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -84,13 +84,37 @@ static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
 
 #include "AMDGPUGenCallingConv.inc"
 
+// Find a larger type to do a load / store of a vector with.
+EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
+  unsigned StoreSize = VT.getStoreSizeInBits();
+  if (StoreSize <= 32)
+    return EVT::getIntegerVT(Ctx, StoreSize);
+
+  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
+  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
+}
+
+// Type for a vector that will be loaded to.
+EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) {
+  unsigned StoreSize = VT.getStoreSizeInBits();
+  if (StoreSize <= 32)
+    return EVT::getIntegerVT(Ctx, 32);
+
+  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
+}
+
 AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   TargetLowering(TM, new TargetLoweringObjectFileELF()) {
 
   Subtarget = &TM.getSubtarget<AMDGPUSubtarget>();
 
-  // Initialize target lowering borrowed from AMDIL
-  InitAMDILLowering();
+  setOperationAction(ISD::Constant, MVT::i32, Legal);
+  setOperationAction(ISD::Constant, MVT::i64, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
 
   // We need to custom lower some of the intrinsics
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -107,9 +131,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::FROUND, MVT::f32, Legal);
   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
 
-  // The hardware supports ROTR, but not ROTL
-  setOperationAction(ISD::ROTL, MVT::i32, Expand);
-
   // Lower floating point store/load to integer store/load to reduce the number
   // of patterns in tablegen.
   setOperationAction(ISD::STORE, MVT::f32, Promote);
@@ -118,6 +139,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
 
+  setOperationAction(ISD::STORE, MVT::i64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
+
   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
 
@@ -161,6 +185,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
 
+  setOperationAction(ISD::LOAD, MVT::i64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
+
   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
 
@@ -202,29 +229,63 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
 
   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
 
-  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+  if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
+    setOperationAction(ISD::FCEIL, MVT::f64, Custom);
+    setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
+    setOperationAction(ISD::FRINT, MVT::f64, Custom);
+    setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
+  }
 
-  setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
-  setOperationAction(ISD::FNEG, MVT::v4f32, Expand);
+  if (!Subtarget->hasBFI()) {
+    // fcopysign can be done in a single instruction with BFI.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  }
 
-  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
+  for (MVT VT : ScalarIntVTs) {
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::SDIV, VT, Expand);
 
-  setOperationAction(ISD::MUL, MVT::i64, Expand);
-  setOperationAction(ISD::SUB, MVT::i64, Expand);
+    // GPU does not have divrem function for signed or unsigned.
+    setOperationAction(ISD::SDIVREM, VT, Custom);
+    setOperationAction(ISD::UDIVREM, VT, Custom);
+
+    // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 
+    setOperationAction(ISD::BSWAP, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+  }
+
+  if (!Subtarget->hasBCNT(32))
+    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+
+  if (!Subtarget->hasBCNT(64))
+    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+  // The hardware supports 32-bit ROTR, but not ROTL.
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL, MVT::i64, Expand);
+  setOperationAction(ISD::ROTR, MVT::i64, Expand);
+
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
+  setOperationAction(ISD::MUL, MVT::i64, Expand);
+  setOperationAction(ISD::MULHU, MVT::i64, Expand);
+  setOperationAction(ISD::MULHS, MVT::i64, Expand);
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
-  setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
   setOperationAction(ISD::UREM, MVT::i32, Expand);
-  setOperationAction(ISD::VSELECT, MVT::v2f32, Expand);
-  setOperationAction(ISD::VSELECT, MVT::v4f32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
 
-  static const MVT::SimpleValueType IntTypes[] = {
+  static const MVT::SimpleValueType VectorIntTypes[] = {
     MVT::v2i32, MVT::v4i32
   };
 
-  for (MVT VT : IntTypes) {
-    //Expand the following operations for the current type by default
+  for (MVT VT : VectorIntTypes) {
+    // Expand the following operations for the current type by default.
     setOperationAction(ISD::ADD,  VT, Expand);
     setOperationAction(ISD::AND,  VT, Expand);
     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
@@ -232,40 +293,93 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
     setOperationAction(ISD::MUL,  VT, Expand);
     setOperationAction(ISD::OR,   VT, Expand);
     setOperationAction(ISD::SHL,  VT, Expand);
-    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
-    setOperationAction(ISD::SRL,  VT, Expand);
     setOperationAction(ISD::SRA,  VT, Expand);
+    setOperationAction(ISD::SRL,  VT, Expand);
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::ROTR, VT, Expand);
     setOperationAction(ISD::SUB,  VT, Expand);
-    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+    // TODO: Implement custom UREM / SREM routines.
+    setOperationAction(ISD::SDIV, VT, Expand);
+    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
     setOperationAction(ISD::UREM, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::SDIVREM, VT, Custom);
+    setOperationAction(ISD::UDIVREM, VT, Custom);
+    setOperationAction(ISD::ADDC, VT, Expand);
+    setOperationAction(ISD::SUBC, VT, Expand);
+    setOperationAction(ISD::ADDE, VT, Expand);
+    setOperationAction(ISD::SUBE, VT, Expand);
     setOperationAction(ISD::SELECT, VT, Expand);
     setOperationAction(ISD::VSELECT, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
     setOperationAction(ISD::XOR,  VT, Expand);
+    setOperationAction(ISD::BSWAP, VT, Expand);
+    setOperationAction(ISD::CTPOP, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
   }
 
-  static const MVT::SimpleValueType FloatTypes[] = {
+  static const MVT::SimpleValueType FloatVectorTypes[] = {
     MVT::v2f32, MVT::v4f32
   };
 
-  for (MVT VT : FloatTypes) {
+  for (MVT VT : FloatVectorTypes) {
     setOperationAction(ISD::FABS, VT, Expand);
     setOperationAction(ISD::FADD, VT, Expand);
+    setOperationAction(ISD::FCEIL, VT, Expand);
     setOperationAction(ISD::FCOS, VT, Expand);
     setOperationAction(ISD::FDIV, VT, Expand);
+    setOperationAction(ISD::FEXP2, VT, Expand);
+    setOperationAction(ISD::FLOG2, VT, Expand);
     setOperationAction(ISD::FPOW, VT, Expand);
     setOperationAction(ISD::FFLOOR, VT, Expand);
     setOperationAction(ISD::FTRUNC, VT, Expand);
     setOperationAction(ISD::FMUL, VT, Expand);
+    setOperationAction(ISD::FMA, VT, Expand);
     setOperationAction(ISD::FRINT, VT, Expand);
+    setOperationAction(ISD::FNEARBYINT, VT, Expand);
     setOperationAction(ISD::FSQRT, VT, Expand);
     setOperationAction(ISD::FSIN, VT, Expand);
     setOperationAction(ISD::FSUB, VT, Expand);
+    setOperationAction(ISD::FNEG, VT, Expand);
     setOperationAction(ISD::SELECT, VT, Expand);
+    setOperationAction(ISD::VSELECT, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
+    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
   }
 
+  setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
+  setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
+
   setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::SELECT_CC);
+
+  setSchedulingPreference(Sched::RegPressure);
+  setJumpIsExpensive(true);
+
+  setSelectIsExpensive(false);
+  PredictableSelectIsExpensive = false;
+
+  // There are no integer divide instructions, and these expand to a pretty
+  // large sequence of instructions.
+  setIntDivIsCheap(false);
+  setPow2DivIsCheap(false);
+
+  // TODO: Investigate this when 64-bit divides are implemented.
+  addBypassSlowDiv(64, 32);
+
+  // FIXME: Need to really handle these.
+  MaxStoresPerMemcpy  = 4096;
+  MaxStoresPerMemmove = 4096;
+  MaxStoresPerMemset  = 4096;
 }
 
 //===----------------------------------------------------------------------===//
@@ -276,6 +390,23 @@ MVT AMDGPUTargetLowering::getVectorIdxTy() const {
   return MVT::i32;
 }
 
+bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
+  return true;
+}
+
+// The backend supports 32 and 64 bit floating point immediates.
+// FIXME: Why are we reporting vectors of FP immediates as legal?
+bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  EVT ScalarVT = VT.getScalarType();
+  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64);
+}
+
+// We don't want to shrink f64 / f32 constants.
+bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
+  EVT ScalarVT = VT.getScalarType();
+  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
+}
+
 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
                                                    EVT CastTy) const {
   if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
@@ -330,6 +461,10 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
   return Src == MVT::i32 && Dest == MVT::i64;
 }
 
+bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  return isZExtFree(Val.getValueType(), VT2);
+}
+
 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
   // limited number of native 64-bit operations. Shrinking an operation to fit
@@ -383,25 +518,28 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
   return SDValue();
 }
 
-SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
-    const {
+SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
+                                             SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default:
     Op.getNode()->dump();
     llvm_unreachable("Custom lowering code for this"
                      "instruction is not implemented yet!");
     break;
-  // AMDIL DAG lowering
-  case ISD::SDIV: return LowerSDIV(Op, DAG);
-  case ISD::SREM: return LowerSREM(Op, DAG);
   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
-  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
-  // AMDGPU DAG lowering
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::SDIV: return LowerSDIV(Op, DAG);
+  case ISD::SREM: return LowerSREM(Op, DAG);
   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
+  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
+  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
+  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
+  case ISD::FRINT: return LowerFRINT(Op, DAG);
+  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
+  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
   }
   return Op;
@@ -419,95 +557,23 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
     // nothing here and let the illegal result integer be handled normally.
     return;
-  case ISD::UDIV: {
-    SDValue Op = SDValue(N, 0);
-    SDLoc DL(Op);
-    EVT VT = Op.getValueType();
-    SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
-      N->getOperand(0), N->getOperand(1));
-    Results.push_back(UDIVREM);
-    break;
-  }
-  case ISD::UREM: {
-    SDValue Op = SDValue(N, 0);
-    SDLoc DL(Op);
-    EVT VT = Op.getValueType();
-    SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
-      N->getOperand(0), N->getOperand(1));
-    Results.push_back(UDIVREM.getValue(1));
-    break;
-  }
-  case ISD::UDIVREM: {
-    SDValue Op = SDValue(N, 0);
-    SDLoc DL(Op);
-    EVT VT = Op.getValueType();
-    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
-
-    SDValue one = DAG.getConstant(1, HalfVT);
-    SDValue zero = DAG.getConstant(0, HalfVT);
-
-    //HiLo split
-    SDValue LHS = N->getOperand(0);
-    SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
-    SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
-
-    SDValue RHS = N->getOperand(1);
-    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
-    SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
-
-    // Get Speculative values
-    SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
-    SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
-
-    SDValue REM_Hi = zero;
-    SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
-
-    SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
-    SDValue DIV_Lo = zero;
-
-    const unsigned halfBitWidth = HalfVT.getSizeInBits();
-
-    for (unsigned i = 0; i < halfBitWidth; ++i) {
-      SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
-      // Get Value of high bit
-      SDValue HBit;
-      if (halfBitWidth == 32 && Subtarget->hasBFE()) {
-        HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
-      } else {
-        HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
-        HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
-      }
-
-      SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
-        DAG.getConstant(halfBitWidth - 1, HalfVT));
-      REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
-      REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
-
-      REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
-      REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
-
-
-      SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
-
-      SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
-      SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
-
-      DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
-
-      // Update REM
-
-      SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
-
-      REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
-      REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
-      REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
-    }
+  case ISD::LOAD: {
+    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
+    if (!Node)
+      return;
 
-    SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
-    SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
-    Results.push_back(DIV);
-    Results.push_back(REM);
-    break;
+    Results.push_back(SDValue(Node, 0));
+    Results.push_back(SDValue(Node, 1));
+    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
+    // function
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
+    return;
+  }
+  case ISD::STORE: {
+    SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG);
+    if (Lowered.getNode())
+      Results.push_back(Lowered);
+    return;
   }
   default:
     return;
@@ -531,12 +597,14 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
                                                        SelectionDAG &DAG) const {
   const DataLayout *TD = getTargetMachine().getDataLayout();
   SDLoc DL(InitPtr);
+  Type *InitTy = Init->getType();
+
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) {
-    EVT VT = EVT::getEVT(CI->getType());
-    PointerType *PtrTy = PointerType::get(CI->getType(), 0);
-    return DAG.getStore(Chain, DL,  DAG.getConstant(*CI, VT), InitPtr,
-                 MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
-                 TD->getPrefTypeAlignment(CI->getType()));
+    EVT VT = EVT::getEVT(InitTy);
+    PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
+    return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr,
+                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
+                        TD->getPrefTypeAlignment(InitTy));
   }
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
@@ -547,7 +615,6 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
                  TD->getPrefTypeAlignment(CFP->getType()));
   }
 
-  Type *InitTy = Init->getType();
   if (StructType *ST = dyn_cast<StructType>(InitTy)) {
     const StructLayout *SL = TD->getStructLayout(ST);
 
@@ -589,6 +656,14 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
   }
 
+  if (isa<UndefValue>(Init)) {
+    EVT VT = EVT::getEVT(InitTy);
+    PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
+    return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
+                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
+                        TD->getPrefTypeAlignment(InitTy));
+  }
+
   Init->dump();
   llvm_unreachable("Unhandled constant initializer");
 }
@@ -628,11 +703,19 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
     unsigned Size = TD->getTypeAllocSize(EltType);
     unsigned Alignment = TD->getPrefTypeAlignment(EltType);
 
+    MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS);
+    MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
+
+    int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
+    SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT);
+
     const GlobalVariable *Var = cast<GlobalVariable>(GV);
+    if (!Var->hasInitializer()) {
+      // This has no use, but bugpoint will hit it.
+      return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
+    }
+
     const Constant *Init = Var->getInitializer();
-    int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
-    SDValue InitPtr = DAG.getFrameIndex(FI,
-        getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
     SmallVector<SDNode*, 8> WorkList;
 
     for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(),
@@ -651,8 +734,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
       }
       DAG.UpdateNodeOperands(*I, Ops);
     }
-    return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op),
-        getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
+    return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
   }
   }
 }
@@ -688,8 +770,7 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
   const AMDGPUFrameLowering *TFL =
    static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
 
-  FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
-  assert(FIN);
+  FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
 
   unsigned FrameIndex = FIN->getIndex();
   unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
@@ -705,26 +786,66 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   switch (IntrinsicID) {
     default: return Op;
-    case AMDGPUIntrinsic::AMDIL_abs:
+    case AMDGPUIntrinsic::AMDGPU_abs:
+    case AMDGPUIntrinsic::AMDIL_abs: // Legacy name.
       return LowerIntrinsicIABS(Op, DAG);
-    case AMDGPUIntrinsic::AMDIL_exp:
-      return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
     case AMDGPUIntrinsic::AMDGPU_lrp:
       return LowerIntrinsicLRP(Op, DAG);
-    case AMDGPUIntrinsic::AMDIL_fraction:
+    case AMDGPUIntrinsic::AMDGPU_fract:
+    case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
       return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
-    case AMDGPUIntrinsic::AMDIL_max:
-      return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1),
-                                                  Op.getOperand(2));
+
+    case AMDGPUIntrinsic::AMDGPU_clamp:
+    case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name.
+      return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+    case Intrinsic::AMDGPU_div_scale: {
+      // 3rd parameter required to be a constant.
+      const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+      if (!Param)
+        return DAG.getUNDEF(VT);
+
+      // Translate to the operands expected by the machine instruction. The
+      // first parameter must be the same as the first instruction.
+      SDValue Numerator = Op.getOperand(1);
+      SDValue Denominator = Op.getOperand(2);
+      SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
+
+      return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT,
+                         Src0, Denominator, Numerator);
+    }
+
+    case Intrinsic::AMDGPU_div_fmas:
+      return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+    case Intrinsic::AMDGPU_div_fixup:
+      return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+    case Intrinsic::AMDGPU_trig_preop:
+      return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2));
+
+    case Intrinsic::AMDGPU_rcp:
+      return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
+
+    case Intrinsic::AMDGPU_rsq:
+      return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_legacy_rsq:
+      return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
+
+    case Intrinsic::AMDGPU_rsq_clamped:
+      return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
+
     case AMDGPUIntrinsic::AMDGPU_imax:
       return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
                                                   Op.getOperand(2));
     case AMDGPUIntrinsic::AMDGPU_umax:
       return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1),
                                                   Op.getOperand(2));
-    case AMDGPUIntrinsic::AMDIL_min:
-      return DAG.getNode(AMDGPUISD::FMIN, DL, VT, Op.getOperand(1),
-                                                  Op.getOperand(2));
     case AMDGPUIntrinsic::AMDGPU_imin:
       return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1),
                                                   Op.getOperand(2));
@@ -748,6 +869,18 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT,
                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
 
+    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0:
+      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1:
+      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2:
+      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3:
+      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1));
+
     case AMDGPUIntrinsic::AMDGPU_bfe_i32:
       return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
                          Op.getOperand(1),
@@ -771,8 +904,16 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                          Op.getOperand(1),
                          Op.getOperand(2));
 
-    case AMDGPUIntrinsic::AMDIL_round_nearest:
+    case AMDGPUIntrinsic::AMDGPU_brev:
+      return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDIL_exp: // Legacy name.
+      return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name.
       return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
+    case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.
+      return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));
   }
 }
 
@@ -863,27 +1004,41 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
                                               SelectionDAG &DAG) const {
   LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
   EVT MemEltVT = Load->getMemoryVT().getVectorElementType();
+  EVT LoadVT = Op.getValueType();
   EVT EltVT = Op.getValueType().getVectorElementType();
   EVT PtrVT = Load->getBasePtr().getValueType();
+
   unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
   SmallVector<SDValue, 8> Loads;
+  SmallVector<SDValue, 8> Chains;
+
   SDLoc SL(Op);
 
   for (unsigned i = 0, e = NumElts; i != e; ++i) {
     SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
                     DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT));
-    Loads.push_back(DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
-                        Load->getChain(), Ptr,
-                        MachinePointerInfo(Load->getMemOperand()->getValue()),
-                        MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
-                        Load->getAlignment()));
+
+    SDValue NewLoad
+      = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
+                       Load->getChain(), Ptr,
+                       MachinePointerInfo(Load->getMemOperand()->getValue()),
+                       MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
+                       Load->getAlignment());
+    Loads.push_back(NewLoad.getValue(0));
+    Chains.push_back(NewLoad.getValue(1));
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), Loads);
+
+  SDValue Ops[] = {
+    DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads),
+    DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains)
+  };
+
+  return DAG.getMergeValues(Ops, SL);
 }
 
 SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
                                                SelectionDAG &DAG) const {
-  StoreSDNode *Store = dyn_cast<StoreSDNode>(Op);
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
   EVT MemVT = Store->getMemoryVT();
   unsigned MemBits = MemVT.getSizeInBits();
 
@@ -981,7 +1136,13 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
                                        Load->getBasePtr(),
                                        MemVT,
                                        Load->getMemOperand());
-    return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32);
+
+    SDValue Ops[] = {
+      DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32),
+      ExtLoad32.getValue(1)
+    };
+
+    return DAG.getMergeValues(Ops, DL);
   }
 
   if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
@@ -995,7 +1156,13 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 
     SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
                                    BasePtr, MVT::i8, MMO);
-    return DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
+
+    SDValue Ops[] = {
+      DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD),
+      NewLD.getValue(1)
+    };
+
+    return DAG.getMergeValues(Ops, DL);
   }
 
   // Lower loads constant address space global variable loads
@@ -1003,11 +1170,12 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
       isa<GlobalVariable>(
           GetUnderlyingObject(Load->getMemOperand()->getValue()))) {
 
+
     SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL,
         getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
         DAG.getConstant(2, MVT::i32));
-    return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+    return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
                        Load->getChain(), Ptr,
                        DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
   }
@@ -1034,10 +1202,21 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   EVT MemEltVT = MemVT.getScalarType();
   if (ExtType == ISD::SEXTLOAD) {
     SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
-    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
+
+    SDValue Ops[] = {
+      DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
+      Load->getChain()
+    };
+
+    return DAG.getMergeValues(Ops, DL);
   }
 
-  return DAG.getZeroExtendInReg(Ret, DL, MemEltVT);
+  SDValue Ops[] = {
+    DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
+    Load->getChain()
+  };
+
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
@@ -1097,6 +1276,251 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
+SDValue AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  MVT INTTY;
+  MVT FLTTY;
+  if (!OVT.isVector()) {
+    INTTY = MVT::i32;
+    FLTTY = MVT::f32;
+  } else if (OVT.getVectorNumElements() == 2) {
+    INTTY = MVT::v2i32;
+    FLTTY = MVT::v2f32;
+  } else if (OVT.getVectorNumElements() == 4) {
+    INTTY = MVT::v4i32;
+    FLTTY = MVT::v4f32;
+  }
+  unsigned bitsize = OVT.getScalarType().getSizeInBits();
+  // char|short jq = ia ^ ib;
+  SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
+
+  // jq = jq >> (bitsize - 2)
+  jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT));
+
+  // jq = jq | 0x1
+  jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
+
+  // jq = (int)jq
+  jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
+
+  // int ia = (int)LHS;
+  SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
+
+  // int ib, (int)RHS;
+  SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
+
+  // float fa = (float)ia;
+  SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
+
+  // float fb = (float)ib;
+  SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
+
+  // float fq = native_divide(fa, fb);
+  SDValue fq = DAG.getNode(ISD::FMUL, DL, FLTTY,
+                           fa, DAG.getNode(AMDGPUISD::RCP, DL, FLTTY, fb));
+
+  // fq = trunc(fq);
+  fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
+
+  // float fqneg = -fq;
+  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
+
+  // float fr = mad(fqneg, fb, fa);
+  SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY,
+      DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa);
+
+  // int iq = (int)fq;
+  SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
+
+  // fr = fabs(fr);
+  fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
+
+  // fb = fabs(fb);
+  fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
+
+  // int cv = fr >= fb;
+  SDValue cv;
+  if (INTTY == MVT::i32) {
+    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
+  } else {
+    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
+  }
+  // jq = (cv ? jq : 0);
+  jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq,
+      DAG.getConstant(0, OVT));
+  // dst = iq + jq;
+  iq = DAG.getSExtOrTrunc(iq, DL, OVT);
+  iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
+  return iq;
+}
+
+SDValue AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  // The LowerSDIV32 function generates equivalent to the following IL.
+  // mov r0, LHS
+  // mov r1, RHS
+  // ilt r10, r0, 0
+  // ilt r11, r1, 0
+  // iadd r0, r0, r10
+  // iadd r1, r1, r11
+  // ixor r0, r0, r10
+  // ixor r1, r1, r11
+  // udiv r0, r0, r1
+  // ixor r10, r10, r11
+  // iadd r0, r0, r10
+  // ixor DST, r0, r10
+
+  // mov r0, LHS
+  SDValue r0 = LHS;
+
+  // mov r1, RHS
+  SDValue r1 = RHS;
+
+  // ilt r10, r0, 0
+  SDValue r10 = DAG.getSelectCC(DL,
+      r0, DAG.getConstant(0, OVT),
+      DAG.getConstant(-1, OVT),
+      DAG.getConstant(0, OVT),
+      ISD::SETLT);
+
+  // ilt r11, r1, 0
+  SDValue r11 = DAG.getSelectCC(DL,
+      r1, DAG.getConstant(0, OVT),
+      DAG.getConstant(-1, OVT),
+      DAG.getConstant(0, OVT),
+      ISD::SETLT);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // iadd r1, r1, r11
+  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
+
+  // ixor r0, r0, r10
+  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+
+  // ixor r1, r1, r11
+  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
+
+  // udiv r0, r0, r1
+  r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
+
+  // ixor r10, r10, r11
+  r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // ixor DST, r0, r10
+  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+  return DST;
+}
+
+SDValue AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
+  return SDValue(Op.getNode(), 0);
+}
+
+SDValue AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
+  EVT OVT = Op.getValueType().getScalarType();
+
+  if (OVT == MVT::i64)
+    return LowerSDIV64(Op, DAG);
+
+  if (OVT.getScalarType() == MVT::i32)
+    return LowerSDIV32(Op, DAG);
+
+  if (OVT == MVT::i16 || OVT == MVT::i8) {
+    // FIXME: We should be checking for the masked bits. This isn't reached
+    // because i8 and i16 are not legal types.
+    return LowerSDIV24(Op, DAG);
+  }
+
+  return SDValue(Op.getNode(), 0);
+}
+
+SDValue AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  // The LowerSREM32 function generates equivalent to the following IL.
+  // mov r0, LHS
+  // mov r1, RHS
+  // ilt r10, r0, 0
+  // ilt r11, r1, 0
+  // iadd r0, r0, r10
+  // iadd r1, r1, r11
+  // ixor r0, r0, r10
+  // ixor r1, r1, r11
+  // udiv r20, r0, r1
+  // umul r20, r20, r1
+  // sub r0, r0, r20
+  // iadd r0, r0, r10
+  // ixor DST, r0, r10
+
+  // mov r0, LHS
+  SDValue r0 = LHS;
+
+  // mov r1, RHS
+  SDValue r1 = RHS;
+
+  // ilt r10, r0, 0
+  SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);
+
+  // ilt r11, r1, 0
+  SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // iadd r1, r1, r11
+  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
+
+  // ixor r0, r0, r10
+  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+
+  // ixor r1, r1, r11
+  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
+
+  // udiv r20, r0, r1
+  SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
+
+  // umul r20, r20, r1
+  r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);
+
+  // sub r0, r0, r20
+  r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // ixor DST, r0, r10
+  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+  return DST;
+}
+
+SDValue AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const {
+  return SDValue(Op.getNode(), 0);
+}
+
+SDValue AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
+  EVT OVT = Op.getValueType();
+
+  if (OVT.getScalarType() == MVT::i64)
+    return LowerSREM64(Op, DAG);
+
+  if (OVT.getScalarType() == MVT::i32)
+    return LowerSREM32(Op, DAG);
+
+  return SDValue(Op.getNode(), 0);
+}
+
 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDLoc DL(Op);
@@ -1201,6 +1625,177 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
   return DAG.getMergeValues(Ops, DL);
 }
 
+SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue Zero = DAG.getConstant(0, VT);
+  SDValue NegOne = DAG.getConstant(-1, VT);
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+
+  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
+  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
+  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
+  SDValue RSign = LHSign; // Remainder sign is the same as LHS
+
+  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
+  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
+
+  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
+  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
+
+  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
+  SDValue Rem = Div.getValue(1);
+
+  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
+  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
+
+  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
+  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
+
+  SDValue Res[2] = {
+    Div,
+    Rem
+  };
+  return DAG.getMergeValues(Res, DL);
+}
+
+SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  // result = trunc(src)
+  // if (src > 0.0 && src != result)
+  //   result += 1.0
+
+  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+
+  const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64);
+  const SDValue One = DAG.getConstantFP(1.0, MVT::f64);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+
+  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
+  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
+  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
+
+  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
+  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
+}
+
+SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  assert(Op.getValueType() == MVT::f64);
+
+  const SDValue Zero = DAG.getConstant(0, MVT::i32);
+  const SDValue One = DAG.getConstant(1, MVT::i32);
+
+  SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
+
+  // Extract the upper half, since this is where we will find the sign and
+  // exponent.
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
+
+  const unsigned FractBits = 52;
+  const unsigned ExpBits = 11;
+
+  // Extract the exponent.
+  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_I32, SL, MVT::i32,
+                                Hi,
+                                DAG.getConstant(FractBits - 32, MVT::i32),
+                                DAG.getConstant(ExpBits, MVT::i32));
+  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
+                            DAG.getConstant(1023, MVT::i32));
+
+  // Extract the sign bit.
+  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32);
+  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
+
+  // Extend back to to 64-bits.
+  SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+                                  Zero, SignBit);
+  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
+
+  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
+  const SDValue FractMask
+    = DAG.getConstant((UINT64_C(1) << FractBits) - 1, MVT::i64);
+
+  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
+  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
+  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
+
+  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, MVT::i32);
+
+  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
+  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
+
+  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
+  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
+
+  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
+}
+
+SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  assert(Op.getValueType() == MVT::f64);
+
+  APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52");
+  SDValue C1 = DAG.getConstantFP(C1Val, MVT::f64);
+  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
+
+  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
+  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
+
+  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
+
+  APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51");
+  SDValue C2 = DAG.getConstantFP(C2Val, MVT::f64);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
+
+  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
+}
+
+SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
+  // FNEARBYINT and FRINT are the same, except in their handling of FP
+  // exceptions. Those aren't really meaningful for us, and OpenCL only has
+  // rint, so just treat them as equivalent.
+  return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
+}
+
+SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  // result = trunc(src);
+  // if (src < 0.0 && src != result)
+  //   result += -1.0.
+
+  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+
+  const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64);
+  const SDValue NegOne = DAG.getConstantFP(-1.0, MVT::f64);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+
+  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
+  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
+  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
+
+  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
+  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
+}
+
 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDValue S0 = Op.getOperand(0);
@@ -1218,7 +1813,6 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
   FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
                         DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32
   return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
-
 }
 
 SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op,
@@ -1303,6 +1897,37 @@ static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
   return DAG.getConstant(Src0 >> Offset, MVT::i32);
 }
 
+SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  EVT VT = N->getValueType(0);
+
+  if (VT.isVector() || VT.getSizeInBits() > 32)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue Mul;
+
+  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
+    N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
+    N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
+    Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
+  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
+    N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
+    N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
+    Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
+  } else {
+    return SDValue();
+  }
+
+  // We need to use sext even for MUL_U24, because MUL_U24 is used
+  // for signed multiply of 8 and 16-bit types.
+  return DAG.getSExtOrTrunc(Mul, DL, VT);
+}
+
 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -1310,34 +1935,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
 
   switch(N->getOpcode()) {
     default: break;
-    case ISD::MUL: {
-      EVT VT = N->getValueType(0);
-      SDValue N0 = N->getOperand(0);
-      SDValue N1 = N->getOperand(1);
-      SDValue Mul;
-
-      // FIXME: Add support for 24-bit multiply with 64-bit output on SI.
-      if (VT.isVector() || VT.getSizeInBits() > 32)
-        break;
-
-      if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
-        N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
-        N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
-        Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
-      } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
-        N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
-        N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
-        Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
-      } else {
-        break;
-      }
-
-      // We need to use sext even for MUL_U24, because MUL_U24 is used
-      // for signed multiply of 8 and 16-bit types.
-      SDValue Reg = DAG.getSExtOrTrunc(Mul, DL, VT);
-
-      return Reg;
-    }
+    case ISD::MUL:
+      return performMulCombine(N, DCI);
     case AMDGPUISD::MUL_I24:
     case AMDGPUISD::MUL_U24: {
       SDValue N0 = N->getOperand(0);
@@ -1511,29 +2110,38 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   // AMDIL DAG nodes
   NODE_NAME_CASE(CALL);
   NODE_NAME_CASE(UMUL);
-  NODE_NAME_CASE(DIV_INF);
   NODE_NAME_CASE(RET_FLAG);
   NODE_NAME_CASE(BRANCH_COND);
 
   // AMDGPU DAG nodes
   NODE_NAME_CASE(DWORDADDR)
   NODE_NAME_CASE(FRACT)
+  NODE_NAME_CASE(CLAMP)
   NODE_NAME_CASE(FMAX)
   NODE_NAME_CASE(SMAX)
   NODE_NAME_CASE(UMAX)
   NODE_NAME_CASE(FMIN)
   NODE_NAME_CASE(SMIN)
   NODE_NAME_CASE(UMIN)
+  NODE_NAME_CASE(URECIP)
+  NODE_NAME_CASE(DIV_SCALE)
+  NODE_NAME_CASE(DIV_FMAS)
+  NODE_NAME_CASE(DIV_FIXUP)
+  NODE_NAME_CASE(TRIG_PREOP)
+  NODE_NAME_CASE(RCP)
+  NODE_NAME_CASE(RSQ)
+  NODE_NAME_CASE(RSQ_LEGACY)
+  NODE_NAME_CASE(RSQ_CLAMPED)
+  NODE_NAME_CASE(DOT4)
   NODE_NAME_CASE(BFE_U32)
   NODE_NAME_CASE(BFE_I32)
   NODE_NAME_CASE(BFI)
   NODE_NAME_CASE(BFM)
+  NODE_NAME_CASE(BREV)
   NODE_NAME_CASE(MUL_U24)
   NODE_NAME_CASE(MUL_I24)
   NODE_NAME_CASE(MAD_U24)
   NODE_NAME_CASE(MAD_I24)
-  NODE_NAME_CASE(URECIP)
-  NODE_NAME_CASE(DOT4)
   NODE_NAME_CASE(EXPORT)
   NODE_NAME_CASE(CONST_ADDRESS)
   NODE_NAME_CASE(REGISTER_LOAD)
@@ -1544,6 +2152,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(SAMPLEB)
   NODE_NAME_CASE(SAMPLED)
   NODE_NAME_CASE(SAMPLEL)
+  NODE_NAME_CASE(CVT_F32_UBYTE0)
+  NODE_NAME_CASE(CVT_F32_UBYTE1)
+  NODE_NAME_CASE(CVT_F32_UBYTE2)
+  NODE_NAME_CASE(CVT_F32_UBYTE3)
+  NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
   NODE_NAME_CASE(STORE_MSKOR)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
   }
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index d5d821d..98a92ad 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -42,10 +42,33 @@ private:
   SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const;
   /// \brief Split a vector store into multiple scalar stores.
   /// \returns The resulting chain.
+
+  SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue ExpandSIGN_EXTEND_INREG(SDValue Op,
+                                  unsigned BitsDiff,
+                                  SelectionDAG &DAG) const;
+  SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
 protected:
+  static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
+  static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT);
 
   /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
   /// MachineFunction.
@@ -61,6 +84,7 @@ protected:
   SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
   bool isHWTrueValue(SDValue Op) const;
   bool isHWFalseValue(SDValue Op) const;
 
@@ -87,10 +111,16 @@ public:
 
   bool isZExtFree(Type *Src, Type *Dest) const override;
   bool isZExtFree(EVT Src, EVT Dest) const override;
+  bool isZExtFree(SDValue Val, EVT VT2) const override;
 
   bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
 
   MVT getVectorIdxTy() const override;
+  bool isSelectSupported(SelectSupportKind) const override;
+
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+  bool ShouldShrinkFPConstant(EVT VT) const override;
+
   bool isLoadBitCastBeneficial(EVT, EVT) const override;
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                       bool isVarArg,
@@ -101,6 +131,7 @@ public:
                     SmallVectorImpl<SDValue> &InVals) const override;
 
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
   void ReplaceNodeResults(SDNode * N,
                           SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
@@ -128,38 +159,6 @@ public:
     SDValue Op,
     const SelectionDAG &DAG,
     unsigned Depth = 0) const override;
-
-// Functions defined in AMDILISelLowering.cpp
-public:
-  bool getTgtMemIntrinsic(IntrinsicInfo &Info,
-                          const CallInst &I, unsigned Intrinsic) const override;
-
-  /// We want to mark f32/f64 floating point values as legal.
-  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
-
-  /// We don't want to shrink f64/f32 constants.
-  bool ShouldShrinkFPConstant(EVT VT) const override;
-
-  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
-
-private:
-  void InitAMDILLowering();
-  SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSREM8(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSREM16(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const;
-
-  SDValue ExpandSIGN_EXTEND_INREG(SDValue Op,
-                                  unsigned BitsDiff,
-                                  SelectionDAG &DAG) const;
-  SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
-  EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const;
-  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 };
 
 namespace AMDGPUISD {
@@ -169,12 +168,15 @@ enum {
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
   CALL,        // Function call based on a single integer
   UMUL,        // 32bit unsigned multiplication
-  DIV_INF,      // Divide with infinity returned on zero divisor
   RET_FLAG,
   BRANCH_COND,
   // End AMDIL ISD Opcodes
   DWORDADDR,
   FRACT,
+  CLAMP,
+
+  // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
+  // Denormals handled on some parts.
   COS_HW,
   SIN_HW,
   FMAX,
@@ -184,11 +186,23 @@ enum {
   SMIN,
   UMIN,
   URECIP,
+  DIV_SCALE,
+  DIV_FMAS,
+  DIV_FIXUP,
+  TRIG_PREOP, // 1 ULP max error for f64
+
+  // RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
+  //            For f64, max error 2^29 ULP, handles denormals.
+  RCP,
+  RSQ,
+  RSQ_LEGACY,
+  RSQ_CLAMPED,
   DOT4,
   BFE_U32, // Extract range of bits with zero extension to 32-bits.
   BFE_I32, // Extract range of bits with sign extension to 32-bits.
   BFI, // (src0 & src1) | (~src0 & src2)
   BFM, // Insert a range of bits into a 32-bit word.
+  BREV, // Reverse bits.
   MUL_U24,
   MUL_I24,
   MAD_U24,
@@ -203,6 +217,21 @@ enum {
   SAMPLEB,
   SAMPLED,
   SAMPLEL,
+
+  // These cvt_f32_ubyte* nodes need to remain consecutive and in order.
+  CVT_F32_UBYTE0,
+  CVT_F32_UBYTE1,
+  CVT_F32_UBYTE2,
+  CVT_F32_UBYTE3,
+  /// This node is for VLIW targets and it is used to represent a vector
+  /// that is stored in consecutive registers with the same channel.
+  /// For example:
+  ///   |X  |Y|Z|W|
+  /// T0|v.x| | | |
+  /// T1|v.y| | | |
+  /// T2|v.z| | | |
+  /// T3|v.w| | | |
+  BUILD_VERTICAL_VECTOR,
   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
   STORE_MSKOR,
   LOAD_CONSTANT,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
index 1c3361a..fef5b8c 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -30,8 +30,8 @@ using namespace llvm;
 // Pin the vtable to this file.
 void AMDGPUInstrInfo::anchor() {}
 
-AMDGPUInstrInfo::AMDGPUInstrInfo(TargetMachine &tm)
-  : AMDGPUGenInstrInfo(-1,-1), RI(tm), TM(tm) { }
+AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &st)
+  : AMDGPUGenInstrInfo(-1,-1), RI(st), ST(st) { }
 
 const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const {
   return RI;
@@ -320,33 +320,11 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
     return -1;
   }
 
-  Offset = TM.getFrameLowering()->getFrameIndexOffset(MF, -1);
+  Offset = MF.getTarget().getFrameLowering()->getFrameIndexOffset(MF, -1);
 
   return getIndirectIndexBegin(MF) + Offset;
 }
 
-
-void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF,
-    DebugLoc DL) const {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const AMDGPURegisterInfo & RI = getRegisterInfo();
-
-  for (unsigned i = 0; i < MI.getNumOperands(); i++) {
-    MachineOperand &MO = MI.getOperand(i);
-    // Convert dst regclass to one that is supported by the ISA
-    if (MO.isReg() && MO.isDef()) {
-      if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
-        const TargetRegisterClass * oldRegClass = MRI.getRegClass(MO.getReg());
-        const TargetRegisterClass * newRegClass = RI.getISARegClass(oldRegClass);
-
-        assert(newRegClass);
-
-        MRI.setRegClass(MO.getReg(), newRegClass);
-      }
-    }
-  }
-}
-
 int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
   switch (Channels) {
   default: return Opcode;
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
index 74baf6b..95dc8c1 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
@@ -33,7 +33,7 @@
 
 namespace llvm {
 
-class AMDGPUTargetMachine;
+class AMDGPUSubtarget;
 class MachineFunction;
 class MachineInstr;
 class MachineInstrBuilder;
@@ -45,9 +45,9 @@ private:
                           MachineBasicBlock &MBB) const;
   virtual void anchor();
 protected:
-  TargetMachine &TM;
+  const AMDGPUSubtarget &ST;
 public:
-  explicit AMDGPUInstrInfo(TargetMachine &tm);
+  explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
 
   virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
 
@@ -137,14 +137,6 @@ public:
   bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
 
   // Helper functions that check the opcode for status information
-  bool isLoadInst(llvm::MachineInstr *MI) const;
-  bool isExtLoadInst(llvm::MachineInstr *MI) const;
-  bool isSWSExtLoadInst(llvm::MachineInstr *MI) const;
-  bool isSExtLoadInst(llvm::MachineInstr *MI) const;
-  bool isZExtLoadInst(llvm::MachineInstr *MI) const;
-  bool isAExtLoadInst(llvm::MachineInstr *MI) const;
-  bool isStoreInst(llvm::MachineInstr *MI) const;
-  bool isTruncStoreInst(llvm::MachineInstr *MI) const;
   bool isRegisterStore(const MachineInstr &MI) const;
   bool isRegisterLoad(const MachineInstr &MI) const;
 
@@ -185,11 +177,6 @@ public:
                                     unsigned ValueReg, unsigned Address,
                                     unsigned OffsetReg) const = 0;
 
-
-  /// \brief Convert the AMDIL MachineInstr to a supported ISA
-  /// MachineInstr
-  void convertToISA(MachineInstr & MI, MachineFunction &MF, DebugLoc DL) const;
-
   /// \brief Build a MOV instruction.
   virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
                                       MachineBasicBlock::iterator I,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index f96dbb4..934d59d 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -19,6 +19,14 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
   SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
 ]>;
 
+def AMDGPUTrigPreOp : SDTypeProfile<1, 2,
+  [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
+>;
+
+def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
+  [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
+>;
+
 //===----------------------------------------------------------------------===//
 // AMDGPU DAG Nodes
 //
@@ -29,11 +37,25 @@ def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
 // out = a - floor(a)
 def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
 
+// out = 1.0 / a
+def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
+
+// out = 1.0 / sqrt(a)
+def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
+
+// out = 1.0 / sqrt(a)
+def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
+
+// out = 1.0 / sqrt(a) result clamped to +/- max_float.
+def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>;
+
 // out = max(a, b) a and b are floats
 def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp,
   [SDNPCommutative, SDNPAssociative]
 >;
 
+def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>;
+
 // out = max(a, b) a and b are signed ints
 def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
   [SDNPCommutative, SDNPAssociative]
@@ -59,12 +81,38 @@ def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp,
   [SDNPCommutative, SDNPAssociative]
 >;
 
+
+def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
+  SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1",
+  SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2",
+  SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3",
+  SDTIntToFPOp, []>;
+
+
 // urecip - This operation is a helper for integer division, it returns the
 // result of 1 / a as a fractional unsigned integer.
 // out = (2^32 / a) + e
 // e is rounding error
 def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
 
+// Special case divide preop and flags.
+def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;
+
+//  Special case divide FMA with scale and flags (src0 = Quotient,
+//  src1 = Denominator, src2 = Numerator).
+def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", SDTFPTernaryOp>;
+
+// Single or double precision division fixup.
+// Special case divide fixup and flags(src0 = Quotient, src1 =
+// Denominator, src2 = Numerator).
+def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;
+
+// Look Up 2.0 / pi src0 with segment select src1[4:0]
+def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>;
+
 def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
                           SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
                           [SDNPHasChain, SDNPMayLoad]>;
@@ -92,6 +140,8 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
 
+def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>;
+
 // Signed and unsigned 24-bit mulitply.  The highest 8-bits are ignore when
 // performing the mulitply.  The result is a 32-bit value.
 def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
@@ -107,3 +157,22 @@ def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp,
 def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp,
   []
 >;
+
+//===----------------------------------------------------------------------===//
+// Flow Control Profile Types
+//===----------------------------------------------------------------------===//
+// Branch instruction where second and third are basic blocks
+def SDTIL_BRCond : SDTypeProfile<0, 2, [
+    SDTCisVT<0, OtherVT>
+    ]>;
+
+//===----------------------------------------------------------------------===//
+// Flow Control DAG Nodes
+//===----------------------------------------------------------------------===//
+def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// Call/Return DAG Nodes
+//===----------------------------------------------------------------------===//
+def IL_retflag       : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
+    [SDNPHasChain, SDNPOptInGlue]>;
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index 80bdf5b..b86b781 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -49,6 +49,11 @@ def u8imm : Operand<i8> {
   let PrintMethod = "printU8ImmOperand";
 }
 
+//===--------------------------------------------------------------------===//
+// Custom Operands
+//===--------------------------------------------------------------------===//
+def brtarget   : Operand<OtherVT>;
+
 //===----------------------------------------------------------------------===//
 // PatLeafs for floating-point comparisons
 //===----------------------------------------------------------------------===//
@@ -127,6 +132,21 @@ def COND_NULL : PatLeaf <
 // Load/Store Pattern Fragments
 //===----------------------------------------------------------------------===//
 
+def global_store : PatFrag<(ops node:$val, node:$ptr),
+    (store node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+// Global address space loads
+def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+// Constant address space loads
+def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
 def az_extload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
   LoadSDNode *L = cast<LoadSDNode>(N);
   return L->getExtensionType() == ISD::ZEXTLOAD ||
@@ -232,26 +252,55 @@ def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
     return isLocalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
 
-def atomic_load_add_local : PatFrag<(ops node:$ptr, node:$value),
-                                    (atomic_load_add node:$ptr, node:$value), [{
-  return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
-}]>;
 
-def atomic_load_sub_local : PatFrag<(ops node:$ptr, node:$value),
-                                    (atomic_load_sub node:$ptr, node:$value), [{
-  return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+class local_binary_atomic_op<SDNode atomic_op> :
+  PatFrag<(ops node:$ptr, node:$value),
+    (atomic_op node:$ptr, node:$value), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
 }]>;
 
+
+def atomic_swap_local : local_binary_atomic_op<atomic_swap>;
+def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>;
+def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>;
+def atomic_load_and_local : local_binary_atomic_op<atomic_load_and>;
+def atomic_load_or_local : local_binary_atomic_op<atomic_load_or>;
+def atomic_load_xor_local : local_binary_atomic_op<atomic_load_xor>;
+def atomic_load_nand_local : local_binary_atomic_op<atomic_load_nand>;
+def atomic_load_min_local : local_binary_atomic_op<atomic_load_min>;
+def atomic_load_max_local : local_binary_atomic_op<atomic_load_max>;
+def atomic_load_umin_local : local_binary_atomic_op<atomic_load_umin>;
+def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>;
+
 def mskor_global : PatFrag<(ops node:$val, node:$ptr),
                             (AMDGPUstore_mskor node:$val, node:$ptr), [{
   return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
 }]>;
 
+def atomic_cmp_swap_32_local :
+  PatFrag<(ops node:$ptr, node:$cmp, node:$swap),
+          (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{
+  AtomicSDNode *AN = cast<AtomicSDNode>(N);
+  return AN->getMemoryVT() == MVT::i32 &&
+         AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+def atomic_cmp_swap_64_local :
+  PatFrag<(ops node:$ptr, node:$cmp, node:$swap),
+          (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{
+  AtomicSDNode *AN = cast<AtomicSDNode>(N);
+  return AN->getMemoryVT() == MVT::i64 &&
+         AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+
 class Constants {
 int TWO_PI = 0x40c90fdb;
 int PI = 0x40490fdb;
 int TWO_PI_INV = 0x3e22f983;
 int FP_UINT_MAX_PLUS_1 = 0x4f800000;    // 1 << 32 in floating point encoding
+int FP32_NEG_ONE = 0xbf800000;
+int FP32_ONE = 0x3f800000;
 }
 def CONST : Constants;
 
@@ -273,7 +322,7 @@ class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
   (outs rc:$dst),
   (ins rc:$src0),
   "CLAMP $dst, $src0",
-  [(set f32:$dst, (int_AMDIL_clamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
+  [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
 >;
 
 class FABS <RegisterClass rc> : AMDGPUShaderInst <
@@ -363,7 +412,7 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat <
 
 // BFI_INT patterns
 
-multiclass BFIPatterns <Instruction BFI_INT> {
+multiclass BFIPatterns <Instruction BFI_INT, Instruction LoadImm32> {
 
   // Definition from ISA doc:
   // (y & x) | (z & ~x)
@@ -379,6 +428,19 @@ multiclass BFIPatterns <Instruction BFI_INT> {
     (BFI_INT $x, $y, $z)
   >;
 
+  def : Pat <
+    (fcopysign f32:$src0, f32:$src1),
+    (BFI_INT (LoadImm32 0x7fffffff), $src0, $src1)
+  >;
+
+  def : Pat <
+    (f64 (fcopysign f64:$src0, f64:$src1)),
+      (INSERT_SUBREG (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+      (i32 (EXTRACT_SUBREG $src0, sub0)), sub0),
+      (BFI_INT (LoadImm32 0x7fffffff),
+               (i32 (EXTRACT_SUBREG $src0, sub1)),
+               (i32 (EXTRACT_SUBREG $src1, sub1))), sub1)
+  >;
 }
 
 // SHA-256 Ma patterns
@@ -457,6 +519,23 @@ multiclass Expand24UBitOps<Instruction MulInst, Instruction AddInst> {
   >;
 }
 
+class RcpPat<Instruction RcpInst, ValueType vt> : Pat <
+  (fdiv FP_ONE, vt:$src),
+  (RcpInst $src)
+>;
+
+multiclass RsqPat<Instruction RsqInst, ValueType vt> {
+  def : Pat <
+    (fdiv FP_ONE, (fsqrt vt:$src)),
+    (RsqInst $src)
+  >;
+
+  def : Pat <
+    (AMDGPUrcp (fsqrt vt:$src)),
+    (RsqInst $src)
+  >;
+}
+
 include "R600Instructions.td"
 include "R700Instructions.td"
 include "EvergreenInstructions.td"
diff --git a/lib/Target/R600/AMDILIntrinsicInfo.cpp b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp
index fab4a3b..58916a9 100644
--- a/lib/Target/R600/AMDILIntrinsicInfo.cpp
+++ b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp
@@ -1,4 +1,4 @@
-//===- AMDILIntrinsicInfo.cpp - AMDGPU Intrinsic Information ------*- C++ -*-===//
+//===- AMDGPUIntrinsicInfo.cpp - AMDGPU Intrinsic Information ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,7 +12,7 @@
 //
 //===-----------------------------------------------------------------------===//
 
-#include "AMDILIntrinsicInfo.h"
+#include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Intrinsics.h"
@@ -24,14 +24,12 @@ using namespace llvm;
 #include "AMDGPUGenIntrinsics.inc"
 #undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
 
-AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm) 
-  : TargetIntrinsicInfo() {
-}
+AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm)
+    : TargetIntrinsicInfo() {}
 
-std::string 
-AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys,
-    unsigned int numTys) const  {
-  static const char* const names[] = {
+std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
+                                         unsigned numTys) const {
+  static const char *const names[] = {
 #define GET_INTRINSIC_NAME_TABLE
 #include "AMDGPUGenIntrinsics.inc"
 #undef GET_INTRINSIC_NAME_TABLE
@@ -40,23 +38,23 @@ AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys,
   if (IntrID < Intrinsic::num_intrinsics) {
     return nullptr;
   }
-  assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics
-      && "Invalid intrinsic ID");
+  assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
+         "Invalid intrinsic ID");
 
   std::string Result(names[IntrID - Intrinsic::num_intrinsics]);
   return Result;
 }
 
-unsigned int
-AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const  {
+unsigned AMDGPUIntrinsicInfo::lookupName(const char *Name,
+                                         unsigned Len) const {
   if (!StringRef(Name, Len).startswith("llvm."))
     return 0; // All intrinsics start with 'llvm.'
 
 #define GET_FUNCTION_RECOGNIZER
 #include "AMDGPUGenIntrinsics.inc"
 #undef GET_FUNCTION_RECOGNIZER
-  AMDGPUIntrinsic::ID IntrinsicID
-    = (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic;
+  AMDGPUIntrinsic::ID IntrinsicID =
+      (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic;
   IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name);
 
   if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) {
@@ -65,17 +63,15 @@ AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const  {
   return 0;
 }
 
-bool 
-AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const  {
-  // Overload Table
+bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
+// Overload Table
 #define GET_INTRINSIC_OVERLOAD_TABLE
 #include "AMDGPUGenIntrinsics.inc"
 #undef GET_INTRINSIC_OVERLOAD_TABLE
 }
 
-Function*
-AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
-    Type **Tys,
-    unsigned numTys) const  {
+Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+                                              Type **Tys,
+                                              unsigned numTys) const {
   llvm_unreachable("Not implemented");
 }
diff --git a/lib/Target/R600/AMDILIntrinsicInfo.h b/lib/Target/R600/AMDGPUIntrinsicInfo.h
index 924275a..5be68a2 100644
--- a/lib/Target/R600/AMDILIntrinsicInfo.h
+++ b/lib/Target/R600/AMDGPUIntrinsicInfo.h
@@ -1,4 +1,4 @@
-//===- AMDILIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===//
+//===- AMDGPUIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,8 +11,8 @@
 /// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
 //
 //===-----------------------------------------------------------------------===//
-#ifndef AMDIL_INTRINSICS_H
-#define AMDIL_INTRINSICS_H
+#ifndef AMDGPU_INTRINSICINFO_H
+#define AMDGPU_INTRINSICINFO_H
 
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
@@ -34,16 +34,15 @@ enum ID {
 class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo {
 public:
   AMDGPUIntrinsicInfo(TargetMachine *tm);
-  std::string getName(unsigned int IntrId, Type **Tys = nullptr,
-                      unsigned int numTys = 0) const override;
-  unsigned int lookupName(const char *Name, unsigned int Len) const override;
-  bool isOverloaded(unsigned int IID) const override;
-  Function *getDeclaration(Module *M, unsigned int ID,
+  std::string getName(unsigned IntrId, Type **Tys = nullptr,
+                      unsigned numTys = 0) const override;
+  unsigned lookupName(const char *Name, unsigned Len) const override;
+  bool isOverloaded(unsigned IID) const override;
+  Function *getDeclaration(Module *M, unsigned ID,
                            Type **Tys = nullptr,
-                           unsigned int numTys = 0) const override;
+                           unsigned numTys = 0) const override;
 };
 
 } // end namespace llvm
 
-#endif // AMDIL_INTRINSICS_H
-
+#endif // AMDGPU_INTRINSICINFO_H
diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td
index 9ad5e72..d934676 100644
--- a/lib/Target/R600/AMDGPUIntrinsics.td
+++ b/lib/Target/R600/AMDGPUIntrinsics.td
@@ -18,18 +18,26 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
   def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
-
+  def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
   def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+
+  // This is named backwards (instead of rsq_legacy) so we don't have
+  // to define it with the public builtins intrinsics. This is a
+  // workaround for how intrinsic names are parsed. If the name is
+  // llvm.AMDGPU.rsq.legacy, the parser assumes that you meant
+  // llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name.
+  def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
   def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
   def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
   def int_AMDGPU_kilp : Intrinsic<[], [], []>;
   def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
   def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
   def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
-  def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
   def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
   def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
@@ -53,12 +61,27 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cvt_f32_ubyte0 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cvt_f32_ubyte1 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cvt_f32_ubyte2 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cvt_f32_ubyte3 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
   def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_barrier_local  : Intrinsic<[], [], []>;
+  def int_AMDGPU_barrier_global  : Intrinsic<[], [], []>;
+}
+
+// Legacy names for compatibility.
+let TargetPrefix = "AMDIL", isTarget = 1 in {
+  def int_AMDIL_abs : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDIL_fraction : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDIL_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDIL_exp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_AMDIL_round_nearest : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
 }
 
 let TargetPrefix = "TGSI", isTarget = 1 in {
diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
index b759495..ac82e88 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.cpp
+++ b/lib/Target/R600/AMDGPUMCInstLower.cpp
@@ -15,6 +15,7 @@
 
 #include "AMDGPUMCInstLower.h"
 #include "AMDGPUAsmPrinter.h"
+#include "AMDGPUTargetMachine.h"
 #include "InstPrinter/AMDGPUInstPrinter.h"
 #include "R600InstrInfo.h"
 #include "SIInstrInfo.h"
diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h
index 2b7f1e3..58fe34d 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.h
+++ b/lib/Target/R600/AMDGPUMCInstLower.h
@@ -14,9 +14,9 @@
 namespace llvm {
 
 class AMDGPUSubtarget;
-class MCInst;
-class MCContext;
 class MachineInstr;
+class MCContext;
+class MCInst;
 
 class AMDGPUMCInstLower {
 
diff --git a/lib/Target/R600/AMDGPUPromoteAlloca.cpp b/lib/Target/R600/AMDGPUPromoteAlloca.cpp
new file mode 100644
index 0000000..218750d
--- /dev/null
+++ b/lib/Target/R600/AMDGPUPromoteAlloca.cpp
@@ -0,0 +1,387 @@
+//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates allocas by either converting them into vectors or
+// by migrating them to local address space.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "amdgpu-promote-alloca"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUPromoteAlloca : public FunctionPass,
+                       public InstVisitor<AMDGPUPromoteAlloca> {
+
+  static char ID;
+  Module *Mod;
+  const AMDGPUSubtarget &ST;
+  int LocalMemAvailable;
+
+public:
+  AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
+                                                   LocalMemAvailable(0) { }
+  virtual bool doInitialization(Module &M);
+  virtual bool runOnFunction(Function &F);
+  virtual const char *getPassName() const {
+    return "AMDGPU Promote Alloca";
+  }
+  void visitAlloca(AllocaInst &I);
+};
+
+} // End anonymous namespace
+
+char AMDGPUPromoteAlloca::ID = 0;
+
+bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
+  Mod = &M;
+  return false;
+}
+
+bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
+
+  const FunctionType *FTy = F.getFunctionType();
+
+  LocalMemAvailable = ST.getLocalMemorySize();
+
+
+  // If the function has any arguments in the local address space, then it's
+  // possible these arguments require the entire local memory space, so
+  // we cannot use local memory in the pass.
+  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
+    const Type *ParamTy = FTy->getParamType(i);
+    if (ParamTy->isPointerTy() &&
+        ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+      LocalMemAvailable = 0;
+      DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
+                      "local memory disabled.\n");
+      break;
+    }
+  }
+
+  if (LocalMemAvailable > 0) {
+    // Check how much local memory is being used by global objects
+    for (Module::global_iterator I = Mod->global_begin(),
+                                 E = Mod->global_end(); I != E; ++I) {
+      GlobalVariable *GV = I;
+      PointerType *GVTy = GV->getType();
+      if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+        continue;
+      for (Value::use_iterator U = GV->use_begin(),
+                               UE = GV->use_end(); U != UE; ++U) {
+        Instruction *Use = dyn_cast<Instruction>(*U);
+        if (!Use)
+          continue;
+        if (Use->getParent()->getParent() == &F)
+          LocalMemAvailable -=
+              Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType());
+      }
+    }
+  }
+
+  LocalMemAvailable = std::max(0, LocalMemAvailable);
+  DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
+
+  visit(F);
+
+  return false;
+}
+
+static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
+  return VectorType::get(ArrayTy->getArrayElementType(),
+                         ArrayTy->getArrayNumElements());
+}
+
+static Value* calculateVectorIndex(Value *Ptr,
+                                  std::map<GetElementPtrInst*, Value*> GEPIdx) {
+  if (isa<AllocaInst>(Ptr))
+    return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
+
+  GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
+
+  return GEPIdx[GEP];
+}
+
+static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
+  // FIXME we only support simple cases
+  if (GEP->getNumOperands() != 3)
+    return NULL;
+
+  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
+  if (!I0 || !I0->isZero())
+    return NULL;
+
+  return GEP->getOperand(2);
+}
+
+// Not an instruction handled below to turn into a vector.
+//
+// TODO: Check isTriviallyVectorizable for calls and handle other
+// instructions.
+static bool canVectorizeInst(Instruction *Inst) {
+  switch (Inst->getOpcode()) {
+  case Instruction::Load:
+  case Instruction::Store:
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
+  Type *AllocaTy = Alloca->getAllocatedType();
+
+  DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
+
+  // FIXME: There is no reason why we can't support larger arrays, we
+  // are just being conservative for now.
+  if (!AllocaTy->isArrayTy() ||
+      AllocaTy->getArrayElementType()->isVectorTy() ||
+      AllocaTy->getArrayNumElements() > 4) {
+
+    DEBUG(dbgs() << "  Cannot convert type to vector");
+    return false;
+  }
+
+  std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
+  std::vector<Value*> WorkList;
+  for (User *AllocaUser : Alloca->users()) {
+    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
+    if (!GEP) {
+      if (!canVectorizeInst(cast<Instruction>(AllocaUser)))
+        return false;
+
+      WorkList.push_back(AllocaUser);
+      continue;
+    }
+
+    Value *Index = GEPToVectorIndex(GEP);
+
+    // If we can't compute a vector index from this GEP, then we can't
+    // promote this alloca to vector.
+    if (!Index) {
+      DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
+      return false;
+    }
+
+    GEPVectorIdx[GEP] = Index;
+    for (User *GEPUser : AllocaUser->users()) {
+      if (!canVectorizeInst(cast<Instruction>(GEPUser)))
+        return false;
+
+      WorkList.push_back(GEPUser);
+    }
+  }
+
+  VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
+
+  DEBUG(dbgs() << "  Converting alloca to vector "
+        << *AllocaTy << " -> " << *VectorTy << '\n');
+
+  for (std::vector<Value*>::iterator I = WorkList.begin(),
+                                     E = WorkList.end(); I != E; ++I) {
+    Instruction *Inst = cast<Instruction>(*I);
+    IRBuilder<> Builder(Inst);
+    switch (Inst->getOpcode()) {
+    case Instruction::Load: {
+      Value *Ptr = Inst->getOperand(0);
+      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+      Value *VecValue = Builder.CreateLoad(BitCast);
+      Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
+      Inst->replaceAllUsesWith(ExtractElement);
+      Inst->eraseFromParent();
+      break;
+    }
+    case Instruction::Store: {
+      Value *Ptr = Inst->getOperand(1);
+      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+      Value *VecValue = Builder.CreateLoad(BitCast);
+      Value *NewVecValue = Builder.CreateInsertElement(VecValue,
+                                                       Inst->getOperand(0),
+                                                       Index);
+      Builder.CreateStore(NewVecValue, BitCast);
+      Inst->eraseFromParent();
+      break;
+    }
+    case Instruction::BitCast:
+    case Instruction::AddrSpaceCast:
+      break;
+
+    default:
+      Inst->dump();
+      llvm_unreachable("Inconsistency in instructions promotable to vector");
+    }
+  }
+  return true;
+}
+
+static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
+  for (User *User : Val->users()) {
+    if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
+      continue;
+    if (isa<CallInst>(User)) {
+      WorkList.push_back(User);
+      continue;
+    }
+    if (!User->getType()->isPointerTy())
+      continue;
+    WorkList.push_back(User);
+    collectUsesWithPtrTypes(User, WorkList);
+  }
+}
+
+void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
+  IRBuilder<> Builder(&I);
+
+  // First try to replace the alloca with a vector
+  Type *AllocaTy = I.getAllocatedType();
+
+  DEBUG(dbgs() << "Trying to promote " << I << '\n');
+
+  if (tryPromoteAllocaToVector(&I))
+    return;
+
+  DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
+
+  // FIXME: This is the maximum work group size.  We should try to get
+  // value from the reqd_work_group_size function attribute if it is
+  // available.
+  unsigned WorkGroupSize = 256;
+  int AllocaSize = WorkGroupSize *
+      Mod->getDataLayout()->getTypeAllocSize(AllocaTy);
+
+  if (AllocaSize > LocalMemAvailable) {
+    DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
+    return;
+  }
+
+  DEBUG(dbgs() << "Promoting alloca to local memory\n");
+  LocalMemAvailable -= AllocaSize;
+
+  GlobalVariable *GV = new GlobalVariable(
+      *Mod, ArrayType::get(I.getAllocatedType(), 256), false,
+      GlobalValue::ExternalLinkage, 0, I.getName(), 0,
+      GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
+
+  FunctionType *FTy = FunctionType::get(
+      Type::getInt32Ty(Mod->getContext()), false);
+  AttributeSet AttrSet;
+  AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
+
+  Value *ReadLocalSizeY = Mod->getOrInsertFunction(
+      "llvm.r600.read.local.size.y", FTy, AttrSet);
+  Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
+      "llvm.r600.read.local.size.z", FTy, AttrSet);
+  Value *ReadTIDIGX = Mod->getOrInsertFunction(
+      "llvm.r600.read.tidig.x", FTy, AttrSet);
+  Value *ReadTIDIGY = Mod->getOrInsertFunction(
+      "llvm.r600.read.tidig.y", FTy, AttrSet);
+  Value *ReadTIDIGZ = Mod->getOrInsertFunction(
+      "llvm.r600.read.tidig.z", FTy, AttrSet);
+
+
+  Value *TCntY = Builder.CreateCall(ReadLocalSizeY);
+  Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ);
+  Value *TIdX  = Builder.CreateCall(ReadTIDIGX);
+  Value *TIdY  = Builder.CreateCall(ReadTIDIGY);
+  Value *TIdZ  = Builder.CreateCall(ReadTIDIGZ);
+
+  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
+  Tmp0 = Builder.CreateMul(Tmp0, TIdX);
+  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
+  Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
+  TID = Builder.CreateAdd(TID, TIdZ);
+
+  std::vector<Value*> Indices;
+  Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
+  Indices.push_back(TID);
+
+  Value *Offset = Builder.CreateGEP(GV, Indices);
+  I.mutateType(Offset->getType());
+  I.replaceAllUsesWith(Offset);
+  I.eraseFromParent();
+
+  std::vector<Value*> WorkList;
+
+  collectUsesWithPtrTypes(Offset, WorkList);
+
+  for (std::vector<Value*>::iterator i = WorkList.begin(),
+                                     e = WorkList.end(); i != e; ++i) {
+    Value *V = *i;
+    CallInst *Call = dyn_cast<CallInst>(V);
+    if (!Call) {
+      Type *EltTy = V->getType()->getPointerElementType();
+      PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+      V->mutateType(NewTy);
+      continue;
+    }
+
+    IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
+    if (!Intr) {
+      std::vector<Type*> ArgTypes;
+      for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
+                                ArgIdx != ArgEnd; ++ArgIdx) {
+        ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
+      }
+      Function *F = Call->getCalledFunction();
+      FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
+                                                F->isVarArg());
+      Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType,
+                                             F->getAttributes());
+      Function *NewF = cast<Function>(C);
+      Call->setCalledFunction(NewF);
+      continue;
+    }
+
+    Builder.SetInsertPoint(Intr);
+    switch (Intr->getIntrinsicID()) {
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+      // These intrinsics are for address space 0 only
+      Intr->eraseFromParent();
+      continue;
+    case Intrinsic::memcpy: {
+      MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
+      Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
+                           MemCpy->getLength(), MemCpy->getAlignment(),
+                           MemCpy->isVolatile());
+      Intr->eraseFromParent();
+      continue;
+    }
+    case Intrinsic::memset: {
+      MemSetInst *MemSet = cast<MemSetInst>(Intr);
+      Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
+                           MemSet->getLength(), MemSet->getAlignment(),
+                           MemSet->isVolatile());
+      Intr->eraseFromParent();
+      continue;
+    }
+    default:
+      Intr->dump();
+      llvm_unreachable("Don't know how to promote alloca intrinsic use.");
+    }
+  }
+}
+
+FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
+  return new AMDGPUPromoteAlloca(ST);
+}
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
index 19927fa..3433280 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.cpp
+++ b/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -17,9 +17,9 @@
 
 using namespace llvm;
 
-AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm)
+AMDGPURegisterInfo::AMDGPURegisterInfo(const AMDGPUSubtarget &st)
 : AMDGPUGenRegisterInfo(0),
-  TM(tm)
+  ST(st)
   { }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h
index a7cba0d..4731595 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.h
+++ b/lib/Target/R600/AMDGPURegisterInfo.h
@@ -25,27 +25,19 @@
 
 namespace llvm {
 
-class AMDGPUTargetMachine;
+class AMDGPUSubtarget;
 class TargetInstrInfo;
 
 struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
-  TargetMachine &TM;
   static const MCPhysReg CalleeSavedReg;
+  const AMDGPUSubtarget &ST;
 
-  AMDGPURegisterInfo(TargetMachine &tm);
+  AMDGPURegisterInfo(const AMDGPUSubtarget &st);
 
   BitVector getReservedRegs(const MachineFunction &MF) const override {
     assert(!"Unimplemented");  return BitVector();
   }
 
-  /// \param RC is an AMDIL reg class.
-  ///
-  /// \returns The ISA reg class that is equivalent to \p RC.
-  virtual const TargetRegisterClass * getISARegClass(
-                                         const TargetRegisterClass * RC) const {
-    assert(!"Unimplemented"); return nullptr;
-  }
-
   virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const {
     assert(!"Unimplemented"); return nullptr;
   }
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index f3b9932..b83c290 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -13,6 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUSubtarget.h"
+#include "R600InstrInfo.h"
+#include "SIInstrInfo.h"
 
 using namespace llvm;
 
@@ -23,90 +25,42 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_CTOR
 #include "AMDGPUGenSubtargetInfo.inc"
 
-AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
-  AMDGPUGenSubtargetInfo(TT, CPU, FS), DumpCode(false) {
-    InstrItins = getInstrItineraryForCPU(CPU);
-
-  // Default card
-  StringRef GPU = CPU;
-  Is64bit = false;
-  HasVertexCache = false;
-  TexVTXClauseSize = 0;
-  Gen = AMDGPUSubtarget::R600;
-  FP64 = false;
-  CaymanISA = false;
-  EnableIRStructurizer = true;
-  EnableIfCvt = true;
-  WavefrontSize = 0;
-  CFALUBug = false;
+AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS) :
+  AMDGPUGenSubtargetInfo(TT, GPU, FS),
+  DevName(GPU),
+  Is64bit(false),
+  DumpCode(false),
+  R600ALUInst(false),
+  HasVertexCache(false),
+  TexVTXClauseSize(0),
+  Gen(AMDGPUSubtarget::R600),
+  FP64(false),
+  CaymanISA(false),
+  EnableIRStructurizer(true),
+  EnableIfCvt(true),
+  WavefrontSize(0),
+  CFALUBug(false),
+  LocalMemorySize(0),
+  InstrItins(getInstrItineraryForCPU(GPU)) {
   ParseSubtargetFeatures(GPU, FS);
-  DevName = GPU;
-}
 
-bool
-AMDGPUSubtarget::is64bit() const  {
-  return Is64bit;
-}
-bool
-AMDGPUSubtarget::hasVertexCache() const {
-  return HasVertexCache;
-}
-short
-AMDGPUSubtarget::getTexVTXClauseSize() const {
-  return TexVTXClauseSize;
-}
-enum AMDGPUSubtarget::Generation
-AMDGPUSubtarget::getGeneration() const {
-  return Gen;
-}
-bool
-AMDGPUSubtarget::hasHWFP64() const {
-  return FP64;
-}
-bool
-AMDGPUSubtarget::hasCaymanISA() const {
-  return CaymanISA;
-}
-bool
-AMDGPUSubtarget::IsIRStructurizerEnabled() const {
-  return EnableIRStructurizer;
-}
-bool
-AMDGPUSubtarget::isIfCvtEnabled() const {
-  return EnableIfCvt;
-}
-unsigned
-AMDGPUSubtarget::getWavefrontSize() const {
-  return WavefrontSize;
+  if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    InstrInfo.reset(new R600InstrInfo(*this));
+  } else {
+    InstrInfo.reset(new SIInstrInfo(*this));
+  }
 }
-unsigned
-AMDGPUSubtarget::getStackEntrySize() const {
+
+unsigned AMDGPUSubtarget::getStackEntrySize() const {
   assert(getGeneration() <= NORTHERN_ISLANDS);
   switch(getWavefrontSize()) {
   case 16:
     return 8;
   case 32:
-    if (hasCaymanISA())
-      return 4;
-    else
-      return 8;
+    return hasCaymanISA() ? 4 : 8;
   case 64:
     return 4;
   default:
     llvm_unreachable("Illegal wavefront size.");
   }
 }
-bool
-AMDGPUSubtarget::hasCFAluBug() const {
-  assert(getGeneration() <= NORTHERN_ISLANDS);
-  return CFALUBug;
-}
-bool
-AMDGPUSubtarget::isTargetELF() const {
-  return false;
-}
-
-std::string
-AMDGPUSubtarget::getDeviceName() const {
-  return DevName;
-}
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 1b041d6..0c388b3 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -15,6 +15,7 @@
 #ifndef AMDGPUSUBTARGET_H
 #define AMDGPUSUBTARGET_H
 #include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -27,6 +28,9 @@
 namespace llvm {
 
 class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
+
+  std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
+
 public:
   enum Generation {
     R600 = 0,
@@ -40,42 +44,78 @@ public:
 private:
   std::string DevName;
   bool Is64bit;
-  bool Is32on64bit;
   bool DumpCode;
   bool R600ALUInst;
   bool HasVertexCache;
   short TexVTXClauseSize;
-  enum Generation Gen;
+  Generation Gen;
   bool FP64;
   bool CaymanISA;
   bool EnableIRStructurizer;
   bool EnableIfCvt;
   unsigned WavefrontSize;
   bool CFALUBug;
+  int LocalMemorySize;
 
   InstrItineraryData InstrItins;
 
 public:
   AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS);
 
-  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+  const AMDGPUInstrInfo *getInstrInfo() const {
+    return InstrInfo.get();
+  }
+
+  const InstrItineraryData &getInstrItineraryData() const {
+    return InstrItins;
+  }
+
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
-  bool is64bit() const;
-  bool hasVertexCache() const;
-  short getTexVTXClauseSize() const;
-  enum Generation getGeneration() const;
-  bool hasHWFP64() const;
-  bool hasCaymanISA() const;
+  bool is64bit() const {
+    return Is64bit;
+  }
+
+  bool hasVertexCache() const {
+    return HasVertexCache;
+  }
+
+  short getTexVTXClauseSize() const {
+      return TexVTXClauseSize;
+  }
+
+  Generation getGeneration() const {
+    return Gen;
+  }
+
+  bool hasHWFP64() const {
+    return FP64;
+  }
+
+  bool hasCaymanISA() const {
+    return CaymanISA;
+  }
 
   bool hasBFE() const {
     return (getGeneration() >= EVERGREEN);
   }
 
+  bool hasBFI() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
   bool hasBFM() const {
     return hasBFE();
   }
 
+  bool hasBCNT(unsigned Size) const {
+    if (Size == 32)
+      return (getGeneration() >= EVERGREEN);
+
+    assert(Size == 64);
+    return (getGeneration() >= SOUTHERN_ISLANDS);
+  }
+
   bool hasMulU24() const {
     return (getGeneration() >= EVERGREEN);
   }
@@ -85,22 +125,48 @@ public:
             hasCaymanISA());
   }
 
-  bool IsIRStructurizerEnabled() const;
-  bool isIfCvtEnabled() const;
-  unsigned getWavefrontSize() const;
+  bool IsIRStructurizerEnabled() const {
+    return EnableIRStructurizer;
+  }
+
+  bool isIfCvtEnabled() const {
+    return EnableIfCvt;
+  }
+
+  unsigned getWavefrontSize() const {
+    return WavefrontSize;
+  }
+
   unsigned getStackEntrySize() const;
-  bool hasCFAluBug() const;
+
+  bool hasCFAluBug() const {
+    assert(getGeneration() <= NORTHERN_ISLANDS);
+    return CFALUBug;
+  }
+
+  int getLocalMemorySize() const {
+    return LocalMemorySize;
+  }
 
   bool enableMachineScheduler() const override {
     return getGeneration() <= NORTHERN_ISLANDS;
   }
 
   // Helper functions to simplify if statements
-  bool isTargetELF() const;
-  std::string getDeviceName() const;
-  bool dumpCode() const { return DumpCode; }
-  bool r600ALUEncoding() const { return R600ALUInst; }
+  bool isTargetELF() const {
+    return false;
+  }
 
+  StringRef getDeviceName() const {
+    return DevName;
+  }
+
+  bool dumpCode() const {
+    return DumpCode;
+  }
+  bool r600ALUEncoding() const {
+    return R600ALUInst;
+  }
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index 174fdca..8aab944 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -80,10 +80,8 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
   InstrItins(&Subtarget.getInstrItineraryData()) {
   // TLInfo uses InstrInfo so it must be initialized after.
   if (Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    InstrInfo.reset(new R600InstrInfo(*this));
     TLInfo.reset(new R600TargetLowering(*this));
   } else {
-    InstrInfo.reset(new SIInstrInfo(*this));
     TLInfo.reset(new SITargetLowering(*this));
   }
   setRequiresStructuredCFG(true);
@@ -111,6 +109,7 @@ public:
     return nullptr;
   }
 
+  virtual void addCodeGenPrepare();
   bool addPreISel() override;
   bool addInstSelector() override;
   bool addPreRegAlloc() override;
@@ -136,6 +135,13 @@ void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
   PM.add(createAMDGPUTargetTransformInfoPass(this));
 }
 
+void AMDGPUPassConfig::addCodeGenPrepare() {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+  addPass(createAMDGPUPromoteAlloca(ST));
+  addPass(createSROAPass());
+  TargetPassConfig::addCodeGenPrepare();
+}
+
 bool
 AMDGPUPassConfig::addPreISel() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
@@ -159,7 +165,6 @@ bool AMDGPUPassConfig::addInstSelector() {
 }
 
 bool AMDGPUPassConfig::addPreRegAlloc() {
-  addPass(createAMDGPUConvertToISAPass(*TM));
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
 
   if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
@@ -169,6 +174,8 @@ bool AMDGPUPassConfig::addPreRegAlloc() {
     // SIFixSGPRCopies can generate a lot of duplicate instructions,
     // so we need to run MachineCSE afterwards.
     addPass(&MachineCSEID);
+    initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry());
+    insertPass(&RegisterCoalescerID, &SIFixSGPRLiveRangesID);
   }
   return false;
 }
diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h
index 1287e13..3bb15be 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.h
+++ b/lib/Target/R600/AMDGPUTargetMachine.h
@@ -17,8 +17,8 @@
 
 #include "AMDGPUFrameLowering.h"
 #include "AMDGPUInstrInfo.h"
+#include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
-#include "AMDILIntrinsicInfo.h"
 #include "R600ISelLowering.h"
 #include "llvm/IR/DataLayout.h"
 
@@ -30,7 +30,6 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
   const DataLayout Layout;
   AMDGPUFrameLowering FrameLowering;
   AMDGPUIntrinsicInfo IntrinsicInfo;
-  std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
   std::unique_ptr<AMDGPUTargetLowering> TLInfo;
   const InstrItineraryData *InstrItins;
 
@@ -46,13 +45,13 @@ public:
     return &IntrinsicInfo;
   }
   const AMDGPUInstrInfo *getInstrInfo() const override {
-    return InstrInfo.get();
+    return getSubtargetImpl()->getInstrInfo();
   }
   const AMDGPUSubtarget *getSubtargetImpl() const override {
     return &Subtarget;
   }
   const AMDGPURegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo->getRegisterInfo();
+    return &getInstrInfo()->getRegisterInfo();
   }
   AMDGPUTargetLowering *getTargetLowering() const override {
     return TLInfo.get();
diff --git a/lib/Target/R600/AMDILBase.td b/lib/Target/R600/AMDILBase.td
deleted file mode 100644
index 5dcd478..0000000
--- a/lib/Target/R600/AMDILBase.td
+++ /dev/null
@@ -1,25 +0,0 @@
-//===- AMDIL.td - AMDIL Target Machine -------------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// Target-independent interfaces which we are implementing
-//===----------------------------------------------------------------------===//
-
-include "llvm/Target/Target.td"
-
-// Dummy Instruction itineraries for pseudo instructions
-def ALU_NULL : FuncUnit;
-def NullALU : InstrItinClass;
-
-//===----------------------------------------------------------------------===//
-// Register File, Calling Conv, Instruction Descriptions
-//===----------------------------------------------------------------------===//
-
-
-include "AMDILRegisterInfo.td"
-include "AMDILInstrInfo.td"
-
diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp
deleted file mode 100644
index 7cea803..0000000
--- a/lib/Target/R600/AMDILISelLowering.cpp
+++ /dev/null
@@ -1,560 +0,0 @@
-//===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief TargetLowering functions borrowed from AMDIL.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPUISelLowering.h"
-#include "AMDGPURegisterInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "AMDILIntrinsicInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetOptions.h"
-
-using namespace llvm;
-//===----------------------------------------------------------------------===//
-// TargetLowering Implementation Help Functions End
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// TargetLowering Class Implementation Begins
-//===----------------------------------------------------------------------===//
-void AMDGPUTargetLowering::InitAMDILLowering() {
-  static const MVT::SimpleValueType types[] = {
-    MVT::i8,
-    MVT::i16,
-    MVT::i32,
-    MVT::f32,
-    MVT::f64,
-    MVT::i64,
-    MVT::v2i8,
-    MVT::v4i8,
-    MVT::v2i16,
-    MVT::v4i16,
-    MVT::v4f32,
-    MVT::v4i32,
-    MVT::v2f32,
-    MVT::v2i32,
-    MVT::v2f64,
-    MVT::v2i64
-  };
-
-  static const MVT::SimpleValueType IntTypes[] = {
-    MVT::i8,
-    MVT::i16,
-    MVT::i32,
-    MVT::i64
-  };
-
-  static const MVT::SimpleValueType FloatTypes[] = {
-    MVT::f32,
-    MVT::f64
-  };
-
-  static const MVT::SimpleValueType VectorTypes[] = {
-    MVT::v2i8,
-    MVT::v4i8,
-    MVT::v2i16,
-    MVT::v4i16,
-    MVT::v4f32,
-    MVT::v4i32,
-    MVT::v2f32,
-    MVT::v2i32,
-    MVT::v2f64,
-    MVT::v2i64
-  };
-
-  const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>();
-  // These are the current register classes that are
-  // supported
-
-  for (MVT VT : types) {
-    setOperationAction(ISD::SUBE, VT, Expand);
-    setOperationAction(ISD::SUBC, VT, Expand);
-    setOperationAction(ISD::ADDE, VT, Expand);
-    setOperationAction(ISD::ADDC, VT, Expand);
-    setOperationAction(ISD::BRCOND, VT, Custom);
-    setOperationAction(ISD::BR_JT, VT, Expand);
-    setOperationAction(ISD::BRIND, VT, Expand);
-    // TODO: Implement custom UREM/SREM routines
-    setOperationAction(ISD::SREM, VT, Expand);
-    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
-    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
-    if (VT != MVT::i64 && VT != MVT::v2i64) {
-      setOperationAction(ISD::SDIV, VT, Custom);
-    }
-  }
-  for (MVT VT : FloatTypes) {
-    // IL does not have these operations for floating point types
-    setOperationAction(ISD::FP_ROUND_INREG, VT, Expand);
-    setOperationAction(ISD::SETOLT, VT, Expand);
-    setOperationAction(ISD::SETOGE, VT, Expand);
-    setOperationAction(ISD::SETOGT, VT, Expand);
-    setOperationAction(ISD::SETOLE, VT, Expand);
-    setOperationAction(ISD::SETULT, VT, Expand);
-    setOperationAction(ISD::SETUGE, VT, Expand);
-    setOperationAction(ISD::SETUGT, VT, Expand);
-    setOperationAction(ISD::SETULE, VT, Expand);
-  }
-
-  for (MVT VT : IntTypes) {
-    // GPU also does not have divrem function for signed or unsigned
-    setOperationAction(ISD::SDIVREM, VT, Expand);
-
-    // GPU does not have [S|U]MUL_LOHI functions as a single instruction
-    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
-    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
-
-    setOperationAction(ISD::BSWAP, VT, Expand);
-
-    // GPU doesn't have any counting operators
-    setOperationAction(ISD::CTPOP, VT, Expand);
-    setOperationAction(ISD::CTTZ, VT, Expand);
-    setOperationAction(ISD::CTLZ, VT, Expand);
-  }
-
-  for (MVT VT : VectorTypes) {
-    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
-    setOperationAction(ISD::SDIVREM, VT, Expand);
-    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
-    // setOperationAction(ISD::VSETCC, VT, Expand);
-    setOperationAction(ISD::SELECT_CC, VT, Expand);
-
-  }
-  setOperationAction(ISD::MULHU, MVT::i64, Expand);
-  setOperationAction(ISD::MULHU, MVT::v2i64, Expand);
-  setOperationAction(ISD::MULHS, MVT::i64, Expand);
-  setOperationAction(ISD::MULHS, MVT::v2i64, Expand);
-  setOperationAction(ISD::ADD, MVT::v2i64, Expand);
-  setOperationAction(ISD::SREM, MVT::v2i64, Expand);
-  setOperationAction(ISD::Constant          , MVT::i64  , Legal);
-  setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
-  setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand);
-  setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand);
-  setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand);
-  setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand);
-  if (STM.hasHWFP64()) {
-    // we support loading/storing v2f64 but not operations on the type
-    setOperationAction(ISD::FADD, MVT::v2f64, Expand);
-    setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
-    setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
-    setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand);
-    setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
-    setOperationAction(ISD::ConstantFP        , MVT::f64  , Legal);
-    // We want to expand vector conversions into their scalar
-    // counterparts.
-    setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand);
-    setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand);
-    setOperationAction(ISD::FABS, MVT::f64, Expand);
-    setOperationAction(ISD::FABS, MVT::v2f64, Expand);
-  }
-  // TODO: Fix the UDIV24 algorithm so it works for these
-  // types correctly. This needs vector comparisons
-  // for this to work correctly.
-  setOperationAction(ISD::UDIV, MVT::v2i8, Expand);
-  setOperationAction(ISD::UDIV, MVT::v4i8, Expand);
-  setOperationAction(ISD::UDIV, MVT::v2i16, Expand);
-  setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
-  setOperationAction(ISD::SUBC, MVT::Other, Expand);
-  setOperationAction(ISD::ADDE, MVT::Other, Expand);
-  setOperationAction(ISD::ADDC, MVT::Other, Expand);
-  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
-  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
-  setOperationAction(ISD::BRIND, MVT::Other, Expand);
-
-
-  // Use the default implementation.
-  setOperationAction(ISD::ConstantFP        , MVT::f32    , Legal);
-  setOperationAction(ISD::Constant          , MVT::i32    , Legal);
-
-  setSchedulingPreference(Sched::RegPressure);
-  setPow2DivIsCheap(false);
-  setSelectIsExpensive(true);
-  setJumpIsExpensive(true);
-
-  MaxStoresPerMemcpy  = 4096;
-  MaxStoresPerMemmove = 4096;
-  MaxStoresPerMemset  = 4096;
-
-}
-
-bool
-AMDGPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
-    const CallInst &I, unsigned Intrinsic) const {
-  return false;
-}
-
-// The backend supports 32 and 64 bit floating point immediates
-bool
-AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
-  if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
-      || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-bool
-AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
-  if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
-      || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
-    return false;
-  } else {
-    return true;
-  }
-}
-
-
-// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
-// be zero. Op is expected to be a target specific node. Used by DAG
-// combiner.
-
-//===----------------------------------------------------------------------===//
-//                           Other Lowering Hooks
-//===----------------------------------------------------------------------===//
-
-SDValue
-AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
-  EVT OVT = Op.getValueType();
-  SDValue DST;
-  if (OVT.getScalarType() == MVT::i64) {
-    DST = LowerSDIV64(Op, DAG);
-  } else if (OVT.getScalarType() == MVT::i32) {
-    DST = LowerSDIV32(Op, DAG);
-  } else if (OVT.getScalarType() == MVT::i16
-      || OVT.getScalarType() == MVT::i8) {
-    DST = LowerSDIV24(Op, DAG);
-  } else {
-    DST = SDValue(Op.getNode(), 0);
-  }
-  return DST;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
-  EVT OVT = Op.getValueType();
-  SDValue DST;
-  if (OVT.getScalarType() == MVT::i64) {
-    DST = LowerSREM64(Op, DAG);
-  } else if (OVT.getScalarType() == MVT::i32) {
-    DST = LowerSREM32(Op, DAG);
-  } else if (OVT.getScalarType() == MVT::i16) {
-    DST = LowerSREM16(Op, DAG);
-  } else if (OVT.getScalarType() == MVT::i8) {
-    DST = LowerSREM8(Op, DAG);
-  } else {
-    DST = SDValue(Op.getNode(), 0);
-  }
-  return DST;
-}
-
-EVT
-AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const {
-  int iSize = (size * numEle);
-  int vEle = (iSize >> ((size == 64) ? 6 : 5));
-  if (!vEle) {
-    vEle = 1;
-  }
-  if (size == 64) {
-    if (vEle == 1) {
-      return EVT(MVT::i64);
-    } else {
-      return EVT(MVT::getVectorVT(MVT::i64, vEle));
-    }
-  } else {
-    if (vEle == 1) {
-      return EVT(MVT::i32);
-    } else {
-      return EVT(MVT::getVectorVT(MVT::i32, vEle));
-    }
-  }
-}
-
-SDValue
-AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Chain = Op.getOperand(0);
-  SDValue Cond  = Op.getOperand(1);
-  SDValue Jump  = Op.getOperand(2);
-  SDValue Result;
-  Result = DAG.getNode(
-      AMDGPUISD::BRANCH_COND,
-      SDLoc(Op),
-      Op.getValueType(),
-      Chain, Jump, Cond);
-  return Result;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT OVT = Op.getValueType();
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  MVT INTTY;
-  MVT FLTTY;
-  if (!OVT.isVector()) {
-    INTTY = MVT::i32;
-    FLTTY = MVT::f32;
-  } else if (OVT.getVectorNumElements() == 2) {
-    INTTY = MVT::v2i32;
-    FLTTY = MVT::v2f32;
-  } else if (OVT.getVectorNumElements() == 4) {
-    INTTY = MVT::v4i32;
-    FLTTY = MVT::v4f32;
-  }
-  unsigned bitsize = OVT.getScalarType().getSizeInBits();
-  // char|short jq = ia ^ ib;
-  SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
-
-  // jq = jq >> (bitsize - 2)
-  jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT)); 
-
-  // jq = jq | 0x1
-  jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
-
-  // jq = (int)jq
-  jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
-
-  // int ia = (int)LHS;
-  SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
-
-  // int ib, (int)RHS;
-  SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
-
-  // float fa = (float)ia;
-  SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
-
-  // float fb = (float)ib;
-  SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
-
-  // float fq = native_divide(fa, fb);
-  SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb);
-
-  // fq = trunc(fq);
-  fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
-
-  // float fqneg = -fq;
-  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
-
-  // float fr = mad(fqneg, fb, fa);
-  SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY,
-      DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa);
-
-  // int iq = (int)fq;
-  SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
-
-  // fr = fabs(fr);
-  fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
-
-  // fb = fabs(fb);
-  fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
-
-  // int cv = fr >= fb;
-  SDValue cv;
-  if (INTTY == MVT::i32) {
-    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
-  } else {
-    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
-  }
-  // jq = (cv ? jq : 0);
-  jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq, 
-      DAG.getConstant(0, OVT));
-  // dst = iq + jq;
-  iq = DAG.getSExtOrTrunc(iq, DL, OVT);
-  iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
-  return iq;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT OVT = Op.getValueType();
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  // The LowerSDIV32 function generates equivalent to the following IL.
-  // mov r0, LHS
-  // mov r1, RHS
-  // ilt r10, r0, 0
-  // ilt r11, r1, 0
-  // iadd r0, r0, r10
-  // iadd r1, r1, r11
-  // ixor r0, r0, r10
-  // ixor r1, r1, r11
-  // udiv r0, r0, r1
-  // ixor r10, r10, r11
-  // iadd r0, r0, r10
-  // ixor DST, r0, r10
-
-  // mov r0, LHS
-  SDValue r0 = LHS;
-
-  // mov r1, RHS
-  SDValue r1 = RHS;
-
-  // ilt r10, r0, 0
-  SDValue r10 = DAG.getSelectCC(DL,
-      r0, DAG.getConstant(0, OVT),
-      DAG.getConstant(-1, MVT::i32),
-      DAG.getConstant(0, MVT::i32),
-      ISD::SETLT);
-
-  // ilt r11, r1, 0
-  SDValue r11 = DAG.getSelectCC(DL,
-      r1, DAG.getConstant(0, OVT),
-      DAG.getConstant(-1, MVT::i32),
-      DAG.getConstant(0, MVT::i32),
-      ISD::SETLT);
-
-  // iadd r0, r0, r10
-  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
-
-  // iadd r1, r1, r11
-  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
-
-  // ixor r0, r0, r10
-  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
-
-  // ixor r1, r1, r11
-  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
-
-  // udiv r0, r0, r1
-  r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
-
-  // ixor r10, r10, r11
-  r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
-
-  // iadd r0, r0, r10
-  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
-
-  // ixor DST, r0, r10
-  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); 
-  return DST;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
-  return SDValue(Op.getNode(), 0);
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT OVT = Op.getValueType();
-  MVT INTTY = MVT::i32;
-  if (OVT == MVT::v2i8) {
-    INTTY = MVT::v2i32;
-  } else if (OVT == MVT::v4i8) {
-    INTTY = MVT::v4i32;
-  }
-  SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
-  SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
-  LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
-  LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
-  return LHS;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT OVT = Op.getValueType();
-  MVT INTTY = MVT::i32;
-  if (OVT == MVT::v2i16) {
-    INTTY = MVT::v2i32;
-  } else if (OVT == MVT::v4i16) {
-    INTTY = MVT::v4i32;
-  }
-  SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
-  SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
-  LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
-  LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
-  return LHS;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT OVT = Op.getValueType();
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  // The LowerSREM32 function generates equivalent to the following IL.
-  // mov r0, LHS
-  // mov r1, RHS
-  // ilt r10, r0, 0
-  // ilt r11, r1, 0
-  // iadd r0, r0, r10
-  // iadd r1, r1, r11
-  // ixor r0, r0, r10
-  // ixor r1, r1, r11
-  // udiv r20, r0, r1
-  // umul r20, r20, r1
-  // sub r0, r0, r20
-  // iadd r0, r0, r10
-  // ixor DST, r0, r10
-
-  // mov r0, LHS
-  SDValue r0 = LHS;
-
-  // mov r1, RHS
-  SDValue r1 = RHS;
-
-  // ilt r10, r0, 0
-  SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);
-
-  // ilt r11, r1, 0
-  SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);
-
-  // iadd r0, r0, r10
-  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
-
-  // iadd r1, r1, r11
-  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
-
-  // ixor r0, r0, r10
-  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
-
-  // ixor r1, r1, r11
-  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
-
-  // udiv r20, r0, r1
-  SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
-
-  // umul r20, r20, r1
-  r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);
-
-  // sub r0, r0, r20
-  r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
-
-  // iadd r0, r0, r10
-  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
-
-  // ixor DST, r0, r10
-  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); 
-  return DST;
-}
-
-SDValue
-AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const {
-  return SDValue(Op.getNode(), 0);
-}
diff --git a/lib/Target/R600/AMDILInstrInfo.td b/lib/Target/R600/AMDILInstrInfo.td
deleted file mode 100644
index 0f0c88d..0000000
--- a/lib/Target/R600/AMDILInstrInfo.td
+++ /dev/null
@@ -1,150 +0,0 @@
-//===------------ AMDILInstrInfo.td - AMDIL Target ------*-tablegen-*------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-// This file describes the AMDIL instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-//===--------------------------------------------------------------------===//
-// Custom Operands
-//===--------------------------------------------------------------------===//
-def brtarget   : Operand<OtherVT>;
-
-//===--------------------------------------------------------------------===//
-// Custom Selection DAG Type Profiles
-//===--------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// Generic Profile Types
-//===----------------------------------------------------------------------===//
-
-def SDTIL_GenBinaryOp : SDTypeProfile<1, 2, [
-    SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
-    ]>;
-def SDTIL_GenTernaryOp : SDTypeProfile<1, 3, [
-    SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<2, 3>
-    ]>;
-def SDTIL_GenVecBuild : SDTypeProfile<1, 1, [
-    SDTCisEltOfVec<1, 0>
-    ]>;
-
-//===----------------------------------------------------------------------===//
-// Flow Control Profile Types
-//===----------------------------------------------------------------------===//
-// Branch instruction where second and third are basic blocks
-def SDTIL_BRCond : SDTypeProfile<0, 2, [
-    SDTCisVT<0, OtherVT>
-    ]>;
-
-//===--------------------------------------------------------------------===//
-// Custom Selection DAG Nodes
-//===--------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// Flow Control DAG Nodes
-//===----------------------------------------------------------------------===//
-def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>;
-
-//===----------------------------------------------------------------------===//
-// Call/Return DAG Nodes
-//===----------------------------------------------------------------------===//
-def IL_retflag       : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
-    [SDNPHasChain, SDNPOptInGlue]>;
-
-//===--------------------------------------------------------------------===//
-// Instructions
-//===--------------------------------------------------------------------===//
-// Floating point math functions
-def IL_div_inf      : SDNode<"AMDGPUISD::DIV_INF", SDTIL_GenBinaryOp>;
-
-//===----------------------------------------------------------------------===//
-// Integer functions
-//===----------------------------------------------------------------------===//
-def IL_umul        : SDNode<"AMDGPUISD::UMUL"    , SDTIntBinOp,
-    [SDNPCommutative, SDNPAssociative]>;
-
-//===--------------------------------------------------------------------===//
-// Custom Pattern DAG Nodes
-//===--------------------------------------------------------------------===//
-def global_store : PatFrag<(ops node:$val, node:$ptr),
-    (store node:$val, node:$ptr), [{
-        return isGlobalStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-//===----------------------------------------------------------------------===//
-// Load pattern fragments
-//===----------------------------------------------------------------------===//
-// Global address space loads
-def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-// Constant address space loads
-def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
-}]>;
-
-//===----------------------------------------------------------------------===//
-// Complex addressing mode patterns
-//===----------------------------------------------------------------------===//
-def ADDR : ComplexPattern<i32, 2, "SelectADDR", [], []>;
-def ADDRF : ComplexPattern<i32, 2, "SelectADDR", [frameindex], []>;
-def ADDR64 : ComplexPattern<i64, 2, "SelectADDR64", [], []>;
-def ADDR64F : ComplexPattern<i64, 2, "SelectADDR64", [frameindex], []>;
-
-//===----------------------------------------------------------------------===//
-// Instruction format classes
-//===----------------------------------------------------------------------===//
-class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
-: Instruction {
-
-     let Namespace = "AMDGPU";
-     dag OutOperandList = outs;
-     dag InOperandList = ins;
-     let Pattern = pattern;
-     let AsmString = !strconcat(asmstr, "\n");
-     let isPseudo = 1;
-     let Itinerary = NullALU;
-     bit hasIEEEFlag = 0;
-     bit hasZeroOpFlag = 0;
-     let mayLoad = 0;
-     let mayStore = 0;
-     let hasSideEffects = 0;
-}
-
-//===--------------------------------------------------------------------===//
-// Multiclass Instruction formats
-//===--------------------------------------------------------------------===//
-// Multiclass that handles branch instructions
-multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {
-    def _i32 : ILFormat<(outs),
-  (ins brtarget:$target, rci:$src0),
-        "; i32 Pseudo branch instruction",
-  [(Op bb:$target, (i32 rci:$src0))]>;
-    def _f32 : ILFormat<(outs),
-  (ins brtarget:$target, rcf:$src0),
-        "; f32 Pseudo branch instruction",
-  [(Op bb:$target, (f32 rcf:$src0))]>;
-}
-
-// Only scalar types should generate flow control
-multiclass BranchInstr<string name> {
-  def _i32 : ILFormat<(outs), (ins GPRI32:$src),
-      !strconcat(name, " $src"), []>;
-  def _f32 : ILFormat<(outs), (ins GPRF32:$src),
-      !strconcat(name, " $src"), []>;
-}
-// Only scalar types should generate flow control
-multiclass BranchInstr2<string name> {
-  def _i32 : ILFormat<(outs), (ins GPRI32:$src0, GPRI32:$src1),
-      !strconcat(name, " $src0, $src1"), []>;
-  def _f32 : ILFormat<(outs), (ins GPRF32:$src0, GPRF32:$src1),
-      !strconcat(name, " $src0, $src1"), []>;
-}
-
-//===--------------------------------------------------------------------===//
-// Intrinsics support
-//===--------------------------------------------------------------------===//
-include "AMDILIntrinsics.td"
diff --git a/lib/Target/R600/AMDILIntrinsics.td b/lib/Target/R600/AMDILIntrinsics.td
deleted file mode 100644
index 4a3e02e..0000000
--- a/lib/Target/R600/AMDILIntrinsics.td
+++ /dev/null
@@ -1,224 +0,0 @@
-//===- AMDILIntrinsics.td - Defines AMDIL Intrinscs -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-// This file defines all of the amdil-specific intrinsics
-//
-//===---------------------------------------------------------------===//
-//===--------------------------------------------------------------------===//
-// Intrinsic classes
-// Generic versions of the above classes but for Target specific intrinsics
-// instead of SDNode patterns.
-//===--------------------------------------------------------------------===//
-let TargetPrefix = "AMDIL", isTarget = 1 in {
-     class VoidIntLong :
-          Intrinsic<[llvm_i64_ty], [], []>;
-     class VoidIntInt :
-          Intrinsic<[llvm_i32_ty], [], []>;
-     class VoidIntBool :
-          Intrinsic<[llvm_i32_ty], [], []>;
-     class UnaryIntInt :
-          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-     class UnaryIntFloat :
-          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-     class ConvertIntFTOI :
-          Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-     class ConvertIntITOF :
-          Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty], [IntrNoMem]>;
-     class UnaryIntNoRetInt :
-          Intrinsic<[], [llvm_anyint_ty], []>;
-     class UnaryIntNoRetFloat :
-          Intrinsic<[], [llvm_anyfloat_ty], []>;
-     class BinaryIntInt :
-          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
-     class BinaryIntFloat :
-          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
-     class BinaryIntNoRetInt :
-          Intrinsic<[], [llvm_anyint_ty, LLVMMatchType<0>], []>;
-     class BinaryIntNoRetFloat :
-          Intrinsic<[], [llvm_anyfloat_ty, LLVMMatchType<0>], []>;
-     class TernaryIntInt :
-          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
-          LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
-     class TernaryIntFloat :
-          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>,
-          LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
-     class QuaternaryIntInt :
-          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
-          LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
-     class UnaryAtomicInt :
-          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
-     class BinaryAtomicInt :
-          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
-     class TernaryAtomicInt :
-          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
-     class UnaryAtomicIntNoRet :
-          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
-     class BinaryAtomicIntNoRet :
-          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
-     class TernaryAtomicIntNoRet :
-          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
-}
-
-let TargetPrefix = "AMDIL", isTarget = 1 in {
-  def int_AMDIL_abs : GCCBuiltin<"__amdil_abs">, UnaryIntInt;
-
-  def int_AMDIL_bit_reverse_u32 : GCCBuiltin<"__amdil_ubit_reverse">,
-          UnaryIntInt;
-  def int_AMDIL_bit_count_i32 : GCCBuiltin<"__amdil_count_bits">,
-          UnaryIntInt;
-  def int_AMDIL_bit_find_first_lo : GCCBuiltin<"__amdil_ffb_lo">,
-          UnaryIntInt;
-  def int_AMDIL_bit_find_first_hi : GCCBuiltin<"__amdil_ffb_hi">,
-          UnaryIntInt;
-  def int_AMDIL_bit_find_first_sgn : GCCBuiltin<"__amdil_ffb_signed">,
-          UnaryIntInt;
-  def int_AMDIL_media_bitalign : GCCBuiltin<"__amdil_bitalign">,
-                    TernaryIntInt;
-  def int_AMDIL_media_bytealign : GCCBuiltin<"__amdil_bytealign">,
-                    TernaryIntInt;
-  def int_AMDIL_bit_insert_u32 : GCCBuiltin<"__amdil_ubit_insert">,
-                    QuaternaryIntInt;
-  def int_AMDIL_bfi : GCCBuiltin<"__amdil_bfi">,
-      TernaryIntInt;
-  def int_AMDIL_bfm : GCCBuiltin<"__amdil_bfm">,
-      BinaryIntInt;
-  def int_AMDIL_mulhi_i32 : GCCBuiltin<"__amdil_imul_high">,
-          BinaryIntInt;
-  def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">,
-          BinaryIntInt;
-  def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">,
-          BinaryIntInt;
-  def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">,
-          BinaryIntInt;
-  def int_AMDIL_carry_i32 : GCCBuiltin<"__amdil_carry">,
-          BinaryIntInt;
-  def int_AMDIL_borrow_i32 : GCCBuiltin<"__amdil_borrow">,
-          BinaryIntInt;
-  def int_AMDIL_min_i32 : GCCBuiltin<"__amdil_imin">,
-          BinaryIntInt;
-  def int_AMDIL_min_u32 : GCCBuiltin<"__amdil_umin">,
-          BinaryIntInt;
-  def int_AMDIL_min     : GCCBuiltin<"__amdil_min">,
-          BinaryIntFloat;
-  def int_AMDIL_max_i32 : GCCBuiltin<"__amdil_imax">,
-          BinaryIntInt;
-  def int_AMDIL_max_u32 : GCCBuiltin<"__amdil_umax">,
-          BinaryIntInt;
-  def int_AMDIL_max     : GCCBuiltin<"__amdil_max">,
-          BinaryIntFloat;
-  def int_AMDIL_media_lerp_u4 : GCCBuiltin<"__amdil_u4lerp">,
-          TernaryIntInt;
-  def int_AMDIL_media_sad : GCCBuiltin<"__amdil_sad">,
-          TernaryIntInt;
-  def int_AMDIL_media_sad_hi : GCCBuiltin<"__amdil_sadhi">,
-          TernaryIntInt;
-  def int_AMDIL_fraction : GCCBuiltin<"__amdil_fraction">,
-          UnaryIntFloat;
-  def int_AMDIL_clamp : GCCBuiltin<"__amdil_clamp">,
-          TernaryIntFloat;
-  def int_AMDIL_pireduce : GCCBuiltin<"__amdil_pireduce">,
-          UnaryIntFloat;
-  def int_AMDIL_round_nearest : GCCBuiltin<"__amdil_round_nearest">,
-          UnaryIntFloat;
-  def int_AMDIL_round_neginf : GCCBuiltin<"__amdil_round_neginf">,
-          UnaryIntFloat;
-  def int_AMDIL_round_zero : GCCBuiltin<"__amdil_round_zero">,
-          UnaryIntFloat;
-  def int_AMDIL_acos : GCCBuiltin<"__amdil_acos">,
-          UnaryIntFloat;
-  def int_AMDIL_atan : GCCBuiltin<"__amdil_atan">,
-          UnaryIntFloat;
-  def int_AMDIL_asin : GCCBuiltin<"__amdil_asin">,
-          UnaryIntFloat;
-  def int_AMDIL_cos : GCCBuiltin<"__amdil_cos">,
-          UnaryIntFloat;
-  def int_AMDIL_cos_vec : GCCBuiltin<"__amdil_cos_vec">,
-          UnaryIntFloat;
-  def int_AMDIL_tan : GCCBuiltin<"__amdil_tan">,
-          UnaryIntFloat;
-  def int_AMDIL_sin : GCCBuiltin<"__amdil_sin">,
-          UnaryIntFloat;
-  def int_AMDIL_sin_vec : GCCBuiltin<"__amdil_sin_vec">,
-          UnaryIntFloat;
-  def int_AMDIL_pow : GCCBuiltin<"__amdil_pow">, BinaryIntFloat;
-  def int_AMDIL_div : GCCBuiltin<"__amdil_div">, BinaryIntFloat;
-  def int_AMDIL_udiv : GCCBuiltin<"__amdil_udiv">, BinaryIntInt;
-  def int_AMDIL_sqrt: GCCBuiltin<"__amdil_sqrt">,
-          UnaryIntFloat;
-  def int_AMDIL_sqrt_vec: GCCBuiltin<"__amdil_sqrt_vec">,
-          UnaryIntFloat;
-  def int_AMDIL_exp : GCCBuiltin<"__amdil_exp">,
-          UnaryIntFloat;
-  def int_AMDIL_exp_vec : GCCBuiltin<"__amdil_exp_vec">,
-          UnaryIntFloat;
-  def int_AMDIL_exn : GCCBuiltin<"__amdil_exn">,
-          UnaryIntFloat;
-  def int_AMDIL_log_vec : GCCBuiltin<"__amdil_log_vec">,
-          UnaryIntFloat;
-  def int_AMDIL_ln : GCCBuiltin<"__amdil_ln">,
-          UnaryIntFloat;
-  def int_AMDIL_sign: GCCBuiltin<"__amdil_sign">,
-          UnaryIntFloat;
-  def int_AMDIL_fma: GCCBuiltin<"__amdil_fma">,
-          TernaryIntFloat;
-  def int_AMDIL_rsq : GCCBuiltin<"__amdil_rsq">,
-          UnaryIntFloat;
-  def int_AMDIL_rsq_vec : GCCBuiltin<"__amdil_rsq_vec">,
-          UnaryIntFloat;
-  def int_AMDIL_length : GCCBuiltin<"__amdil_length">,
-          UnaryIntFloat;
-  def int_AMDIL_lerp : GCCBuiltin<"__amdil_lerp">,
-          TernaryIntFloat;
-  def int_AMDIL_media_sad4 : GCCBuiltin<"__amdil_sad4">,
-      Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty,
-           llvm_v4i32_ty, llvm_i32_ty], []>;
-
-  def int_AMDIL_frexp_f64 : GCCBuiltin<"__amdil_frexp">,
-        Intrinsic<[llvm_v2i64_ty], [llvm_double_ty], []>;
- def int_AMDIL_ldexp : GCCBuiltin<"__amdil_ldexp">,
-    Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty, llvm_anyint_ty], []>;
-  def int_AMDIL_drcp : GCCBuiltin<"__amdil_rcp">,
-      Intrinsic<[llvm_double_ty], [llvm_double_ty], []>;
-  def int_AMDIL_convert_f16_f32 : GCCBuiltin<"__amdil_half_to_float">,
-      ConvertIntITOF;
-  def int_AMDIL_convert_f32_f16 : GCCBuiltin<"__amdil_float_to_half">,
-      ConvertIntFTOI;
-  def int_AMDIL_convert_f32_i32_rpi : GCCBuiltin<"__amdil_float_to_int_rpi">,
-      ConvertIntFTOI;
-  def int_AMDIL_convert_f32_i32_flr : GCCBuiltin<"__amdil_float_to_int_flr">,
-      ConvertIntFTOI;
-  def int_AMDIL_convert_f32_f16_near : GCCBuiltin<"__amdil_float_to_half_near">,
-      ConvertIntFTOI;
-  def int_AMDIL_convert_f32_f16_neg_inf : GCCBuiltin<"__amdil_float_to_half_neg_inf">,
-      ConvertIntFTOI;
-  def int_AMDIL_convert_f32_f16_plus_inf : GCCBuiltin<"__amdil_float_to_half_plus_inf">,
-      ConvertIntFTOI;
- def int_AMDIL_media_convert_f2v4u8 : GCCBuiltin<"__amdil_f_2_u4">,
-      Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], []>;
-  def int_AMDIL_media_unpack_byte_0 : GCCBuiltin<"__amdil_unpack_0">,
-      ConvertIntITOF;
-  def int_AMDIL_media_unpack_byte_1 : GCCBuiltin<"__amdil_unpack_1">,
-      ConvertIntITOF;
-  def int_AMDIL_media_unpack_byte_2 : GCCBuiltin<"__amdil_unpack_2">,
-      ConvertIntITOF;
-  def int_AMDIL_media_unpack_byte_3 : GCCBuiltin<"__amdil_unpack_3">,
-      ConvertIntITOF;
-  def int_AMDIL_dp2_add : GCCBuiltin<"__amdil_dp2_add">,
-        Intrinsic<[llvm_float_ty], [llvm_v2f32_ty,
-          llvm_v2f32_ty, llvm_float_ty], []>;
-  def int_AMDIL_dp2 : GCCBuiltin<"__amdil_dp2">,
-        Intrinsic<[llvm_float_ty], [llvm_v2f32_ty,
-          llvm_v2f32_ty], []>;
-  def int_AMDIL_dp3 : GCCBuiltin<"__amdil_dp3">,
-        Intrinsic<[llvm_float_ty], [llvm_v4f32_ty,
-          llvm_v4f32_ty], []>;
-  def int_AMDIL_dp4 : GCCBuiltin<"__amdil_dp4">,
-        Intrinsic<[llvm_float_ty], [llvm_v4f32_ty,
-          llvm_v4f32_ty], []>;
-}
diff --git a/lib/Target/R600/AMDILRegisterInfo.td b/lib/Target/R600/AMDILRegisterInfo.td
deleted file mode 100644
index b9d0334..0000000
--- a/lib/Target/R600/AMDILRegisterInfo.td
+++ /dev/null
@@ -1,107 +0,0 @@
-//===- AMDILRegisterInfo.td - AMDIL Register defs ----------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-//  Declarations that describe the AMDIL register file
-//
-//===----------------------------------------------------------------------===//
-
-class AMDILReg<bits<16> num, string n> : Register<n> {
-  field bits<16> Value;
-  let Value = num;
-  let Namespace = "AMDGPU";
-}
-
-// We will start with 8 registers for each class before expanding to more
-// Since the swizzle is added based on the register class, we can leave it
-// off here and just specify different registers for different register classes
-def R1 : AMDILReg<1, "r1">, DwarfRegNum<[1]>;
-def R2 : AMDILReg<2, "r2">, DwarfRegNum<[2]>;
-def R3 : AMDILReg<3, "r3">, DwarfRegNum<[3]>;
-def R4 : AMDILReg<4, "r4">, DwarfRegNum<[4]>;
-def R5 : AMDILReg<5, "r5">, DwarfRegNum<[5]>;
-def R6 : AMDILReg<6, "r6">, DwarfRegNum<[6]>;
-def R7 : AMDILReg<7, "r7">, DwarfRegNum<[7]>;
-def R8 : AMDILReg<8, "r8">, DwarfRegNum<[8]>;
-def R9 : AMDILReg<9, "r9">, DwarfRegNum<[9]>;
-def R10 : AMDILReg<10, "r10">, DwarfRegNum<[10]>;
-def R11 : AMDILReg<11, "r11">, DwarfRegNum<[11]>;
-def R12 : AMDILReg<12, "r12">, DwarfRegNum<[12]>;
-def R13 : AMDILReg<13, "r13">, DwarfRegNum<[13]>;
-def R14 : AMDILReg<14, "r14">, DwarfRegNum<[14]>;
-def R15 : AMDILReg<15, "r15">, DwarfRegNum<[15]>;
-def R16 : AMDILReg<16, "r16">, DwarfRegNum<[16]>;
-def R17 : AMDILReg<17, "r17">, DwarfRegNum<[17]>;
-def R18 : AMDILReg<18, "r18">, DwarfRegNum<[18]>;
-def R19 : AMDILReg<19, "r19">, DwarfRegNum<[19]>;
-def R20 : AMDILReg<20, "r20">, DwarfRegNum<[20]>;
-
-// All registers between 1000 and 1024 are reserved and cannot be used
-// unless commented in this section
-// r1021-r1025 are used to dynamically calculate the local/group/thread/region/region_local ID's
-// r1020 is used to hold the frame index for local arrays
-// r1019 is used to hold the dynamic stack allocation pointer
-// r1018 is used as a temporary register for handwritten code
-// r1017 is used as a temporary register for handwritten code
-// r1016 is used as a temporary register for load/store code
-// r1015 is used as a temporary register for data segment offset
-// r1014 is used as a temporary register for store code
-// r1013 is used as the section data pointer register
-// r1012-r1010 and r1001-r1008 are used for temporary I/O registers
-// r1009 is used as the frame pointer register
-// r999 is used as the mem register.
-// r998 is used as the return address register.
-//def R1025 : AMDILReg<1025, "r1025">, DwarfRegNum<[1025]>;
-//def R1024 : AMDILReg<1024, "r1024">, DwarfRegNum<[1024]>;
-//def R1023 : AMDILReg<1023, "r1023">, DwarfRegNum<[1023]>;
-//def R1022 : AMDILReg<1022, "r1022">, DwarfRegNum<[1022]>;
-//def R1021 : AMDILReg<1021, "r1021">, DwarfRegNum<[1021]>;
-//def R1020 : AMDILReg<1020, "r1020">, DwarfRegNum<[1020]>;
-def SP : AMDILReg<1019, "r1019">, DwarfRegNum<[1019]>;
-def T1 : AMDILReg<1018, "r1018">, DwarfRegNum<[1018]>;
-def T2 : AMDILReg<1017, "r1017">, DwarfRegNum<[1017]>;
-def T3 : AMDILReg<1016, "r1016">, DwarfRegNum<[1016]>;
-def T4 : AMDILReg<1015, "r1015">, DwarfRegNum<[1015]>;
-def T5 : AMDILReg<1014, "r1014">, DwarfRegNum<[1014]>;
-def SDP : AMDILReg<1013, "r1013">, DwarfRegNum<[1013]>;
-def R1012: AMDILReg<1012, "r1012">, DwarfRegNum<[1012]>;
-def R1011: AMDILReg<1011, "r1011">, DwarfRegNum<[1011]>;
-def R1010: AMDILReg<1010, "r1010">, DwarfRegNum<[1010]>;
-def DFP : AMDILReg<1009, "r1009">, DwarfRegNum<[1009]>;
-def R1008: AMDILReg<1008, "r1008">, DwarfRegNum<[1008]>;
-def R1007: AMDILReg<1007, "r1007">, DwarfRegNum<[1007]>;
-def R1006: AMDILReg<1006, "r1006">, DwarfRegNum<[1006]>;
-def R1005: AMDILReg<1005, "r1005">, DwarfRegNum<[1005]>;
-def R1004: AMDILReg<1004, "r1004">, DwarfRegNum<[1004]>;
-def R1003: AMDILReg<1003, "r1003">, DwarfRegNum<[1003]>;
-def R1002: AMDILReg<1002, "r1002">, DwarfRegNum<[1002]>;
-def R1001: AMDILReg<1001, "r1001">, DwarfRegNum<[1001]>;
-def MEM : AMDILReg<999, "mem">, DwarfRegNum<[999]>;
-def RA : AMDILReg<998, "r998">, DwarfRegNum<[998]>;
-def FP : AMDILReg<997, "r997">, DwarfRegNum<[997]>;
-def GPRI16 : RegisterClass<"AMDGPU", [i16], 16,
-  (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
-        let AltOrders = [(add (sequence "R%u", 1, 20))];
-        let AltOrderSelect = [{
-          return 1;
-        }];
-    }
-def GPRI32 : RegisterClass<"AMDGPU", [i32], 32,
-  (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
-        let AltOrders = [(add (sequence "R%u", 1, 20))];
-        let AltOrderSelect = [{
-          return 1;
-        }];
-    }
-def GPRF32 : RegisterClass<"AMDGPU", [f32], 32,
-  (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
-        let AltOrders = [(add (sequence "R%u", 1, 20))];
-        let AltOrderSelect = [{
-          return 1;
-        }];
-    }
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index 3c6fa5a..4d16082 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -13,10 +13,9 @@ add_public_tablegen_target(AMDGPUCommonTableGen)
 
 add_llvm_target(R600CodeGen
   AMDILCFGStructurizer.cpp
-  AMDILIntrinsicInfo.cpp
-  AMDILISelLowering.cpp
   AMDGPUAsmPrinter.cpp
   AMDGPUFrameLowering.cpp
+  AMDGPUIntrinsicInfo.cpp
   AMDGPUISelDAGToDAG.cpp
   AMDGPUMCInstLower.cpp
   AMDGPUMachineFunction.cpp
@@ -24,8 +23,8 @@ add_llvm_target(R600CodeGen
   AMDGPUTargetMachine.cpp
   AMDGPUTargetTransformInfo.cpp
   AMDGPUISelLowering.cpp
-  AMDGPUConvertToISA.cpp
   AMDGPUInstrInfo.cpp
+  AMDGPUPromoteAlloca.cpp
   AMDGPURegisterInfo.cpp
   R600ClauseMergePass.cpp
   R600ControlFlowFinalizer.cpp
@@ -41,6 +40,7 @@ add_llvm_target(R600CodeGen
   R600TextureIntrinsicsReplacer.cpp
   SIAnnotateControlFlow.cpp
   SIFixSGPRCopies.cpp
+  SIFixSGPRLiveRanges.cpp
   SIInsertWaits.cpp
   SIInstrInfo.cpp
   SIISelLowering.cpp
diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td
index 2065441..dcb7e98 100644
--- a/lib/Target/R600/EvergreenInstructions.td
+++ b/lib/Target/R600/EvergreenInstructions.td
@@ -295,7 +295,7 @@ def : Pat<(i32 (sext_inreg i32:$src, i8)),
 def : Pat<(i32 (sext_inreg i32:$src, i16)),
   (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>;
 
-defm : BFIPatterns <BFI_INT_eg>;
+defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32>;
 
 def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
   [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))],
@@ -326,6 +326,8 @@ def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
 def DOT4_eg : DOT4_Common<0xBE>;
 defm CUBE_eg : CUBE_Common<0xC0>;
 
+def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
+
 let hasSideEffects = 1 in {
   def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>;
 }
@@ -346,7 +348,7 @@ def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> {
 def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
 
 def GROUP_BARRIER : InstR600 <
-    (outs), (ins), "  GROUP_BARRIER", [(int_AMDGPU_barrier_local)], AnyALU>,
+    (outs), (ins), "  GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>,
     R600ALU_Word0,
     R600ALU_Word1_OP2 <0x54> {
 
@@ -375,6 +377,11 @@ def GROUP_BARRIER : InstR600 <
   let ALUInst = 1;
 }
 
+def : Pat <
+	(int_AMDGPU_barrier_global),
+	(GROUP_BARRIER)
+>;
+
 //===----------------------------------------------------------------------===//
 // LDS Instructions
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
index 11ae091..0927040 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -99,9 +99,9 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
     return;
   }
 
-  // The low 8 bits encoding value is the register index, for both VGPRs and
-  // SGPRs.
-  unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8)  - 1);
+  // The low 8 bits of the encoding value is the register index, for both VGPRs
+  // and SGPRs.
+  unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1);
   if (NumRegs == 1) {
     O << Type << RegIdx;
     return;
@@ -216,13 +216,8 @@ void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
 
 void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
                                      raw_ostream &O) {
-  union Literal {
-    float f;
-    int32_t i;
-  } L;
-
-  L.i = MI->getOperand(OpNo).getImm();
-  O << L.i << "(" << L.f << ")";
+  int32_t Imm = MI->getOperand(OpNo).getImm();
+  O << Imm << '(' << BitsToFloat(Imm) << ')';
 }
 
 void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 5e7cefe..dc1344f 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -172,17 +172,13 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
                                         SmallVectorImpl<MCFixup> &Fixup,
                                         const MCSubtargetInfo &STI) const {
   if (MO.isReg()) {
-    if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) {
+    if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags))
       return MRI.getEncodingValue(MO.getReg());
-    } else {
-      return getHWReg(MO.getReg());
-    }
-  } else if (MO.isImm()) {
-    return MO.getImm();
-  } else {
-    assert(0);
-    return 0;
+    return getHWReg(MO.getReg());
   }
+
+  assert(MO.isImm());
+  return MO.getImm();
 }
 
 #include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index d255e96..d98a6db 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/Support/Debug.h"
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index d6c6830..7f3560a 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -13,6 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "R600ISelLowering.h"
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
@@ -65,6 +68,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
 
   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
 
   setOperationAction(ISD::FSUB, MVT::f32, Expand);
 
@@ -133,19 +137,47 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
   setTargetDAGCombine(ISD::FP_ROUND);
   setTargetDAGCombine(ISD::FP_TO_SINT);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   setTargetDAGCombine(ISD::SELECT_CC);
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 
+  setOperationAction(ISD::SUB, MVT::i64, Expand);
+
   // These should be replaced by UDVIREM, but it does not happen automatically
   // during Type Legalization
   setOperationAction(ISD::UDIV, MVT::i64, Custom);
   setOperationAction(ISD::UREM, MVT::i64, Custom);
+  setOperationAction(ISD::SDIV, MVT::i64, Custom);
+  setOperationAction(ISD::SREM, MVT::i64, Custom);
+
+  // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
+  //  to be Legal/Custom in order to avoid library calls.
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 
+  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
+  for (MVT VT : ScalarIntVTs) {
+    setOperationAction(ISD::ADDC, VT, Expand);
+    setOperationAction(ISD::SUBC, VT, Expand);
+    setOperationAction(ISD::ADDE, VT, Expand);
+    setOperationAction(ISD::SUBE, VT, Expand);
+  }
+
   setBooleanContents(ZeroOrNegativeOneBooleanContent);
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
   setSchedulingPreference(Sched::Source);
@@ -537,11 +569,24 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
   case ISD::FCOS:
   case ISD::FSIN: return LowerTrig(Op, DAG);
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
   case ISD::STORE: return LowerSTORE(Op, DAG);
-  case ISD::LOAD: return LowerLOAD(Op, DAG);
+  case ISD::LOAD: {
+    SDValue Result = LowerLOAD(Op, DAG);
+    assert((!Result.getNode() ||
+            Result.getNode()->getNumValues() == 2) &&
+           "Load should return a value and a chain");
+    return Result;
+  }
+
+  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
   case ISD::INTRINSIC_VOID: {
     SDValue Chain = Op.getOperand(0);
@@ -776,6 +821,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case Intrinsic::r600_read_tidig_z:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T0_Z, VT);
+    case Intrinsic::AMDGPU_rsq:
+      // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
+      return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
     }
     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
     break;
@@ -793,20 +841,172 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
     return;
-  case ISD::LOAD: {
-    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
-    Results.push_back(SDValue(Node, 0));
-    Results.push_back(SDValue(Node, 1));
-    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
-    // function
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
-    return;
+  case ISD::UDIV: {
+    SDValue Op = SDValue(N, 0);
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
+      N->getOperand(0), N->getOperand(1));
+    Results.push_back(UDIVREM);
+    break;
   }
-  case ISD::STORE:
-    SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
-    Results.push_back(SDValue(Node, 0));
-    return;
+  case ISD::UREM: {
+    SDValue Op = SDValue(N, 0);
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
+      N->getOperand(0), N->getOperand(1));
+    Results.push_back(UDIVREM.getValue(1));
+    break;
+  }
+  case ISD::SDIV: {
+    SDValue Op = SDValue(N, 0);
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
+      N->getOperand(0), N->getOperand(1));
+    Results.push_back(SDIVREM);
+    break;
+  }
+  case ISD::SREM: {
+    SDValue Op = SDValue(N, 0);
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
+      N->getOperand(0), N->getOperand(1));
+    Results.push_back(SDIVREM.getValue(1));
+    break;
+  }
+  case ISD::SDIVREM: {
+    SDValue Op = SDValue(N, 1);
+    SDValue RES = LowerSDIVREM(Op, DAG);
+    Results.push_back(RES);
+    Results.push_back(RES.getValue(1));
+    break;
+  }
+  case ISD::UDIVREM: {
+    SDValue Op = SDValue(N, 0);
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+
+    SDValue one = DAG.getConstant(1, HalfVT);
+    SDValue zero = DAG.getConstant(0, HalfVT);
+
+    //HiLo split
+    SDValue LHS = N->getOperand(0);
+    SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
+    SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
+
+    SDValue RHS = N->getOperand(1);
+    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
+    SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
+
+    // Get Speculative values
+    SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
+    SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
+
+    SDValue REM_Hi = zero;
+    SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
+
+    SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
+    SDValue DIV_Lo = zero;
+
+    const unsigned halfBitWidth = HalfVT.getSizeInBits();
+
+    for (unsigned i = 0; i < halfBitWidth; ++i) {
+      SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
+      // Get Value of high bit
+      SDValue HBit;
+      if (halfBitWidth == 32 && Subtarget->hasBFE()) {
+        HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
+      } else {
+        HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
+        HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
+      }
+
+      SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
+        DAG.getConstant(halfBitWidth - 1, HalfVT));
+      REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
+      REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
+
+      REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
+      REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
+
+
+      SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
+
+      SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
+      SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
+
+      DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
+
+      // Update REM
+
+      SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
+
+      REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
+      REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
+      REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
+    }
+
+    SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
+    SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
+    Results.push_back(DIV);
+    Results.push_back(REM);
+    break;
   }
+  }
+}
+
+SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
+                                                   SDValue Vector) const {
+
+  SDLoc DL(Vector);
+  EVT VecVT = Vector.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  SmallVector<SDValue, 8> Args;
+
+  for (unsigned i = 0, e = VecVT.getVectorNumElements();
+                                                           i != e; ++i) {
+    Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
+                               Vector, DAG.getConstant(i, getVectorIdxTy())));
+  }
+
+  return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
+}
+
+SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+
+  SDLoc DL(Op);
+  SDValue Vector = Op.getOperand(0);
+  SDValue Index = Op.getOperand(1);
+
+  if (isa<ConstantSDNode>(Index) ||
+      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+    return Op;
+
+  Vector = vectorToVerticalVector(DAG, Vector);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
+                     Vector, Index);
+}
+
+SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Vector = Op.getOperand(0);
+  SDValue Value = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+
+  if (isa<ConstantSDNode>(Index) ||
+      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+    return Op;
+
+  Vector = vectorToVerticalVector(DAG, Vector);
+  SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
+                               Vector, Value, Index);
+  return vectorToVerticalVector(DAG, Insert);
 }
 
 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
@@ -840,6 +1040,80 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
       DAG.getConstantFP(3.14159265359, MVT::f32));
 }
 
+SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Shift = Op.getOperand(2);
+  SDValue Zero = DAG.getConstant(0, VT);
+  SDValue One  = DAG.getConstant(1, VT);
+
+  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
+  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
+  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
+  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
+
+  // The dance around Width1 is necessary for 0 special case.
+  // Without it the CompShift might be 32, producing incorrect results in
+  // Overflow. So we do the shift in two steps, the alternative is to
+  // add a conditional to filter the special case.
+
+  SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
+  Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
+
+  SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
+  HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
+  SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
+
+  SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
+  SDValue LoBig = Zero;
+
+  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
+  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
+}
+
+SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Shift = Op.getOperand(2);
+  SDValue Zero = DAG.getConstant(0, VT);
+  SDValue One  = DAG.getConstant(1, VT);
+
+  const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
+
+  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
+  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
+  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
+  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
+
+  // The dance around Width1 is necessary for 0 special case.
+  // Without it the CompShift might be 32, producing incorrect results in
+  // Overflow. So we do the shift in two steps, the alternative is to
+  // add a conditional to filter the special case.
+
+  SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
+  Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
+
+  SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
+  SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
+  LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
+
+  SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
+  SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
+
+  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
+  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
+}
+
 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(
       ISD::SETCC,
@@ -1369,6 +1643,15 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
   return DAG.getMergeValues(Ops, DL);
 }
 
+SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Cond  = Op.getOperand(1);
+  SDValue Jump  = Op.getOperand(2);
+
+  return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
+                     Chain, Jump, Cond);
+}
+
 /// XXX Only kernel functions are supported, so we can assume for now that
 /// every function is a kernel function, but in the future we should use
 /// separate calling conventions for kernel and non-kernel functions.
@@ -1902,9 +2185,8 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
   SDValue FakeOp;
 
   std::vector<SDValue> Ops;
-  for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end();
-              I != E; ++I)
-          Ops.push_back(*I);
+  for (const SDUse &I : Node->ops())
+    Ops.push_back(I);
 
   if (Opcode == AMDGPU::DOT_4) {
     int OperandIdx[] = {
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index a8a464f..d22c8c9 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -51,15 +51,18 @@ private:
   void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
       MachineRegisterInfo & MRI, unsigned dword_offset) const;
   SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG) const;
+  SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const;
 
-  /// \brief Lower ROTL opcode to BITALIGN
-  SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
-
+  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth,
                                           SelectionDAG &DAG) const;
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index b0d9ae3..3972e2f 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -28,10 +28,9 @@ using namespace llvm;
 #define GET_INSTRINFO_CTOR_DTOR
 #include "AMDGPUGenDFAPacketizer.inc"
 
-R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm)
-  : AMDGPUInstrInfo(tm),
-    RI(tm),
-    ST(tm.getSubtarget<AMDGPUSubtarget>())
+R600InstrInfo::R600InstrInfo(const AMDGPUSubtarget &st)
+  : AMDGPUInstrInfo(st),
+    RI(st)
   { }
 
 const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const {
@@ -52,11 +51,15 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                            unsigned DestReg, unsigned SrcReg,
                            bool KillSrc) const {
   unsigned VectorComponents = 0;
-  if (AMDGPU::R600_Reg128RegClass.contains(DestReg) &&
-      AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
+  if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) ||
+      AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) &&
+      (AMDGPU::R600_Reg128RegClass.contains(SrcReg) ||
+       AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) {
     VectorComponents = 4;
-  } else if(AMDGPU::R600_Reg64RegClass.contains(DestReg) &&
-            AMDGPU::R600_Reg64RegClass.contains(SrcReg)) {
+  } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) ||
+            AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) &&
+            (AMDGPU::R600_Reg64RegClass.contains(SrcReg) ||
+             AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) {
     VectorComponents = 2;
   }
 
@@ -768,16 +771,6 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   return true;
 }
 
-int R600InstrInfo::getBranchInstr(const MachineOperand &op) const {
-  const MachineInstr *MI = op.getParent();
-
-  switch (MI->getDesc().OpInfo->RegClass) {
-  default: // FIXME: fallthrough??
-  case AMDGPU::GPRI32RegClassID: return AMDGPU::BRANCH_COND_i32;
-  case AMDGPU::GPRF32RegClassID: return AMDGPU::BRANCH_COND_f32;
-  };
-}
-
 static
 MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) {
   for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend();
@@ -1064,10 +1057,34 @@ unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   return 2;
 }
 
+bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+
+  switch(MI->getOpcode()) {
+  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+  case AMDGPU::R600_EXTRACT_ELT_V2:
+  case AMDGPU::R600_EXTRACT_ELT_V4:
+    buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(),
+                      RI.getHWRegIndex(MI->getOperand(1).getReg()), //  Address
+                      MI->getOperand(2).getReg(),
+                      RI.getHWRegChan(MI->getOperand(1).getReg()));
+    break;
+  case AMDGPU::R600_INSERT_ELT_V2:
+  case AMDGPU::R600_INSERT_ELT_V4:
+    buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value
+                       RI.getHWRegIndex(MI->getOperand(1).getReg()),  // Address
+                       MI->getOperand(3).getReg(),                    // Offset
+                       RI.getHWRegChan(MI->getOperand(1).getReg()));  // Channel
+    break;
+  }
+  MI->eraseFromParent();
+  return true;
+}
+
 void  R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
                                              const MachineFunction &MF) const {
   const AMDGPUFrameLowering *TFL =
-                 static_cast<const AMDGPUFrameLowering*>(TM.getFrameLowering());
+    static_cast<const AMDGPUFrameLowering*>(
+    MF.getTarget().getFrameLowering());
 
   unsigned StackWidth = TFL->getStackWidth(MF);
   int End = getIndirectIndexEnd(MF);
@@ -1100,7 +1117,22 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
                                        MachineBasicBlock::iterator I,
                                        unsigned ValueReg, unsigned Address,
                                        unsigned OffsetReg) const {
-  unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address);
+  return buildIndirectWrite(MBB, I, ValueReg, Address, OffsetReg, 0);
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg,
+                                       unsigned AddrChan) const {
+  unsigned AddrReg;
+  switch (AddrChan) {
+    default: llvm_unreachable("Invalid Channel");
+    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
+    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
+    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
+    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+  }
   MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
                                                AMDGPU::AR_X, OffsetReg);
   setImmOperand(MOVA, AMDGPU::OpName::write, 0);
@@ -1117,7 +1149,22 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
                                        MachineBasicBlock::iterator I,
                                        unsigned ValueReg, unsigned Address,
                                        unsigned OffsetReg) const {
-  unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address);
+  return buildIndirectRead(MBB, I, ValueReg, Address, OffsetReg, 0);
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg,
+                                       unsigned AddrChan) const {
+  unsigned AddrReg;
+  switch (AddrChan) {
+    default: llvm_unreachable("Invalid Channel");
+    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
+    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
+    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
+    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+  }
   MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
                                                        AMDGPU::AR_X,
                                                        OffsetReg);
@@ -1220,7 +1267,6 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
     const {
   assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented");
   unsigned Opcode;
-  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
   if (ST.getGeneration() <= AMDGPUSubtarget::R700)
     Opcode = AMDGPU::DOT4_r600;
   else
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index b5304a0..45a57d3 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -32,12 +32,22 @@ namespace llvm {
   class R600InstrInfo : public AMDGPUInstrInfo {
   private:
   const R600RegisterInfo RI;
-  const AMDGPUSubtarget &ST;
 
-  int getBranchInstr(const MachineOperand &op) const;
   std::vector<std::pair<int, unsigned> >
   ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const;
 
+
+  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg, unsigned Address,
+                                        unsigned OffsetReg,
+                                        unsigned AddrChan) const;
+
+  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg, unsigned Address,
+                                        unsigned OffsetReg,
+                                        unsigned AddrChan) const;
   public:
   enum BankSwizzle {
     ALU_VEC_012_SCL_210 = 0,
@@ -48,7 +58,7 @@ namespace llvm {
     ALU_VEC_210
   };
 
-  explicit R600InstrInfo(AMDGPUTargetMachine &tm);
+  explicit R600InstrInfo(const AMDGPUSubtarget &st);
 
   const R600RegisterInfo &getRegisterInfo() const override;
   void copyPhysReg(MachineBasicBlock &MBB,
@@ -197,6 +207,8 @@ namespace llvm {
   int getInstrLatency(const InstrItineraryData *ItinData,
                       SDNode *Node) const override { return 1;}
 
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
   /// \brief Reserve the registers that may be accesed using indirect addressing.
   void reserveIndirectRegisters(BitVector &Reserved,
                                 const MachineFunction &MF) const;
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 590fde2..73fa345 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -125,7 +125,7 @@ class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
 class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
                     InstrItinClass itin = AnyALU> :
     R600_1OP <inst, opName,
-              [(set R600_Reg32:$dst, (node R600_Reg32:$src0))]
+              [(set R600_Reg32:$dst, (node R600_Reg32:$src0))], itin
 >;
 
 // If you add or change the operands for R600_2OP instructions, you must
@@ -161,10 +161,10 @@ class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
 }
 
 class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
-                       InstrItinClass itim = AnyALU> :
+                       InstrItinClass itin = AnyALU> :
     R600_2OP <inst, opName,
               [(set R600_Reg32:$dst, (node R600_Reg32:$src0,
-                                           R600_Reg32:$src1))]
+                                           R600_Reg32:$src1))], itin
 >;
 
 // If you add our change the operands for R600_3OP instructions, you must
@@ -721,14 +721,11 @@ def SETNE_DX10 : R600_2OP <
 >;
 
 def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
-def TRUNC : R600_1OP_Helper <0x11, "TRUNC", int_AMDGPU_trunc>;
+def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>;
 def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>;
 def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>;
 def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>;
 
-// Add also ftrunc intrinsic pattern
-def : Pat<(ftrunc f32:$src0), (TRUNC $src0)>;
-
 def MOV : R600_1OP <0x19, "MOV", []>;
 
 let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in {
@@ -1082,18 +1079,21 @@ class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
   let Itinerary = TransALU;
 }
 
+// Clamped to maximum.
 class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
-  inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq
+  inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped
 > {
   let Itinerary = TransALU;
 }
 
-class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP <
-  inst, "RECIPSQRT_IEEE", []
+class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy
 > {
   let Itinerary = TransALU;
 }
 
+// TODO: There is also RECIPSQRT_FF which clamps to zero.
+
 class SIN_Common <bits<11> inst> : R600_1OP <
   inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{
   let Trig = 1;
@@ -1266,13 +1266,6 @@ defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
 
 
 //===----------------------------------------------------------------------===//
-// Branch Instructions
-//===----------------------------------------------------------------------===//
-
-def IF_PREDICATE_SET  : ILFormat<(outs), (ins GPRI32:$src),
-  "IF_PREDICATE_SET $src", []>;
-
-//===----------------------------------------------------------------------===//
 // Pseudo instructions
 //===----------------------------------------------------------------------===//
 
@@ -1345,15 +1338,6 @@ def TXD_SHADOW: InstR600 <
 } // End isPseudo = 1
 } // End usesCustomInserter = 1
 
-//===---------------------------------------------------------------------===//
-// Return instruction
-//===---------------------------------------------------------------------===//
-let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
-    usesCustomInserter = 1 in {
-  def RETURN          : ILFormat<(outs), (ins variable_ops),
-      "RETURN", [(IL_retflag)]>;
-}
-
 
 //===----------------------------------------------------------------------===//
 // Constant Buffer Addressing Support
@@ -1480,11 +1464,52 @@ let Inst{63-32} = Word1;
   let VTXInst = 1;
 }
 
+//===---------------------------------------------------------------------===//
+// Flow and Program control Instructions
+//===---------------------------------------------------------------------===//
+class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
+: Instruction {
+
+     let Namespace = "AMDGPU";
+     dag OutOperandList = outs;
+     dag InOperandList = ins;
+     let Pattern = pattern;
+     let AsmString = !strconcat(asmstr, "\n");
+     let isPseudo = 1;
+     let Itinerary = NullALU;
+     bit hasIEEEFlag = 0;
+     bit hasZeroOpFlag = 0;
+     let mayLoad = 0;
+     let mayStore = 0;
+     let hasSideEffects = 0;
+}
+
+multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {
+    def _i32 : ILFormat<(outs),
+  (ins brtarget:$target, rci:$src0),
+        "; i32 Pseudo branch instruction",
+  [(Op bb:$target, (i32 rci:$src0))]>;
+    def _f32 : ILFormat<(outs),
+  (ins brtarget:$target, rcf:$src0),
+        "; f32 Pseudo branch instruction",
+  [(Op bb:$target, (f32 rcf:$src0))]>;
+}
+
+// Only scalar types should generate flow control
+multiclass BranchInstr<string name> {
+  def _i32 : ILFormat<(outs), (ins R600_Reg32:$src),
+      !strconcat(name, " $src"), []>;
+  def _f32 : ILFormat<(outs), (ins R600_Reg32:$src),
+      !strconcat(name, " $src"), []>;
+}
+// Only scalar types should generate flow control
+multiclass BranchInstr2<string name> {
+  def _i32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1),
+      !strconcat(name, " $src0, $src1"), []>;
+  def _f32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1),
+      !strconcat(name, " $src0, $src1"), []>;
+}
 
-
-//===--------------------------------------------------------------------===//
-// Instructions support
-//===--------------------------------------------------------------------===//
 //===---------------------------------------------------------------------===//
 // Custom Inserter for Branches and returns, this eventually will be a
 // separate pass
@@ -1497,13 +1522,22 @@ let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
 }
 
 //===---------------------------------------------------------------------===//
-// Flow and Program control Instructions
+// Return instruction
 //===---------------------------------------------------------------------===//
+let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
+    usesCustomInserter = 1 in {
+  def RETURN          : ILFormat<(outs), (ins variable_ops),
+      "RETURN", [(IL_retflag)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Instructions
+//===----------------------------------------------------------------------===//
+
+def IF_PREDICATE_SET  : ILFormat<(outs), (ins R600_Reg32:$src),
+  "IF_PREDICATE_SET $src", []>;
+
 let isTerminator=1 in {
-  def SWITCH      : ILFormat< (outs), (ins GPRI32:$src),
-  !strconcat("SWITCH", " $src"), []>;
-  def CASE        : ILFormat< (outs), (ins GPRI32:$src),
-      !strconcat("CASE", " $src"), []>;
   def BREAK       : ILFormat< (outs), (ins),
       "BREAK", []>;
   def CONTINUE    : ILFormat< (outs), (ins),
@@ -1548,6 +1582,60 @@ let isTerminator=1 in {
 }
 
 //===----------------------------------------------------------------------===//
+// Indirect addressing pseudo instructions
+//===----------------------------------------------------------------------===//
+
+let isPseudo = 1 in {
+
+class ExtractVertical <RegisterClass vec_rc> : InstR600 <
+  (outs R600_Reg32:$dst),
+  (ins vec_rc:$vec, R600_Reg32:$index), "",
+  [],
+  AnyALU
+>;
+
+let Constraints = "$dst = $vec" in {
+
+class InsertVertical <RegisterClass vec_rc> : InstR600 <
+  (outs vec_rc:$dst),
+  (ins vec_rc:$vec, R600_Reg32:$value, R600_Reg32:$index), "",
+  [],
+  AnyALU
+>;
+
+} // End Constraints = "$dst = $vec"
+
+} // End isPseudo = 1
+
+def R600_EXTRACT_ELT_V2 : ExtractVertical <R600_Reg64Vertical>;
+def R600_EXTRACT_ELT_V4 : ExtractVertical <R600_Reg128Vertical>;
+
+def R600_INSERT_ELT_V2 : InsertVertical <R600_Reg64Vertical>;
+def R600_INSERT_ELT_V4 : InsertVertical <R600_Reg128Vertical>;
+
+class ExtractVerticalPat <Instruction inst, ValueType vec_ty,
+                          ValueType scalar_ty> : Pat <
+  (scalar_ty (extractelt vec_ty:$vec, i32:$index)),
+  (inst $vec, $index)
+>;
+
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2i32, i32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2f32, f32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4i32, i32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4f32, f32>;
+
+class InsertVerticalPat <Instruction inst, ValueType vec_ty,
+                         ValueType scalar_ty> : Pat <
+  (vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)),
+  (inst $vec, $value, $index)
+>;
+
+def : InsertVerticalPat <R600_INSERT_ELT_V2, v2i32, i32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V2, v2f32, f32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V4, v4i32, i32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>;
+
+//===----------------------------------------------------------------------===//
 // ISel Patterns
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index d1655d1..7ea654c 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "R600MachineScheduler.h"
+#include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Pass.h"
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
index c2f6c03..74cf309 100644
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp
@@ -16,6 +16,7 @@
 
 #include "llvm/Support/Debug.h"
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "R600InstrInfo.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineDominators.h"
diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp
index f3bb88b..dc95675 100644
--- a/lib/Target/R600/R600RegisterInfo.cpp
+++ b/lib/Target/R600/R600RegisterInfo.cpp
@@ -20,15 +20,14 @@
 
 using namespace llvm;
 
-R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm)
-: AMDGPURegisterInfo(tm),
-  TM(tm)
+R600RegisterInfo::R600RegisterInfo(const AMDGPUSubtarget &st)
+: AMDGPURegisterInfo(st)
   { RCW.RegWeight = 0; RCW.WeightLimit = 0;}
 
 BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
 
-  const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo());
+  const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(ST.getInstrInfo());
 
   Reserved.set(AMDGPU::ZERO);
   Reserved.set(AMDGPU::HALF);
@@ -55,16 +54,6 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
-const TargetRegisterClass *
-R600RegisterInfo::getISARegClass(const TargetRegisterClass * rc) const {
-  switch (rc->getID()) {
-  case AMDGPU::GPRF32RegClassID:
-  case AMDGPU::GPRI32RegClassID:
-    return &AMDGPU::R600_Reg32RegClass;
-  default: return rc;
-  }
-}
-
 unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
   return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
 }
diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h
index 52e1a4b..247808b 100644
--- a/lib/Target/R600/R600RegisterInfo.h
+++ b/lib/Target/R600/R600RegisterInfo.h
@@ -16,26 +16,18 @@
 #define R600REGISTERINFO_H_
 
 #include "AMDGPURegisterInfo.h"
-#include "AMDGPUTargetMachine.h"
 
 namespace llvm {
 
-class R600TargetMachine;
+class AMDGPUSubtarget;
 
 struct R600RegisterInfo : public AMDGPURegisterInfo {
-  AMDGPUTargetMachine &TM;
   RegClassWeight RCW;
 
-  R600RegisterInfo(AMDGPUTargetMachine &tm);
+  R600RegisterInfo(const AMDGPUSubtarget &st);
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
-  /// \param RC is an AMDIL reg class.
-  ///
-  /// \returns the R600 reg class that is equivalent to \p RC.
-  const TargetRegisterClass *getISARegClass(
-    const TargetRegisterClass *RC) const override;
-
   /// \brief get the HW encoding for a register's channel.
   unsigned getHWRegChan(unsigned reg) const;
 
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
index 68bcd20..cc667d9 100644
--- a/lib/Target/R600/R600RegisterInfo.td
+++ b/lib/Target/R600/R600RegisterInfo.td
@@ -18,18 +18,28 @@ class R600RegWithChan <string name, bits<9> sel, string chan> :
 
 class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
     RegisterWithSubRegs<n, subregs> {
+  field bits<2> chan_encoding = 0;
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1, sub2, sub3];
-  let HWEncoding = encoding;
+  let HWEncoding{8-0} = encoding{8-0};
+  let HWEncoding{10-9} = chan_encoding;
 }
 
 class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
     RegisterWithSubRegs<n, subregs> {
+  field bits<2> chan_encoding = 0;
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
   let HWEncoding = encoding;
+  let HWEncoding{8-0} = encoding{8-0};
+  let HWEncoding{10-9} = chan_encoding;
 }
 
+class R600Reg_64Vertical<int lo, int hi, string chan> : R600Reg_64 <
+  "V"#lo#hi#"_"#chan,
+  [!cast<Register>("T"#lo#"_"#chan), !cast<Register>("T"#hi#"_"#chan)],
+  lo
+>;
 
 foreach Index = 0-127 in {
   foreach Chan = [ "X", "Y", "Z", "W" ] in {
@@ -54,6 +64,24 @@ foreach Index = 0-127 in {
                                    Index>;
 }
 
+foreach Chan = [ "X", "Y", "Z", "W"] in {
+
+  let chan_encoding = !if(!eq(Chan, "X"), 0,
+                      !if(!eq(Chan, "Y"), 1,
+                      !if(!eq(Chan, "Z"), 2,
+                      !if(!eq(Chan, "W"), 3, 0)))) in {
+    def V0123_#Chan : R600Reg_128 <"V0123_"#Chan,
+                                   [!cast<Register>("T0_"#Chan),
+                                    !cast<Register>("T1_"#Chan),
+                                    !cast<Register>("T2_"#Chan),
+                                    !cast<Register>("T3_"#Chan)],
+                                    0>;
+    def V01_#Chan : R600Reg_64Vertical<0, 1, Chan>;
+    def V23_#Chan : R600Reg_64Vertical<2, 3, Chan>;
+  }
+}
+
+
 // KCACHE_BANK0
 foreach Index = 159-128 in {
   foreach Chan = [ "X", "Y", "Z", "W" ] in {
@@ -130,8 +158,14 @@ def ALU_PARAM : R600Reg<"Param", 0>;
 
 let isAllocatable = 0 in {
 
-// XXX: Only use the X channel, until we support wider stack widths
-def R600_Addr : RegisterClass <"AMDGPU", [i32], 127, (add (sequence "Addr%u_X", 0, 127))>;
+def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>;
+
+// We only use Addr_[YZW] for vertical vectors.
+// FIXME if we add more vertical vector registers we will need to ad more
+// registers to these classes.
+def R600_Addr_Y : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Y)>;
+def R600_Addr_Z : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Z)>;
+def R600_Addr_W : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_W)>;
 
 def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32,
   (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>;
@@ -206,5 +240,13 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
   let CopyCost = -1;
 }
 
+def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
+  (add V0123_W, V0123_Z, V0123_Y, V0123_X)
+>;
+
 def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
                                 (add (sequence "T%u_XY", 0, 63))>;
+
+def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+                                      (add V01_X, V01_Y, V01_Z, V01_W,
+                                           V23_X, V23_Y, V23_Z, V23_W)>;
diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp
index d6e4451..91eb60b 100644
--- a/lib/Target/R600/SIAnnotateControlFlow.cpp
+++ b/lib/Target/R600/SIAnnotateControlFlow.cpp
@@ -65,7 +65,6 @@ class SIAnnotateControlFlow : public FunctionPass {
 
   DominatorTree *DT;
   StackVector Stack;
-  SSAUpdater PhiInserter;
 
   bool isTopOfStack(BasicBlock *BB);
 
@@ -81,7 +80,7 @@ class SIAnnotateControlFlow : public FunctionPass {
 
   void insertElse(BranchInst *Term);
 
-  void handleLoopCondition(Value *Cond);
+  Value *handleLoopCondition(Value *Cond, PHINode *Broken);
 
   void handleLoop(BranchInst *Term);
 
@@ -177,7 +176,7 @@ bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
     } else {
       if (Phi->getIncomingValue(i) != BoolFalse)
         return false;
- 
+
     }
   }
   return true;
@@ -204,20 +203,26 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
 }
 
 /// \brief Recursively handle the condition leading to a loop
-void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) {
+Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken) {
   if (PHINode *Phi = dyn_cast<PHINode>(Cond)) {
+    BasicBlock *Parent = Phi->getParent();
+    PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front());
+    Value *Ret = NewPhi;
 
     // Handle all non-constant incoming values first
     for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
       Value *Incoming = Phi->getIncomingValue(i);
-      if (isa<ConstantInt>(Incoming))
+      BasicBlock *From = Phi->getIncomingBlock(i);
+      if (isa<ConstantInt>(Incoming)) {
+        NewPhi->addIncoming(Broken, From);
         continue;
+      }
 
       Phi->setIncomingValue(i, BoolFalse);
-      handleLoopCondition(Incoming);
+      Value *PhiArg = handleLoopCondition(Incoming, Broken);
+      NewPhi->addIncoming(PhiArg, From);
     }
 
-    BasicBlock *Parent = Phi->getParent();
     BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock();
 
     for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
@@ -230,33 +235,28 @@ void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) {
       if (From == IDom) {
         CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt());
         if (OldEnd && OldEnd->getCalledFunction() == EndCf) {
-          Value *Args[] = {
-            OldEnd->getArgOperand(0),
-            PhiInserter.GetValueAtEndOfBlock(Parent)
-          };
-          Value *Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
-          PhiInserter.AddAvailableValue(Parent, Ret);
+          Value *Args[] = { OldEnd->getArgOperand(0), NewPhi };
+          Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
           continue;
         }
       }
-
       TerminatorInst *Insert = From->getTerminator();
-      Value *Arg = PhiInserter.GetValueAtEndOfBlock(From);
-      Value *Ret = CallInst::Create(Break, Arg, "", Insert);
-      PhiInserter.AddAvailableValue(From, Ret);
+      Value *PhiArg = CallInst::Create(Break, Broken, "", Insert);
+      NewPhi->setIncomingValue(i, PhiArg);
     }
     eraseIfUnused(Phi);
+    return Ret;
 
   } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
     BasicBlock *Parent = Inst->getParent();
     TerminatorInst *Insert = Parent->getTerminator();
-    Value *Args[] = { Cond, PhiInserter.GetValueAtEndOfBlock(Parent) };
-    Value *Ret = CallInst::Create(IfBreak, Args, "", Insert);
-    PhiInserter.AddAvailableValue(Parent, Ret);
+    Value *Args[] = { Cond, Broken };
+    return CallInst::Create(IfBreak, Args, "", Insert);
 
   } else {
     llvm_unreachable("Unhandled loop condition!");
   }
+  return 0;
 }
 
 /// \brief Handle a back edge (loop)
@@ -264,15 +264,11 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
   BasicBlock *Target = Term->getSuccessor(1);
   PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front());
 
-  PhiInserter.Initialize(Int64, "");
-  PhiInserter.AddAvailableValue(Target, Broken);
-
   Value *Cond = Term->getCondition();
   Term->setCondition(BoolTrue);
-  handleLoopCondition(Cond);
+  Value *Arg = handleLoopCondition(Cond, Broken);
 
   BasicBlock *BB = Term->getParent();
-  Value *Arg = PhiInserter.GetValueAtEndOfBlock(BB);
   for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target);
        PI != PE; ++PI) {
 
diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h
index 2cbce28..4d31a11 100644
--- a/lib/Target/R600/SIDefines.h
+++ b/lib/Target/R600/SIDefines.h
@@ -35,4 +35,54 @@ enum {
 #define   S_00B84C_LDS_SIZE(x)                                        (((x) & 0x1FF) << 15)
 #define R_0286CC_SPI_PS_INPUT_ENA                                       0x0286CC
 
+
+#define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
+#define   S_00B848_VGPRS(x)                                           (((x) & 0x3F) << 0)
+#define   G_00B848_VGPRS(x)                                           (((x) >> 0) & 0x3F)
+#define   C_00B848_VGPRS                                              0xFFFFFFC0
+#define   S_00B848_SGPRS(x)                                           (((x) & 0x0F) << 6)
+#define   G_00B848_SGPRS(x)                                           (((x) >> 6) & 0x0F)
+#define   C_00B848_SGPRS                                              0xFFFFFC3F
+#define   S_00B848_PRIORITY(x)                                        (((x) & 0x03) << 10)
+#define   G_00B848_PRIORITY(x)                                        (((x) >> 10) & 0x03)
+#define   C_00B848_PRIORITY                                           0xFFFFF3FF
+#define   S_00B848_FLOAT_MODE(x)                                      (((x) & 0xFF) << 12)
+#define   G_00B848_FLOAT_MODE(x)                                      (((x) >> 12) & 0xFF)
+#define   C_00B848_FLOAT_MODE                                         0xFFF00FFF
+#define   S_00B848_PRIV(x)                                            (((x) & 0x1) << 20)
+#define   G_00B848_PRIV(x)                                            (((x) >> 20) & 0x1)
+#define   C_00B848_PRIV                                               0xFFEFFFFF
+#define   S_00B848_DX10_CLAMP(x)                                      (((x) & 0x1) << 21)
+#define   G_00B848_DX10_CLAMP(x)                                      (((x) >> 21) & 0x1)
+#define   C_00B848_DX10_CLAMP                                         0xFFDFFFFF
+#define   S_00B848_DEBUG_MODE(x)                                      (((x) & 0x1) << 22)
+#define   G_00B848_DEBUG_MODE(x)                                      (((x) >> 22) & 0x1)
+#define   C_00B848_DEBUG_MODE                                         0xFFBFFFFF
+#define   S_00B848_IEEE_MODE(x)                                       (((x) & 0x1) << 23)
+#define   G_00B848_IEEE_MODE(x)                                       (((x) >> 23) & 0x1)
+#define   C_00B848_IEEE_MODE                                          0xFF7FFFFF
+
+
+// Helpers for setting FLOAT_MODE
+#define FP_ROUND_ROUND_TO_NEAREST 0
+#define FP_ROUND_ROUND_TO_INF 1
+#define FP_ROUND_ROUND_TO_NEGINF 2
+#define FP_ROUND_ROUND_TO_ZERO 3
+
+// Bits 3:0 control rounding mode. 1:0 control single precision, 3:2 double
+// precision.
+#define FP_ROUND_MODE_SP(x) ((x) & 0x3)
+#define FP_ROUND_MODE_DP(x) (((x) & 0x3) << 2)
+
+#define FP_DENORM_FLUSH_IN_FLUSH_OUT 0
+#define FP_DENORM_FLUSH_OUT 1
+#define FP_DENORM_FLUSH_IN 2
+#define FP_DENORM_FLUSH_NONE 3
+
+
+// Bits 7:4 control denormal handling. 5:4 control single precision, 6:7 double
+// precision.
+#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4)
+#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6)
+
 #endif // SIDEFINES_H_
diff --git a/lib/Target/R600/SIFixSGPRLiveRanges.cpp b/lib/Target/R600/SIFixSGPRLiveRanges.cpp
new file mode 100644
index 0000000..7d116ee
--- /dev/null
+++ b/lib/Target/R600/SIFixSGPRLiveRanges.cpp
@@ -0,0 +1,110 @@
+//===-- SIFixSGPRLiveRanges.cpp - Fix SGPR live ranges ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// SALU instructions ignore control flow, so we need to modify the live ranges
+/// of the registers they define.
+///
+/// The strategy is to view the entire program as if it were a single basic
+/// block and calculate the intervals accordingly.  We implement this
+/// by walking this list of segments for each LiveRange and setting the
+/// end of each segment equal to the start of the segment that immediately
+/// follows it.
+
+#include "AMDGPU.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-fix-sgpr-live-ranges"
+
+namespace {
+
+class SIFixSGPRLiveRanges : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIFixSGPRLiveRanges() : MachineFunctionPass(ID) {
+    initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF) override;
+
+  virtual const char *getPassName() const override {
+    return "SI Fix SGPR live ranges";
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreserved<SlotIndexes>();
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE,
+                      "SI Fix SGPR Live Ranges", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE,
+                    "SI Fix SGPR Live Ranges", false, false)
+
+char SIFixSGPRLiveRanges::ID = 0;
+
+char &llvm::SIFixSGPRLiveRangesID = SIFixSGPRLiveRanges::ID;
+
+FunctionPass *llvm::createSIFixSGPRLiveRangesPass() {
+  return new SIFixSGPRLiveRanges();
+}
+
+bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+                                                      I != E; ++I) {
+      MachineInstr &MI = *I;
+      MachineOperand *ExecUse = MI.findRegisterUseOperand(AMDGPU::EXEC);
+      if (ExecUse)
+        continue;
+
+      for (const MachineOperand &Def : MI.operands()) {
+        if (!Def.isReg() || !Def.isDef() ||!TargetRegisterInfo::isVirtualRegister(Def.getReg()))
+          continue;
+
+        const TargetRegisterClass *RC = MRI.getRegClass(Def.getReg());
+
+        if (!TRI->isSGPRClass(RC))
+          continue;
+        LiveInterval &LI = LIS->getInterval(Def.getReg());
+        for (unsigned i = 0, e = LI.size() - 1; i != e; ++i) {
+          LiveRange::Segment &Seg = LI.segments[i];
+          LiveRange::Segment &Next = LI.segments[i + 1];
+          Seg.end = Next.start;
+        }
+      }
+    }
+  }
+
+  return false;
+}
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index c9e247c..b13c3b8 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -14,8 +14,8 @@
 
 #include "SIISelLowering.h"
 #include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
-#include "AMDILIntrinsicInfo.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/Function.h"
+#include "llvm/ADT/SmallString.h"
 
 using namespace llvm;
 
@@ -76,6 +77,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::ADD, MVT::i32, Legal);
   setOperationAction(ISD::ADDC, MVT::i32, Legal);
   setOperationAction(ISD::ADDE, MVT::i32, Legal);
+  setOperationAction(ISD::SUBC, MVT::i32, Legal);
+  setOperationAction(ISD::SUBE, MVT::i32, Legal);
 
   // We need to custom lower vector stores from local memory
   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
@@ -88,14 +91,12 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
 
   // We need to custom lower loads/stores from private memory
   setOperationAction(ISD::LOAD, MVT::i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::i64, Custom);
   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
 
   setOperationAction(ISD::STORE, MVT::i1, Custom);
   setOperationAction(ISD::STORE, MVT::i32, Custom);
-  setOperationAction(ISD::STORE, MVT::i64, Custom);
   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 
@@ -105,18 +106,14 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::SELECT, MVT::f64, Promote);
   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
 
-  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
-
-  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
 
   setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
 
-  setOperationAction(ISD::ANY_EXTEND, MVT::i64, Custom);
-  setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
-  setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom);
-
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
@@ -139,6 +136,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
 
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
 
   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
@@ -215,9 +213,16 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
     setOperationAction(ISD::FRINT, MVT::f64, Legal);
   }
 
+  // FIXME: These should be removed and handled the same was as f32 fneg. Source
+  // modifiers also work for the double instructions.
+  setOperationAction(ISD::FNEG, MVT::f64, Expand);
+  setOperationAction(ISD::FABS, MVT::f64, Expand);
+
   setTargetDAGCombine(ISD::SELECT_CC);
   setTargetDAGCombine(ISD::SETCC);
 
+  setTargetDAGCombine(ISD::UINT_TO_FP);
+
   setSchedulingPreference(Sched::RegPressure);
 }
 
@@ -265,8 +270,12 @@ bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT  VT,
   return VT.bitsGT(MVT::i32);
 }
 
-bool SITargetLowering::shouldSplitVectorType(EVT VT) const {
-  return VT.getScalarType().bitsLE(MVT::i16);
+TargetLoweringBase::LegalizeTypeAction
+SITargetLowering::getPreferredVectorAction(EVT VT) const {
+  if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
+    return TypeSplitVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
@@ -482,19 +491,20 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MI->eraseFromParent();
     break;
   }
-  case AMDGPU::V_SUB_F64:
-    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64),
-            MI->getOperand(0).getReg())
-            .addReg(MI->getOperand(1).getReg())
-            .addReg(MI->getOperand(2).getReg())
-            .addImm(0)  /* src2 */
-            .addImm(0)  /* ABS */
-            .addImm(0)  /* CLAMP */
-            .addImm(0)  /* OMOD */
-            .addImm(2); /* NEG */
+  case AMDGPU::V_SUB_F64: {
+    unsigned DestReg = MI->getOperand(0).getReg();
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg)
+      .addImm(0)  // SRC0 modifiers
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(1)  // SRC1 modifiers
+      .addReg(MI->getOperand(2).getReg())
+      .addImm(0)  // SRC2 modifiers
+      .addImm(0)  // src2
+      .addImm(0)  // CLAMP
+      .addImm(0); // OMOD
     MI->eraseFromParent();
     break;
-
+  }
   case AMDGPU::SI_RegisterStorePseudo: {
     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
     unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@@ -595,27 +605,31 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
   case ISD::LOAD: {
     LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
+    EVT VT = Op.getValueType();
+
+    // These loads are legal.
+    if (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+        VT.isVector() && VT.getVectorNumElements() == 2 &&
+        VT.getVectorElementType() == MVT::i32)
+      return SDValue();
+
     if (Op.getValueType().isVector() &&
         (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
          Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
          (Load->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
           Op.getValueType().getVectorNumElements() > 4))) {
-      SDValue MergedValues[2] = {
-        SplitVectorLoad(Op, DAG),
-        Load->getChain()
-      };
-      return DAG.getMergeValues(MergedValues, SDLoc(Op));
+      return SplitVectorLoad(Op, DAG);
     } else {
-      return LowerLOAD(Op, DAG);
+      SDValue Result = LowerLOAD(Op, DAG);
+      assert((!Result.getNode() ||
+              Result.getNode()->getNumValues() == 2) &&
+             "Load should return a value and a chain");
+      return Result;
     }
   }
 
   case ISD::SELECT: return LowerSELECT(Op, DAG);
-  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
-  case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG);
   case ISD::STORE: return LowerSTORE(Op, DAG);
-  case ISD::ANY_EXTEND: // Fall-through
-  case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG);
   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IntrinsicID =
@@ -827,13 +841,9 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   LoadSDNode *Load = cast<LoadSDNode>(Op);
-  SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
-  SDValue MergedValues[2];
-  MergedValues[1] = Load->getChain();
-  if (Ret.getNode()) {
-    MergedValues[0] = Ret;
-    return DAG.getMergeValues(MergedValues, DL);
-  }
+  SDValue Lowered = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
+  if (Lowered.getNode())
+    return Lowered;
 
   if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
     return SDValue();
@@ -846,25 +856,38 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
                             DAG.getConstant(2, MVT::i32));
-  Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
-                    Load->getChain(), Ptr,
-                    DAG.getTargetConstant(0, MVT::i32),
-                    Op.getOperand(2));
+
+  // FIXME: REGISTER_LOAD should probably have a chain result.
+  SDValue Chain = Load->getChain();
+  SDValue LoLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+                               Chain, Ptr,
+                               DAG.getTargetConstant(0, MVT::i32),
+                               Op.getOperand(2));
+
+  SDValue Ret = LoLoad.getValue(0);
   if (MemVT.getSizeInBits() == 64) {
+    // TODO: This needs a test to make sure the right thing is happening with
+    // the chain. That is hard without general function support.
+
     SDValue IncPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
                                  DAG.getConstant(1, MVT::i32));
 
-    SDValue LoadUpper = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
-                                    Load->getChain(), IncPtr,
-                                    DAG.getTargetConstant(0, MVT::i32),
-                                    Op.getOperand(2));
+    SDValue HiLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+                                 Chain, IncPtr,
+                                 DAG.getTargetConstant(0, MVT::i32),
+                                 Op.getOperand(2));
 
-    Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ret, LoadUpper);
+    Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, LoLoad, HiLoad);
+    // Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+    //                     LoLoad.getValue(1), HiLoad.getValue(1));
   }
 
-  MergedValues[0] = Ret;
-  return DAG.getMergeValues(MergedValues, DL);
+  SDValue Ops[] = {
+    Ret,
+    Chain
+  };
 
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
@@ -903,39 +926,17 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
 }
 
-SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue True = Op.getOperand(2);
-  SDValue False = Op.getOperand(3);
-  SDValue CC = Op.getOperand(4);
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-
-  SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC);
-  return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
-}
-
-SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-
-  if (VT != MVT::i64) {
-    return SDValue();
-  }
-
-  SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i32, Op.getOperand(0),
-                                                 DAG.getConstant(31, MVT::i32));
-
-  return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi);
-}
-
 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   StoreSDNode *Store = cast<StoreSDNode>(Op);
   EVT VT = Store->getMemoryVT();
 
+  // These stores are legal.
+  if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+      VT.isVector() && VT.getVectorNumElements() == 2 &&
+      VT.getVectorElementType() == MVT::i32)
+    return SDValue();
+
   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
   if (Ret.getNode())
     return Ret;
@@ -1011,27 +1012,99 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   return Chain;
 }
 
+//===----------------------------------------------------------------------===//
+// Custom DAG optimizations
+//===----------------------------------------------------------------------===//
+
+SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
+                                                     DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+  EVT ScalarVT = VT.getScalarType();
+  if (ScalarVT != MVT::f32)
+    return SDValue();
 
-SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
 
-  if (VT != MVT::i64) {
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+
+  // TODO: We could try to match extracting the higher bytes, which would be
+  // easier if i8 vectors weren't promoted to i32 vectors, particularly after
+  // types are legalized. v4i8 -> v4f32 is probably the only case to worry
+  // about in practice.
+  if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
+    if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
+      SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
+      DCI.AddToWorklist(Cvt.getNode());
+      return Cvt;
+    }
+  }
+
+  // We are primarily trying to catch operations on illegal vector types
+  // before they are expanded.
+  // For scalars, we can use the more flexible method of checking masked bits
+  // after legalization.
+  if (!DCI.isBeforeLegalize() ||
+      !SrcVT.isVector() ||
+      SrcVT.getVectorElementType() != MVT::i8) {
     return SDValue();
   }
 
-  SDValue Src = Op.getOperand(0);
-  if (Src.getValueType() != MVT::i32)
-    Src = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
+  assert(DCI.isBeforeLegalize() && "Unexpected legal type");
 
-  SDValue Zero = DAG.getConstant(0, MVT::i32);
-  return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Src, Zero);
-}
+  // Weird sized vectors are a pain to handle, but we know 3 is really the same
+  // size as 4.
+  unsigned NElts = SrcVT.getVectorNumElements();
+  if (!SrcVT.isSimple() && NElts != 3)
+    return SDValue();
 
-//===----------------------------------------------------------------------===//
-// Custom DAG optimizations
-//===----------------------------------------------------------------------===//
+  // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to
+  // prevent a mess from expanding to v4i32 and repacking.
+  if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+    EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
+    EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
+    EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
+
+    LoadSDNode *Load = cast<LoadSDNode>(Src);
+    SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
+                                     Load->getChain(),
+                                     Load->getBasePtr(),
+                                     LoadVT,
+                                     Load->getMemOperand());
+
+    // Make sure successors of the original load stay after it by updating
+    // them to use the new Chain.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1));
+
+    SmallVector<SDValue, 4> Elts;
+    if (RegVT.isVector())
+      DAG.ExtractVectorElements(NewLoad, Elts);
+    else
+      Elts.push_back(NewLoad);
+
+    SmallVector<SDValue, 4> Ops;
+
+    unsigned EltIdx = 0;
+    for (SDValue Elt : Elts) {
+      unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx);
+      for (unsigned I = 0; I < ComponentsInElt; ++I) {
+        unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I;
+        SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt);
+        DCI.AddToWorklist(Cvt.getNode());
+        Ops.push_back(Cvt);
+      }
+
+      ++EltIdx;
+    }
+
+    assert(Ops.size() == NElts);
+
+    return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops);
+  }
+
+  return SDValue();
+}
 
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
@@ -1074,6 +1147,31 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
       }
       break;
     }
+
+  case AMDGPUISD::CVT_F32_UBYTE0:
+  case AMDGPUISD::CVT_F32_UBYTE1:
+  case AMDGPUISD::CVT_F32_UBYTE2:
+  case AMDGPUISD::CVT_F32_UBYTE3: {
+    unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
+
+    SDValue Src = N->getOperand(0);
+    APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
+
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                          !DCI.isBeforeLegalizeOps());
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
+        TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
+      DCI.CommitTargetLoweringOpt(TLO);
+    }
+
+    break;
+  }
+
+  case ISD::UINT_TO_FP: {
+    return performUCharToFloatCombine(N, DCI);
+  }
   }
 
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
@@ -1297,7 +1395,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
   int32_t Immediate = Desc->getSize() == 4 ? 0 : -1;
   bool HaveVSrc = false, HaveSSrc = false;
 
-  // First figure out what we alread have in this instruction
+  // First figure out what we already have in this instruction.
   for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
        i != e && Op < NumOps; ++i, ++Op) {
 
@@ -1316,7 +1414,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
     }
   }
 
-  // If we neither have VSrc nor SSrc it makes no sense to continue
+  // If we neither have VSrc nor SSrc, it makes no sense to continue.
   if (!HaveVSrc && !HaveSSrc)
     return Node;
 
@@ -1332,17 +1430,17 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
     const SDValue &Operand = Node->getOperand(i);
     Ops.push_back(Operand);
 
-    // Already folded immediate ?
+    // Already folded immediate?
     if (isa<ConstantSDNode>(Operand.getNode()) ||
         isa<ConstantFPSDNode>(Operand.getNode()))
       continue;
 
-    // Is this a VSrc or SSrc operand ?
+    // Is this a VSrc or SSrc operand?
     unsigned RegClass = Desc->OpInfo[Op].RegClass;
     if (isVSrc(RegClass) || isSSrc(RegClass)) {
       // Try to fold the immediates
       if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) {
-        // Folding didn't worked, make sure we don't hit the SReg limit
+        // Folding didn't work, make sure we don't hit the SReg limit.
         ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed);
       }
       continue;
@@ -1371,7 +1469,6 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
       continue;
 
     if (DescE64) {
-
       // Test if it makes sense to switch to e64 encoding
       unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass;
       if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass))
@@ -1402,7 +1499,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
       if (!DescE64)
         continue;
       Desc = DescE64;
-      DescE64 = 0;
+      DescE64 = nullptr;
     }
     else if (Operand.getMachineOpcode() == AMDGPU::FABS_SI) {
       Ops.pop_back();
@@ -1412,7 +1509,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
       if (!DescE64)
         continue;
       Desc = DescE64;
-      DescE64 = 0;
+      DescE64 = nullptr;
     }
   }
 
@@ -1535,7 +1632,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
   }
 }
 
-/// \brief Fold the instructions after slecting them
+/// \brief Fold the instructions after selecting them.
 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
                                           SelectionDAG &DAG) const {
   const SIInstrInfo *TII =
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index c6eaa81..e25323a 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -27,10 +27,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
                                SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
   bool foldImm(SDValue &Operand, int32_t &Immediate,
@@ -46,11 +43,16 @@ class SITargetLowering : public AMDGPUTargetLowering {
   void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
   MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const;
 
+  static SDValue performUCharToFloatCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI);
+
 public:
   SITargetLowering(TargetMachine &tm);
   bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS,
                                      bool *IsFast) const override;
-  bool shouldSplitVectorType(EVT VT) const override;
+
+  TargetLoweringBase::LegalizeTypeAction
+  getPreferredVectorAction(EVT VT) const override;
 
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                         Type *Ty) const override;
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index a17fed7..1733326 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -341,6 +341,8 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
   return Result;
 }
 
+// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
+// around other non-memory instructions.
 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
   bool Changes = false;
 
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index 168eff2..7cae9fc 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -51,6 +51,16 @@ class Enc64 <dag outs, dag ins, string asm, list<dag> pattern> :
   let Size = 8;
 }
 
+class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64 <outs, ins, asm, pattern> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOP3 = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Scalar operations
 //===----------------------------------------------------------------------===//
@@ -207,7 +217,7 @@ class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
 }
 
 class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc64 <outs, ins, asm, pattern> {
+    VOP3Common <outs, ins, asm, pattern> {
 
   bits<8> dst;
   bits<2> src0_modifiers;
@@ -233,16 +243,11 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let Inst{61} = src0_modifiers{0};
   let Inst{62} = src1_modifiers{0};
   let Inst{63} = src2_modifiers{0};
-  
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let UseNamedOperandTable = 1;
-  let VOP3 = 1;
+
 }
 
 class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc64 <outs, ins, asm, pattern> {
+    VOP3Common <outs, ins, asm, pattern> {
 
   bits<8> dst;
   bits<2> src0_modifiers;
@@ -266,11 +271,6 @@ class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let Inst{62} = src1_modifiers{0};
   let Inst{63} = src2_modifiers{0};
 
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let UseNamedOperandTable = 1;
-  let VOP3 = 1;
 }
 
 class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 4a9e346..455c890 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -19,13 +19,14 @@
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
 #include "llvm/MC/MCInstrDesc.h"
 
 using namespace llvm;
 
-SIInstrInfo::SIInstrInfo(AMDGPUTargetMachine &tm)
-  : AMDGPUInstrInfo(tm),
-    RI(tm) { }
+SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st)
+  : AMDGPUInstrInfo(st),
+    RI(st) { }
 
 //===----------------------------------------------------------------------===//
 // TargetInstrInfo callbacks
@@ -187,18 +188,25 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       int FrameIndex,
                                       const TargetRegisterClass *RC,
                                       const TargetRegisterInfo *TRI) const {
-  SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+  MachineFunction *MF = MBB.getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
   DebugLoc DL = MBB.findDebugLoc(MI);
   unsigned KillFlag = isKill ? RegState::Kill : 0;
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
-  if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) {
-    unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent());
+  if (RI.hasVGPRs(RC)) {
+    LLVMContext &Ctx = MF->getFunction()->getContext();
+    Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Can't spill VGPR!");
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0)
+            .addReg(SrcReg);
+  } else if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) {
+    unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MF);
+    unsigned TgtReg = MFI->SpillTracker.LaneVGPR;
 
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), MFI->SpillTracker.LaneVGPR)
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), TgtReg)
             .addReg(SrcReg, KillFlag)
             .addImm(Lane);
-    MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, Lane);
+    MFI->SpillTracker.addSpilledReg(FrameIndex, TgtReg, Lane);
   } else if (RI.isSGPRClass(RC)) {
     // We are only allowed to create one new instruction when spilling
     // registers, so we need to use pseudo instruction for vector
@@ -207,8 +215,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     // Reserve a spot in the spill tracker for each sub-register of
     // the vector register.
     unsigned NumSubRegs = RC->getSize() / 4;
-    unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent(),
-                                                        NumSubRegs);
+    unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MF, NumSubRegs);
     MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR,
                                     FirstLane);
 
@@ -234,19 +241,19 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                        unsigned DestReg, int FrameIndex,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
-  SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+  MachineFunction *MF = MBB.getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   DebugLoc DL = MBB.findDebugLoc(MI);
-  if (TRI->getCommonSubClass(RC, &AMDGPU::SReg_32RegClass)) {
-    SIMachineFunctionInfo::SpilledReg Spill =
-        MFI->SpillTracker.getSpilledReg(FrameIndex);
-    assert(Spill.VGPR);
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), DestReg)
-            .addReg(Spill.VGPR)
-            .addImm(Spill.Lane);
-    insertNOPs(MI, 3);
+
+  if (RI.hasVGPRs(RC)) {
+    LLVMContext &Ctx = MF->getFunction()->getContext();
+    Ctx.emitError("SIInstrInfo::loadRegToStackSlot - Can't retrieve spilled VGPR!");
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+            .addImm(0);
   } else if (RI.isSGPRClass(RC)){
     unsigned Opcode;
     switch(RC->getSize() * 8) {
+    case 32:  Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break;
     case 64:  Opcode = AMDGPU::SI_SPILL_S64_RESTORE;  break;
     case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
     case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
@@ -260,7 +267,6 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     BuildMI(MBB, MI, DL, get(Opcode), DestReg)
             .addReg(Spill.VGPR)
             .addImm(FrameIndex);
-    insertNOPs(MI, 3);
   } else {
     llvm_unreachable("VGPR spilling not supported");
   }
@@ -281,6 +287,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_S64_SAVE:
   case AMDGPU::SI_SPILL_S64_RESTORE:
     return 2;
+  case AMDGPU::SI_SPILL_S32_RESTORE:
+    return 1;
   default: llvm_unreachable("Invalid spill opcode");
   }
 }
@@ -334,7 +342,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   case AMDGPU::SI_SPILL_S512_RESTORE:
   case AMDGPU::SI_SPILL_S256_RESTORE:
   case AMDGPU::SI_SPILL_S128_RESTORE:
-  case AMDGPU::SI_SPILL_S64_RESTORE: {
+  case AMDGPU::SI_SPILL_S64_RESTORE:
+  case AMDGPU::SI_SPILL_S32_RESTORE: {
     unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
 
     for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
@@ -348,6 +357,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
               .addReg(MI->getOperand(1).getReg())
               .addImm(Spill.Lane + i);
     }
+    insertNOPs(MI, 3);
     MI->eraseFromParent();
     break;
   }
@@ -514,6 +524,23 @@ bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO) const {
   return (MO.isImm() || MO.isFPImm()) && !isInlineConstant(MO);
 }
 
+static bool compareMachineOp(const MachineOperand &Op0,
+                             const MachineOperand &Op1) {
+  if (Op0.getType() != Op1.getType())
+    return false;
+
+  switch (Op0.getType()) {
+  case MachineOperand::MO_Register:
+    return Op0.getReg() == Op1.getReg();
+  case MachineOperand::MO_Immediate:
+    return Op0.getImm() == Op1.getImm();
+  case MachineOperand::MO_FPImmediate:
+    return Op0.getFPImm() == Op1.getFPImm();
+  default:
+    llvm_unreachable("Didn't expect to be comparing these operand types");
+  }
+}
+
 bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
                                     StringRef &ErrInfo) const {
   uint16_t Opcode = MI->getOpcode();
@@ -532,7 +559,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
   // Make sure the register classes are correct
   for (unsigned i = 0, e = Desc.getNumOperands(); i != e; ++i) {
     switch (Desc.OpInfo[i].OperandType) {
-    case MCOI::OPERAND_REGISTER:
+    case MCOI::OPERAND_REGISTER: {
+      int RegClass = Desc.OpInfo[i].RegClass;
+      if (!RI.regClassCanUseImmediate(RegClass) &&
+          (MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm())) {
+        ErrInfo = "Expected register, but got immediate";
+        return false;
+      }
+    }
       break;
     case MCOI::OPERAND_IMMEDIATE:
       if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm()) {
@@ -620,6 +654,24 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
       return false;
     }
   }
+
+  // Verify misc. restrictions on specific instructions.
+  if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
+      Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
+    MI->dump();
+
+    const MachineOperand &Src0 = MI->getOperand(2);
+    const MachineOperand &Src1 = MI->getOperand(3);
+    const MachineOperand &Src2 = MI->getOperand(4);
+    if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
+      if (!compareMachineOp(Src0, Src1) &&
+          !compareMachineOp(Src0, Src2)) {
+        ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
+        return false;
+      }
+    }
+  }
+
   return true;
 }
 
@@ -654,7 +706,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
   case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
   case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
+  case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
   case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
   case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
   case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
@@ -667,6 +721,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
   case AMDGPU::S_LOAD_DWORDX4_IMM:
   case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
+  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e32;
+  case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
+  case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
   }
 }
 
@@ -731,8 +788,8 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
   unsigned SubReg = MRI.createVirtualRegister(SubRC);
 
   // Just in case the super register is itself a sub-register, copy it to a new
-  // value so we don't need to wory about merging its subreg index with the
-  // SubIdx passed to this function.  The register coalescer should be able to
+  // value so we don't need to worry about merging its subreg index with the
+  // SubIdx passed to this function. The register coalescer should be able to
   // eliminate this extra copy.
   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY),
           NewSuperReg)
@@ -1157,22 +1214,27 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
       continue;
     }
     case AMDGPU::S_AND_B64:
-      splitScalar64BitOp(Worklist, Inst, AMDGPU::S_AND_B32);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32);
       Inst->eraseFromParent();
       continue;
 
     case AMDGPU::S_OR_B64:
-      splitScalar64BitOp(Worklist, Inst, AMDGPU::S_OR_B32);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32);
       Inst->eraseFromParent();
       continue;
 
     case AMDGPU::S_XOR_B64:
-      splitScalar64BitOp(Worklist, Inst, AMDGPU::S_XOR_B32);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32);
       Inst->eraseFromParent();
       continue;
 
     case AMDGPU::S_NOT_B64:
-      splitScalar64BitOp(Worklist, Inst, AMDGPU::S_NOT_B32);
+      splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
+      Inst->eraseFromParent();
+      continue;
+
+    case AMDGPU::S_BCNT1_I32_B64:
+      splitScalar64BitBCNT(Worklist, Inst);
       Inst->eraseFromParent();
       continue;
 
@@ -1217,6 +1279,10 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
       // 3 to not hit an assertion later in MCInstLower.
       Inst->addOperand(MachineOperand::CreateImm(0));
       Inst->addOperand(MachineOperand::CreateImm(0));
+    } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
+      // The VALU version adds the second operand to the result, so insert an
+      // extra 0 operand.
+      Inst->addOperand(MachineOperand::CreateImm(0));
     }
 
     addDescImplicitUseDef(NewDesc, Inst);
@@ -1297,9 +1363,62 @@ const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
   return &AMDGPU::VReg_32RegClass;
 }
 
-void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
-                                     MachineInstr *Inst,
-                                     unsigned Opcode) const {
+void SIInstrInfo::splitScalar64BitUnaryOp(
+  SmallVectorImpl<MachineInstr *> &Worklist,
+  MachineInstr *Inst,
+  unsigned Opcode) const {
+  MachineBasicBlock &MBB = *Inst->getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  MachineOperand &Dest = Inst->getOperand(0);
+  MachineOperand &Src0 = Inst->getOperand(1);
+  DebugLoc DL = Inst->getDebugLoc();
+
+  MachineBasicBlock::iterator MII = Inst;
+
+  const MCInstrDesc &InstDesc = get(Opcode);
+  const TargetRegisterClass *Src0RC = Src0.isReg() ?
+    MRI.getRegClass(Src0.getReg()) :
+    &AMDGPU::SGPR_32RegClass;
+
+  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
+
+  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub0, Src0SubRC);
+
+  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+  const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
+
+  unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
+  MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
+    .addOperand(SrcReg0Sub0);
+
+  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub1, Src0SubRC);
+
+  unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
+  MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
+    .addOperand(SrcReg0Sub1);
+
+  unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
+  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+    .addReg(DestSub0)
+    .addImm(AMDGPU::sub0)
+    .addReg(DestSub1)
+    .addImm(AMDGPU::sub1);
+
+  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+  // Try to legalize the operands in case we need to swap the order to keep it
+  // valid.
+  Worklist.push_back(LoHalf);
+  Worklist.push_back(HiHalf);
+}
+
+void SIInstrInfo::splitScalar64BitBinaryOp(
+  SmallVectorImpl<MachineInstr *> &Worklist,
+  MachineInstr *Inst,
+  unsigned Opcode) const {
   MachineBasicBlock &MBB = *Inst->getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
@@ -1360,6 +1479,46 @@ void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
   Worklist.push_back(HiHalf);
 }
 
+void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
+                                       MachineInstr *Inst) const {
+  MachineBasicBlock &MBB = *Inst->getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  MachineBasicBlock::iterator MII = Inst;
+  DebugLoc DL = Inst->getDebugLoc();
+
+  MachineOperand &Dest = Inst->getOperand(0);
+  MachineOperand &Src = Inst->getOperand(1);
+
+  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e32);
+  const TargetRegisterClass *SrcRC = Src.isReg() ?
+    MRI.getRegClass(Src.getReg()) :
+    &AMDGPU::SGPR_32RegClass;
+
+  unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+  const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
+
+  MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
+                                                      AMDGPU::sub0, SrcSubRC);
+  MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
+                                                      AMDGPU::sub1, SrcSubRC);
+
+  MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg)
+    .addOperand(SrcRegSub0)
+    .addImm(0);
+
+  MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg)
+    .addOperand(SrcRegSub1)
+    .addReg(MidReg);
+
+  MRI.replaceRegWith(Dest.getReg(), ResultReg);
+
+  Worklist.push_back(First);
+  Worklist.push_back(Second);
+}
+
 void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
                                         MachineInstr *Inst) const {
   // Add the implict and explicit register definitions.
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index 7b31a81..4c204d8 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -44,13 +44,19 @@ private:
                          const TargetRegisterClass *RC,
                          const MachineOperand &Op) const;
 
-  void splitScalar64BitOp(SmallVectorImpl<MachineInstr *> & Worklist,
-                          MachineInstr *Inst, unsigned Opcode) const;
+  void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+                               MachineInstr *Inst, unsigned Opcode) const;
+
+  void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+                                MachineInstr *Inst, unsigned Opcode) const;
+
+  void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
+                            MachineInstr *Inst) const;
 
   void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
 
 public:
-  explicit SIInstrInfo(AMDGPUTargetMachine &tm);
+  explicit SIInstrInfo(const AMDGPUSubtarget &st);
 
   const SIRegisterInfo &getRegisterInfo() const override {
     return RI;
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 2242e6d..774c9d1 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -147,6 +147,12 @@ def FRAMEri32 : Operand<iPTR> {
 }
 
 //===----------------------------------------------------------------------===//
+// Complex patterns
+//===----------------------------------------------------------------------===//
+
+def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">;
+
+//===----------------------------------------------------------------------===//
 // SI assembler operands
 //===----------------------------------------------------------------------===//
 
@@ -187,6 +193,12 @@ class SOP1_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
   opName#" $dst, $src0", pattern
 >;
 
+// 64-bit input, 32-bit output.
+class SOP1_32_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
+  op, (outs SReg_32:$dst), (ins SSrc_64:$src0),
+  opName#" $dst, $src0", pattern
+>;
+
 class SOP2_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
   op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
   opName#" $dst, $src0, $src1", pattern
@@ -260,7 +272,7 @@ class SIMCInstr <string pseudo, int subtarget> {
 multiclass VOP3_m <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern,
                    string opName> {
 
-  def "" : InstSI <outs, ins, "", pattern>, VOP <opName>,
+  def "" : VOP3Common <outs, ins, "", pattern>, VOP <opName>,
            SIMCInstr<OpName, SISubtarget.NONE> {
     let isPseudo = 1;
   }
@@ -357,12 +369,13 @@ multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern,
 }
 
 multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
-                        string opName, ValueType vt, PatLeaf cond> {
-
+                        string opName, ValueType vt, PatLeaf cond, bit defExec = 0> {
   def _e32 : VOPC <
     op, (ins arc:$src0, vrc:$src1),
     opName#"_e32 $dst, $src0, $src1", []
-  >, VOP <opName>;
+  >, VOP <opName> {
+    let Defs = !if(defExec, [VCC, EXEC], [VCC]);
+  }
 
   def _e64 : VOP3 <
     {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
@@ -375,6 +388,7 @@ multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
       [(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))]
     )
   >, VOP <opName> {
+    let Defs = !if(defExec, [EXEC], []);
     let src2 = SIOperand.ZERO;
     let src2_modifiers = 0;
   }
@@ -388,6 +402,14 @@ multiclass VOPC_64 <bits<8> op, string opName,
   ValueType vt = untyped, PatLeaf cond = COND_NULL>
   : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond>;
 
+multiclass VOPCX_32 <bits<8> op, string opName,
+  ValueType vt = untyped, PatLeaf cond = COND_NULL>
+  : VOPC_Helper <op, VReg_32, VSrc_32, opName, vt, cond, 1>;
+
+multiclass VOPCX_64 <bits<8> op, string opName,
+  ValueType vt = untyped, PatLeaf cond = COND_NULL>
+  : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond, 1>;
+
 multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m <
   op, (outs VReg_32:$dst),
   (ins InputMods: $src0_modifiers, VSrc_32:$src0, InputMods:$src1_modifiers,
@@ -396,7 +418,7 @@ multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m <
   opName#" $dst, $src0_modifiers, $src1, $src2, $clamp, $omod", pattern, opName
 >;
 
-class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 <
+class VOP3_64_32 <bits <9> op, string opName, list<dag> pattern> : VOP3 <
   op, (outs VReg_64:$dst),
   (ins VSrc_64:$src0, VSrc_32:$src1),
   opName#" $dst, $src0, $src1", pattern
@@ -410,11 +432,29 @@ class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 <
 
 class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
   op, (outs VReg_64:$dst),
-  (ins VSrc_64:$src0, VSrc_64:$src1, VSrc_64:$src2,
+  (ins InputMods:$src0_modifiers, VSrc_64:$src0,
+       InputMods:$src1_modifiers, VSrc_64:$src1,
+       InputMods:$src2_modifiers, VSrc_64:$src2,
+       InstFlag:$clamp, InstFlag:$omod),
+  opName#" $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers, $clamp, $omod", pattern
+>, VOP <opName>;
+
+
+class VOP3b_Helper <bits<9> op, RegisterClass vrc, RegisterClass arc,
+                    string opName, list<dag> pattern> : VOP3 <
+  op, (outs vrc:$dst0, SReg_64:$dst1),
+  (ins arc:$src0, arc:$src1, arc:$src2,
    InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
-  opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
+  opName#" $dst0, $dst1, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
 >, VOP <opName>;
 
+
+class VOP3b_64 <bits<9> op, string opName, list<dag> pattern> :
+  VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>;
+
+class VOP3b_32 <bits<9> op, string opName, list<dag> pattern> :
+  VOP3b_Helper <op, VReg_32, VSrc_32, opName, pattern>;
+
 //===----------------------------------------------------------------------===//
 // Vector I/O classes
 //===----------------------------------------------------------------------===//
@@ -475,10 +515,11 @@ class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A
   let vdst = 0;
 }
 
+// 1 address, 1 data.
 class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
   op,
   (outs rc:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, u16imm:$offset),
+  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset),
   asm#" $vdst, $addr, $data0, $offset, [M0]",
   []> {
 
@@ -487,6 +528,41 @@ class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
   let mayLoad = 1;
 }
 
+// 1 address, 2 data.
+class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+  op,
+  (outs rc:$vdst),
+  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset),
+  asm#" $vdst, $addr, $data0, $data1, $offset, [M0]",
+  []> {
+  let mayStore = 1;
+  let mayLoad = 1;
+}
+
+// 1 address, 2 data.
+class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+  op,
+  (outs),
+  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset),
+  asm#" $addr, $data0, $data1, $offset, [M0]",
+  []> {
+  let mayStore = 1;
+  let mayLoad = 1;
+}
+
+// 1 address, 1 data.
+class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+  op,
+  (outs),
+  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset),
+  asm#" $addr, $data0, $offset, [M0]",
+  []> {
+
+  let data1 = 0;
+  let mayStore = 1;
+  let mayLoad = 1;
+}
+
 class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
   op,
   (outs),
@@ -500,7 +576,9 @@ class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBU
   let mayLoad = 0;
 }
 
-multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
+multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass,
+                              ValueType load_vt = i32,
+                              SDPatternOperator ld = null_frag> {
 
   let lds = 0, mayLoad = 1 in {
 
@@ -542,16 +620,19 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
     let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in {
       def _ADDR64 : MUBUF <op, (outs regClass:$vdata),
                            (ins SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset),
-                           asm#" $vdata, $srsrc + $vaddr + $offset", []>;
+                           asm#" $vdata, $srsrc + $vaddr + $offset",
+                           [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc,
+                                                  i64:$vaddr, u16imm:$offset)))]>;
     }
   }
 }
 
-class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
+class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
+                          ValueType store_vt, SDPatternOperator st> :
     MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
                             u16imm:$offset),
           name#" $vdata, $srsrc + $vaddr + $offset",
-         []> {
+          [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, u16imm:$offset))]> {
 
   let mayLoad = 0;
   let mayStore = 1;
@@ -658,6 +739,53 @@ multiclass MIMG_Sampler <bits<7> op, string asm> {
   defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4>;
 }
 
+class MIMG_Gather_Helper <bits<7> op, string asm,
+                          RegisterClass dst_rc,
+                          RegisterClass src_rc> : MIMG <
+  op,
+  (outs dst_rc:$vdata),
+  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
+       SReg_256:$srsrc, SReg_128:$ssamp),
+  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
+     #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
+  []> {
+  let mayLoad = 1;
+  let mayStore = 0;
+
+  // DMASK was repurposed for GATHER4. 4 components are always
+  // returned and DMASK works like a swizzle - it selects
+  // the component to fetch. The only useful DMASK values are
+  // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
+  // (red,red,red,red) etc.) The ISA document doesn't mention
+  // this.
+  // Therefore, disable all code which updates DMASK by setting these two:
+  let MIMG = 0;
+  let hasPostISelHook = 0;
+}
+
+multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
+                                    RegisterClass dst_rc,
+                                    int channels> {
+  def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_32>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128>,
+            MIMG_Mask<asm#"_V4", channels>;
+  def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256>,
+            MIMG_Mask<asm#"_V8", channels>;
+  def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512>,
+            MIMG_Mask<asm#"_V16", channels>;
+}
+
+multiclass MIMG_Gather <bits<7> op, string asm> {
+  defm _V1 : MIMG_Gather_Src_Helper<op, asm, VReg_32, 1>;
+  defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2>;
+  defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3>;
+  defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4>;
+}
+
 //===----------------------------------------------------------------------===//
 // Vector instruction mappings
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 500fa78..b3b44e2 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -96,22 +96,35 @@ def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32",
   [(set i32:$dst, (not i32:$src0))]
 >;
 
-def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>;
+def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64",
+  [(set i64:$dst, (not i64:$src0))]
+>;
 def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>;
 def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>;
-def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32", []>;
+def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32",
+  [(set i32:$dst, (AMDGPUbrev i32:$src0))]
+>;
 def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>;
 } // End neverHasSideEffects = 1
 
 ////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>;
 ////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>;
-////def S_BCNT1_I32_B32 : SOP1_BCNT1 <0x0000000f, "S_BCNT1_I32_B32", []>;
-////def S_BCNT1_I32_B64 : SOP1_BCNT1 <0x00000010, "S_BCNT1_I32_B64", []>;
-////def S_FF0_I32_B32 : SOP1_FF0 <0x00000011, "S_FF0_I32_B32", []>;
+def S_BCNT1_I32_B32 : SOP1_32 <0x0000000f, "S_BCNT1_I32_B32",
+  [(set i32:$dst, (ctpop i32:$src0))]
+>;
+def S_BCNT1_I32_B64 : SOP1_32_64 <0x00000010, "S_BCNT1_I32_B64", []>;
+
+////def S_FF0_I32_B32 : SOP1_32 <0x00000011, "S_FF0_I32_B32", []>;
 ////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>;
-////def S_FF1_I32_B32 : SOP1_FF1 <0x00000013, "S_FF1_I32_B32", []>;
+def S_FF1_I32_B32 : SOP1_32 <0x00000013, "S_FF1_I32_B32",
+  [(set i32:$dst, (cttz_zero_undef i32:$src0))]
+>;
 ////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "S_FF1_I32_B64", []>;
-//def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32", []>;
+
+def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32",
+  [(set i32:$dst, (ctlz_zero_undef i32:$src0))]
+>;
+
 //def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>;
 def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>;
 //def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>;
@@ -320,7 +333,7 @@ def S_CMPK_EQ_I32 : SOPK <
 >;
 */
 
-let isCompare = 1 in {
+let isCompare = 1, Defs = [SCC] in {
 def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>;
 def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>;
 def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>;
@@ -332,7 +345,7 @@ def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>;
 def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>;
 def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>;
 def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>;
-} // End isCompare = 1
+} // End isCompare = 1, Defs = [SCC]
 
 let Defs = [SCC], isCommutable = 1 in {
   def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
@@ -467,26 +480,26 @@ defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", f32, COND_UNE>;
 defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32">;
 defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32">;
 
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
 
-defm V_CMPX_F_F32 : VOPC_32 <0x00000010, "V_CMPX_F_F32">;
-defm V_CMPX_LT_F32 : VOPC_32 <0x00000011, "V_CMPX_LT_F32">;
-defm V_CMPX_EQ_F32 : VOPC_32 <0x00000012, "V_CMPX_EQ_F32">;
-defm V_CMPX_LE_F32 : VOPC_32 <0x00000013, "V_CMPX_LE_F32">;
-defm V_CMPX_GT_F32 : VOPC_32 <0x00000014, "V_CMPX_GT_F32">;
-defm V_CMPX_LG_F32 : VOPC_32 <0x00000015, "V_CMPX_LG_F32">;
-defm V_CMPX_GE_F32 : VOPC_32 <0x00000016, "V_CMPX_GE_F32">;
-defm V_CMPX_O_F32 : VOPC_32 <0x00000017, "V_CMPX_O_F32">;
-defm V_CMPX_U_F32 : VOPC_32 <0x00000018, "V_CMPX_U_F32">;
-defm V_CMPX_NGE_F32 : VOPC_32 <0x00000019, "V_CMPX_NGE_F32">;
-defm V_CMPX_NLG_F32 : VOPC_32 <0x0000001a, "V_CMPX_NLG_F32">;
-defm V_CMPX_NGT_F32 : VOPC_32 <0x0000001b, "V_CMPX_NGT_F32">;
-defm V_CMPX_NLE_F32 : VOPC_32 <0x0000001c, "V_CMPX_NLE_F32">;
-defm V_CMPX_NEQ_F32 : VOPC_32 <0x0000001d, "V_CMPX_NEQ_F32">;
-defm V_CMPX_NLT_F32 : VOPC_32 <0x0000001e, "V_CMPX_NLT_F32">;
-defm V_CMPX_TRU_F32 : VOPC_32 <0x0000001f, "V_CMPX_TRU_F32">;
+defm V_CMPX_F_F32 : VOPCX_32 <0x00000010, "V_CMPX_F_F32">;
+defm V_CMPX_LT_F32 : VOPCX_32 <0x00000011, "V_CMPX_LT_F32">;
+defm V_CMPX_EQ_F32 : VOPCX_32 <0x00000012, "V_CMPX_EQ_F32">;
+defm V_CMPX_LE_F32 : VOPCX_32 <0x00000013, "V_CMPX_LE_F32">;
+defm V_CMPX_GT_F32 : VOPCX_32 <0x00000014, "V_CMPX_GT_F32">;
+defm V_CMPX_LG_F32 : VOPCX_32 <0x00000015, "V_CMPX_LG_F32">;
+defm V_CMPX_GE_F32 : VOPCX_32 <0x00000016, "V_CMPX_GE_F32">;
+defm V_CMPX_O_F32 : VOPCX_32 <0x00000017, "V_CMPX_O_F32">;
+defm V_CMPX_U_F32 : VOPCX_32 <0x00000018, "V_CMPX_U_F32">;
+defm V_CMPX_NGE_F32 : VOPCX_32 <0x00000019, "V_CMPX_NGE_F32">;
+defm V_CMPX_NLG_F32 : VOPCX_32 <0x0000001a, "V_CMPX_NLG_F32">;
+defm V_CMPX_NGT_F32 : VOPCX_32 <0x0000001b, "V_CMPX_NGT_F32">;
+defm V_CMPX_NLE_F32 : VOPCX_32 <0x0000001c, "V_CMPX_NLE_F32">;
+defm V_CMPX_NEQ_F32 : VOPCX_32 <0x0000001d, "V_CMPX_NEQ_F32">;
+defm V_CMPX_NLT_F32 : VOPCX_32 <0x0000001e, "V_CMPX_NLT_F32">;
+defm V_CMPX_TRU_F32 : VOPCX_32 <0x0000001f, "V_CMPX_TRU_F32">;
 
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
 
 defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64">;
 defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", f64, COND_OLT>;
@@ -505,26 +518,26 @@ defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", f64, COND_UNE>;
 defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64">;
 defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64">;
 
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
 
-defm V_CMPX_F_F64 : VOPC_64 <0x00000030, "V_CMPX_F_F64">;
-defm V_CMPX_LT_F64 : VOPC_64 <0x00000031, "V_CMPX_LT_F64">;
-defm V_CMPX_EQ_F64 : VOPC_64 <0x00000032, "V_CMPX_EQ_F64">;
-defm V_CMPX_LE_F64 : VOPC_64 <0x00000033, "V_CMPX_LE_F64">;
-defm V_CMPX_GT_F64 : VOPC_64 <0x00000034, "V_CMPX_GT_F64">;
-defm V_CMPX_LG_F64 : VOPC_64 <0x00000035, "V_CMPX_LG_F64">;
-defm V_CMPX_GE_F64 : VOPC_64 <0x00000036, "V_CMPX_GE_F64">;
-defm V_CMPX_O_F64 : VOPC_64 <0x00000037, "V_CMPX_O_F64">;
-defm V_CMPX_U_F64 : VOPC_64 <0x00000038, "V_CMPX_U_F64">;
-defm V_CMPX_NGE_F64 : VOPC_64 <0x00000039, "V_CMPX_NGE_F64">;
-defm V_CMPX_NLG_F64 : VOPC_64 <0x0000003a, "V_CMPX_NLG_F64">;
-defm V_CMPX_NGT_F64 : VOPC_64 <0x0000003b, "V_CMPX_NGT_F64">;
-defm V_CMPX_NLE_F64 : VOPC_64 <0x0000003c, "V_CMPX_NLE_F64">;
-defm V_CMPX_NEQ_F64 : VOPC_64 <0x0000003d, "V_CMPX_NEQ_F64">;
-defm V_CMPX_NLT_F64 : VOPC_64 <0x0000003e, "V_CMPX_NLT_F64">;
-defm V_CMPX_TRU_F64 : VOPC_64 <0x0000003f, "V_CMPX_TRU_F64">;
+defm V_CMPX_F_F64 : VOPCX_64 <0x00000030, "V_CMPX_F_F64">;
+defm V_CMPX_LT_F64 : VOPCX_64 <0x00000031, "V_CMPX_LT_F64">;
+defm V_CMPX_EQ_F64 : VOPCX_64 <0x00000032, "V_CMPX_EQ_F64">;
+defm V_CMPX_LE_F64 : VOPCX_64 <0x00000033, "V_CMPX_LE_F64">;
+defm V_CMPX_GT_F64 : VOPCX_64 <0x00000034, "V_CMPX_GT_F64">;
+defm V_CMPX_LG_F64 : VOPCX_64 <0x00000035, "V_CMPX_LG_F64">;
+defm V_CMPX_GE_F64 : VOPCX_64 <0x00000036, "V_CMPX_GE_F64">;
+defm V_CMPX_O_F64 : VOPCX_64 <0x00000037, "V_CMPX_O_F64">;
+defm V_CMPX_U_F64 : VOPCX_64 <0x00000038, "V_CMPX_U_F64">;
+defm V_CMPX_NGE_F64 : VOPCX_64 <0x00000039, "V_CMPX_NGE_F64">;
+defm V_CMPX_NLG_F64 : VOPCX_64 <0x0000003a, "V_CMPX_NLG_F64">;
+defm V_CMPX_NGT_F64 : VOPCX_64 <0x0000003b, "V_CMPX_NGT_F64">;
+defm V_CMPX_NLE_F64 : VOPCX_64 <0x0000003c, "V_CMPX_NLE_F64">;
+defm V_CMPX_NEQ_F64 : VOPCX_64 <0x0000003d, "V_CMPX_NEQ_F64">;
+defm V_CMPX_NLT_F64 : VOPCX_64 <0x0000003e, "V_CMPX_NLT_F64">;
+defm V_CMPX_TRU_F64 : VOPCX_64 <0x0000003f, "V_CMPX_TRU_F64">;
 
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
 
 defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32">;
 defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32">;
@@ -543,26 +556,26 @@ defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32">;
 defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32">;
 defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32">;
 
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
 
-defm V_CMPSX_F_F32 : VOPC_32 <0x00000050, "V_CMPSX_F_F32">;
-defm V_CMPSX_LT_F32 : VOPC_32 <0x00000051, "V_CMPSX_LT_F32">;
-defm V_CMPSX_EQ_F32 : VOPC_32 <0x00000052, "V_CMPSX_EQ_F32">;
-defm V_CMPSX_LE_F32 : VOPC_32 <0x00000053, "V_CMPSX_LE_F32">;
-defm V_CMPSX_GT_F32 : VOPC_32 <0x00000054, "V_CMPSX_GT_F32">;
-defm V_CMPSX_LG_F32 : VOPC_32 <0x00000055, "V_CMPSX_LG_F32">;
-defm V_CMPSX_GE_F32 : VOPC_32 <0x00000056, "V_CMPSX_GE_F32">;
-defm V_CMPSX_O_F32 : VOPC_32 <0x00000057, "V_CMPSX_O_F32">;
-defm V_CMPSX_U_F32 : VOPC_32 <0x00000058, "V_CMPSX_U_F32">;
-defm V_CMPSX_NGE_F32 : VOPC_32 <0x00000059, "V_CMPSX_NGE_F32">;
-defm V_CMPSX_NLG_F32 : VOPC_32 <0x0000005a, "V_CMPSX_NLG_F32">;
-defm V_CMPSX_NGT_F32 : VOPC_32 <0x0000005b, "V_CMPSX_NGT_F32">;
-defm V_CMPSX_NLE_F32 : VOPC_32 <0x0000005c, "V_CMPSX_NLE_F32">;
-defm V_CMPSX_NEQ_F32 : VOPC_32 <0x0000005d, "V_CMPSX_NEQ_F32">;
-defm V_CMPSX_NLT_F32 : VOPC_32 <0x0000005e, "V_CMPSX_NLT_F32">;
-defm V_CMPSX_TRU_F32 : VOPC_32 <0x0000005f, "V_CMPSX_TRU_F32">;
+defm V_CMPSX_F_F32 : VOPCX_32 <0x00000050, "V_CMPSX_F_F32">;
+defm V_CMPSX_LT_F32 : VOPCX_32 <0x00000051, "V_CMPSX_LT_F32">;
+defm V_CMPSX_EQ_F32 : VOPCX_32 <0x00000052, "V_CMPSX_EQ_F32">;
+defm V_CMPSX_LE_F32 : VOPCX_32 <0x00000053, "V_CMPSX_LE_F32">;
+defm V_CMPSX_GT_F32 : VOPCX_32 <0x00000054, "V_CMPSX_GT_F32">;
+defm V_CMPSX_LG_F32 : VOPCX_32 <0x00000055, "V_CMPSX_LG_F32">;
+defm V_CMPSX_GE_F32 : VOPCX_32 <0x00000056, "V_CMPSX_GE_F32">;
+defm V_CMPSX_O_F32 : VOPCX_32 <0x00000057, "V_CMPSX_O_F32">;
+defm V_CMPSX_U_F32 : VOPCX_32 <0x00000058, "V_CMPSX_U_F32">;
+defm V_CMPSX_NGE_F32 : VOPCX_32 <0x00000059, "V_CMPSX_NGE_F32">;
+defm V_CMPSX_NLG_F32 : VOPCX_32 <0x0000005a, "V_CMPSX_NLG_F32">;
+defm V_CMPSX_NGT_F32 : VOPCX_32 <0x0000005b, "V_CMPSX_NGT_F32">;
+defm V_CMPSX_NLE_F32 : VOPCX_32 <0x0000005c, "V_CMPSX_NLE_F32">;
+defm V_CMPSX_NEQ_F32 : VOPCX_32 <0x0000005d, "V_CMPSX_NEQ_F32">;
+defm V_CMPSX_NLT_F32 : VOPCX_32 <0x0000005e, "V_CMPSX_NLT_F32">;
+defm V_CMPSX_TRU_F32 : VOPCX_32 <0x0000005f, "V_CMPSX_TRU_F32">;
 
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
 
 defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64">;
 defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64">;
@@ -611,18 +624,18 @@ defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", i32, COND_NE>;
 defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", i32, COND_SGE>;
 defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32">;
 
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
 
-defm V_CMPX_F_I32 : VOPC_32 <0x00000090, "V_CMPX_F_I32">;
-defm V_CMPX_LT_I32 : VOPC_32 <0x00000091, "V_CMPX_LT_I32">;
-defm V_CMPX_EQ_I32 : VOPC_32 <0x00000092, "V_CMPX_EQ_I32">;
-defm V_CMPX_LE_I32 : VOPC_32 <0x00000093, "V_CMPX_LE_I32">;
-defm V_CMPX_GT_I32 : VOPC_32 <0x00000094, "V_CMPX_GT_I32">;
-defm V_CMPX_NE_I32 : VOPC_32 <0x00000095, "V_CMPX_NE_I32">;
-defm V_CMPX_GE_I32 : VOPC_32 <0x00000096, "V_CMPX_GE_I32">;
-defm V_CMPX_T_I32 : VOPC_32 <0x00000097, "V_CMPX_T_I32">;
+defm V_CMPX_F_I32 : VOPCX_32 <0x00000090, "V_CMPX_F_I32">;
+defm V_CMPX_LT_I32 : VOPCX_32 <0x00000091, "V_CMPX_LT_I32">;
+defm V_CMPX_EQ_I32 : VOPCX_32 <0x00000092, "V_CMPX_EQ_I32">;
+defm V_CMPX_LE_I32 : VOPCX_32 <0x00000093, "V_CMPX_LE_I32">;
+defm V_CMPX_GT_I32 : VOPCX_32 <0x00000094, "V_CMPX_GT_I32">;
+defm V_CMPX_NE_I32 : VOPCX_32 <0x00000095, "V_CMPX_NE_I32">;
+defm V_CMPX_GE_I32 : VOPCX_32 <0x00000096, "V_CMPX_GE_I32">;
+defm V_CMPX_T_I32 : VOPCX_32 <0x00000097, "V_CMPX_T_I32">;
 
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
 
 defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64">;
 defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", i64, COND_SLT>;
@@ -633,18 +646,18 @@ defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", i64, COND_NE>;
 defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", i64, COND_SGE>;
 defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64">;
 
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
 
-defm V_CMPX_F_I64 : VOPC_64 <0x000000b0, "V_CMPX_F_I64">;
-defm V_CMPX_LT_I64 : VOPC_64 <0x000000b1, "V_CMPX_LT_I64">;
-defm V_CMPX_EQ_I64 : VOPC_64 <0x000000b2, "V_CMPX_EQ_I64">;
-defm V_CMPX_LE_I64 : VOPC_64 <0x000000b3, "V_CMPX_LE_I64">;
-defm V_CMPX_GT_I64 : VOPC_64 <0x000000b4, "V_CMPX_GT_I64">;
-defm V_CMPX_NE_I64 : VOPC_64 <0x000000b5, "V_CMPX_NE_I64">;
-defm V_CMPX_GE_I64 : VOPC_64 <0x000000b6, "V_CMPX_GE_I64">;
-defm V_CMPX_T_I64 : VOPC_64 <0x000000b7, "V_CMPX_T_I64">;
+defm V_CMPX_F_I64 : VOPCX_64 <0x000000b0, "V_CMPX_F_I64">;
+defm V_CMPX_LT_I64 : VOPCX_64 <0x000000b1, "V_CMPX_LT_I64">;
+defm V_CMPX_EQ_I64 : VOPCX_64 <0x000000b2, "V_CMPX_EQ_I64">;
+defm V_CMPX_LE_I64 : VOPCX_64 <0x000000b3, "V_CMPX_LE_I64">;
+defm V_CMPX_GT_I64 : VOPCX_64 <0x000000b4, "V_CMPX_GT_I64">;
+defm V_CMPX_NE_I64 : VOPCX_64 <0x000000b5, "V_CMPX_NE_I64">;
+defm V_CMPX_GE_I64 : VOPCX_64 <0x000000b6, "V_CMPX_GE_I64">;
+defm V_CMPX_T_I64 : VOPCX_64 <0x000000b7, "V_CMPX_T_I64">;
 
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
 
 defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32">;
 defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", i32, COND_ULT>;
@@ -655,18 +668,18 @@ defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", i32, COND_NE>;
 defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", i32, COND_UGE>;
 defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32">;
 
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
 
-defm V_CMPX_F_U32 : VOPC_32 <0x000000d0, "V_CMPX_F_U32">;
-defm V_CMPX_LT_U32 : VOPC_32 <0x000000d1, "V_CMPX_LT_U32">;
-defm V_CMPX_EQ_U32 : VOPC_32 <0x000000d2, "V_CMPX_EQ_U32">;
-defm V_CMPX_LE_U32 : VOPC_32 <0x000000d3, "V_CMPX_LE_U32">;
-defm V_CMPX_GT_U32 : VOPC_32 <0x000000d4, "V_CMPX_GT_U32">;
-defm V_CMPX_NE_U32 : VOPC_32 <0x000000d5, "V_CMPX_NE_U32">;
-defm V_CMPX_GE_U32 : VOPC_32 <0x000000d6, "V_CMPX_GE_U32">;
-defm V_CMPX_T_U32 : VOPC_32 <0x000000d7, "V_CMPX_T_U32">;
+defm V_CMPX_F_U32 : VOPCX_32 <0x000000d0, "V_CMPX_F_U32">;
+defm V_CMPX_LT_U32 : VOPCX_32 <0x000000d1, "V_CMPX_LT_U32">;
+defm V_CMPX_EQ_U32 : VOPCX_32 <0x000000d2, "V_CMPX_EQ_U32">;
+defm V_CMPX_LE_U32 : VOPCX_32 <0x000000d3, "V_CMPX_LE_U32">;
+defm V_CMPX_GT_U32 : VOPCX_32 <0x000000d4, "V_CMPX_GT_U32">;
+defm V_CMPX_NE_U32 : VOPCX_32 <0x000000d5, "V_CMPX_NE_U32">;
+defm V_CMPX_GE_U32 : VOPCX_32 <0x000000d6, "V_CMPX_GE_U32">;
+defm V_CMPX_T_U32 : VOPCX_32 <0x000000d7, "V_CMPX_T_U32">;
 
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
 
 defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64">;
 defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", i64, COND_ULT>;
@@ -677,30 +690,30 @@ defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", i64, COND_NE>;
 defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", i64, COND_UGE>;
 defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64">;
 
-let hasSideEffects = 1, Defs = [EXEC] in {
+let hasSideEffects = 1 in {
 
-defm V_CMPX_F_U64 : VOPC_64 <0x000000f0, "V_CMPX_F_U64">;
-defm V_CMPX_LT_U64 : VOPC_64 <0x000000f1, "V_CMPX_LT_U64">;
-defm V_CMPX_EQ_U64 : VOPC_64 <0x000000f2, "V_CMPX_EQ_U64">;
-defm V_CMPX_LE_U64 : VOPC_64 <0x000000f3, "V_CMPX_LE_U64">;
-defm V_CMPX_GT_U64 : VOPC_64 <0x000000f4, "V_CMPX_GT_U64">;
-defm V_CMPX_NE_U64 : VOPC_64 <0x000000f5, "V_CMPX_NE_U64">;
-defm V_CMPX_GE_U64 : VOPC_64 <0x000000f6, "V_CMPX_GE_U64">;
-defm V_CMPX_T_U64 : VOPC_64 <0x000000f7, "V_CMPX_T_U64">;
+defm V_CMPX_F_U64 : VOPCX_64 <0x000000f0, "V_CMPX_F_U64">;
+defm V_CMPX_LT_U64 : VOPCX_64 <0x000000f1, "V_CMPX_LT_U64">;
+defm V_CMPX_EQ_U64 : VOPCX_64 <0x000000f2, "V_CMPX_EQ_U64">;
+defm V_CMPX_LE_U64 : VOPCX_64 <0x000000f3, "V_CMPX_LE_U64">;
+defm V_CMPX_GT_U64 : VOPCX_64 <0x000000f4, "V_CMPX_GT_U64">;
+defm V_CMPX_NE_U64 : VOPCX_64 <0x000000f5, "V_CMPX_NE_U64">;
+defm V_CMPX_GE_U64 : VOPCX_64 <0x000000f6, "V_CMPX_GE_U64">;
+defm V_CMPX_T_U64 : VOPCX_64 <0x000000f7, "V_CMPX_T_U64">;
 
-} // End hasSideEffects = 1, Defs = [EXEC]
+} // End hasSideEffects = 1
 
 defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32">;
 
-let hasSideEffects = 1, Defs = [EXEC] in {
-defm V_CMPX_CLASS_F32 : VOPC_32 <0x00000098, "V_CMPX_CLASS_F32">;
-} // End hasSideEffects = 1, Defs = [EXEC]
+let hasSideEffects = 1 in {
+defm V_CMPX_CLASS_F32 : VOPCX_32 <0x00000098, "V_CMPX_CLASS_F32">;
+} // End hasSideEffects = 1
 
 defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64">;
 
-let hasSideEffects = 1, Defs = [EXEC] in {
-defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
-} // End hasSideEffects = 1, Defs = [EXEC]
+let hasSideEffects = 1 in {
+defm V_CMPX_CLASS_F64 : VOPCX_64 <0x000000b8, "V_CMPX_CLASS_F64">;
+} // End hasSideEffects = 1
 
 } // End isCompare = 1
 
@@ -708,8 +721,97 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
 // DS Instructions
 //===----------------------------------------------------------------------===//
 
-def DS_ADD_U32_RTN : DS_1A1D_RET <0x20, "DS_ADD_U32_RTN", VReg_32>;
-def DS_SUB_U32_RTN : DS_1A1D_RET <0x21, "DS_SUB_U32_RTN", VReg_32>;
+
+def DS_ADD_U32 : DS_1A1D_NORET <0x0, "DS_ADD_U32", VReg_32>;
+def DS_SUB_U32 : DS_1A1D_NORET <0x1, "DS_SUB_U32", VReg_32>;
+def DS_RSUB_U32 : DS_1A1D_NORET <0x2, "DS_RSUB_U32", VReg_32>;
+def DS_INC_U32 : DS_1A1D_NORET <0x3, "DS_INC_U32", VReg_32>;
+def DS_DEC_U32 : DS_1A1D_NORET <0x4, "DS_DEC_U32", VReg_32>;
+def DS_MIN_I32 : DS_1A1D_NORET <0x5, "DS_MIN_I32", VReg_32>;
+def DS_MAX_I32 : DS_1A1D_NORET <0x6, "DS_MAX_I32", VReg_32>;
+def DS_MIN_U32 : DS_1A1D_NORET <0x7, "DS_MIN_U32", VReg_32>;
+def DS_MAX_U32 : DS_1A1D_NORET <0x8, "DS_MAX_U32", VReg_32>;
+def DS_AND_B32 : DS_1A1D_NORET <0x9, "DS_AND_B32", VReg_32>;
+def DS_OR_B32 : DS_1A1D_NORET <0xa, "DS_OR_B32", VReg_32>;
+def DS_XOR_B32 : DS_1A1D_NORET <0xb, "DS_XOR_B32", VReg_32>;
+def DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "DS_MSKOR_B32", VReg_32>;
+def DS_CMPST_B32 : DS_1A2D_NORET <0x10, "DS_CMPST_B32", VReg_32>;
+def DS_CMPST_F32 : DS_1A2D_NORET <0x11, "DS_CMPST_F32", VReg_32>;
+def DS_MIN_F32 : DS_1A1D_NORET <0x12, "DS_MIN_F32", VReg_32>;
+def DS_MAX_F32 : DS_1A1D_NORET <0x13, "DS_MAX_F32", VReg_32>;
+
+def DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "DS_ADD_RTN_U32", VReg_32>;
+def DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "DS_SUB_RTN_U32", VReg_32>;
+def DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "DS_RSUB_RTN_U32", VReg_32>;
+def DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "DS_INC_RTN_U32", VReg_32>;
+def DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "DS_DEC_RTN_U32", VReg_32>;
+def DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "DS_MIN_RTN_I32", VReg_32>;
+def DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "DS_MAX_RTN_I32", VReg_32>;
+def DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "DS_MIN_RTN_U32", VReg_32>;
+def DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "DS_MAX_RTN_U32", VReg_32>;
+def DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "DS_AND_RTN_B32", VReg_32>;
+def DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "DS_OR_RTN_B32", VReg_32>;
+def DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "DS_XOR_RTN_B32", VReg_32>;
+def DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "DS_MSKOR_RTN_B32", VReg_32>;
+def DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "DS_WRXCHG_RTN_B32", VReg_32>;
+//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "DS_WRXCHG2_RTN_B32", VReg_32>;
+//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "DS_WRXCHG2_RTN_B32", VReg_32>;
+def DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "DS_CMPST_RTN_B32", VReg_32>;
+def DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "DS_CMPST_RTN_F32", VReg_32>;
+def DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "DS_MIN_RTN_F32", VReg_32>;
+def DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "DS_MAX_RTN_F32", VReg_32>;
+
+let SubtargetPredicate = isCI in {
+def DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "DS_WRAP_RTN_F32", VReg_32>;
+} // End isCI
+
+
+def DS_ADD_U64 : DS_1A1D_NORET <0x40, "DS_ADD_U64", VReg_32>;
+def DS_SUB_U64 : DS_1A1D_NORET <0x41, "DS_SUB_U64", VReg_32>;
+def DS_RSUB_U64 : DS_1A1D_NORET <0x42, "DS_RSUB_U64", VReg_32>;
+def DS_INC_U64 : DS_1A1D_NORET <0x43, "DS_INC_U64", VReg_32>;
+def DS_DEC_U64 : DS_1A1D_NORET <0x44, "DS_DEC_U64", VReg_32>;
+def DS_MIN_I64 : DS_1A1D_NORET <0x45, "DS_MIN_I64", VReg_64>;
+def DS_MAX_I64 : DS_1A1D_NORET <0x46, "DS_MAX_I64", VReg_64>;
+def DS_MIN_U64 : DS_1A1D_NORET <0x47, "DS_MIN_U64", VReg_64>;
+def DS_MAX_U64 : DS_1A1D_NORET <0x48, "DS_MAX_U64", VReg_64>;
+def DS_AND_B64 : DS_1A1D_NORET <0x49, "DS_AND_B64", VReg_64>;
+def DS_OR_B64 : DS_1A1D_NORET <0x4a, "DS_OR_B64", VReg_64>;
+def DS_XOR_B64 : DS_1A1D_NORET <0x4b, "DS_XOR_B64", VReg_64>;
+def DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "DS_MSKOR_B64", VReg_64>;
+def DS_CMPST_B64 : DS_1A2D_NORET <0x50, "DS_CMPST_B64", VReg_64>;
+def DS_CMPST_F64 : DS_1A2D_NORET <0x51, "DS_CMPST_F64", VReg_64>;
+def DS_MIN_F64 : DS_1A1D_NORET <0x52, "DS_MIN_F64", VReg_64>;
+def DS_MAX_F64 : DS_1A1D_NORET <0x53, "DS_MAX_F64", VReg_64>;
+
+def DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "DS_ADD_RTN_U64", VReg_64>;
+def DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "DS_SUB_RTN_U64", VReg_64>;
+def DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "DS_RSUB_RTN_U64", VReg_64>;
+def DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "DS_INC_RTN_U64", VReg_64>;
+def DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "DS_DEC_RTN_U64", VReg_64>;
+def DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "DS_MIN_RTN_I64", VReg_64>;
+def DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "DS_MAX_RTN_I64", VReg_64>;
+def DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "DS_MIN_RTN_U64", VReg_64>;
+def DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "DS_MAX_RTN_U64", VReg_64>;
+def DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "DS_AND_RTN_B64", VReg_64>;
+def DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "DS_OR_RTN_B64", VReg_64>;
+def DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "DS_XOR_RTN_B64", VReg_64>;
+def DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "DS_MSKOR_RTN_B64", VReg_64>;
+def DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "DS_WRXCHG_RTN_B64", VReg_64>;
+//def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "DS_WRXCHG2_RTN_B64", VReg_64>;
+//def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "DS_WRXCHG2_RTN_B64", VReg_64>;
+def DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "DS_CMPST_RTN_B64", VReg_64>;
+def DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "DS_CMPST_RTN_F64", VReg_64>;
+def DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "DS_MIN_F64", VReg_64>;
+def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "DS_MAX_F64", VReg_64>;
+
+//let SubtargetPredicate = isCI in {
+// DS_CONDXCHG32_RTN_B64
+// DS_CONDXCHG32_RTN_B128
+//} // End isCI
+
+// TODO: _SRC2_* forms
+
 def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>;
 def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "DS_WRITE_B8", VReg_32>;
 def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "DS_WRITE_B16", VReg_32>;
@@ -744,32 +846,46 @@ defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMA
 //def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>;
 //def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>;
 //def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>;
-defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <0x00000008, "BUFFER_LOAD_UBYTE", VReg_32>;
-defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper <0x00000009, "BUFFER_LOAD_SBYTE", VReg_32>;
-defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper <0x0000000a, "BUFFER_LOAD_USHORT", VReg_32>;
-defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <0x0000000b, "BUFFER_LOAD_SSHORT", VReg_32>;
-defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <0x0000000c, "BUFFER_LOAD_DWORD", VReg_32>;
-defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64>;
-defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>;
+defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <
+  0x00000008, "BUFFER_LOAD_UBYTE", VReg_32, i32, az_extloadi8_global
+>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper <
+  0x00000009, "BUFFER_LOAD_SBYTE", VReg_32, i32, sextloadi8_global
+>;
+defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper <
+  0x0000000a, "BUFFER_LOAD_USHORT", VReg_32, i32, az_extloadi16_global
+>;
+defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <
+  0x0000000b, "BUFFER_LOAD_SSHORT", VReg_32, i32, sextloadi16_global
+>;
+defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <
+  0x0000000c, "BUFFER_LOAD_DWORD", VReg_32, i32, global_load
+>;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <
+  0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64, v2i32, global_load
+>;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <
+  0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128, v4i32, global_load
+>;
 
 def BUFFER_STORE_BYTE : MUBUF_Store_Helper <
-  0x00000018, "BUFFER_STORE_BYTE", VReg_32
+  0x00000018, "BUFFER_STORE_BYTE", VReg_32, i32, truncstorei8_global
 >;
 
 def BUFFER_STORE_SHORT : MUBUF_Store_Helper <
-  0x0000001a, "BUFFER_STORE_SHORT", VReg_32
+  0x0000001a, "BUFFER_STORE_SHORT", VReg_32, i32, truncstorei16_global
 >;
 
 def BUFFER_STORE_DWORD : MUBUF_Store_Helper <
-  0x0000001c, "BUFFER_STORE_DWORD", VReg_32
+  0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32, global_store
 >;
 
 def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
-  0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64
+  0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, v2i32, global_store
 >;
 
 def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
-  0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128
+  0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32, global_store
 >;
 //def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
 //def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>;
@@ -885,31 +1001,31 @@ defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">;
 //def IMAGE_SAMPLE_C_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_O", 0x0000003d>;
 //def IMAGE_SAMPLE_C_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL_O", 0x0000003e>;
 //def IMAGE_SAMPLE_C_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ_O", 0x0000003f>;
-//def IMAGE_GATHER4 : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4", 0x00000040>;
-//def IMAGE_GATHER4_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL", 0x00000041>;
-//def IMAGE_GATHER4_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L", 0x00000044>;
-//def IMAGE_GATHER4_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B", 0x00000045>;
-//def IMAGE_GATHER4_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL", 0x00000046>;
-//def IMAGE_GATHER4_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ", 0x00000047>;
-//def IMAGE_GATHER4_C : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C", 0x00000048>;
-//def IMAGE_GATHER4_C_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL", 0x00000049>;
-//def IMAGE_GATHER4_C_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L", 0x0000004c>;
-//def IMAGE_GATHER4_C_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B", 0x0000004d>;
-//def IMAGE_GATHER4_C_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL", 0x0000004e>;
-//def IMAGE_GATHER4_C_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ", 0x0000004f>;
-//def IMAGE_GATHER4_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_O", 0x00000050>;
-//def IMAGE_GATHER4_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL_O", 0x00000051>;
-//def IMAGE_GATHER4_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L_O", 0x00000054>;
-//def IMAGE_GATHER4_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_O", 0x00000055>;
-//def IMAGE_GATHER4_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL_O", 0x00000056>;
-//def IMAGE_GATHER4_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ_O", 0x00000057>;
-//def IMAGE_GATHER4_C_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_O", 0x00000058>;
-//def IMAGE_GATHER4_C_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL_O", 0x00000059>;
-//def IMAGE_GATHER4_C_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L_O", 0x0000005c>;
-//def IMAGE_GATHER4_C_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_O", 0x0000005d>;
-//def IMAGE_GATHER4_C_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL_O", 0x0000005e>;
-//def IMAGE_GATHER4_C_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ_O", 0x0000005f>;
-//def IMAGE_GET_LOD : MIMG_NoPattern_ <"IMAGE_GET_LOD", 0x00000060>;
+defm IMAGE_GATHER4          : MIMG_Gather <0x00000040, "IMAGE_GATHER4">;
+defm IMAGE_GATHER4_CL       : MIMG_Gather <0x00000041, "IMAGE_GATHER4_CL">;
+defm IMAGE_GATHER4_L        : MIMG_Gather <0x00000044, "IMAGE_GATHER4_L">;
+defm IMAGE_GATHER4_B        : MIMG_Gather <0x00000045, "IMAGE_GATHER4_B">;
+defm IMAGE_GATHER4_B_CL     : MIMG_Gather <0x00000046, "IMAGE_GATHER4_B_CL">;
+defm IMAGE_GATHER4_LZ       : MIMG_Gather <0x00000047, "IMAGE_GATHER4_LZ">;
+defm IMAGE_GATHER4_C        : MIMG_Gather <0x00000048, "IMAGE_GATHER4_C">;
+defm IMAGE_GATHER4_C_CL     : MIMG_Gather <0x00000049, "IMAGE_GATHER4_C_CL">;
+defm IMAGE_GATHER4_C_L      : MIMG_Gather <0x0000004c, "IMAGE_GATHER4_C_L">;
+defm IMAGE_GATHER4_C_B      : MIMG_Gather <0x0000004d, "IMAGE_GATHER4_C_B">;
+defm IMAGE_GATHER4_C_B_CL   : MIMG_Gather <0x0000004e, "IMAGE_GATHER4_C_B_CL">;
+defm IMAGE_GATHER4_C_LZ     : MIMG_Gather <0x0000004f, "IMAGE_GATHER4_C_LZ">;
+defm IMAGE_GATHER4_O        : MIMG_Gather <0x00000050, "IMAGE_GATHER4_O">;
+defm IMAGE_GATHER4_CL_O     : MIMG_Gather <0x00000051, "IMAGE_GATHER4_CL_O">;
+defm IMAGE_GATHER4_L_O      : MIMG_Gather <0x00000054, "IMAGE_GATHER4_L_O">;
+defm IMAGE_GATHER4_B_O      : MIMG_Gather <0x00000055, "IMAGE_GATHER4_B_O">;
+defm IMAGE_GATHER4_B_CL_O   : MIMG_Gather <0x00000056, "IMAGE_GATHER4_B_CL_O">;
+defm IMAGE_GATHER4_LZ_O     : MIMG_Gather <0x00000057, "IMAGE_GATHER4_LZ_O">;
+defm IMAGE_GATHER4_C_O      : MIMG_Gather <0x00000058, "IMAGE_GATHER4_C_O">;
+defm IMAGE_GATHER4_C_CL_O   : MIMG_Gather <0x00000059, "IMAGE_GATHER4_C_CL_O">;
+defm IMAGE_GATHER4_C_L_O    : MIMG_Gather <0x0000005c, "IMAGE_GATHER4_C_L_O">;
+defm IMAGE_GATHER4_C_B_O    : MIMG_Gather <0x0000005d, "IMAGE_GATHER4_C_B_O">;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "IMAGE_GATHER4_C_B_CL_O">;
+defm IMAGE_GATHER4_C_LZ_O   : MIMG_Gather <0x0000005f, "IMAGE_GATHER4_C_LZ_O">;
+defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, "IMAGE_GET_LOD">;
 //def IMAGE_SAMPLE_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD", 0x00000068>;
 //def IMAGE_SAMPLE_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL", 0x00000069>;
 //def IMAGE_SAMPLE_C_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD", 0x0000006a>;
@@ -962,8 +1078,12 @@ defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32",
   [(set i32:$dst, (fp_to_sint f32:$src0))]
 >;
 defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
-////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>;
-//defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", []>;
+defm V_CVT_F16_F32 : VOP1_32 <0x0000000a, "V_CVT_F16_F32",
+  [(set i32:$dst, (f32_to_f16 f32:$src0))]
+>;
+defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16",
+  [(set f32:$dst, (f16_to_f32 i32:$src0))]
+>;
 //defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>;
 //defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>;
 //defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>;
@@ -973,10 +1093,18 @@ defm V_CVT_F32_F64 : VOP1_32_64 <0x0000000f, "V_CVT_F32_F64",
 defm V_CVT_F64_F32 : VOP1_64_32 <0x00000010, "V_CVT_F64_F32",
   [(set f64:$dst, (fextend f32:$src0))]
 >;
-//defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", []>;
-//defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>;
-//defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>;
-//defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>;
+defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0",
+  [(set f32:$dst, (AMDGPUcvt_f32_ubyte0 i32:$src0))]
+>;
+defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1",
+  [(set f32:$dst, (AMDGPUcvt_f32_ubyte1 i32:$src0))]
+>;
+defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2",
+  [(set f32:$dst, (AMDGPUcvt_f32_ubyte2 i32:$src0))]
+>;
+defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3",
+  [(set f32:$dst, (AMDGPUcvt_f32_ubyte3 i32:$src0))]
+>;
 defm V_CVT_U32_F64 : VOP1_32_64 <0x00000015, "V_CVT_U32_F64",
   [(set i32:$dst, (fp_to_uint f64:$src0))]
 >;
@@ -988,7 +1116,7 @@ defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32",
   [(set f32:$dst, (AMDGPUfract f32:$src0))]
 >;
 defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32",
-  [(set f32:$dst, (int_AMDGPU_trunc f32:$src0))]
+  [(set f32:$dst, (ftrunc f32:$src0))]
 >;
 defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32",
   [(set f32:$dst, (fceil f32:$src0))]
@@ -1006,24 +1134,33 @@ defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>;
 defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32",
   [(set f32:$dst, (flog2 f32:$src0))]
 >;
+
 defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>;
 defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>;
 defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32",
-  [(set f32:$dst, (fdiv FP_ONE, f32:$src0))]
+  [(set f32:$dst, (AMDGPUrcp f32:$src0))]
 >;
 defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>;
-defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>;
+defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32",
+  [(set f32:$dst, (AMDGPUrsq_clamped f32:$src0))]
+>;
 defm V_RSQ_LEGACY_F32 : VOP1_32 <
   0x0000002d, "V_RSQ_LEGACY_F32",
-  [(set f32:$dst, (int_AMDGPU_rsq f32:$src0))]
+  [(set f32:$dst, (AMDGPUrsq_legacy f32:$src0))]
+>;
+defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32",
+  [(set f32:$dst, (AMDGPUrsq f32:$src0))]
 >;
-defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>;
 defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64",
-  [(set f64:$dst, (fdiv FP_ONE, f64:$src0))]
+  [(set f64:$dst, (AMDGPUrcp f64:$src0))]
 >;
 defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>;
-defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", []>;
-defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", []>;
+defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64",
+  [(set f64:$dst, (AMDGPUrsq f64:$src0))]
+>;
+defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64",
+  [(set f64:$dst, (AMDGPUrsq_clamped f64:$src0))]
+>;
 defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32",
   [(set f32:$dst, (fsqrt f32:$src0))]
 >;
@@ -1211,7 +1348,7 @@ defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32",
 defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
 defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
 defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
-//defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
+defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
 defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
 defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
 
@@ -1303,16 +1440,20 @@ defm V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
 //def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>;
 defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
 ////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
-defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
-def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>;
+defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32",
+  [(set f32:$dst, (AMDGPUdiv_fixup f32:$src0, f32:$src1, f32:$src2))]
+>;
+def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64",
+  [(set f64:$dst, (AMDGPUdiv_fixup f64:$src0, f64:$src1, f64:$src2))]
+>;
 
-def V_LSHL_B64 : VOP3_64_Shift <0x00000161, "V_LSHL_B64",
+def V_LSHL_B64 : VOP3_64_32 <0x00000161, "V_LSHL_B64",
   [(set i64:$dst, (shl i64:$src0, i32:$src1))]
 >;
-def V_LSHR_B64 : VOP3_64_Shift <0x00000162, "V_LSHR_B64",
+def V_LSHR_B64 : VOP3_64_32 <0x00000162, "V_LSHR_B64",
   [(set i64:$dst, (srl i64:$src0, i32:$src1))]
 >;
-def V_ASHR_I64 : VOP3_64_Shift <0x00000163, "V_ASHR_I64",
+def V_ASHR_I64 : VOP3_64_32 <0x00000163, "V_ASHR_I64",
   [(set i64:$dst, (sra i64:$src0, i32:$src1))]
 >;
 
@@ -1336,14 +1477,23 @@ defm V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
 
 } // isCommutable = 1
 
-defm V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
-def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
-defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>;
-def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>;
+def V_DIV_SCALE_F32 : VOP3b_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
+
+// Double precision division pre-scale.
+def V_DIV_SCALE_F64 : VOP3b_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
+
+defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32",
+  [(set f32:$dst, (AMDGPUdiv_fmas f32:$src0, f32:$src1, f32:$src2))]
+>;
+def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64",
+  [(set f64:$dst, (AMDGPUdiv_fmas f64:$src0, f64:$src1, f64:$src2))]
+>;
 //def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>;
 //def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
 //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
-def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>;
+def V_TRIG_PREOP_F64 : VOP3_64_32 <0x00000174, "V_TRIG_PREOP_F64",
+  [(set f64:$dst, (AMDGPUtrig_preop f64:$src0, i32:$src1))]
+>;
 
 //===----------------------------------------------------------------------===//
 // Pseudo Instructions
@@ -1500,7 +1650,7 @@ let usesCustomInserter = 1 in {
 // constant that can be used with the ADDR64 MUBUF instructions.
 def SI_ADDR64_RSRC : InstSI <
   (outs SReg_128:$srsrc),
-  (ins SReg_64:$ptr),
+  (ins SSrc_64:$ptr),
   "", []
 >;
 
@@ -1508,7 +1658,7 @@ def V_SUB_F64 : InstSI <
   (outs VReg_64:$dst),
   (ins VReg_64:$src0, VReg_64:$src1),
   "V_SUB_F64 $dst, $src0, $src1",
-  []
+  [(set f64:$dst, (fsub f64:$src0, f64:$src1))]
 >;
 
 } // end usesCustomInserter
@@ -1529,6 +1679,7 @@ multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
 
 }
 
+defm SI_SPILL_S32  : SI_SPILL_SGPR <SReg_32>;
 defm SI_SPILL_S64  : SI_SPILL_SGPR <SReg_64>;
 defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
 defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
@@ -1552,7 +1703,7 @@ def : Pat <
 
 /* int_SI_vs_load_input */
 def : Pat<
-  (SIload_input v4i32:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
+  (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
   (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0)
 >;
 
@@ -1564,11 +1715,6 @@ def : Pat <
        $src0, $src1, $src2, $src3)
 >;
 
-def : Pat <
-  (f64 (fsub f64:$src0, f64:$src1)),
-  (V_SUB_F64 $src0, $src1)
->;
-
 //===----------------------------------------------------------------------===//
 // SMRD Patterns
 //===----------------------------------------------------------------------===//
@@ -1596,7 +1742,6 @@ multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
 
 defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
 defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>;
 defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
 defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
 defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
@@ -1615,6 +1760,24 @@ def : Pat <
   (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
 >;
 
+} // Predicates = [isSI] in {
+
+//===----------------------------------------------------------------------===//
+// SOP1 Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isSI, isCFDepth0] in {
+
+def : Pat <
+  (i64 (ctpop i64:$src)),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (S_BCNT1_I32_B64 $src), sub0),
+    (S_MOV_B32 0), sub1)
+>;
+
+} // Predicates = [isSI, isCFDepth0]
+
+let  Predicates = [isSI] in {
 //===----------------------------------------------------------------------===//
 // SOP2 Patterns
 //===----------------------------------------------------------------------===//
@@ -1625,18 +1788,39 @@ def : Pat <
 >;
 
 //===----------------------------------------------------------------------===//
-// VOP2 Patterns
+// SOPP Patterns
 //===----------------------------------------------------------------------===//
 
 def : Pat <
-  (or i64:$src0, i64:$src1),
+  (int_AMDGPU_barrier_global),
+  (S_BARRIER)
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP1 Patterns
+//===----------------------------------------------------------------------===//
+
+def : RcpPat<V_RCP_F32_e32, f32>;
+def : RcpPat<V_RCP_F64_e32, f64>;
+defm : RsqPat<V_RSQ_F32_e32, f32>;
+defm : RsqPat<V_RSQ_F64_e32, f64>;
+
+//===----------------------------------------------------------------------===//
+// VOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+class BinOp64Pat <SDNode node, Instruction inst> : Pat <
+  (node i64:$src0, i64:$src1),
   (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-    (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub0),
+    (inst  (EXTRACT_SUBREG i64:$src0, sub0),
                   (EXTRACT_SUBREG i64:$src1, sub0)), sub0),
-    (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub1),
+    (inst (EXTRACT_SUBREG i64:$src0, sub1),
                   (EXTRACT_SUBREG i64:$src1, sub1)), sub1)
 >;
 
+def : BinOp64Pat <or, V_OR_B32_e32>;
+def : BinOp64Pat <xor, V_XOR_B32_e32>;
+
 class SextInReg <ValueType vt, int ShiftAmt> : Pat <
   (sext_inreg i32:$src0, vt),
   (V_ASHRREV_I32_e32 ShiftAmt, (V_LSHLREV_B32_e32 ShiftAmt, $src0))
@@ -1645,10 +1829,82 @@ class SextInReg <ValueType vt, int ShiftAmt> : Pat <
 def : SextInReg <i8, 24>;
 def : SextInReg <i16, 16>;
 
+def : Pat <
+  (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
+  (V_BCNT_U32_B32_e32 $popcnt, $val)
+>;
+
+def : Pat <
+   (i32 (ctpop i32:$popcnt)),
+   (V_BCNT_U32_B32_e64 $popcnt, 0, 0, 0)
+>;
+
+def : Pat <
+  (i64 (ctpop i64:$src)),
+  (INSERT_SUBREG
+    (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+      (V_BCNT_U32_B32_e32 (EXTRACT_SUBREG $src, sub1),
+        (V_BCNT_U32_B32_e64 (EXTRACT_SUBREG $src, sub0), 0, 0, 0)),
+      sub0),
+    (V_MOV_B32_e32 0), sub1)
+>;
+
 /********** ======================= **********/
 /********** Image sampling patterns **********/
 /********** ======================= **********/
 
+class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+  (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, i32:$dmask, i32:$unorm,
+        i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
+  (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da),
+          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc),
+          $addr, $rsrc, $sampler)
+>;
+
+// Only the variants which make sense are defined.
+def : SampleRawPattern<int_SI_gather4,           IMAGE_GATHER4_V4_V2,        v2i32>;
+def : SampleRawPattern<int_SI_gather4,           IMAGE_GATHER4_V4_V4,        v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl,        IMAGE_GATHER4_CL_V4_V4,     v4i32>;
+def : SampleRawPattern<int_SI_gather4_l,         IMAGE_GATHER4_L_V4_V4,      v4i32>;
+def : SampleRawPattern<int_SI_gather4_b,         IMAGE_GATHER4_B_V4_V4,      v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl,      IMAGE_GATHER4_B_CL_V4_V4,   v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl,      IMAGE_GATHER4_B_CL_V4_V8,   v8i32>;
+def : SampleRawPattern<int_SI_gather4_lz,        IMAGE_GATHER4_LZ_V4_V2,     v2i32>;
+def : SampleRawPattern<int_SI_gather4_lz,        IMAGE_GATHER4_LZ_V4_V4,     v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_c,         IMAGE_GATHER4_C_V4_V4,      v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl,      IMAGE_GATHER4_C_CL_V4_V4,   v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl,      IMAGE_GATHER4_C_CL_V4_V8,   v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_l,       IMAGE_GATHER4_C_L_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_l,       IMAGE_GATHER4_C_L_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b,       IMAGE_GATHER4_C_B_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_b,       IMAGE_GATHER4_C_B_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_cl,    IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz,      IMAGE_GATHER4_C_LZ_V4_V4,   v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_o,         IMAGE_GATHER4_O_V4_V4,      v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl_o,      IMAGE_GATHER4_CL_O_V4_V4,   v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl_o,      IMAGE_GATHER4_CL_O_V4_V8,   v8i32>;
+def : SampleRawPattern<int_SI_gather4_l_o,       IMAGE_GATHER4_L_O_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_l_o,       IMAGE_GATHER4_L_O_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_b_o,       IMAGE_GATHER4_B_O_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_o,       IMAGE_GATHER4_B_O_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl_o,    IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_lz_o,      IMAGE_GATHER4_LZ_O_V4_V4,   v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_c_o,       IMAGE_GATHER4_C_O_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_o,       IMAGE_GATHER4_C_O_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl_o,    IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_l_o,     IMAGE_GATHER4_C_L_O_V4_V8,  v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_o,     IMAGE_GATHER4_C_B_O_V4_V8,  v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_cl_o,  IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz_o,    IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz_o,    IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>;
+
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>;
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>;
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>;
+
 /* SIsample for simple 1D texture lookup */
 def : Pat <
   (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
@@ -1864,7 +2120,10 @@ def : BitConvert <v2f32, v2i32, VReg_64>;
 def : BitConvert <v2i32, v2f32, VReg_64>;
 def : BitConvert <v2i32, i64, VReg_64>;
 def : BitConvert <i64, v2i32, VReg_64>;
-
+def : BitConvert <v2f32, i64, VReg_64>;
+def : BitConvert <i64, v2f32, VReg_64>;
+def : BitConvert <v2i32, f64, VReg_64>;
+def : BitConvert <f64, v2i32, VReg_64>;
 def : BitConvert <v4f32, v4i32, VReg_128>;
 def : BitConvert <v4i32, v4f32, VReg_128>;
 
@@ -1894,7 +2153,7 @@ def FCLAMP_SI : AMDGPUShaderInst <
 }
 
 def : Pat <
-  (int_AMDIL_clamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)),
+  (AMDGPUclamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)),
   (FCLAMP_SI f32:$src)
 >;
 
@@ -2106,7 +2365,7 @@ def : Pat <
   (V_MUL_HI_I32 $src0, $src1, (i32 0))
 >;
 
-defm : BFIPatterns <V_BFI_B32>;
+defm : BFIPatterns <V_BFI_B32, S_MOV_B32>;
 def : ROTRPattern <V_ALIGNBIT_B32>;
 
 /********** ======================= **********/
@@ -2130,7 +2389,7 @@ defm : DSReadPat <DS_READ_U8,  i32, az_extloadi8_local>;
 defm : DSReadPat <DS_READ_I16, i32, sextloadi16_local>;
 defm : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>;
 defm : DSReadPat <DS_READ_B32, i32, local_load>;
-defm : DSReadPat <DS_READ_B64, i64, local_load>;
+defm : DSReadPat <DS_READ_B64, v2i32, local_load>;
 
 multiclass DSWritePat <DS inst, ValueType vt, PatFrag frag> {
   def : Pat <
@@ -2139,48 +2398,109 @@ multiclass DSWritePat <DS inst, ValueType vt, PatFrag frag> {
   >;
 
   def : Pat <
-    (frag vt:$src1, i32:$src0),
-    (inst 0, $src0, $src1, 0)
+    (frag vt:$val, i32:$ptr),
+    (inst 0, $ptr, $val, 0)
   >;
 }
 
 defm : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
 defm : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>;
 defm : DSWritePat <DS_WRITE_B32, i32, local_store>;
-defm : DSWritePat <DS_WRITE_B64, i64, local_store>;
+defm : DSWritePat <DS_WRITE_B64, v2i32, local_store>;
 
-def : Pat <(atomic_load_add_local i32:$ptr, i32:$val),
-           (DS_ADD_U32_RTN 0, $ptr, $val, 0)>;
-
-def : Pat <(atomic_load_sub_local i32:$ptr, i32:$val),
-           (DS_SUB_U32_RTN 0, $ptr, $val, 0)>;
+multiclass DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> {
+  def : Pat <
+    (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$value),
+    (inst (i1 0), $ptr, $value, (as_i16imm $offset))
+  >;
 
-//===----------------------------------------------------------------------===//
-// MUBUF Patterns
-//===----------------------------------------------------------------------===//
+  def : Pat <
+    (frag i32:$ptr, vt:$val),
+    (inst 0, $ptr, $val, 0)
+  >;
+}
 
-multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
-                              PatFrag global_ld, PatFrag constant_ld> {
+// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec
+//
+// We need to use something for the data0, so we set a register to
+// -1. For the non-rtn variants, the manual says it does
+// DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max
+// will always do the increment so I'm assuming it's the same.
+//
+// We also load this -1 with s_mov_b32 / s_mov_b64 even though this
+// needs to be a VGPR. The SGPR copy pass will fix this, and it's
+// easier since there is no v_mov_b64.
+multiclass DSAtomicIncRetPat<DS inst, ValueType vt,
+                             Instruction LoadImm, PatFrag frag> {
   def : Pat <
-    (vt (global_ld (mubuf_vaddr_offset i64:$ptr, i64:$offset, IMM12bit:$imm_offset))),
-    (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, (as_i16imm $imm_offset))
+    (frag (add i32:$ptr, (i32 IMM16bit:$offset)), (vt 1)),
+    (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset))
   >;
 
   def : Pat <
-    (vt (global_ld (add i64:$ptr, (i64 IMM12bit:$offset)))),
-    (Instr_ADDR64 (SI_ADDR64_RSRC (i64 0)), $ptr, (as_i16imm $offset))
+    (frag i32:$ptr, (vt 1)),
+    (inst 0, $ptr, (LoadImm (vt -1)), 0)
   >;
+}
 
+multiclass DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> {
   def : Pat <
-    (vt (global_ld i64:$ptr)),
-    (Instr_ADDR64 (SI_ADDR64_RSRC (i64 0)), $ptr, 0)
+    (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$cmp, vt:$swap),
+    (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset))
   >;
 
   def : Pat <
-     (vt (global_ld (add i64:$ptr, i64:$offset))),
-     (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0)
+    (frag i32:$ptr, vt:$cmp, vt:$swap),
+    (inst 0, $ptr, $cmp, $swap, 0)
   >;
+}
+
+
+// 32-bit atomics.
+defm : DSAtomicIncRetPat<DS_INC_RTN_U32, i32,
+                         S_MOV_B32, atomic_load_add_local>;
+defm : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32,
+                         S_MOV_B32, atomic_load_sub_local>;
+
+defm : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, atomic_swap_local>;
+defm : DSAtomicRetPat<DS_ADD_RTN_U32, i32, atomic_load_add_local>;
+defm : DSAtomicRetPat<DS_SUB_RTN_U32, i32, atomic_load_sub_local>;
+defm : DSAtomicRetPat<DS_AND_RTN_B32, i32, atomic_load_and_local>;
+defm : DSAtomicRetPat<DS_OR_RTN_B32, i32, atomic_load_or_local>;
+defm : DSAtomicRetPat<DS_XOR_RTN_B32, i32, atomic_load_xor_local>;
+defm : DSAtomicRetPat<DS_MIN_RTN_I32, i32, atomic_load_min_local>;
+defm : DSAtomicRetPat<DS_MAX_RTN_I32, i32, atomic_load_max_local>;
+defm : DSAtomicRetPat<DS_MIN_RTN_U32, i32, atomic_load_umin_local>;
+defm : DSAtomicRetPat<DS_MAX_RTN_U32, i32, atomic_load_umax_local>;
+
+defm : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, atomic_cmp_swap_32_local>;
+
+// 64-bit atomics.
+defm : DSAtomicIncRetPat<DS_INC_RTN_U64, i64,
+                         S_MOV_B64, atomic_load_add_local>;
+defm : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64,
+                         S_MOV_B64, atomic_load_sub_local>;
+
+defm : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, atomic_swap_local>;
+defm : DSAtomicRetPat<DS_ADD_RTN_U64, i64, atomic_load_add_local>;
+defm : DSAtomicRetPat<DS_SUB_RTN_U64, i64, atomic_load_sub_local>;
+defm : DSAtomicRetPat<DS_AND_RTN_B64, i64, atomic_load_and_local>;
+defm : DSAtomicRetPat<DS_OR_RTN_B64, i64, atomic_load_or_local>;
+defm : DSAtomicRetPat<DS_XOR_RTN_B64, i64, atomic_load_xor_local>;
+defm : DSAtomicRetPat<DS_MIN_RTN_I64, i64, atomic_load_min_local>;
+defm : DSAtomicRetPat<DS_MAX_RTN_I64, i64, atomic_load_max_local>;
+defm : DSAtomicRetPat<DS_MIN_RTN_U64, i64, atomic_load_umin_local>;
+defm : DSAtomicRetPat<DS_MAX_RTN_U64, i64, atomic_load_umax_local>;
+
+defm : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>;
+
 
+//===----------------------------------------------------------------------===//
+// MUBUF Patterns
+//===----------------------------------------------------------------------===//
+
+multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
+                              PatFrag constant_ld> {
   def : Pat <
      (vt (constant_ld (add i64:$ptr, i64:$offset))),
      (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0)
@@ -2188,53 +2508,19 @@ multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
 }
 
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32,
-                          sextloadi8_global, sextloadi8_constant>;
+                          sextloadi8_constant>;
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32,
-                          az_extloadi8_global, az_extloadi8_constant>;
+                          az_extloadi8_constant>;
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32,
-                          sextloadi16_global, sextloadi16_constant>;
+                          sextloadi16_constant>;
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32,
-                          az_extloadi16_global, az_extloadi16_constant>;
+                          az_extloadi16_constant>;
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32,
-                          global_load, constant_load>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, i64,
-                          global_load, constant_load>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, i64,
-                          az_extloadi32_global, az_extloadi32_constant>;
+                          constant_load>;
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32,
-                          global_load, constant_load>;
+                          constant_load>;
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32,
-                          global_load, constant_load>;
-
-multiclass MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> {
-
-  def : Pat <
-    (st vt:$value, (mubuf_vaddr_offset i64:$ptr, i64:$offset, IMM12bit:$imm_offset)),
-    (Instr $value, (SI_ADDR64_RSRC $ptr), $offset, (as_i16imm $imm_offset))
-  >;
-
-  def : Pat <
-    (st vt:$value, (add i64:$ptr, IMM12bit:$offset)),
-    (Instr $value, (SI_ADDR64_RSRC (i64 0)), $ptr, (as_i16imm $offset))
-  >;
-
-  def : Pat <
-    (st vt:$value, i64:$ptr),
-    (Instr $value, (SI_ADDR64_RSRC (i64 0)), $ptr, 0)
-  >;
-
-  def : Pat <
-    (st vt:$value, (add i64:$ptr, i64:$offset)),
-    (Instr $value, (SI_ADDR64_RSRC $ptr), $offset, 0)
-   >;
-}
-
-defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE, i32, truncstorei8_global>;
-defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT, i32, truncstorei16_global>;
-defm : MUBUFStore_Pattern <BUFFER_STORE_DWORD, i32, global_store>;
-defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, i64, global_store>;
-defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, v2i32, global_store>;
-defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4, v4i32, global_store>;
+                          constant_load>;
 
 // BUFFER_LOAD_DWORD*, addr64=0
 multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen,
@@ -2301,7 +2587,7 @@ def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
 def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
 def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
 
-let Predicates = [isCI] in {
+let SubtargetPredicate = isCI in {
 
 // Sea island new arithmetic instructinos
 let neverHasSideEffects = 1 in {
@@ -2348,7 +2634,7 @@ def V_MAD_I64_I32 : VOP3_64 <0x00000177, "V_MAD_I64_I32", []>;
 // BUFFER_LOAD_DWORDX3
 // BUFFER_STORE_DWORDX3
 
-} // End Predicates = [isCI]
+} // End iSCI
 
 
 /********** ====================== **********/
@@ -2360,13 +2646,13 @@ multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST I
   // 1. Extract with offset
   def : Pat<
     (vector_extract vt:$vec, (add i32:$idx, imm:$off)),
-    (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off))
+    (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off))
   >;
 
   // 2. Extract without offset
   def : Pat<
     (vector_extract vt:$vec, i32:$idx),
-    (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0))
+    (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0))
   >;
 
   // 3. Insert with offset
@@ -2392,20 +2678,6 @@ defm : SI_INDIRECT_Pattern <v4i32, i32, SI_INDIRECT_DST_V4>;
 defm : SI_INDIRECT_Pattern <v8i32, i32, SI_INDIRECT_DST_V8>;
 defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>;
 
-/********** =============== **********/
-/**********   Conditions    **********/
-/********** =============== **********/
-
-def : Pat<
-  (i1 (setcc f32:$src0, f32:$src1, SETO)),
-  (V_CMP_O_F32_e64 $src0, $src1)
->;
-
-def : Pat<
-  (i1 (setcc f32:$src0, f32:$src1, SETUO)),
-  (V_CMP_U_F32_e64 $src0, $src1)
->;
-
 //===----------------------------------------------------------------------===//
 // Conversion Patterns
 //===----------------------------------------------------------------------===//
@@ -2439,6 +2711,62 @@ def : Pat <
     (S_MOV_B32 -1), sub1)
 >;
 
+class ZExt_i64_i32_Pat <SDNode ext> : Pat <
+  (i64 (ext i32:$src)),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0),
+    (S_MOV_B32 0), sub1)
+>;
+
+class ZExt_i64_i1_Pat <SDNode ext> : Pat <
+  (i64 (ext i1:$src)),
+  (INSERT_SUBREG
+    (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+      (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0),
+    (S_MOV_B32 0), sub1)
+>;
+
+
+def : ZExt_i64_i32_Pat<zext>;
+def : ZExt_i64_i32_Pat<anyext>;
+def : ZExt_i64_i1_Pat<zext>;
+def : ZExt_i64_i1_Pat<anyext>;
+
+def : Pat <
+  (i64 (sext i32:$src)),
+    (INSERT_SUBREG
+      (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0),
+      (S_ASHR_I32 $src, 31), sub1)
+>;
+
+def : Pat <
+  (i64 (sext i1:$src)),
+  (INSERT_SUBREG
+    (INSERT_SUBREG
+      (i64 (IMPLICIT_DEF)),
+      (V_CNDMASK_B32_e64 0, -1, $src), sub0),
+    (V_CNDMASK_B32_e64 0, -1, $src), sub1)
+>;
+
+def : Pat <
+  (f32 (sint_to_fp i1:$src)),
+  (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src)
+>;
+
+def : Pat <
+  (f32 (uint_to_fp i1:$src)),
+  (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src)
+>;
+
+def : Pat <
+  (f64 (sint_to_fp i1:$src)),
+    (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
+>;
+
+def : Pat <
+  (f64 (uint_to_fp i1:$src)),
+  (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src))
+>;
+
 //===----------------------------------------------------------------------===//
 // Miscellaneous Patterns
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
index 00e32c0..df690a4 100644
--- a/lib/Target/R600/SIIntrinsics.td
+++ b/lib/Target/R600/SIIntrinsics.td
@@ -56,11 +56,61 @@ let TargetPrefix = "SI", isTarget = 1 in {
 
   class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
 
+  // Fully-flexible SAMPLE instruction.
+  class SampleRaw : Intrinsic <
+    [llvm_v4f32_ty],    // vdata(VGPR)
+    [llvm_anyint_ty,    // vaddr(VGPR)
+     llvm_v32i8_ty,     // rsrc(SGPR)
+     llvm_v16i8_ty,     // sampler(SGPR)
+     llvm_i32_ty,       // dmask(imm)
+     llvm_i32_ty,       // unorm(imm)
+     llvm_i32_ty,       // r128(imm)
+     llvm_i32_ty,       // da(imm)
+     llvm_i32_ty,       // glc(imm)
+     llvm_i32_ty,       // slc(imm)
+     llvm_i32_ty,       // tfe(imm)
+     llvm_i32_ty],      // lwe(imm)
+    [IntrNoMem]>;
+
   def int_SI_sample : Sample;
   def int_SI_sampleb : Sample;
   def int_SI_sampled : Sample;
   def int_SI_samplel : Sample;
 
+  // Basic gather4
+  def int_SI_gather4 : SampleRaw;
+  def int_SI_gather4_cl : SampleRaw;
+  def int_SI_gather4_l : SampleRaw;
+  def int_SI_gather4_b : SampleRaw;
+  def int_SI_gather4_b_cl : SampleRaw;
+  def int_SI_gather4_lz : SampleRaw;
+
+  // Gather4 with comparison
+  def int_SI_gather4_c : SampleRaw;
+  def int_SI_gather4_c_cl : SampleRaw;
+  def int_SI_gather4_c_l : SampleRaw;
+  def int_SI_gather4_c_b : SampleRaw;
+  def int_SI_gather4_c_b_cl : SampleRaw;
+  def int_SI_gather4_c_lz : SampleRaw;
+
+  // Gather4 with offsets
+  def int_SI_gather4_o : SampleRaw;
+  def int_SI_gather4_cl_o : SampleRaw;
+  def int_SI_gather4_l_o : SampleRaw;
+  def int_SI_gather4_b_o : SampleRaw;
+  def int_SI_gather4_b_cl_o : SampleRaw;
+  def int_SI_gather4_lz_o : SampleRaw;
+
+  // Gather4 with comparison and offsets
+  def int_SI_gather4_c_o : SampleRaw;
+  def int_SI_gather4_c_cl_o : SampleRaw;
+  def int_SI_gather4_c_l_o : SampleRaw;
+  def int_SI_gather4_c_b_o : SampleRaw;
+  def int_SI_gather4_c_b_cl_o : SampleRaw;
+  def int_SI_gather4_c_lz_o : SampleRaw;
+
+  def int_SI_getlod : SampleRaw;
+
   def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
   def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index 6601f2a..9f5ff29 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -86,6 +86,7 @@ private:
   void Kill(MachineInstr &MI);
   void Branch(MachineInstr &MI);
 
+  void InitM0ForLDS(MachineBasicBlock::iterator MI);
   void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
   void IndirectSrc(MachineInstr &MI);
   void IndirectDst(MachineInstr &MI);
@@ -320,6 +321,14 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
+/// The m0 register stores the maximum allowable address for LDS reads and
+/// writes.  Its value must be at least the size in bytes of LDS allocated by
+/// the shader.  For simplicity, we set it to the maximum possible value.
+void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) {
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),  TII->get(AMDGPU::S_MOV_B32),
+            AMDGPU::M0).addImm(0xffffffff);
+}
+
 void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
 
   MachineBasicBlock &MBB = *MI.getParent();
@@ -333,52 +342,57 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
             .addReg(Idx);
     MBB.insert(I, MovRel);
-    MI.eraseFromParent();
-    return;
-  }
+  } else {
 
-  assert(AMDGPU::SReg_64RegClass.contains(Save));
-  assert(AMDGPU::VReg_32RegClass.contains(Idx));
+    assert(AMDGPU::SReg_64RegClass.contains(Save));
+    assert(AMDGPU::VReg_32RegClass.contains(Idx));
 
-  // Save the EXEC mask
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
-          .addReg(AMDGPU::EXEC);
+    // Save the EXEC mask
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
+            .addReg(AMDGPU::EXEC);
 
-  // Read the next variant into VCC (lower 32 bits) <- also loop target
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-          AMDGPU::VCC_LO)
-          .addReg(Idx);
+    // Read the next variant into VCC (lower 32 bits) <- also loop target
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+            AMDGPU::VCC_LO)
+            .addReg(Idx);
 
-  // Move index from VCC into M0
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-          .addReg(AMDGPU::VCC_LO);
+    // Move index from VCC into M0
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+            .addReg(AMDGPU::VCC_LO);
 
-  // Compare the just read M0 value to all possible Idx values
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
-          .addReg(AMDGPU::M0)
-          .addReg(Idx);
+    // Compare the just read M0 value to all possible Idx values
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
+            .addReg(AMDGPU::M0)
+            .addReg(Idx);
 
-  // Update EXEC, save the original EXEC value to VCC
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
-          .addReg(AMDGPU::VCC);
+    // Update EXEC, save the original EXEC value to VCC
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
+            .addReg(AMDGPU::VCC);
 
-  // Do the actual move
-  MBB.insert(I, MovRel);
+    // Do the actual move
+    MBB.insert(I, MovRel);
 
-  // Update EXEC, switch all done bits to 0 and all todo bits to 1
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
-          .addReg(AMDGPU::EXEC)
-          .addReg(AMDGPU::VCC);
+    // Update EXEC, switch all done bits to 0 and all todo bits to 1
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+            .addReg(AMDGPU::EXEC)
+            .addReg(AMDGPU::VCC);
 
-  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-          .addImm(-7)
-          .addReg(AMDGPU::EXEC);
+    // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+            .addImm(-7)
+            .addReg(AMDGPU::EXEC);
 
-  // Restore EXEC
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
-          .addReg(Save);
+    // Restore EXEC
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+            .addReg(Save);
 
+  }
+  // FIXME: Are there any values other than the LDS address clamp that need to
+  // be stored in the m0 register and may be live for more than a few
+  // instructions?  If so, we should save the m0 register at the beginning
+  // of this function and restore it here.
+  // FIXME: Add support for LDS direct loads.
+  InitM0ForLDS(&MI);
   MI.eraseFromParent();
 }
 
@@ -523,8 +537,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
     MachineBasicBlock &MBB = MF.front();
     // Initialize M0 to a value that won't cause LDS access to be discarded
     // due to offset clamping
-    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_MOV_B32),
-            AMDGPU::M0).addImm(0xffffffff);
+    InitM0ForLDS(MBB.getFirstNonPHI());
   }
 
   if (NeedWQM && MFI->ShaderType == ShaderType::PIXEL) {
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
index af60995..e2df950 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -62,8 +62,10 @@ static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) {
       return VGPR;
     }
   }
-  MF->getFunction()->getContext().emitError(
-      "Could not found S_ENGPGM instrtuction.");
+
+  LLVMContext &Ctx = MF->getFunction()->getContext();
+  Ctx.emitError("Could not find S_ENDPGM instruction.");
+
   return VGPR;
 }
 
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index c72d549..d0b677a 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -14,21 +14,20 @@
 
 
 #include "SIRegisterInfo.h"
-#include "AMDGPUTargetMachine.h"
+#include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 
 using namespace llvm;
 
-SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm)
-: AMDGPURegisterInfo(tm),
-  TM(tm)
+SIRegisterInfo::SIRegisterInfo(const AMDGPUSubtarget &st)
+: AMDGPURegisterInfo(st)
   { }
 
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(AMDGPU::EXEC);
   Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
-  const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(TM.getInstrInfo());
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
   TII->reserveIndirectRegisters(Reserved, MF);
   return Reserved;
 }
@@ -38,15 +37,6 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   return RC->getNumRegs();
 }
 
-const TargetRegisterClass *
-SIRegisterInfo::getISARegClass(const TargetRegisterClass * rc) const {
-  switch (rc->getID()) {
-  case AMDGPU::GPRF32RegClassID:
-    return &AMDGPU::VReg_32RegClass;
-  default: return rc;
-  }
-}
-
 const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
                                                                    MVT VT) const {
   switch(VT.SimpleTy) {
@@ -135,3 +125,19 @@ unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
   unsigned Index = getHWRegIndex(Reg);
   return SubRC->getRegister(Index + Channel);
 }
+
+bool SIRegisterInfo::regClassCanUseImmediate(int RCID) const {
+  switch (RCID) {
+  default: return false;
+  case AMDGPU::SSrc_32RegClassID:
+  case AMDGPU::SSrc_64RegClassID:
+  case AMDGPU::VSrc_32RegClassID:
+  case AMDGPU::VSrc_64RegClassID:
+    return true;
+  }
+}
+
+bool SIRegisterInfo::regClassCanUseImmediate(
+                             const TargetRegisterClass *RC) const {
+  return regClassCanUseImmediate(RC->getID());
+}
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
index 36b4fcd..c9305fb 100644
--- a/lib/Target/R600/SIRegisterInfo.h
+++ b/lib/Target/R600/SIRegisterInfo.h
@@ -20,24 +20,15 @@
 
 namespace llvm {
 
-class AMDGPUTargetMachine;
-
 struct SIRegisterInfo : public AMDGPURegisterInfo {
-  AMDGPUTargetMachine &TM;
 
-  SIRegisterInfo(AMDGPUTargetMachine &tm);
+  SIRegisterInfo(const AMDGPUSubtarget &st);
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const override;
 
-  /// \param RC is an AMDIL reg class.
-  ///
-  /// \returns the SI register class that is equivalent to \p RC.
-  const TargetRegisterClass *
-    getISARegClass(const TargetRegisterClass *RC) const override;
-
   /// \brief get the register class of the specified type to use in the
   /// CFGStructurizer
   const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
@@ -69,6 +60,14 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
   /// \returns The sub-register of Reg that is in Channel.
   unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC,
                             unsigned Channel) const;
+
+  /// \returns True if operands defined with this register class can accept
+  /// inline immediates.
+  bool regClassCanUseImmediate(int RCID) const;
+
+  /// \returns True if operands defined with this register class can accept
+  /// inline immediates.
+  bool regClassCanUseImmediate(const TargetRegisterClass *RC) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
index f1f01de..8974b63 100644
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -168,7 +168,7 @@ def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64,
   (add SGPR_64Regs, VCCReg, EXECReg)
 >;
 
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32], 128, (add SGPR_128)>;
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>;
 
 def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>;
 
diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp
index a0b6907..367963a 100644
--- a/lib/Target/R600/SITypeRewriter.cpp
+++ b/lib/Target/R600/SITypeRewriter.cpp
@@ -119,8 +119,7 @@ void SITypeRewriter::visitCallInst(CallInst &I) {
                                               Type::getInt32Ty(I.getContext())){
       Type *ElementTy = Arg->getType()->getVectorElementType();
       std::string TypeName = "i32";
-      InsertElementInst *Def = dyn_cast<InsertElementInst>(Arg);
-      assert(Def);
+      InsertElementInst *Def = cast<InsertElementInst>(Arg);
       Args.push_back(Def->getOperand(1));
       Types.push_back(ElementTy);
       std::string VecTypeName = "v1" + TypeName;
diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index da88820..9df0054 100644
--- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -47,31 +47,27 @@ class SparcAsmParser : public MCTargetAsmParser {
 
   // public interface of the MCTargetAsmParser.
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer &Out, unsigned &ErrorInfo,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
                                bool MatchingInlineAsm) override;
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                        SMLoc NameLoc,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands) override;
+                        SMLoc NameLoc, OperandVector &Operands) override;
   bool ParseDirective(AsmToken DirectiveID) override;
 
-  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
                                       unsigned Kind) override;
 
   // Custom parse functions for Sparc specific operands.
-  OperandMatchResultTy
-  parseMEMOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  OperandMatchResultTy parseMEMOperand(OperandVector &Operands);
 
-  OperandMatchResultTy
-  parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-               StringRef Name);
+  OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Name);
 
   OperandMatchResultTy
-  parseSparcAsmOperand(SparcOperand *&Operand, bool isCall = false);
+  parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Operand,
+                       bool isCall = false);
 
-  OperandMatchResultTy
-  parseBranchModifiers(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  OperandMatchResultTy parseBranchModifiers(OperandVector &Operands);
 
   // returns true if Tok is matched to a register and returns register in RegNo.
   bool matchRegisterName(const AsmToken &Tok, unsigned &RegNo,
@@ -153,8 +149,6 @@ private:
 
   SMLoc StartLoc, EndLoc;
 
-  SparcOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
-
   struct Token {
     const char *Data;
     unsigned Length;
@@ -182,6 +176,8 @@ private:
     struct MemOp Mem;
   };
 public:
+  SparcOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
   bool isToken() const override { return Kind == k_Token; }
   bool isReg() const override { return Kind == k_Register; }
   bool isImm() const override { return Kind == k_Immediate; }
@@ -291,8 +287,8 @@ public:
     addExpr(Inst, Expr);
   }
 
-  static SparcOperand *CreateToken(StringRef Str, SMLoc S) {
-    SparcOperand *Op = new SparcOperand(k_Token);
+  static std::unique_ptr<SparcOperand> CreateToken(StringRef Str, SMLoc S) {
+    auto Op = make_unique<SparcOperand>(k_Token);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->StartLoc = S;
@@ -300,10 +296,9 @@ public:
     return Op;
   }
 
-  static SparcOperand *CreateReg(unsigned RegNum,
-                                 unsigned Kind,
-                                 SMLoc S, SMLoc E) {
-    SparcOperand *Op = new SparcOperand(k_Register);
+  static std::unique_ptr<SparcOperand> CreateReg(unsigned RegNum, unsigned Kind,
+                                                 SMLoc S, SMLoc E) {
+    auto Op = make_unique<SparcOperand>(k_Register);
     Op->Reg.RegNum = RegNum;
     Op->Reg.Kind   = (SparcOperand::RegisterKind)Kind;
     Op->StartLoc = S;
@@ -311,49 +306,51 @@ public:
     return Op;
   }
 
-  static SparcOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
-    SparcOperand *Op = new SparcOperand(k_Immediate);
+  static std::unique_ptr<SparcOperand> CreateImm(const MCExpr *Val, SMLoc S,
+                                                 SMLoc E) {
+    auto Op = make_unique<SparcOperand>(k_Immediate);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static SparcOperand *MorphToDoubleReg(SparcOperand *Op) {
-    unsigned Reg = Op->getReg();
-    assert(Op->Reg.Kind == rk_FloatReg);
+  static bool MorphToDoubleReg(SparcOperand &Op) {
+    unsigned Reg = Op.getReg();
+    assert(Op.Reg.Kind == rk_FloatReg);
     unsigned regIdx = Reg - Sparc::F0;
     if (regIdx % 2 || regIdx > 31)
-      return nullptr;
-    Op->Reg.RegNum = DoubleRegs[regIdx / 2];
-    Op->Reg.Kind = rk_DoubleReg;
-    return Op;
+      return false;
+    Op.Reg.RegNum = DoubleRegs[regIdx / 2];
+    Op.Reg.Kind = rk_DoubleReg;
+    return true;
   }
 
-  static SparcOperand *MorphToQuadReg(SparcOperand *Op) {
-    unsigned Reg = Op->getReg();
+  static bool MorphToQuadReg(SparcOperand &Op) {
+    unsigned Reg = Op.getReg();
     unsigned regIdx = 0;
-    switch (Op->Reg.Kind) {
-    default: assert(0 && "Unexpected register kind!");
+    switch (Op.Reg.Kind) {
+    default: llvm_unreachable("Unexpected register kind!");
     case rk_FloatReg:
       regIdx = Reg - Sparc::F0;
       if (regIdx % 4 || regIdx > 31)
-        return nullptr;
+        return false;
       Reg = QuadFPRegs[regIdx / 4];
       break;
     case rk_DoubleReg:
       regIdx =  Reg - Sparc::D0;
       if (regIdx % 2 || regIdx > 31)
-        return nullptr;
+        return false;
       Reg = QuadFPRegs[regIdx / 2];
       break;
     }
-    Op->Reg.RegNum  = Reg;
-    Op->Reg.Kind = rk_QuadReg;
-    return Op;
+    Op.Reg.RegNum = Reg;
+    Op.Reg.Kind = rk_QuadReg;
+    return true;
   }
 
-  static SparcOperand *MorphToMEMrr(unsigned Base, SparcOperand *Op) {
+  static std::unique_ptr<SparcOperand>
+  MorphToMEMrr(unsigned Base, std::unique_ptr<SparcOperand> Op) {
     unsigned offsetReg = Op->getReg();
     Op->Kind = k_MemoryReg;
     Op->Mem.Base = Base;
@@ -362,10 +359,9 @@ public:
     return Op;
   }
 
-  static SparcOperand *CreateMEMri(unsigned Base,
-                                 const MCExpr *Off,
-                                 SMLoc S, SMLoc E) {
-    SparcOperand *Op = new SparcOperand(k_MemoryImm);
+  static std::unique_ptr<SparcOperand>
+  CreateMEMri(unsigned Base, const MCExpr *Off, SMLoc S, SMLoc E) {
+    auto Op = make_unique<SparcOperand>(k_MemoryImm);
     Op->Mem.Base = Base;
     Op->Mem.OffsetReg = 0;
     Op->Mem.Off = Off;
@@ -374,7 +370,8 @@ public:
     return Op;
   }
 
-  static SparcOperand *MorphToMEMri(unsigned Base, SparcOperand *Op) {
+  static std::unique_ptr<SparcOperand>
+  MorphToMEMri(unsigned Base, std::unique_ptr<SparcOperand> Op) {
     const MCExpr *Imm  = Op->getImm();
     Op->Kind = k_MemoryImm;
     Op->Mem.Base = Base;
@@ -386,11 +383,11 @@ public:
 
 } // end namespace
 
-bool SparcAsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCStreamer &Out, unsigned &ErrorInfo,
-                        bool MatchingInlineAsm) {
+bool SparcAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                             OperandVector &Operands,
+                                             MCStreamer &Out,
+                                             unsigned &ErrorInfo,
+                                             bool MatchingInlineAsm) {
   MCInst Inst;
   SmallVector<MCInst, 8> Instructions;
   unsigned MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
@@ -415,7 +412,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
-      ErrorLoc = ((SparcOperand*) Operands[ErrorInfo])->getStartLoc();
+      ErrorLoc = ((SparcOperand &)*Operands[ErrorInfo]).getStartLoc();
       if (ErrorLoc == SMLoc())
         ErrorLoc = IDLoc;
     }
@@ -450,11 +447,9 @@ ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc)
 static void applyMnemonicAliases(StringRef &Mnemonic, unsigned Features,
                                  unsigned VariantID);
 
-bool SparcAsmParser::
-ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                 SMLoc NameLoc,
-                 SmallVectorImpl<MCParsedAsmOperand*> &Operands)
-{
+bool SparcAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                      StringRef Name, SMLoc NameLoc,
+                                      OperandVector &Operands) {
 
   // First operand in MCInst is instruction mnemonic.
   Operands.push_back(SparcOperand::CreateToken(Name, NameLoc));
@@ -548,9 +543,8 @@ bool SparcAsmParser:: parseDirectiveWord(unsigned Size, SMLoc L) {
   return false;
 }
 
-SparcAsmParser::OperandMatchResultTy SparcAsmParser::
-parseMEMOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands)
-{
+SparcAsmParser::OperandMatchResultTy
+SparcAsmParser::parseMEMOperand(OperandVector &Operands) {
 
   SMLoc S, E;
   unsigned BaseReg = 0;
@@ -575,23 +569,20 @@ parseMEMOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands)
     break;
   }
 
-  SparcOperand *Offset = nullptr;
+  std::unique_ptr<SparcOperand> Offset;
   OperandMatchResultTy ResTy = parseSparcAsmOperand(Offset);
   if (ResTy != MatchOperand_Success || !Offset)
     return MatchOperand_NoMatch;
 
-  Offset = (Offset->isImm()
-            ? SparcOperand::MorphToMEMri(BaseReg, Offset)
-            : SparcOperand::MorphToMEMrr(BaseReg, Offset));
+  Operands.push_back(
+      Offset->isImm() ? SparcOperand::MorphToMEMri(BaseReg, std::move(Offset))
+                      : SparcOperand::MorphToMEMrr(BaseReg, std::move(Offset)));
 
-  Operands.push_back(Offset);
   return MatchOperand_Success;
 }
 
-SparcAsmParser::OperandMatchResultTy SparcAsmParser::
-parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-             StringRef Mnemonic)
-{
+SparcAsmParser::OperandMatchResultTy
+SparcAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
 
   OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
 
@@ -637,21 +628,21 @@ parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
     return MatchOperand_Success;
   }
 
-  SparcOperand *Op = nullptr;
+  std::unique_ptr<SparcOperand> Op;
 
   ResTy = parseSparcAsmOperand(Op, (Mnemonic == "call"));
   if (ResTy != MatchOperand_Success || !Op)
     return MatchOperand_ParseFail;
 
   // Push the parsed operand into the list of operands
-  Operands.push_back(Op);
+  Operands.push_back(std::move(Op));
 
   return MatchOperand_Success;
 }
 
 SparcAsmParser::OperandMatchResultTy
-SparcAsmParser::parseSparcAsmOperand(SparcOperand *&Op, bool isCall)
-{
+SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
+                                     bool isCall) {
 
   SMLoc S = Parser.getTok().getLoc();
   SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
@@ -718,8 +709,8 @@ SparcAsmParser::parseSparcAsmOperand(SparcOperand *&Op, bool isCall)
   return (Op) ? MatchOperand_Success : MatchOperand_ParseFail;
 }
 
-SparcAsmParser::OperandMatchResultTy SparcAsmParser::
-parseBranchModifiers(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+SparcAsmParser::OperandMatchResultTy
+SparcAsmParser::parseBranchModifiers(OperandVector &Operands) {
 
   // parse (,a|,pn|,pt)+
 
@@ -928,18 +919,14 @@ extern "C" void LLVMInitializeSparcAsmParser() {
 #define GET_MATCHER_IMPLEMENTATION
 #include "SparcGenAsmMatcher.inc"
 
-
-
-unsigned SparcAsmParser::
-validateTargetOperandClass(MCParsedAsmOperand *GOp,
-                           unsigned Kind)
-{
-  SparcOperand *Op = (SparcOperand*)GOp;
-  if (Op->isFloatOrDoubleReg()) {
+unsigned SparcAsmParser::validateTargetOperandClass(MCParsedAsmOperand &GOp,
+                                                    unsigned Kind) {
+  SparcOperand &Op = (SparcOperand &)GOp;
+  if (Op.isFloatOrDoubleReg()) {
     switch (Kind) {
     default: break;
     case MCK_DFPRegs:
-      if (!Op->isFloatReg() || SparcOperand::MorphToDoubleReg(Op))
+      if (!Op.isFloatReg() || SparcOperand::MorphToDoubleReg(Op))
         return MCTargetAsmParser::Match_Success;
       break;
     case MCK_QFPRegs:
diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
index 261fb38..5975a51 100644
--- a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
+++ b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
@@ -173,6 +173,6 @@ void SparcInstPrinter::printCCOperand(const MCInst *MI, int opNum,
 bool SparcInstPrinter::printGetPCX(const MCInst *MI, unsigned opNum,
                                   raw_ostream &O)
 {
-  assert(0 && "FIXME: Implement SparcInstPrinter::printGetPCX.");
+  llvm_unreachable("FIXME: Implement SparcInstPrinter::printGetPCX.");
   return true;
 }
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index 7d517b6..dcd81e3 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -196,12 +196,12 @@ namespace {
                               const MCRelaxableFragment *DF,
                               const MCAsmLayout &Layout) const override {
       // FIXME.
-      assert(0 && "fixupNeedsRelaxation() unimplemented");
+      llvm_unreachable("fixupNeedsRelaxation() unimplemented");
       return false;
     }
     void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
       // FIXME.
-      assert(0 && "relaxInstruction() unimplemented");
+      llvm_unreachable("relaxInstruction() unimplemented");
     }
 
     bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index b19ad7b..eea9626 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -133,7 +133,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   if (Expr->EvaluateAsAbsolute(Res))
     return Res;
 
-  assert(0 && "Unhandled expression!");
+  llvm_unreachable("Unhandled expression!");
   return 0;
 }
 
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index ae57fdc..7f01ab0 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Object/ELF.h"
 
@@ -124,7 +125,7 @@ SparcMCExpr::VariantKind SparcMCExpr::parseVariantKind(StringRef name)
 
 Sparc::Fixups SparcMCExpr::getFixupKind(SparcMCExpr::VariantKind Kind) {
   switch (Kind) {
-  default:           assert(0 && "Unhandled SparcMCExpr::VariantKind");
+  default: llvm_unreachable("Unhandled SparcMCExpr::VariantKind");
   case VK_Sparc_LO:       return Sparc::fixup_sparc_lo10;
   case VK_Sparc_HI:       return Sparc::fixup_sparc_hi22;
   case VK_Sparc_H44:      return Sparc::fixup_sparc_h44;
@@ -219,35 +220,6 @@ void SparcMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
   fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
 }
 
-// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
-// that method should be made public?
-// FIXME: really do above: now that at least three other backends are using it.
-static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
-  switch (Value->getKind()) {
-  case MCExpr::Target:
-    llvm_unreachable("Can't handle nested target expr!");
-    break;
-
-  case MCExpr::Constant:
-    break;
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
-    AddValueSymbolsImpl(BE->getLHS(), Asm);
-    AddValueSymbolsImpl(BE->getRHS(), Asm);
-    break;
-  }
-
-  case MCExpr::SymbolRef:
-    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
-    break;
-
-  case MCExpr::Unary:
-    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
-    break;
-  }
-}
-
-void SparcMCExpr::AddValueSymbols(MCAssembler *Asm) const {
-  AddValueSymbolsImpl(getSubExpr(), Asm);
+void SparcMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*getSubExpr());
 }
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index 78dd945..f0d0ef3 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -88,7 +88,7 @@ public:
   void PrintImpl(raw_ostream &OS) const override;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAsmLayout *Layout) const override;
-  void AddValueSymbols(MCAssembler *) const override;
+  void visitUsedExpr(MCStreamer &Streamer) const override;
   const MCSection *FindAssociatedSection() const override {
     return getSubExpr()->FindAssociatedSection();
   }
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index a37da94..3cdfda3 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -14,6 +14,7 @@
 #include "SparcFrameLowering.h"
 #include "SparcInstrInfo.h"
 #include "SparcMachineFunctionInfo.h"
+#include "SparcSubtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -32,6 +33,9 @@ DisableLeafProc("disable-sparc-leaf-proc",
                 cl::desc("Disable Sparc leaf procedure optimization."),
                 cl::Hidden);
 
+SparcFrameLowering::SparcFrameLowering(const SparcSubtarget &ST)
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
+                          ST.is64Bit() ? 16 : 8, 0, ST.is64Bit() ? 16 : 8) {}
 
 void SparcFrameLowering::emitSPAdjustment(MachineFunction &MF,
                                           MachineBasicBlock &MBB,
@@ -99,7 +103,9 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF) const {
     SAVEri = SP::ADDri;
     SAVErr = SP::ADDrr;
   }
-  NumBytes = - SubTarget.getAdjustedFrameSize(NumBytes);
+  NumBytes =
+      -MF.getTarget().getSubtarget<SparcSubtarget>().getAdjustedFrameSize(
+          NumBytes);
   emitSPAdjustment(MF, MBB, MBBI, NumBytes, SAVErr, SAVEri);
 
   MachineModuleInfo &MMI = MF.getMMI();
@@ -162,7 +168,8 @@ void SparcFrameLowering::emitEpilogue(MachineFunction &MF,
   if (NumBytes == 0)
     return;
 
-  NumBytes = SubTarget.getAdjustedFrameSize(NumBytes);
+  NumBytes = MF.getTarget().getSubtarget<SparcSubtarget>().getAdjustedFrameSize(
+      NumBytes);
   emitSPAdjustment(MF, MBB, MBBI, NumBytes, SP::ADDrr, SP::ADDri);
 }
 
diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h
index bda7b7c..a7d1b89 100644
--- a/lib/Target/Sparc/SparcFrameLowering.h
+++ b/lib/Target/Sparc/SparcFrameLowering.h
@@ -15,19 +15,14 @@
 #define SPARC_FRAMEINFO_H
 
 #include "Sparc.h"
-#include "SparcSubtarget.h"
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
-  class SparcSubtarget;
 
+class SparcSubtarget;
 class SparcFrameLowering : public TargetFrameLowering {
-  const SparcSubtarget &SubTarget;
 public:
-  explicit SparcFrameLowering(const SparcSubtarget &ST)
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
-                          ST.is64Bit() ? 16 : 8, 0, ST.is64Bit() ? 16 : 8),
-      SubTarget(ST) {}
+  explicit SparcFrameLowering(const SparcSubtarget &ST);
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index ef61466..990f52a 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -2030,7 +2030,7 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
   }
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(SDLoc(Op)).setChain(Chain)
-    .setCallee(CallingConv::C, RetTyABI, Callee, &Args, 0);
+    .setCallee(CallingConv::C, RetTyABI, Callee, std::move(Args), 0);
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
 
@@ -2086,7 +2086,7 @@ SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(DL).setChain(Chain)
-    .setCallee(CallingConv::C, RetTy, Callee, &Args, 0);
+    .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
 
diff --git a/lib/Target/Sparc/SparcJITInfo.cpp b/lib/Target/Sparc/SparcJITInfo.cpp
index c775e9e..d0eec98 100644
--- a/lib/Target/Sparc/SparcJITInfo.cpp
+++ b/lib/Target/Sparc/SparcJITInfo.cpp
@@ -213,7 +213,8 @@ extern "C" void *SparcCompilationCallbackC(intptr_t StubAddr) {
 
 
 void SparcJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
-  assert(0 && "FIXME: Implement SparcJITInfo::replaceMachineCodeForFunction");
+  llvm_unreachable("FIXME: Implement SparcJITInfo::"
+                   "replaceMachineCodeForFunction");
 }
 
 
diff --git a/lib/Target/Sparc/SparcSelectionDAGInfo.cpp b/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
index eb36d29..a308fc5 100644
--- a/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
+++ b/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
@@ -11,13 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SparcTargetMachine.h"
+#include "SparcSelectionDAGInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "sparc-selectiondag-info"
 
-SparcSelectionDAGInfo::SparcSelectionDAGInfo(const SparcTargetMachine &TM)
-  : TargetSelectionDAGInfo(TM) {
+SparcSelectionDAGInfo::SparcSelectionDAGInfo(const DataLayout &DL)
+  : TargetSelectionDAGInfo(&DL) {
 }
 
 SparcSelectionDAGInfo::~SparcSelectionDAGInfo() {
diff --git a/lib/Target/Sparc/SparcSelectionDAGInfo.h b/lib/Target/Sparc/SparcSelectionDAGInfo.h
index dcd4203..2346f41 100644
--- a/lib/Target/Sparc/SparcSelectionDAGInfo.h
+++ b/lib/Target/Sparc/SparcSelectionDAGInfo.h
@@ -22,7 +22,7 @@ class SparcTargetMachine;
 
 class SparcSelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
-  explicit SparcSelectionDAGInfo(const SparcTargetMachine &TM);
+  explicit SparcSelectionDAGInfo(const DataLayout &DL);
   ~SparcSelectionDAGInfo();
 };
 
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index e38fb02..eea0c8c 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -26,20 +26,44 @@ using namespace llvm;
 
 void SparcSubtarget::anchor() { }
 
-SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU,
-                               const std::string &FS,  bool is64Bit) :
-  SparcGenSubtargetInfo(TT, CPU, FS),
-  IsV9(false),
-  V8DeprecatedInsts(false),
-  IsVIS(false),
-  Is64Bit(is64Bit),
-  HasHardQuad(false),
-  UsePopc(false) {
+static std::string computeDataLayout(const SparcSubtarget &ST) {
+  // Sparc is big endian.
+  std::string Ret = "E-m:e";
+
+  // Some ABIs have 32bit pointers.
+  if (!ST.is64Bit())
+    Ret += "-p:32:32";
+
+  // Alignments for 64 bit integers.
+  Ret += "-i64:64";
+
+  // On SparcV9 128 floats are aligned to 128 bits, on others only to 64.
+  // On SparcV9 registers can hold 64 or 32 bits, on others only 32.
+  if (ST.is64Bit())
+    Ret += "-n32:64";
+  else
+    Ret += "-f128:64-n32";
+
+  if (ST.is64Bit())
+    Ret += "-S128";
+  else
+    Ret += "-S64";
+
+  return Ret;
+}
+
+SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                                StringRef FS) {
+  IsV9 = false;
+  V8DeprecatedInsts = false;
+  IsVIS = false;
+  HasHardQuad = false;
+  UsePopc = false;
 
   // Determine default and user specified characteristics
   std::string CPUName = CPU;
   if (CPUName.empty())
-    CPUName = (is64Bit) ? "v9" : "v8";
+    CPUName = (Is64Bit) ? "v9" : "v8";
 
   // Parse features string.
   ParseSubtargetFeatures(CPUName, FS);
@@ -47,8 +71,16 @@ SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU,
   // Popc is a v9-only instruction.
   if (!IsV9)
     UsePopc = false;
+
+  return *this;
 }
 
+SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU,
+                               const std::string &FS, TargetMachine &TM,
+                               bool is64Bit)
+    : SparcGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit),
+      DL(computeDataLayout(initializeSubtargetDependencies(CPU, FS))),
+      InstrInfo(*this), TLInfo(TM), TSInfo(DL), FrameLowering(*this) {}
 
 int SparcSubtarget::getAdjustedFrameSize(int frameSize) const {
 
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index 4025622..a335778 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -14,6 +14,13 @@
 #ifndef SPARC_SUBTARGET_H
 #define SPARC_SUBTARGET_H
 
+#include "SparcFrameLowering.h"
+#include "SparcInstrInfo.h"
+#include "SparcISelLowering.h"
+#include "SparcJITInfo.h"
+#include "SparcSelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
@@ -31,10 +38,26 @@ class SparcSubtarget : public SparcGenSubtargetInfo {
   bool Is64Bit;
   bool HasHardQuad;
   bool UsePopc;
+  const DataLayout DL;       // Calculates type size & alignment
+  SparcInstrInfo InstrInfo;
+  SparcTargetLowering TLInfo;
+  SparcSelectionDAGInfo TSInfo;
+  SparcFrameLowering FrameLowering;
+  SparcJITInfo JITInfo;
 
 public:
   SparcSubtarget(const std::string &TT, const std::string &CPU,
-                 const std::string &FS, bool is64bit);
+                 const std::string &FS, TargetMachine &TM, bool is64bit);
+
+  const SparcInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; }
+  const SparcRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const SparcTargetLowering *getTargetLowering() const { return &TLInfo; }
+  const SparcSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  SparcJITInfo *getJITInfo() { return &JITInfo; }
+  const DataLayout *getDataLayout() const { return &DL; }
 
   bool isV9() const { return IsV9; }
   bool isVIS() const { return IsVIS; }
@@ -47,6 +70,7 @@ public:
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  SparcSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
 
   bool is64Bit() const { return Is64Bit; }
 
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 2469d93..0130fac 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -23,32 +23,6 @@ extern "C" void LLVMInitializeSparcTarget() {
   RegisterTargetMachine<SparcV9TargetMachine> Y(TheSparcV9Target);
 }
 
-static std::string computeDataLayout(const SparcSubtarget &ST) {
-  // Sparc is big endian.
-  std::string Ret = "E-m:e";
-
-  // Some ABIs have 32bit pointers.
-  if (!ST.is64Bit())
-    Ret += "-p:32:32";
-
-  // Alignments for 64 bit integers.
-  Ret += "-i64:64";
-
-  // On SparcV9 128 floats are aligned to 128 bits, on others only to 64.
-  // On SparcV9 registers can hold 64 or 32 bits, on others only 32.
-  if (ST.is64Bit())
-    Ret += "-n32:64";
-  else
-    Ret += "-f128:64-n32";
-
-  if (ST.is64Bit())
-    Ret += "-S128";
-  else
-    Ret += "-S64";
-
-  return Ret;
-}
-
 /// SparcTargetMachine ctor - Create an ILP32 architecture model
 ///
 SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT,
@@ -58,11 +32,7 @@ SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT,
                                        CodeGenOpt::Level OL,
                                        bool is64bit)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS, is64bit),
-    DL(computeDataLayout(Subtarget)),
-    InstrInfo(Subtarget),
-    TLInfo(*this), TSInfo(*this),
-    FrameLowering(Subtarget) {
+    Subtarget(TT, CPU, FS, *this, is64bit) {
   initAsmInfo();
 }
 
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index 7d04338..03b5137 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -14,50 +14,40 @@
 #ifndef SPARCTARGETMACHINE_H
 #define SPARCTARGETMACHINE_H
 
-#include "SparcFrameLowering.h"
-#include "SparcISelLowering.h"
 #include "SparcInstrInfo.h"
-#include "SparcJITInfo.h"
-#include "SparcSelectionDAGInfo.h"
 #include "SparcSubtarget.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
 class SparcTargetMachine : public LLVMTargetMachine {
   SparcSubtarget Subtarget;
-  const DataLayout DL;       // Calculates type size & alignment
-  SparcInstrInfo InstrInfo;
-  SparcTargetLowering TLInfo;
-  SparcSelectionDAGInfo TSInfo;
-  SparcFrameLowering FrameLowering;
-  SparcJITInfo JITInfo;
 public:
   SparcTargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS, const TargetOptions &Options,
                      Reloc::Model RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL, bool is64bit);
 
-  const SparcInstrInfo *getInstrInfo() const override { return &InstrInfo; }
-  const TargetFrameLowering  *getFrameLowering() const override {
-    return &FrameLowering;
+  const SparcInstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
+  }
+  const TargetFrameLowering *getFrameLowering() const override {
+    return getSubtargetImpl()->getFrameLowering();
   }
   const SparcSubtarget *getSubtargetImpl() const override { return &Subtarget; }
   const SparcRegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
+    return getSubtargetImpl()->getRegisterInfo();
   }
-  const SparcTargetLowering* getTargetLowering() const override {
-    return &TLInfo;
+  const SparcTargetLowering *getTargetLowering() const override {
+    return getSubtargetImpl()->getTargetLowering();
   }
-  const SparcSelectionDAGInfo* getSelectionDAGInfo() const override {
-    return &TSInfo;
+  const SparcSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return getSubtargetImpl()->getSelectionDAGInfo();
   }
-  SparcJITInfo *getJITInfo() override {
-    return &JITInfo;
+  SparcJITInfo *getJITInfo() override { return Subtarget.getJITInfo(); }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
   }
-  const DataLayout       *getDataLayout() const override { return &DL; }
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 71de64f..758be41 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -104,10 +104,6 @@ private:
     MemOp Mem;
   };
 
-  SystemZOperand(OperandKind kind, SMLoc startLoc, SMLoc endLoc)
-    : Kind(kind), StartLoc(startLoc), EndLoc(endLoc)
-  {}
-
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediates when possible.  Null MCExpr = 0.
     if (!Expr)
@@ -119,40 +115,44 @@ private:
   }
 
 public:
+  SystemZOperand(OperandKind kind, SMLoc startLoc, SMLoc endLoc)
+      : Kind(kind), StartLoc(startLoc), EndLoc(endLoc) {}
+
   // Create particular kinds of operand.
-  static SystemZOperand *createInvalid(SMLoc StartLoc, SMLoc EndLoc) {
-    return new SystemZOperand(KindInvalid, StartLoc, EndLoc);
+  static std::unique_ptr<SystemZOperand> createInvalid(SMLoc StartLoc,
+                                                       SMLoc EndLoc) {
+    return make_unique<SystemZOperand>(KindInvalid, StartLoc, EndLoc);
   }
-  static SystemZOperand *createToken(StringRef Str, SMLoc Loc) {
-    SystemZOperand *Op = new SystemZOperand(KindToken, Loc, Loc);
+  static std::unique_ptr<SystemZOperand> createToken(StringRef Str, SMLoc Loc) {
+    auto Op = make_unique<SystemZOperand>(KindToken, Loc, Loc);
     Op->Token.Data = Str.data();
     Op->Token.Length = Str.size();
     return Op;
   }
-  static SystemZOperand *createReg(RegisterKind Kind, unsigned Num,
-                                   SMLoc StartLoc, SMLoc EndLoc) {
-    SystemZOperand *Op = new SystemZOperand(KindReg, StartLoc, EndLoc);
+  static std::unique_ptr<SystemZOperand>
+  createReg(RegisterKind Kind, unsigned Num, SMLoc StartLoc, SMLoc EndLoc) {
+    auto Op = make_unique<SystemZOperand>(KindReg, StartLoc, EndLoc);
     Op->Reg.Kind = Kind;
     Op->Reg.Num = Num;
     return Op;
   }
-  static SystemZOperand *createAccessReg(unsigned Num, SMLoc StartLoc,
-                                         SMLoc EndLoc) {
-    SystemZOperand *Op = new SystemZOperand(KindAccessReg, StartLoc, EndLoc);
+  static std::unique_ptr<SystemZOperand>
+  createAccessReg(unsigned Num, SMLoc StartLoc, SMLoc EndLoc) {
+    auto Op = make_unique<SystemZOperand>(KindAccessReg, StartLoc, EndLoc);
     Op->AccessReg = Num;
     return Op;
   }
-  static SystemZOperand *createImm(const MCExpr *Expr, SMLoc StartLoc,
-                                   SMLoc EndLoc) {
-    SystemZOperand *Op = new SystemZOperand(KindImm, StartLoc, EndLoc);
+  static std::unique_ptr<SystemZOperand>
+  createImm(const MCExpr *Expr, SMLoc StartLoc, SMLoc EndLoc) {
+    auto Op = make_unique<SystemZOperand>(KindImm, StartLoc, EndLoc);
     Op->Imm = Expr;
     return Op;
   }
-  static SystemZOperand *createMem(RegisterKind RegKind, unsigned Base,
-                                   const MCExpr *Disp, unsigned Index,
-                                   const MCExpr *Length, SMLoc StartLoc,
-                                   SMLoc EndLoc) {
-    SystemZOperand *Op = new SystemZOperand(KindMem, StartLoc, EndLoc);
+  static std::unique_ptr<SystemZOperand>
+  createMem(RegisterKind RegKind, unsigned Base, const MCExpr *Disp,
+            unsigned Index, const MCExpr *Length, SMLoc StartLoc,
+            SMLoc EndLoc) {
+    auto Op = make_unique<SystemZOperand>(KindMem, StartLoc, EndLoc);
     Op->Mem.RegKind = RegKind;
     Op->Mem.Base = Base;
     Op->Mem.Index = Index;
@@ -313,21 +313,19 @@ private:
   bool parseRegister(Register &Reg, RegisterGroup Group, const unsigned *Regs,
                      bool IsAddress = false);
 
-  OperandMatchResultTy
-  parseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                RegisterGroup Group, const unsigned *Regs, RegisterKind Kind);
+  OperandMatchResultTy parseRegister(OperandVector &Operands,
+                                     RegisterGroup Group, const unsigned *Regs,
+                                     RegisterKind Kind);
 
   bool parseAddress(unsigned &Base, const MCExpr *&Disp,
                     unsigned &Index, const MCExpr *&Length,
                     const unsigned *Regs, RegisterKind RegKind);
 
-  OperandMatchResultTy
-  parseAddress(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-               const unsigned *Regs, RegisterKind RegKind,
-               MemoryKind MemKind);
+  OperandMatchResultTy parseAddress(OperandVector &Operands,
+                                    const unsigned *Regs, RegisterKind RegKind,
+                                    MemoryKind MemKind);
 
-  bool parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                    StringRef Mnemonic);
+  bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
 
 public:
   SystemZAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
@@ -343,87 +341,66 @@ public:
   // Override MCTargetAsmParser.
   bool ParseDirective(AsmToken DirectiveID) override;
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
-  bool ParseInstruction(ParseInstructionInfo &Info,
-                        StringRef Name, SMLoc NameLoc,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands)
-    override;
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer &Out, unsigned &ErrorInfo,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
                                bool MatchingInlineAsm) override;
 
   // Used by the TableGen code to parse particular operand types.
-  OperandMatchResultTy
-  parseGR32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseGR32(OperandVector &Operands) {
     return parseRegister(Operands, RegGR, SystemZMC::GR32Regs, GR32Reg);
   }
-  OperandMatchResultTy
-  parseGRH32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseGRH32(OperandVector &Operands) {
     return parseRegister(Operands, RegGR, SystemZMC::GRH32Regs, GRH32Reg);
   }
-  OperandMatchResultTy
-  parseGRX32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseGRX32(OperandVector &Operands) {
     llvm_unreachable("GRX32 should only be used for pseudo instructions");
   }
-  OperandMatchResultTy
-  parseGR64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseGR64(OperandVector &Operands) {
     return parseRegister(Operands, RegGR, SystemZMC::GR64Regs, GR64Reg);
   }
-  OperandMatchResultTy
-  parseGR128(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseGR128(OperandVector &Operands) {
     return parseRegister(Operands, RegGR, SystemZMC::GR128Regs, GR128Reg);
   }
-  OperandMatchResultTy
-  parseADDR32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseADDR32(OperandVector &Operands) {
     return parseRegister(Operands, RegGR, SystemZMC::GR32Regs, ADDR32Reg);
   }
-  OperandMatchResultTy
-  parseADDR64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseADDR64(OperandVector &Operands) {
     return parseRegister(Operands, RegGR, SystemZMC::GR64Regs, ADDR64Reg);
   }
-  OperandMatchResultTy
-  parseADDR128(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseADDR128(OperandVector &Operands) {
     llvm_unreachable("Shouldn't be used as an operand");
   }
-  OperandMatchResultTy
-  parseFP32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseFP32(OperandVector &Operands) {
     return parseRegister(Operands, RegFP, SystemZMC::FP32Regs, FP32Reg);
   }
-  OperandMatchResultTy
-  parseFP64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseFP64(OperandVector &Operands) {
     return parseRegister(Operands, RegFP, SystemZMC::FP64Regs, FP64Reg);
   }
-  OperandMatchResultTy
-  parseFP128(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseFP128(OperandVector &Operands) {
     return parseRegister(Operands, RegFP, SystemZMC::FP128Regs, FP128Reg);
   }
-  OperandMatchResultTy
-  parseBDAddr32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseBDAddr32(OperandVector &Operands) {
     return parseAddress(Operands, SystemZMC::GR32Regs, ADDR32Reg, BDMem);
   }
-  OperandMatchResultTy
-  parseBDAddr64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseBDAddr64(OperandVector &Operands) {
     return parseAddress(Operands, SystemZMC::GR64Regs, ADDR64Reg, BDMem);
   }
-  OperandMatchResultTy
-  parseBDXAddr64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseBDXAddr64(OperandVector &Operands) {
     return parseAddress(Operands, SystemZMC::GR64Regs, ADDR64Reg, BDXMem);
   }
-  OperandMatchResultTy
-  parseBDLAddr64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseBDLAddr64(OperandVector &Operands) {
     return parseAddress(Operands, SystemZMC::GR64Regs, ADDR64Reg, BDLMem);
   }
-  OperandMatchResultTy
-  parseAccessReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-  OperandMatchResultTy
-  parsePCRel(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-             int64_t MinVal, int64_t MaxVal);
-  OperandMatchResultTy
-  parsePCRel16(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parseAccessReg(OperandVector &Operands);
+  OperandMatchResultTy parsePCRel(OperandVector &Operands, int64_t MinVal,
+                                  int64_t MaxVal);
+  OperandMatchResultTy parsePCRel16(OperandVector &Operands) {
     return parsePCRel(Operands, -(1LL << 16), (1LL << 16) - 1);
   }
-  OperandMatchResultTy
-  parsePCRel32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  OperandMatchResultTy parsePCRel32(OperandVector &Operands) {
     return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1);
   }
 };
@@ -497,9 +474,8 @@ bool SystemZAsmParser::parseRegister(Register &Reg, RegisterGroup Group,
 
 // Parse a register and add it to Operands.  The other arguments are as above.
 SystemZAsmParser::OperandMatchResultTy
-SystemZAsmParser::parseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                                RegisterGroup Group, const unsigned *Regs,
-                                RegisterKind Kind) {
+SystemZAsmParser::parseRegister(OperandVector &Operands, RegisterGroup Group,
+                                const unsigned *Regs, RegisterKind Kind) {
   if (Parser.getTok().isNot(AsmToken::Percent))
     return MatchOperand_NoMatch;
 
@@ -566,9 +542,8 @@ bool SystemZAsmParser::parseAddress(unsigned &Base, const MCExpr *&Disp,
 // Parse a memory operand and add it to Operands.  The other arguments
 // are as above.
 SystemZAsmParser::OperandMatchResultTy
-SystemZAsmParser::parseAddress(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               const unsigned *Regs, RegisterKind RegKind,
-                               MemoryKind MemKind) {
+SystemZAsmParser::parseAddress(OperandVector &Operands, const unsigned *Regs,
+                               RegisterKind RegKind, MemoryKind MemKind) {
   SMLoc StartLoc = Parser.getTok().getLoc();
   unsigned Base, Index;
   const MCExpr *Disp;
@@ -622,9 +597,9 @@ bool SystemZAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
   return false;
 }
 
-bool SystemZAsmParser::
-ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
-                 SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool SystemZAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                        StringRef Name, SMLoc NameLoc,
+                                        OperandVector &Operands) {
   Operands.push_back(SystemZOperand::createToken(Name, NameLoc));
 
   // Read the remaining operands.
@@ -655,9 +630,8 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
   return false;
 }
 
-bool SystemZAsmParser::
-parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-             StringRef Mnemonic) {
+bool SystemZAsmParser::parseOperand(OperandVector &Operands,
+                                    StringRef Mnemonic) {
   // Check if the current operand has a custom associated parser, if so, try to
   // custom parse the operand, or fallback to the general approach.
   OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
@@ -700,11 +674,11 @@ parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
   return false;
 }
 
-bool SystemZAsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCStreamer &Out, unsigned &ErrorInfo,
-                        bool MatchingInlineAsm) {
+bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                               OperandVector &Operands,
+                                               MCStreamer &Out,
+                                               unsigned &ErrorInfo,
+                                               bool MatchingInlineAsm) {
   MCInst Inst;
   unsigned MatchResult;
 
@@ -739,7 +713,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
-      ErrorLoc = ((SystemZOperand*)Operands[ErrorInfo])->getStartLoc();
+      ErrorLoc = ((SystemZOperand &)*Operands[ErrorInfo]).getStartLoc();
       if (ErrorLoc == SMLoc())
         ErrorLoc = IDLoc;
     }
@@ -753,8 +727,8 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   llvm_unreachable("Unexpected match type");
 }
 
-SystemZAsmParser::OperandMatchResultTy SystemZAsmParser::
-parseAccessReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+SystemZAsmParser::OperandMatchResultTy
+SystemZAsmParser::parseAccessReg(OperandVector &Operands) {
   if (Parser.getTok().isNot(AsmToken::Percent))
     return MatchOperand_NoMatch;
 
@@ -768,9 +742,9 @@ parseAccessReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-SystemZAsmParser::OperandMatchResultTy SystemZAsmParser::
-parsePCRel(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-           int64_t MinVal, int64_t MaxVal) {
+SystemZAsmParser::OperandMatchResultTy
+SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
+                             int64_t MaxVal) {
   MCContext &Ctx = getContext();
   MCStreamer &Out = getStreamer();
   const MCExpr *Expr;
diff --git a/lib/Target/SystemZ/SystemZCallingConv.td b/lib/Target/SystemZ/SystemZCallingConv.td
index c4f641e..fb0d1d8 100644
--- a/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/lib/Target/SystemZ/SystemZCallingConv.td
@@ -13,7 +13,7 @@ class CCIfExtend<CCAction A>
   : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
 
 //===----------------------------------------------------------------------===//
-// SVR4 return value calling convention
+// z/Linux return value calling convention
 //===----------------------------------------------------------------------===//
 def RetCC_SystemZ : CallingConv<[
   // Promote i32 to i64 if it has an explicit extension type.
@@ -39,7 +39,7 @@ def RetCC_SystemZ : CallingConv<[
 ]>;
 
 //===----------------------------------------------------------------------===//
-// SVR4 argument calling conventions
+// z/Linux argument calling conventions
 //===----------------------------------------------------------------------===//
 def CC_SystemZ : CallingConv<[
   // Promote i32 to i64 if it has an explicit extension type.
@@ -63,3 +63,9 @@ def CC_SystemZ : CallingConv<[
   // Other arguments are passed in 8-byte-aligned 8-byte stack slots.
   CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
 ]>;
+
+//===----------------------------------------------------------------------===//
+// z/Linux callee-saved registers
+//===----------------------------------------------------------------------===//
+def CSR_SystemZ : CalleeSavedRegs<(add (sequence "R%dD", 6, 15),
+                                       (sequence "F%dD", 8, 15))>;
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 65f3caf..055dbe9 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -10,8 +10,9 @@
 #include "SystemZFrameLowering.h"
 #include "SystemZCallingConv.h"
 #include "SystemZInstrBuilder.h"
+#include "SystemZInstrInfo.h"
 #include "SystemZMachineFunctionInfo.h"
-#include "SystemZTargetMachine.h"
+#include "SystemZRegisterInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
@@ -44,11 +45,9 @@ static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = {
 };
 } // end anonymous namespace
 
-SystemZFrameLowering::SystemZFrameLowering(const SystemZTargetMachine &tm,
-                                           const SystemZSubtarget &sti)
-  : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8,
-                        -SystemZMC::CallFrameSize, 8),
-    TM(tm), STI(sti) {
+SystemZFrameLowering::SystemZFrameLowering()
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8,
+                          -SystemZMC::CallFrameSize, 8) {
   // Create a mapping from register number to save slot offset.
   RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS);
   for (unsigned I = 0, E = array_lengthof(SpillOffsetTable); I != E; ++I)
@@ -108,9 +107,8 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
 // instruction, or an implicit one that comes between the explicit start
 // and end registers.
 static void addSavedGPR(MachineBasicBlock &MBB, MachineInstrBuilder &MIB,
-                        const SystemZTargetMachine &TM,
                         unsigned GPR64, bool IsImplicit) {
-  const SystemZRegisterInfo *RI = TM.getRegisterInfo();
+  const TargetRegisterInfo *RI = MBB.getParent()->getTarget().getRegisterInfo();
   unsigned GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_l32);
   bool IsLive = MBB.isLiveIn(GPR64) || MBB.isLiveIn(GPR32);
   if (!IsLive || !IsImplicit) {
@@ -176,8 +174,8 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(SystemZ::STMG));
 
     // Add the explicit register operands.
-    addSavedGPR(MBB, MIB, TM, LowGPR, false);
-    addSavedGPR(MBB, MIB, TM, HighGPR, false);
+    addSavedGPR(MBB, MIB, LowGPR, false);
+    addSavedGPR(MBB, MIB, HighGPR, false);
 
     // Add the address.
     MIB.addReg(SystemZ::R15D).addImm(StartOffset);
@@ -187,13 +185,13 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
       unsigned Reg = CSI[I].getReg();
       if (SystemZ::GR64BitRegClass.contains(Reg))
-        addSavedGPR(MBB, MIB, TM, Reg, true);
+        addSavedGPR(MBB, MIB, Reg, true);
     }
 
     // ...likewise GPR varargs.
     if (IsVarArg)
       for (unsigned I = ZFI->getVarArgsFirstGPR(); I < SystemZ::NumArgGPRs; ++I)
-        addSavedGPR(MBB, MIB, TM, SystemZ::ArgGPRs[I], true);
+        addSavedGPR(MBB, MIB, SystemZ::ArgGPRs[I], true);
   }
 
   // Save FPRs in the normal TargetInstrInfo way.
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h
index 70e25fb..4d5fe6d 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -10,7 +10,6 @@
 #ifndef SYSTEMZFRAMELOWERING_H
 #define SYSTEMZFRAMELOWERING_H
 
-#include "SystemZSubtarget.h"
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/Target/TargetFrameLowering.h"
 
@@ -21,13 +20,8 @@ class SystemZSubtarget;
 class SystemZFrameLowering : public TargetFrameLowering {
   IndexedMap<unsigned> RegSpillOffsets;
 
-protected:
-  const SystemZTargetMachine &TM;
-  const SystemZSubtarget &STI;
-
 public:
-  SystemZFrameLowering(const SystemZTargetMachine &tm,
-                       const SystemZSubtarget &sti);
+  SystemZFrameLowering();
 
   // Override TargetFrameLowering.
   bool isFPCloseToIncomingSP() const override { return false; }
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 6fe1fb9..00c65f5 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -80,9 +80,9 @@ static MachineOperand earlyUseOperand(MachineOperand Op) {
   return Op;
 }
 
-SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
-  : TargetLowering(tm, new TargetLoweringObjectFileELF()),
-    Subtarget(*tm.getSubtargetImpl()), TM(tm) {
+SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm)
+    : TargetLowering(tm, new TargetLoweringObjectFileELF()),
+      Subtarget(tm.getSubtarget<SystemZSubtarget>()) {
   MVT PtrVT = getPointerTy();
 
   // Set up the register classes.
@@ -673,11 +673,13 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
   MachineRegisterInfo &MRI = MF.getRegInfo();
   SystemZMachineFunctionInfo *FuncInfo =
     MF.getInfo<SystemZMachineFunctionInfo>();
-  auto *TFL = static_cast<const SystemZFrameLowering *>(TM.getFrameLowering());
+  auto *TFL = static_cast<const SystemZFrameLowering *>(
+      DAG.getTarget().getFrameLowering());
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, IsVarArg, MF, TM, ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), ArgLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ);
 
   unsigned NumFixedGPRs = 0;
@@ -815,7 +817,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Analyze the operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState ArgCCInfo(CallConv, IsVarArg, MF, TM, ArgLocs, *DAG.getContext());
+  CCState ArgCCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), ArgLocs,
+                    *DAG.getContext());
   ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
 
   // We don't support GuaranteedTailCallOpt, only automatically-detected
@@ -911,6 +914,12 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
     Ops.push_back(DAG.getRegister(RegsToPass[I].first,
                                   RegsToPass[I].second.getValueType()));
 
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
   // Glue the call to the argument copies, if any.
   if (Glue.getNode())
     Ops.push_back(Glue);
@@ -931,7 +940,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RetLocs;
-  CCState RetCCInfo(CallConv, IsVarArg, MF, TM, RetLocs, *DAG.getContext());
+  CCState RetCCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), RetLocs,
+                    *DAG.getContext());
   RetCCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ);
 
   // Copy all of the result registers out of their specified physreg.
@@ -962,7 +972,8 @@ SystemZTargetLowering::LowerReturn(SDValue Chain,
 
   // Assign locations to each returned value.
   SmallVector<CCValAssign, 16> RetLocs;
-  CCState RetCCInfo(CallConv, IsVarArg, MF, TM, RetLocs, *DAG.getContext());
+  CCState RetCCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), RetLocs,
+                    *DAG.getContext());
   RetCCInfo.AnalyzeReturn(Outs, RetCC_SystemZ);
 
   // Quick exit for void returns
@@ -1786,8 +1797,8 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
   const GlobalValue *GV = Node->getGlobal();
   int64_t Offset = Node->getOffset();
   EVT PtrVT = getPointerTy();
-  Reloc::Model RM = TM.getRelocationModel();
-  CodeModel::Model CM = TM.getCodeModel();
+  Reloc::Model RM = DAG.getTarget().getRelocationModel();
+  CodeModel::Model CM = DAG.getTarget().getCodeModel();
 
   SDValue Result;
   if (Subtarget.isPC32DBLSymbol(GV, RM, CM)) {
@@ -1824,7 +1835,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
   SDLoc DL(Node);
   const GlobalValue *GV = Node->getGlobal();
   EVT PtrVT = getPointerTy();
-  TLSModel::Model model = TM.getTLSModel(GV);
+  TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
 
   if (model != TLSModel::LocalExec)
     llvm_unreachable("only local-exec TLS mode supported");
@@ -2287,9 +2298,9 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD_SUB(SDValue Op,
       // Use an addition if the operand is constant and either LAA(G) is
       // available or the negative value is in the range of A(G)FHI.
       int64_t Value = (-Op2->getAPIntValue()).getSExtValue();
-      if (isInt<32>(Value) || TM.getSubtargetImpl()->hasInterlockedAccess1())
+      if (isInt<32>(Value) || Subtarget.hasInterlockedAccess1())
         NegSrc2 = DAG.getConstant(Value, MemVT);
-    } else if (TM.getSubtargetImpl()->hasInterlockedAccess1())
+    } else if (Subtarget.hasInterlockedAccess1())
       // Use LAA(G) if available.
       NegSrc2 = DAG.getNode(ISD::SUB, DL, MemVT, DAG.getConstant(0, MemVT),
                             Src2);
@@ -2602,7 +2613,8 @@ static unsigned forceReg(MachineInstr *MI, MachineOperand &Base,
 MachineBasicBlock *
 SystemZTargetLowering::emitSelect(MachineInstr *MI,
                                   MachineBasicBlock *MBB) const {
-  const SystemZInstrInfo *TII = TM.getInstrInfo();
+  const SystemZInstrInfo *TII = static_cast<const SystemZInstrInfo *>(
+      MBB->getParent()->getTarget().getInstrInfo());
 
   unsigned DestReg  = MI->getOperand(0).getReg();
   unsigned TrueReg  = MI->getOperand(1).getReg();
@@ -2650,7 +2662,8 @@ SystemZTargetLowering::emitCondStore(MachineInstr *MI,
                                      MachineBasicBlock *MBB,
                                      unsigned StoreOpcode, unsigned STOCOpcode,
                                      bool Invert) const {
-  const SystemZInstrInfo *TII = TM.getInstrInfo();
+  const SystemZInstrInfo *TII = static_cast<const SystemZInstrInfo *>(
+      MBB->getParent()->getTarget().getInstrInfo());
 
   unsigned SrcReg     = MI->getOperand(0).getReg();
   MachineOperand Base = MI->getOperand(1);
@@ -2665,7 +2678,7 @@ SystemZTargetLowering::emitCondStore(MachineInstr *MI,
   // Use STOCOpcode if possible.  We could use different store patterns in
   // order to avoid matching the index register, but the performance trade-offs
   // might be more complicated in that case.
-  if (STOCOpcode && !IndexReg && TM.getSubtargetImpl()->hasLoadStoreOnCond()) {
+  if (STOCOpcode && !IndexReg && Subtarget.hasLoadStoreOnCond()) {
     if (Invert)
       CCMask ^= CCValid;
     BuildMI(*MBB, MI, DL, TII->get(STOCOpcode))
@@ -2717,8 +2730,9 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
                                             unsigned BinOpcode,
                                             unsigned BitSize,
                                             bool Invert) const {
-  const SystemZInstrInfo *TII = TM.getInstrInfo();
   MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
   bool IsSubWord = (BitSize < 32);
 
@@ -2840,8 +2854,9 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
                                             unsigned CompareOpcode,
                                             unsigned KeepOldMask,
                                             unsigned BitSize) const {
-  const SystemZInstrInfo *TII = TM.getInstrInfo();
   MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
   bool IsSubWord = (BitSize < 32);
 
@@ -2951,8 +2966,9 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
 MachineBasicBlock *
 SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
                                           MachineBasicBlock *MBB) const {
-  const SystemZInstrInfo *TII = TM.getInstrInfo();
   MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // Extract the operands.  Base can be a register or a frame index.
@@ -3067,8 +3083,9 @@ MachineBasicBlock *
 SystemZTargetLowering::emitExt128(MachineInstr *MI,
                                   MachineBasicBlock *MBB,
                                   bool ClearEven, unsigned SubReg) const {
-  const SystemZInstrInfo *TII = TM.getInstrInfo();
   MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -3098,8 +3115,9 @@ MachineBasicBlock *
 SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
                                          MachineBasicBlock *MBB,
                                          unsigned Opcode) const {
-  const SystemZInstrInfo *TII = TM.getInstrInfo();
   MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -3267,8 +3285,9 @@ MachineBasicBlock *
 SystemZTargetLowering::emitStringWrapper(MachineInstr *MI,
                                          MachineBasicBlock *MBB,
                                          unsigned Opcode) const {
-  const SystemZInstrInfo *TII = TM.getInstrInfo();
   MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
 
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index bceb25e..e21b050 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -198,7 +198,7 @@ class SystemZTargetMachine;
 
 class SystemZTargetLowering : public TargetLowering {
 public:
-  explicit SystemZTargetLowering(SystemZTargetMachine &TM);
+  explicit SystemZTargetLowering(const TargetMachine &TM);
 
   // Override TargetLowering.
   MVT getScalarShiftAmountTy(EVT LHSTy) const override {
@@ -249,7 +249,6 @@ public:
 
 private:
   const SystemZSubtarget &Subtarget;
-  const SystemZTargetMachine &TM;
 
   // Implement LowerOperation for individual opcodes.
   SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index a1e782c..e8841e1 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -133,6 +133,13 @@ def LEDBR : UnaryRRE<"ledb", 0xB344, fround,    FP32,  FP64>;
 def LEXBR : UnaryRRE<"lexb", 0xB346, null_frag, FP128, FP128>;
 def LDXBR : UnaryRRE<"ldxb", 0xB345, null_frag, FP128, FP128>;
 
+def LEDBRA : UnaryRRF4<"ledbra", 0xB344, FP32,  FP64>,
+             Requires<[FeatureFPExtension]>;
+def LEXBRA : UnaryRRF4<"lexbra", 0xB346, FP128, FP128>,
+             Requires<[FeatureFPExtension]>;
+def LDXBRA : UnaryRRF4<"ldxbra", 0xB345, FP128, FP128>,
+             Requires<[FeatureFPExtension]>;
+
 def : Pat<(f32 (fround FP128:$src)),
           (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hh32)>;
 def : Pat<(f64 (fround FP128:$src)),
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index add675a..9f59a1c 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -511,34 +511,24 @@ class InstSS<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 //     to store.  Other stored registers are added as implicit uses.
 //
 //   Unary:
-//     One register output operand and one input operand.  The input
-//     operand may be a register, immediate or memory.
+//     One register output operand and one input operand.
 //
 //   Binary:
-//     One register output operand and two input operands.  The first
-//     input operand is always a register and the second may be a register,
-//     immediate or memory.
-//
-//   Shift:
-//     One register output operand and two input operands.  The first
-//     input operand is a register and the second has the same form as
-//     an address (although it isn't actually used to address memory).
+//     One register output operand and two input operands.
 //
 //   Compare:
-//     Two input operands.  The first operand is always a register,
-//     the second may be a register, immediate or memory.
+//     Two input operands and an implicit CC output operand.
 //
 //   Ternary:
-//     One register output operand and three register input operands.
+//     One register output operand and three input operands.
 //
 //   LoadAndOp:
-//     One output operand and two input operands.  The first input operand
-//     is a register and the second is an address.
+//     One output operand and two input operands, one of which is an address.
+//     The instruction both reads from and writes to the address.
 //
 //   CmpSwap:
-//     One output operand and three input operands.  The first two
-//     operands are registers and the third is an address.  The instruction
-//     both reads from and writes to the address.
+//     One output operand and three input operands, one of which is an address.
+//     The instruction both reads from and writes to the address.
 //
 //   RotateSelect:
 //     One output operand and five input operands.  The first two operands
@@ -691,7 +681,7 @@ class CondStoreRSY<string mnemonic, bits<16> opcode,
 class AsmCondStoreRSY<string mnemonic, bits<16> opcode,
                       RegisterOperand cls, bits<5> bytes,
                       AddressingMode mode = bdaddr20only>
-  : InstRSY<opcode, (outs), (ins cls:$R1, mode:$BD2, uimm8zx4:$R3),
+  : InstRSY<opcode, (outs), (ins cls:$R1, mode:$BD2, imm32zx4:$R3),
             mnemonic#"\t$R1, $BD2, $R3", []>,
     Requires<[FeatureLoadStoreOnCond]> {
   let mayStore = 1;
@@ -730,7 +720,7 @@ class UnaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
 
 class UnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
                RegisterOperand cls2>
-  : InstRRF<opcode, (outs cls1:$R1), (ins uimm8zx4:$R3, cls2:$R2),
+  : InstRRF<opcode, (outs cls1:$R1), (ins imm32zx4:$R3, cls2:$R2),
             mnemonic#"r\t$R1, $R3, $R2", []> {
   let OpKey = mnemonic ## cls1;
   let OpType = "reg";
@@ -739,7 +729,7 @@ class UnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
 
 class UnaryRRF4<string mnemonic, bits<16> opcode, RegisterOperand cls1,
                 RegisterOperand cls2>
-  : InstRRF<opcode, (outs cls1:$R1), (ins uimm8zx4:$R3, cls2:$R2, uimm8zx4:$R4),
+  : InstRRF<opcode, (outs cls1:$R1), (ins imm32zx4:$R3, cls2:$R2, imm32zx4:$R4),
             mnemonic#"\t$R1, $R3, $R2, $R4", []>;
 
 // These instructions are generated by if conversion.  The old value of R1
@@ -757,7 +747,7 @@ class CondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
 // mask is the third operand rather than being part of the mnemonic.
 class AsmCondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
                       RegisterOperand cls2>
-  : InstRRF<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2, uimm8zx4:$R3),
+  : InstRRF<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2, imm32zx4:$R3),
             mnemonic#"r\t$R1, $R2, $R3", []>,
     Requires<[FeatureLoadStoreOnCond]> {
   let Constraints = "$R1 = $R1src";
@@ -823,7 +813,7 @@ class CondUnaryRSY<string mnemonic, bits<16> opcode,
 class AsmCondUnaryRSY<string mnemonic, bits<16> opcode,
                       RegisterOperand cls, bits<5> bytes,
                       AddressingMode mode = bdaddr20only>
-  : InstRSY<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$BD2, uimm8zx4:$R3),
+  : InstRSY<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$BD2, imm32zx4:$R3),
             mnemonic#"\t$R1, $BD2, $R3", []>,
     Requires<[FeatureLoadStoreOnCond]> {
   let mayLoad = 1;
@@ -993,6 +983,33 @@ class BinaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
   let DisableEncoding = "$R1src";
 }
 
+class BinaryRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+               RegisterOperand cls>
+  : InstRS<opcode, (outs cls:$R1), (ins cls:$R1src, shift12only:$BD2),
+           mnemonic#"\t$R1, $BD2",
+           [(set cls:$R1, (operator cls:$R1src, shift12only:$BD2))]> {
+  let R3 = 0;
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BinaryRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                RegisterOperand cls>
+  : InstRSY<opcode, (outs cls:$R1), (ins cls:$R3, shift20only:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2",
+            [(set cls:$R1, (operator cls:$R3, shift20only:$BD2))]>;
+
+multiclass BinaryRSAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2,
+                        SDPatternOperator operator, RegisterOperand cls> {
+  let NumOpsKey = mnemonic in {
+    let NumOpsValue = "3" in
+      def K  : BinaryRSY<mnemonic##"k", opcode2, null_frag, cls>,
+               Requires<[FeatureDistinctOps]>;
+    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+      def "" : BinaryRS<mnemonic, opcode1, operator, cls>;
+  }
+}
+
 class BinaryRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
                AddressingMode mode = bdxaddr12only>
@@ -1077,33 +1094,6 @@ multiclass BinarySIPair<string mnemonic, bits<8> siOpcode,
   }
 }
 
-class ShiftRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
-              RegisterOperand cls>
-  : InstRS<opcode, (outs cls:$R1), (ins cls:$R1src, shift12only:$BD2),
-           mnemonic#"\t$R1, $BD2",
-           [(set cls:$R1, (operator cls:$R1src, shift12only:$BD2))]> {
-  let R3 = 0;
-  let Constraints = "$R1 = $R1src";
-  let DisableEncoding = "$R1src";
-}
-
-class ShiftRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-               RegisterOperand cls>
-  : InstRSY<opcode, (outs cls:$R1), (ins cls:$R3, shift20only:$BD2),
-            mnemonic#"\t$R1, $R3, $BD2",
-            [(set cls:$R1, (operator cls:$R3, shift20only:$BD2))]>;
-
-multiclass ShiftRSAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2,
-                       SDPatternOperator operator, RegisterOperand cls> {
-  let NumOpsKey = mnemonic in {
-    let NumOpsValue = "3" in
-      def K  : ShiftRSY<mnemonic##"k", opcode2, null_frag, cls>,
-               Requires<[FeatureDistinctOps]>;
-    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
-      def "" : ShiftRS<mnemonic, opcode1, operator, cls>;
-  }
-}
-
 class CompareRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                 RegisterOperand cls1, RegisterOperand cls2>
   : InstRR<opcode, (outs), (ins cls1:$R1, cls2:$R2),
@@ -1315,22 +1305,23 @@ multiclass CmpSwapRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
 class RotateSelectRIEf<string mnemonic, bits<16> opcode, RegisterOperand cls1,
                        RegisterOperand cls2>
   : InstRIEf<opcode, (outs cls1:$R1),
-             (ins cls1:$R1src, cls2:$R2, uimm8:$I3, uimm8:$I4, uimm8zx6:$I5),
+             (ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4,
+                  imm32zx6:$I5),
              mnemonic#"\t$R1, $R2, $I3, $I4, $I5", []> {
   let Constraints = "$R1 = $R1src";
   let DisableEncoding = "$R1src";
 }
 
 class PrefetchRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator>
-  : InstRXY<opcode, (outs), (ins uimm8zx4:$R1, bdxaddr20only:$XBD2),
+  : InstRXY<opcode, (outs), (ins imm32zx4:$R1, bdxaddr20only:$XBD2),
             mnemonic##"\t$R1, $XBD2",
-            [(operator uimm8zx4:$R1, bdxaddr20only:$XBD2)]>;
+            [(operator imm32zx4:$R1, bdxaddr20only:$XBD2)]>;
 
 class PrefetchRILPC<string mnemonic, bits<12> opcode,
                     SDPatternOperator operator>
-  : InstRIL<opcode, (outs), (ins uimm8zx4:$R1, pcrel32:$I2),
+  : InstRIL<opcode, (outs), (ins imm32zx4:$R1, pcrel32:$I2),
             mnemonic##"\t$R1, $I2",
-            [(operator uimm8zx4:$R1, pcrel32:$I2)]> {
+            [(operator imm32zx4:$R1, pcrel32:$I2)]> {
   // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
   // However, BDXs have two extra operands and are therefore 6 units more
   // complex.
@@ -1450,7 +1441,8 @@ class StoreRXYPseudo<SDPatternOperator operator, RegisterOperand cls,
 // of registers.
 class RotateSelectRIEfPseudo<RegisterOperand cls1, RegisterOperand cls2>
   : Pseudo<(outs cls1:$R1),
-           (ins cls1:$R1src, cls2:$R2, uimm8:$I3, uimm8:$I4, uimm8zx6:$I5),
+           (ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4,
+                imm32zx6:$I5),
            []> {
   let Constraints = "$R1 = $R1src";
   let DisableEncoding = "$R1src";
@@ -1460,9 +1452,9 @@ class RotateSelectRIEfPseudo<RegisterOperand cls1, RegisterOperand cls2>
 // the value of the PSW's 2-bit condition code field.
 class SelectWrapper<RegisterOperand cls>
   : Pseudo<(outs cls:$dst),
-           (ins cls:$src1, cls:$src2, uimm8zx4:$valid, uimm8zx4:$cc),
+           (ins cls:$src1, cls:$src2, imm32zx4:$valid, imm32zx4:$cc),
            [(set cls:$dst, (z_select_ccmask cls:$src1, cls:$src2,
-                                            uimm8zx4:$valid, uimm8zx4:$cc))]> {
+                                            imm32zx4:$valid, imm32zx4:$cc))]> {
   let usesCustomInserter = 1;
   // Although the instructions used by these nodes do not in themselves
   // change CC, the insertion requires new blocks, and CC cannot be live
@@ -1476,14 +1468,14 @@ multiclass CondStores<RegisterOperand cls, SDPatternOperator store,
                       SDPatternOperator load, AddressingMode mode> {
   let Defs = [CC], Uses = [CC], usesCustomInserter = 1 in {
     def "" : Pseudo<(outs),
-                    (ins cls:$new, mode:$addr, uimm8zx4:$valid, uimm8zx4:$cc),
+                    (ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc),
                     [(store (z_select_ccmask cls:$new, (load mode:$addr),
-                                             uimm8zx4:$valid, uimm8zx4:$cc),
+                                             imm32zx4:$valid, imm32zx4:$cc),
                             mode:$addr)]>;
     def Inv : Pseudo<(outs),
-                     (ins cls:$new, mode:$addr, uimm8zx4:$valid, uimm8zx4:$cc),
+                     (ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc),
                      [(store (z_select_ccmask (load mode:$addr), cls:$new,
-                                              uimm8zx4:$valid, uimm8zx4:$cc),
+                                              imm32zx4:$valid, imm32zx4:$cc),
                               mode:$addr)]>;
   }
 }
@@ -1611,6 +1603,7 @@ class CompareAliasRI<SDPatternOperator operator, RegisterOperand cls,
 // An alias of a RotateSelectRIEf, but with different register sizes.
 class RotateSelectAliasRIEf<RegisterOperand cls1, RegisterOperand cls2>
   : Alias<6, (outs cls1:$R1),
-          (ins cls1:$R1src, cls2:$R2, uimm8:$I3, uimm8:$I4, uimm8zx6:$I5), []> {
+          (ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4,
+               imm32zx6:$I5), []> {
   let Constraints = "$R1 = $R1src";
 }
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 6a18b2d..f58ab47 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -40,9 +40,9 @@ static bool isHighReg(unsigned int Reg) {
 // Pin the vtable to this file.
 void SystemZInstrInfo::anchor() {}
 
-SystemZInstrInfo::SystemZInstrInfo(SystemZTargetMachine &tm)
+SystemZInstrInfo::SystemZInstrInfo(SystemZSubtarget &sti)
   : SystemZGenInstrInfo(SystemZ::ADJCALLSTACKDOWN, SystemZ::ADJCALLSTACKUP),
-    RI(tm), TM(tm) {
+    RI(), STI(sti) {
 }
 
 // MI is a 128-bit load or store.  Split it into two 64-bit loads or stores,
@@ -488,7 +488,7 @@ SystemZInstrInfo::optimizeCompareInstr(MachineInstr *Compare,
   bool IsLogical = (Compare->getDesc().TSFlags & SystemZII::IsLogical) != 0;
   if (Value == 0 &&
       !IsLogical &&
-      removeIPMBasedCompare(Compare, SrcReg, MRI, TM.getRegisterInfo()))
+      removeIPMBasedCompare(Compare, SrcReg, MRI, &RI))
     return true;
   return false;
 }
@@ -505,7 +505,7 @@ static unsigned getConditionalMove(unsigned Opcode) {
 
 bool SystemZInstrInfo::isPredicable(MachineInstr *MI) const {
   unsigned Opcode = MI->getOpcode();
-  if (TM.getSubtargetImpl()->hasLoadStoreOnCond() &&
+  if (STI.hasLoadStoreOnCond() &&
       getConditionalMove(Opcode))
     return true;
   return false;
@@ -537,7 +537,7 @@ PredicateInstruction(MachineInstr *MI,
   unsigned CCMask = Pred[1].getImm();
   assert(CCMask > 0 && CCMask < 15 && "Invalid predicate");
   unsigned Opcode = MI->getOpcode();
-  if (TM.getSubtargetImpl()->hasLoadStoreOnCond()) {
+  if (STI.hasLoadStoreOnCond()) {
     if (unsigned CondOpcode = getConditionalMove(Opcode)) {
       MI->setDesc(get(CondOpcode));
       MachineInstrBuilder(*MI->getParent()->getParent(), MI)
@@ -685,7 +685,7 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   // We prefer to keep the two-operand form where possible both
   // because it tends to be shorter and because some instructions
   // have memory forms that can be used during spilling.
-  if (TM.getSubtargetImpl()->hasDistinctOps()) {
+  if (STI.hasDistinctOps()) {
     MachineOperand &Dest = MI->getOperand(0);
     MachineOperand &Src = MI->getOperand(1);
     unsigned DestReg = Dest.getReg();
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index 09aee5d..83009cb 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -110,9 +110,10 @@ struct Branch {
 };
 } // end namespace SystemZII
 
+class SystemZSubtarget;
 class SystemZInstrInfo : public SystemZGenInstrInfo {
   const SystemZRegisterInfo RI;
-  SystemZTargetMachine &TM;
+  SystemZSubtarget &STI;
 
   void splitMove(MachineBasicBlock::iterator MI, unsigned NewOpcode) const;
   void splitAdjDynAlloc(MachineBasicBlock::iterator MI) const;
@@ -130,7 +131,7 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
   virtual void anchor();
   
 public:
-  explicit SystemZInstrInfo(SystemZTargetMachine &TM);
+  explicit SystemZInstrInfo(SystemZSubtarget &STI);
 
   // Override TargetInstrInfo.
   unsigned isLoadFromStackSlot(const MachineInstr *MI,
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index e70df92..f4951ad 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -63,11 +63,11 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in {
     def BRCL : InstRIL<0xC04, (outs), (ins cond4:$valid, cond4:$R1,
                                            brtarget32:$I2), "jg$R1\t$I2", []>;
   }
-  def AsmBRC : InstRI<0xA74, (outs), (ins uimm8zx4:$R1, brtarget16:$I2),
+  def AsmBRC : InstRI<0xA74, (outs), (ins imm32zx4:$R1, brtarget16:$I2),
                       "brc\t$R1, $I2", []>;
-  def AsmBRCL : InstRIL<0xC04, (outs), (ins uimm8zx4:$R1, brtarget32:$I2),
+  def AsmBRCL : InstRIL<0xC04, (outs), (ins imm32zx4:$R1, brtarget32:$I2),
                         "brcl\t$R1, $I2", []>;
-  def AsmBCR : InstRR<0x07, (outs), (ins uimm8zx4:$R1, GR64:$R2),
+  def AsmBCR : InstRR<0x07, (outs), (ins imm32zx4:$R1, GR64:$R2),
                       "bcr\t$R1, $R2", []>;
 }
 
@@ -109,7 +109,7 @@ multiclass CompareBranches<Operand ccmask, string pos1, string pos2> {
 }
 let isCodeGenOnly = 1 in
   defm C : CompareBranches<cond4, "$M3", "">;
-defm AsmC : CompareBranches<uimm8zx4, "", "$M3, ">;
+defm AsmC : CompareBranches<imm32zx4, "", "$M3, ">;
 
 // Define AsmParser mnemonics for each general condition-code mask
 // (integer or floating-point)
@@ -233,9 +233,7 @@ defm CondStore64 : CondStores<GR64, nonvolatile_store,
 // Call instructions
 //===----------------------------------------------------------------------===//
 
-// The definitions here are for the call-clobbered registers.
-let isCall = 1, Defs = [R0D, R1D, R2D, R3D, R4D, R5D, R14D,
-                        F0D, F1D, F2D, F3D, F4D, F5D, F6D, F7D, CC] in {
+let isCall = 1, Defs = [R14D, CC] in {
   def CallBRASL : Alias<6, (outs), (ins pcrel32:$I2, variable_ops),
                         [(z_call pcrel32:$I2)]>;
   def CallBASR  : Alias<2, (outs), (ins ADDR64:$R2, variable_ops),
@@ -855,7 +853,7 @@ let Defs = [CC] in {
   }
 
   // AND to memory
-  defm NI : BinarySIPair<"ni", 0x94, 0xEB54, null_frag, uimm8>;
+  defm NI : BinarySIPair<"ni", 0x94, 0xEB54, null_frag, imm32zx8>;
 
   // Block AND.
   let mayLoad = 1, mayStore = 1 in
@@ -912,7 +910,7 @@ let Defs = [CC] in {
   }
 
   // OR to memory
-  defm OI : BinarySIPair<"oi", 0x96, 0xEB56, null_frag, uimm8>;
+  defm OI : BinarySIPair<"oi", 0x96, 0xEB56, null_frag, imm32zx8>;
 
   // Block OR.
   let mayLoad = 1, mayStore = 1 in
@@ -952,7 +950,7 @@ let Defs = [CC] in {
   }
 
   // XOR to memory
-  defm XI : BinarySIPair<"xi", 0x97, 0xEB57, null_frag, uimm8>;
+  defm XI : BinarySIPair<"xi", 0x97, 0xEB57, null_frag, imm32zx8>;
 
   // Block XOR.
   let mayLoad = 1, mayStore = 1 in
@@ -1015,26 +1013,26 @@ def DLG  : BinaryRXY<"dlg",  0xE387, z_udivrem64, GR128, load, 8>;
 
 // Shift left.
 let neverHasSideEffects = 1 in {
-  defm SLL : ShiftRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>;
-  def SLLG : ShiftRSY<"sllg", 0xEB0D, shl, GR64>;
+  defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>;
+  def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>;
 }
 
 // Logical shift right.
 let neverHasSideEffects = 1 in {
-  defm SRL : ShiftRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>;
-  def SRLG : ShiftRSY<"srlg", 0xEB0C, srl, GR64>;
+  defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>;
+  def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>;
 }
 
 // Arithmetic shift right.
 let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
-  defm SRA : ShiftRSAndK<"sra", 0x8A, 0xEBDC, sra, GR32>;
-  def SRAG : ShiftRSY<"srag", 0xEB0A, sra, GR64>;
+  defm SRA : BinaryRSAndK<"sra", 0x8A, 0xEBDC, sra, GR32>;
+  def SRAG : BinaryRSY<"srag", 0xEB0A, sra, GR64>;
 }
 
 // Rotate left.
 let neverHasSideEffects = 1 in {
-  def RLL  : ShiftRSY<"rll",  0xEB1D, rotl, GR32>;
-  def RLLG : ShiftRSY<"rllg", 0xEB1C, rotl, GR64>;
+  def RLL  : BinaryRSY<"rll",  0xEB1D, rotl, GR32>;
+  def RLLG : BinaryRSY<"rllg", 0xEB1C, rotl, GR64>;
 }
 
 // Rotate second operand left and inserted selected bits into first operand.
@@ -1403,15 +1401,15 @@ def  : Pat<(sub GR64:$src1, (azextloadi32 bdxaddr20only:$addr)),
 
 // Optimize sign-extended 1/0 selects to -1/0 selects.  This is important
 // for vector legalization.
-def : Pat<(sra (shl (i32 (z_select_ccmask 1, 0, uimm8zx4:$valid, uimm8zx4:$cc)),
+def : Pat<(sra (shl (i32 (z_select_ccmask 1, 0, imm32zx4:$valid, imm32zx4:$cc)),
                          (i32 31)),
                     (i32 31)),
-          (Select32 (LHI -1), (LHI 0), uimm8zx4:$valid, uimm8zx4:$cc)>;
-def : Pat<(sra (shl (i64 (anyext (i32 (z_select_ccmask 1, 0, uimm8zx4:$valid,
-                                                       uimm8zx4:$cc)))),
+          (Select32 (LHI -1), (LHI 0), imm32zx4:$valid, imm32zx4:$cc)>;
+def : Pat<(sra (shl (i64 (anyext (i32 (z_select_ccmask 1, 0, imm32zx4:$valid,
+                                                       imm32zx4:$cc)))),
                     (i32 63)),
                (i32 63)),
-          (Select64 (LGHI -1), (LGHI 0), uimm8zx4:$valid, uimm8zx4:$cc)>;
+          (Select64 (LGHI -1), (LGHI 0), imm32zx4:$valid, imm32zx4:$cc)>;
 
 // Peepholes for turning scalar operations into block operations.
 defm : BlockLoadStore<anyextloadi8, i32, MVCSequence, NCSequence, OCSequence,
diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td
index 3ad146c..7be81dc 100644
--- a/lib/Target/SystemZ/SystemZOperands.td
+++ b/lib/Target/SystemZ/SystemZOperands.td
@@ -202,21 +202,6 @@ def S32Imm : ImmediateAsmOperand<"S32Imm">;
 def U32Imm : ImmediateAsmOperand<"U32Imm">;
 
 //===----------------------------------------------------------------------===//
-// 8-bit immediates
-//===----------------------------------------------------------------------===//
-
-def uimm8zx4 : Immediate<i8, [{
-  return isUInt<4>(N->getZExtValue());
-}], NOOP_SDNodeXForm, "U4Imm">;
-
-def uimm8zx6 : Immediate<i8, [{
-  return isUInt<6>(N->getZExtValue());
-}], NOOP_SDNodeXForm, "U6Imm">;
-
-def simm8    : Immediate<i8, [{}], SIMM8, "S8Imm">;
-def uimm8    : Immediate<i8, [{}], UIMM8, "U8Imm">;
-
-//===----------------------------------------------------------------------===//
 // i32 immediates
 //===----------------------------------------------------------------------===//
 
@@ -241,6 +226,14 @@ def imm32lh16c : Immediate<i32, [{
 }], LH16, "U16Imm">;
 
 // Short immediates
+def imm32zx4 : Immediate<i32, [{
+  return isUInt<4>(N->getZExtValue());
+}], NOOP_SDNodeXForm, "U4Imm">;
+
+def imm32zx6 : Immediate<i32, [{
+  return isUInt<6>(N->getZExtValue());
+}], NOOP_SDNodeXForm, "U6Imm">;
+
 def imm32sx8 : Immediate<i32, [{
   return isInt<8>(N->getSExtValue());
 }], SIMM8, "S8Imm">;
@@ -470,13 +463,13 @@ def AccessReg : AsmOperandClass {
   let Name = "AccessReg";
   let ParserMethod = "parseAccessReg";
 }
-def access_reg : Immediate<i8, [{ return N->getZExtValue() < 16; }],
+def access_reg : Immediate<i32, [{ return N->getZExtValue() < 16; }],
                            NOOP_SDNodeXForm, "AccessReg"> {
   let ParserMatchClass = AccessReg;
 }
 
 // A 4-bit condition-code mask.
-def cond4 : PatLeaf<(i8 imm), [{ return (N->getZExtValue() < 16); }]>,
-            Operand<i8> {
+def cond4 : PatLeaf<(i32 imm), [{ return (N->getZExtValue() < 16); }]>,
+            Operand<i32> {
   let PrintMethod = "printCond4Operand";
 }
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index a391961..c70e662 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -19,14 +19,14 @@ def SDT_ZICmp               : SDTypeProfile<0, 3,
                                             [SDTCisSameAs<0, 1>,
                                              SDTCisVT<2, i32>]>;
 def SDT_ZBRCCMask           : SDTypeProfile<0, 3,
-                                            [SDTCisVT<0, i8>,
-                                             SDTCisVT<1, i8>,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisVT<1, i32>,
                                              SDTCisVT<2, OtherVT>]>;
 def SDT_ZSelectCCMask       : SDTypeProfile<1, 4,
                                             [SDTCisSameAs<0, 1>,
                                              SDTCisSameAs<1, 2>,
-                                             SDTCisVT<3, i8>,
-                                             SDTCisVT<4, i8>]>;
+                                             SDTCisVT<3, i32>,
+                                             SDTCisVT<4, i32>]>;
 def SDT_ZWrapPtr            : SDTypeProfile<1, 1,
                                             [SDTCisSameAs<0, 1>,
                                              SDTCisPtrTy<0>]>;
@@ -37,7 +37,7 @@ def SDT_ZWrapOffset         : SDTypeProfile<1, 2,
 def SDT_ZAdjDynAlloc        : SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>;
 def SDT_ZExtractAccess      : SDTypeProfile<1, 1,
                                             [SDTCisVT<0, i32>,
-                                             SDTCisVT<1, i8>]>;
+                                             SDTCisVT<1, i32>]>;
 def SDT_ZGR128Binary32      : SDTypeProfile<1, 2,
                                             [SDTCisVT<0, untyped>,
                                              SDTCisVT<1, untyped>,
@@ -77,7 +77,7 @@ def SDT_ZString             : SDTypeProfile<1, 3,
                                              SDTCisVT<3, i32>]>;
 def SDT_ZI32Intrinsic       : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>;
 def SDT_ZPrefetch           : SDTypeProfile<0, 2,
-                                            [SDTCisVT<0, i8>,
+                                            [SDTCisVT<0, i32>,
                                              SDTCisPtrTy<1>]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZPatterns.td b/lib/Target/SystemZ/SystemZPatterns.td
index c0f94ec..e307f8a 100644
--- a/lib/Target/SystemZ/SystemZPatterns.td
+++ b/lib/Target/SystemZ/SystemZPatterns.td
@@ -101,15 +101,15 @@ multiclass CondStores64<Instruction insn, Instruction insninv,
                         SDPatternOperator store, SDPatternOperator load,
                         AddressingMode mode> {
   def : Pat<(store (z_select_ccmask GR64:$new, (load mode:$addr),
-                                    uimm8zx4:$valid, uimm8zx4:$cc),
+                                    imm32zx4:$valid, imm32zx4:$cc),
                    mode:$addr),
             (insn (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr,
-                  uimm8zx4:$valid, uimm8zx4:$cc)>;
+                  imm32zx4:$valid, imm32zx4:$cc)>;
   def : Pat<(store (z_select_ccmask (load mode:$addr), GR64:$new,
-                                    uimm8zx4:$valid, uimm8zx4:$cc),
+                                    imm32zx4:$valid, imm32zx4:$cc),
                    mode:$addr),
             (insninv (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr,
-                     uimm8zx4:$valid, uimm8zx4:$cc)>;
+                     imm32zx4:$valid, imm32zx4:$cc)>;
 }
 
 // Try to use MVC instruction INSN for a load of type LOAD followed by a store
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index a04d703..f03bcc4 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -7,31 +7,29 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "SystemZInstrInfo.h"
 #include "SystemZRegisterInfo.h"
-#include "SystemZTargetMachine.h"
+#include "SystemZSubtarget.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
 
 using namespace llvm;
 
 #define GET_REGINFO_TARGET_DESC
 #include "SystemZGenRegisterInfo.inc"
 
-SystemZRegisterInfo::SystemZRegisterInfo(SystemZTargetMachine &tm)
-  : SystemZGenRegisterInfo(SystemZ::R14D), TM(tm) {}
+SystemZRegisterInfo::SystemZRegisterInfo()
+    : SystemZGenRegisterInfo(SystemZ::R14D) {}
 
-const MCPhysReg*
+const MCPhysReg *
 SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  static const MCPhysReg CalleeSavedRegs[] = {
-    SystemZ::R6D,  SystemZ::R7D,  SystemZ::R8D,  SystemZ::R9D,
-    SystemZ::R10D, SystemZ::R11D, SystemZ::R12D, SystemZ::R13D,
-    SystemZ::R14D, SystemZ::R15D,
-    SystemZ::F8D,  SystemZ::F9D,  SystemZ::F10D, SystemZ::F11D,
-    SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D,
-    0
-  };
-
-  return CalleeSavedRegs;
+  return CSR_SystemZ_SaveList;
+}
+
+const uint32_t *
+SystemZRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  return CSR_SystemZ_RegMask;
 }
 
 BitVector
@@ -63,7 +61,8 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
   MachineBasicBlock &MBB = *MI->getParent();
   MachineFunction &MF = *MBB.getParent();
-  auto *TII = static_cast<const SystemZInstrInfo*>(TM.getInstrInfo());
+  auto *TII =
+      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
   DebugLoc DL = MI->getDebugLoc();
 
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index e236f71..9bffa46 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -29,15 +29,9 @@ inline unsigned odd128(bool Is32bit) {
 }
 } // end namespace SystemZ
 
-class SystemZSubtarget;
-class SystemZInstrInfo;
-
 struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
-private:
-  SystemZTargetMachine &TM;
-
 public:
-  SystemZRegisterInfo(SystemZTargetMachine &tm);
+  SystemZRegisterInfo();
 
   // Override TargetRegisterInfo.h.
   bool requiresRegisterScavenging(const MachineFunction &MF) const override {
@@ -51,6 +45,7 @@ public:
   }
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF = nullptr) const
     override;
+  const uint32_t *getCallPreservedMask(CallingConv::ID CC) const override;
   BitVector getReservedRegs(const MachineFunction &MF) const override;
   void eliminateFrameIndex(MachineBasicBlock::iterator MI,
                            int SPAdj, unsigned FIOperandNum,
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td
index 93d7c83..47ac20d 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -119,6 +119,29 @@ defm ADDR128 : SystemZRegClass<"ADDR128", untyped, 128, (sub GR128Bit, R0Q)>;
 // Floating-point registers
 //===----------------------------------------------------------------------===//
 
+// Maps FPR register numbers to their DWARF encoding.
+class DwarfMapping<int id> { int Id = id; }
+
+def F0Dwarf  : DwarfMapping<16>;
+def F2Dwarf  : DwarfMapping<17>;
+def F4Dwarf  : DwarfMapping<18>;
+def F6Dwarf  : DwarfMapping<19>;
+
+def F1Dwarf  : DwarfMapping<20>;
+def F3Dwarf  : DwarfMapping<21>;
+def F5Dwarf  : DwarfMapping<22>;
+def F7Dwarf  : DwarfMapping<23>;
+
+def F8Dwarf  : DwarfMapping<24>;
+def F10Dwarf : DwarfMapping<25>;
+def F12Dwarf : DwarfMapping<26>;
+def F14Dwarf : DwarfMapping<27>;
+
+def F9Dwarf  : DwarfMapping<28>;
+def F11Dwarf : DwarfMapping<29>;
+def F13Dwarf : DwarfMapping<30>;
+def F15Dwarf : DwarfMapping<31>;
+
 // Lower 32 bits of one of the 16 64-bit floating-point registers
 class FPR32<bits<16> num, string n> : SystemZReg<n> {
   let HWEncoding = num;
@@ -142,7 +165,7 @@ class FPR128<bits<16> num, string n, FPR64 low, FPR64 high>
 foreach I = 0-15 in {
   def F#I#S : FPR32<I, "f"#I>;
   def F#I#D : FPR64<I, "f"#I, !cast<FPR32>("F"#I#"S")>,
-              DwarfRegNum<[!add(I, 16)]>;
+              DwarfRegNum<[!cast<DwarfMapping>("F"#I#"Dwarf").Id]>;
 }
 
 foreach I = [0, 1, 4, 5, 8, 9, 12, 13] in {
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index 97abee3..a3cba64 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -18,10 +18,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "systemz-selectiondag-info"
 
-SystemZSelectionDAGInfo::
-SystemZSelectionDAGInfo(const SystemZTargetMachine &TM)
-  : TargetSelectionDAGInfo(TM) {
-}
+SystemZSelectionDAGInfo::SystemZSelectionDAGInfo(const DataLayout &DL)
+    : TargetSelectionDAGInfo(&DL) {}
 
 SystemZSelectionDAGInfo::~SystemZSelectionDAGInfo() {
 }
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
index 79e7fab..e9de146 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -22,7 +22,7 @@ class SystemZTargetMachine;
 
 class SystemZSelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
-  explicit SystemZSelectionDAGInfo(const SystemZTargetMachine &TM);
+  explicit SystemZSelectionDAGInfo(const DataLayout &DL);
   ~SystemZSelectionDAGInfo();
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index a011157..e160bc8 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -20,16 +20,11 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_CTOR
 #include "SystemZGenSubtargetInfo.inc"
 
-// Pin the vtabel to this file.
+// Pin the vtable to this file.
 void SystemZSubtarget::anchor() {}
 
-SystemZSubtarget::SystemZSubtarget(const std::string &TT,
-                                   const std::string &CPU,
-                                   const std::string &FS)
-  : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
-    HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
-    HasFastSerialization(false), HasInterlockedAccess1(false),
-    TargetTriple(TT) {
+SystemZSubtarget &
+SystemZSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
   std::string CPUName = CPU;
   if (CPUName.empty())
     CPUName = "generic";
@@ -37,11 +32,26 @@ SystemZSubtarget::SystemZSubtarget(const std::string &TT,
   if (CPUName == "generic")
     CPUName = sys::getHostCPUName();
 #endif
-
   // Parse features string.
   ParseSubtargetFeatures(CPUName, FS);
+  return *this;
 }
 
+SystemZSubtarget::SystemZSubtarget(const std::string &TT,
+                                   const std::string &CPU,
+                                   const std::string &FS,
+                                   const TargetMachine &TM)
+    : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
+      HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
+      HasFastSerialization(false), HasInterlockedAccess1(false),
+      TargetTriple(TT),
+      // Make sure that global data has at least 16 bits of alignment by
+      // default, so that we can refer to it using LARL.  We don't have any
+      // special requirements for stack variables though.
+      DL("E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64"),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM),
+      TSInfo(DL), FrameLowering() {}
+
 // Return true if GV binds locally under reloc model RM.
 static bool bindsLocally(const GlobalValue *GV, Reloc::Model RM) {
   // For non-PIC, all symbols bind locally.
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index ffca2d8..4e8c710 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -14,6 +14,12 @@
 #ifndef SYSTEMZSUBTARGET_H
 #define SYSTEMZSUBTARGET_H
 
+#include "SystemZFrameLowering.h"
+#include "SystemZISelLowering.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZRegisterInfo.h"
+#include "SystemZSelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
@@ -37,10 +43,26 @@ protected:
 
 private:
   Triple TargetTriple;
-
+  const DataLayout DL;
+  SystemZInstrInfo InstrInfo;
+  SystemZTargetLowering TLInfo;
+  SystemZSelectionDAGInfo TSInfo;
+  SystemZFrameLowering FrameLowering;
+
+  SystemZSubtarget &initializeSubtargetDependencies(StringRef CPU,
+                                                    StringRef FS);
 public:
   SystemZSubtarget(const std::string &TT, const std::string &CPU,
-                   const std::string &FS);
+                   const std::string &FS, const TargetMachine &TM);
+
+  const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; }
+  const SystemZInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  const DataLayout *getDataLayout() const { return &DL; }
+  const SystemZRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const SystemZTargetLowering *getTargetLowering() const { return &TLInfo; }
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
 
   // This is important for reducing register pressure in vector code.
   bool useAA() const override { return true; }
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 4c9ce29..0122e99 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -22,17 +22,10 @@ extern "C" void LLVMInitializeSystemZTarget() {
 SystemZTargetMachine::SystemZTargetMachine(const Target &T, StringRef TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
-                                           Reloc::Model RM,
-                                           CodeModel::Model CM,
+                                           Reloc::Model RM, CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
-  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS),
-    // Make sure that global data has at least 16 bits of alignment by default,
-    // so that we can refer to it using LARL.  We don't have any special
-    // requirements for stack variables though.
-    DL("E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64"),
-    InstrInfo(*this), TLInfo(*this), TSInfo(*this),
-    FrameLowering(*this, Subtarget) {
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
 
@@ -65,7 +58,8 @@ bool SystemZPassConfig::addInstSelector() {
 }
 
 bool SystemZPassConfig::addPreSched2() {
-  if (getSystemZTargetMachine().getSubtargetImpl()->hasLoadStoreOnCond())
+  if (getOptLevel() != CodeGenOpt::None &&
+      getSystemZTargetMachine().getSubtargetImpl()->hasLoadStoreOnCond())
     addPass(&IfConverterID);
   return true;
 }
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index 1db717b..ded07e9 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -15,25 +15,15 @@
 #ifndef SYSTEMZTARGETMACHINE_H
 #define SYSTEMZTARGETMACHINE_H
 
-#include "SystemZFrameLowering.h"
-#include "SystemZISelLowering.h"
-#include "SystemZInstrInfo.h"
-#include "SystemZRegisterInfo.h"
-#include "SystemZSelectionDAGInfo.h"
 #include "SystemZSubtarget.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
+class TargetFrameLowering;
+
 class SystemZTargetMachine : public LLVMTargetMachine {
   SystemZSubtarget        Subtarget;
-  const DataLayout        DL;
-  SystemZInstrInfo        InstrInfo;
-  SystemZTargetLowering   TLInfo;
-  SystemZSelectionDAGInfo TSInfo;
-  SystemZFrameLowering    FrameLowering;
 
 public:
   SystemZTargetMachine(const Target &T, StringRef TT, StringRef CPU,
@@ -43,25 +33,25 @@ public:
 
   // Override TargetMachine.
   const TargetFrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
+    return getSubtargetImpl()->getFrameLowering();
   }
   const SystemZInstrInfo *getInstrInfo() const override {
-    return &InstrInfo;
+    return getSubtargetImpl()->getInstrInfo();
   }
   const SystemZSubtarget *getSubtargetImpl() const override {
     return &Subtarget;
   }
   const DataLayout *getDataLayout() const override {
-    return &DL;
+    return getSubtargetImpl()->getDataLayout();
   }
   const SystemZRegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
+    return getSubtargetImpl()->getRegisterInfo();
   }
   const SystemZTargetLowering *getTargetLowering() const override {
-    return &TLInfo;
+    return getSubtargetImpl()->getTargetLowering();
   }
   const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
-    return &TSInfo;
+    return getSubtargetImpl()->getSelectionDAGInfo();
   }
 
   // Override LLVMTargetMachine
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 8365f64..95c8cb6 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -88,8 +88,8 @@ CodeModel::Model TargetMachine::getCodeModel() const {
 }
 
 /// Get the IR-specified TLS model for Var.
-static TLSModel::Model getSelectedTLSModel(const GlobalVariable *Var) {
-  switch (Var->getThreadLocalMode()) {
+static TLSModel::Model getSelectedTLSModel(const GlobalValue *GV) {
+  switch (GV->getThreadLocalMode()) {
   case GlobalVariable::NotThreadLocal:
     llvm_unreachable("getSelectedTLSModel for non-TLS variable");
     break;
@@ -127,13 +127,10 @@ TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
       Model = TLSModel::InitialExec;
   }
 
-  const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV);
-  if (Var) {
-    // If the user specified a more specific model, use that.
-    TLSModel::Model SelectedModel = getSelectedTLSModel(Var);
-    if (SelectedModel > Model)
-      return SelectedModel;
-  }
+  // If the user specified a more specific model, use that.
+  TLSModel::Model SelectedModel = getSelectedTLSModel(GV);
+  if (SelectedModel > Model)
+    return SelectedModel;
 
   return Model;
 }
diff --git a/lib/Target/TargetSubtargetInfo.cpp b/lib/Target/TargetSubtargetInfo.cpp
index 3ca13da..87b6b66 100644
--- a/lib/Target/TargetSubtargetInfo.cpp
+++ b/lib/Target/TargetSubtargetInfo.cpp
@@ -39,10 +39,23 @@ bool TargetSubtargetInfo::useMachineScheduler() const {
   return enableMachineScheduler();
 }
 
+bool TargetSubtargetInfo::enableAtomicExpandLoadLinked() const {
+  return true;
+}
+
 bool TargetSubtargetInfo::enableMachineScheduler() const {
   return false;
 }
 
+bool TargetSubtargetInfo::enableRALocalReassignment(
+    CodeGenOpt::Level OptLevel) const {
+  return true;
+}
+
+bool TargetSubtargetInfo::enablePostMachineScheduler() const {
+  return false;
+}
+
 bool TargetSubtargetInfo::enablePostRAScheduler(
           CodeGenOpt::Level OptLevel,
           AntiDepBreakMode& Mode,
diff --git a/lib/Target/X86/Android.mk b/lib/Target/X86/Android.mk
index 0d0a9ca..e2c4be7 100644
--- a/lib/Target/X86/Android.mk
+++ b/lib/Target/X86/Android.mk
@@ -12,6 +12,7 @@ x86_codegen_TBLGEN_TABLES := \
 
 x86_codegen_SRC_FILES := \
   X86AsmPrinter.cpp \
+  X86AtomicExpandPass.cpp \
   X86CodeEmitter.cpp \
   X86FastISel.cpp \
   X86FixupLEAs.cpp \
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
index f3e6b3f..a365f62 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -36,8 +37,8 @@ bool IsStackReg(unsigned Reg) {
 }
 
 std::string FuncName(unsigned AccessSize, bool IsWrite) {
-  return std::string("__sanitizer_sanitize_") + (IsWrite ? "store" : "load") +
-         (utostr(AccessSize));
+  return std::string("__asan_report_") + (IsWrite ? "store" : "load") +
+         utostr(AccessSize);
 }
 
 class X86AddressSanitizer : public X86AsmInstrumentation {
@@ -47,47 +48,55 @@ public:
 
   // X86AsmInstrumentation implementation:
   virtual void InstrumentInstruction(
-      const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-      MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) override {
+      const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
+      const MCInstrInfo &MII, MCStreamer &Out) override {
     InstrumentMOV(Inst, Operands, Ctx, MII, Out);
   }
 
   // Should be implemented differently in x86_32 and x86_64 subclasses.
-  virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize,
-                                        bool IsWrite, MCContext &Ctx,
-                                        MCStreamer &Out) = 0;
+  virtual void InstrumentMemOperandSmallImpl(
+      X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+      MCStreamer &Out) = 0;
+  virtual void InstrumentMemOperandLargeImpl(
+      X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+      MCStreamer &Out) = 0;
 
-  void InstrumentMemOperand(MCParsedAsmOperand *Op, unsigned AccessSize,
+  void InstrumentMemOperand(MCParsedAsmOperand &Op, unsigned AccessSize,
                             bool IsWrite, MCContext &Ctx, MCStreamer &Out);
-  void InstrumentMOV(const MCInst &Inst,
-                     SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+  void InstrumentMOV(const MCInst &Inst, OperandVector &Operands,
                      MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
   void EmitInstruction(MCStreamer &Out, const MCInst &Inst) {
     Out.EmitInstruction(Inst, STI);
   }
 
+  void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); }
+
 protected:
   const MCSubtargetInfo &STI;
 };
 
 void X86AddressSanitizer::InstrumentMemOperand(
-    MCParsedAsmOperand *Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+    MCParsedAsmOperand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
     MCStreamer &Out) {
-  assert(Op && Op->isMem() && "Op should be a memory operand.");
+  assert(Op.isMem() && "Op should be a memory operand.");
   assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 &&
          "AccessSize should be a power of two, less or equal than 16.");
 
-  X86Operand *MemOp = static_cast<X86Operand *>(Op);
+  X86Operand &MemOp = static_cast<X86Operand &>(Op);
   // FIXME: get rid of this limitation.
-  if (IsStackReg(MemOp->getMemBaseReg()) || IsStackReg(MemOp->getMemIndexReg()))
+  if (IsStackReg(MemOp.getMemBaseReg()) || IsStackReg(MemOp.getMemIndexReg()))
     return;
 
-  InstrumentMemOperandImpl(MemOp, AccessSize, IsWrite, Ctx, Out);
+  // FIXME: take into account load/store alignment.
+  if (AccessSize < 8)
+    InstrumentMemOperandSmallImpl(MemOp, AccessSize, IsWrite, Ctx, Out);
+  else
+    InstrumentMemOperandLargeImpl(MemOp, AccessSize, IsWrite, Ctx, Out);
 }
 
 void X86AddressSanitizer::InstrumentMOV(
-    const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-    MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) {
+    const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
+    const MCInstrInfo &MII, MCStreamer &Out) {
   // Access size in bytes.
   unsigned AccessSize = 0;
 
@@ -124,107 +133,351 @@ void X86AddressSanitizer::InstrumentMOV(
 
   const bool IsWrite = MII.get(Inst.getOpcode()).mayStore();
   for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) {
-    MCParsedAsmOperand *Op = Operands[Ix];
-    if (Op && Op->isMem())
+    assert(Operands[Ix]);
+    MCParsedAsmOperand &Op = *Operands[Ix];
+    if (Op.isMem())
       InstrumentMemOperand(Op, AccessSize, IsWrite, Ctx, Out);
   }
 }
 
 class X86AddressSanitizer32 : public X86AddressSanitizer {
 public:
+  static const long kShadowOffset = 0x20000000;
+
   X86AddressSanitizer32(const MCSubtargetInfo &STI)
       : X86AddressSanitizer(STI) {}
   virtual ~X86AddressSanitizer32() {}
 
-  virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize,
-                                        bool IsWrite, MCContext &Ctx,
-                                        MCStreamer &Out) override;
+  virtual void InstrumentMemOperandSmallImpl(
+      X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+      MCStreamer &Out) override;
+  virtual void InstrumentMemOperandLargeImpl(
+      X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+      MCStreamer &Out) override;
+
+ private:
+  void EmitCallAsanReport(MCContext &Ctx, MCStreamer &Out, unsigned AccessSize,
+                          bool IsWrite, unsigned AddressReg) {
+    EmitInstruction(Out, MCInstBuilder(X86::CLD));
+    EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
+
+    EmitInstruction(Out, MCInstBuilder(X86::AND64ri8).addReg(X86::ESP)
+                             .addReg(X86::ESP).addImm(-16));
+    EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(AddressReg));
+
+
+    const std::string& Fn = FuncName(AccessSize, IsWrite);
+    MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn));
+    const MCSymbolRefExpr *FnExpr =
+        MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
+    EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr));
+  }
 };
 
-void X86AddressSanitizer32::InstrumentMemOperandImpl(
-    X86Operand *Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+void X86AddressSanitizer32::InstrumentMemOperandSmallImpl(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
     MCStreamer &Out) {
-  // FIXME: emit .cfi directives for correct stack unwinding.
   EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX));
+  EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::ECX));
+  EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EDX));
+  EmitInstruction(Out, MCInstBuilder(X86::PUSHF32));
+
   {
     MCInst Inst;
     Inst.setOpcode(X86::LEA32r);
     Inst.addOperand(MCOperand::CreateReg(X86::EAX));
+    Op.addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+
+  EmitInstruction(
+      Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EAX));
+  EmitInstruction(Out, MCInstBuilder(X86::SHR32ri).addReg(X86::ECX)
+                           .addReg(X86::ECX).addImm(3));
+
+  {
+    MCInst Inst;
+    Inst.setOpcode(X86::MOV8rm);
+    Inst.addOperand(MCOperand::CreateReg(X86::CL));
+    const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+
+  EmitInstruction(Out,
+                  MCInstBuilder(X86::TEST8rr).addReg(X86::CL).addReg(X86::CL));
+  MCSymbol *DoneSym = Ctx.CreateTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+
+  EmitInstruction(
+      Out, MCInstBuilder(X86::MOV32rr).addReg(X86::EDX).addReg(X86::EAX));
+  EmitInstruction(Out, MCInstBuilder(X86::AND32ri).addReg(X86::EDX)
+                           .addReg(X86::EDX).addImm(7));
+
+  switch (AccessSize) {
+  case 1:
+    break;
+  case 2: {
+    MCInst Inst;
+    Inst.setOpcode(X86::LEA32r);
+    Inst.addOperand(MCOperand::CreateReg(X86::EDX));
+
+    const MCExpr *Disp = MCConstantExpr::Create(1, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, X86::EDX, 0, 1, SMLoc(), SMLoc()));
     Op->addMemOperands(Inst, 5);
     EmitInstruction(Out, Inst);
+    break;
   }
+  case 4:
+    EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8).addReg(X86::EDX)
+                             .addReg(X86::EDX).addImm(3));
+    break;
+  default:
+    assert(false && "Incorrect access size");
+    break;
+  }
+
+  EmitInstruction(
+      Out, MCInstBuilder(X86::MOVSX32rr8).addReg(X86::ECX).addReg(X86::CL));
+  EmitInstruction(
+      Out, MCInstBuilder(X86::CMP32rr).addReg(X86::EDX).addReg(X86::ECX));
+  EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr));
+
+  EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite, X86::EAX);
+  EmitLabel(Out, DoneSym);
+
+  EmitInstruction(Out, MCInstBuilder(X86::POPF32));
+  EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EDX));
+  EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::ECX));
+  EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX));
+}
+
+void X86AddressSanitizer32::InstrumentMemOperandLargeImpl(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+    MCStreamer &Out) {
   EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX));
+  EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::ECX));
+  EmitInstruction(Out, MCInstBuilder(X86::PUSHF32));
+
   {
-    const std::string Func = FuncName(AccessSize, IsWrite);
-    const MCSymbol *FuncSym = Ctx.GetOrCreateSymbol(StringRef(Func));
-    const MCSymbolRefExpr *FuncExpr =
-        MCSymbolRefExpr::Create(FuncSym, MCSymbolRefExpr::VK_PLT, Ctx);
-    EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FuncExpr));
+    MCInst Inst;
+    Inst.setOpcode(X86::LEA32r);
+    Inst.addOperand(MCOperand::CreateReg(X86::EAX));
+    Op.addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
   }
-  EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX));
+  EmitInstruction(
+      Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EAX));
+  EmitInstruction(Out, MCInstBuilder(X86::SHR32ri).addReg(X86::ECX)
+                           .addReg(X86::ECX).addImm(3));
+  {
+    MCInst Inst;
+    switch (AccessSize) {
+      case 8:
+        Inst.setOpcode(X86::CMP8mi);
+        break;
+      case 16:
+        Inst.setOpcode(X86::CMP16mi);
+        break;
+      default:
+        assert(false && "Incorrect access size");
+        break;
+    }
+    const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    Inst.addOperand(MCOperand::CreateImm(0));
+    EmitInstruction(Out, Inst);
+  }
+  MCSymbol *DoneSym = Ctx.CreateTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+
+  EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite, X86::EAX);
+  EmitLabel(Out, DoneSym);
+
+  EmitInstruction(Out, MCInstBuilder(X86::POPF32));
+  EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::ECX));
   EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX));
 }
 
 class X86AddressSanitizer64 : public X86AddressSanitizer {
 public:
+  static const long kShadowOffset = 0x7fff8000;
+
   X86AddressSanitizer64(const MCSubtargetInfo &STI)
       : X86AddressSanitizer(STI) {}
   virtual ~X86AddressSanitizer64() {}
 
-  virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize,
-                                        bool IsWrite, MCContext &Ctx,
-                                        MCStreamer &Out) override;
-};
+  virtual void InstrumentMemOperandSmallImpl(
+      X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+      MCStreamer &Out) override;
+  virtual void InstrumentMemOperandLargeImpl(
+      X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+      MCStreamer &Out) override;
 
-void X86AddressSanitizer64::InstrumentMemOperandImpl(X86Operand *Op,
-                                                     unsigned AccessSize,
-                                                     bool IsWrite,
-                                                     MCContext &Ctx,
-                                                     MCStreamer &Out) {
-  // FIXME: emit .cfi directives for correct stack unwinding.
-
-  // Set %rsp below current red zone (128 bytes wide) using LEA instruction to
-  // preserve flags.
-  {
+private:
+  void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) {
     MCInst Inst;
     Inst.setOpcode(X86::LEA64r);
     Inst.addOperand(MCOperand::CreateReg(X86::RSP));
 
-    const MCExpr *Disp = MCConstantExpr::Create(-128, Ctx);
+    const MCExpr *Disp = MCConstantExpr::Create(Offset, Ctx);
     std::unique_ptr<X86Operand> Op(
         X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc()));
     Op->addMemOperands(Inst, 5);
     EmitInstruction(Out, Inst);
   }
+
+  void EmitCallAsanReport(MCContext &Ctx, MCStreamer &Out, unsigned AccessSize,
+                          bool IsWrite) {
+    EmitInstruction(Out, MCInstBuilder(X86::CLD));
+    EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
+
+    EmitInstruction(Out, MCInstBuilder(X86::AND64ri8).addReg(X86::RSP)
+                             .addReg(X86::RSP).addImm(-16));
+
+    const std::string& Fn = FuncName(AccessSize, IsWrite);
+    MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn));
+    const MCSymbolRefExpr *FnExpr =
+        MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
+    EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr));
+  }
+};
+
+void X86AddressSanitizer64::InstrumentMemOperandSmallImpl(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+    MCStreamer &Out) {
+  EmitAdjustRSP(Ctx, Out, -128);
+  EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RAX));
+  EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RCX));
   EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RDI));
+  EmitInstruction(Out, MCInstBuilder(X86::PUSHF64));
   {
     MCInst Inst;
     Inst.setOpcode(X86::LEA64r);
     Inst.addOperand(MCOperand::CreateReg(X86::RDI));
-    Op->addMemOperands(Inst, 5);
+    Op.addMemOperands(Inst, 5);
     EmitInstruction(Out, Inst);
   }
+  EmitInstruction(
+      Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RAX).addReg(X86::RDI));
+  EmitInstruction(Out, MCInstBuilder(X86::SHR64ri).addReg(X86::RAX)
+                           .addReg(X86::RAX).addImm(3));
   {
-    const std::string Func = FuncName(AccessSize, IsWrite);
-    const MCSymbol *FuncSym = Ctx.GetOrCreateSymbol(StringRef(Func));
-    const MCSymbolRefExpr *FuncExpr =
-        MCSymbolRefExpr::Create(FuncSym, MCSymbolRefExpr::VK_PLT, Ctx);
-    EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FuncExpr));
+    MCInst Inst;
+    Inst.setOpcode(X86::MOV8rm);
+    Inst.addOperand(MCOperand::CreateReg(X86::AL));
+    const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, X86::RAX, 0, 1, SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+
+  EmitInstruction(Out,
+                  MCInstBuilder(X86::TEST8rr).addReg(X86::AL).addReg(X86::AL));
+  MCSymbol *DoneSym = Ctx.CreateTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+
+  EmitInstruction(
+      Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EDI));
+  EmitInstruction(Out, MCInstBuilder(X86::AND32ri).addReg(X86::ECX)
+                           .addReg(X86::ECX).addImm(7));
+
+  switch (AccessSize) {
+  case 1:
+    break;
+  case 2: {
+    MCInst Inst;
+    Inst.setOpcode(X86::LEA32r);
+    Inst.addOperand(MCOperand::CreateReg(X86::ECX));
+
+    const MCExpr *Disp = MCConstantExpr::Create(1, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+    break;
   }
+  case 4:
+    EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8).addReg(X86::ECX)
+                             .addReg(X86::ECX).addImm(3));
+    break;
+  default:
+    assert(false && "Incorrect access size");
+    break;
+  }
+
+  EmitInstruction(
+      Out, MCInstBuilder(X86::MOVSX32rr8).addReg(X86::EAX).addReg(X86::AL));
+  EmitInstruction(
+      Out, MCInstBuilder(X86::CMP32rr).addReg(X86::ECX).addReg(X86::EAX));
+  EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr));
+
+  EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite);
+  EmitLabel(Out, DoneSym);
+
+  EmitInstruction(Out, MCInstBuilder(X86::POPF64));
   EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RDI));
+  EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RCX));
+  EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RAX));
+  EmitAdjustRSP(Ctx, Out, 128);
+}
+
+void X86AddressSanitizer64::InstrumentMemOperandLargeImpl(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+    MCStreamer &Out) {
+  EmitAdjustRSP(Ctx, Out, -128);
+  EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RAX));
+  EmitInstruction(Out, MCInstBuilder(X86::PUSHF64));
 
-  // Restore old %rsp value.
   {
     MCInst Inst;
     Inst.setOpcode(X86::LEA64r);
-    Inst.addOperand(MCOperand::CreateReg(X86::RSP));
-
-    const MCExpr *Disp = MCConstantExpr::Create(128, Ctx);
+    Inst.addOperand(MCOperand::CreateReg(X86::RAX));
+    Op.addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+  EmitInstruction(Out, MCInstBuilder(X86::SHR64ri).addReg(X86::RAX)
+                           .addReg(X86::RAX).addImm(3));
+  {
+    MCInst Inst;
+    switch (AccessSize) {
+    case 8:
+      Inst.setOpcode(X86::CMP8mi);
+      break;
+    case 16:
+      Inst.setOpcode(X86::CMP16mi);
+      break;
+    default:
+      assert(false && "Incorrect access size");
+      break;
+    }
+    const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
     std::unique_ptr<X86Operand> Op(
-        X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc()));
+        X86Operand::CreateMem(0, Disp, X86::RAX, 0, 1, SMLoc(), SMLoc()));
     Op->addMemOperands(Inst, 5);
+    Inst.addOperand(MCOperand::CreateImm(0));
     EmitInstruction(Out, Inst);
   }
+
+  MCSymbol *DoneSym = Ctx.CreateTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+
+  EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite);
+  EmitLabel(Out, DoneSym);
+
+  EmitInstruction(Out, MCInstBuilder(X86::POPF64));
+  EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RAX));
+  EmitAdjustRSP(Ctx, Out, 128);
 }
 
 } // End anonymous namespace
@@ -233,8 +486,8 @@ X86AsmInstrumentation::X86AsmInstrumentation() {}
 X86AsmInstrumentation::~X86AsmInstrumentation() {}
 
 void X86AsmInstrumentation::InstrumentInstruction(
-    const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-    MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) {}
+    const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
+    const MCInstrInfo &MII, MCStreamer &Out) {}
 
 X86AsmInstrumentation *
 CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
index 0369b14..1bc3c09 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
@@ -12,6 +12,8 @@
 
 #include "llvm/ADT/SmallVector.h"
 
+#include <memory>
+
 namespace llvm {
 
 class MCContext;
@@ -35,10 +37,9 @@ public:
   // Instruments Inst. Should be called just before the original
   // instruction is sent to Out.
   virtual void InstrumentInstruction(
-      const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-      MCContext &Ctx,
-      const MCInstrInfo &MII,
-      MCStreamer &Out);
+      const MCInst &Inst,
+      SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> &Operands,
+      MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
 
 protected:
   friend X86AsmInstrumentation *
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index d3e695e..f0765ed 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -235,6 +235,7 @@ private:
     IES_RSHIFT,
     IES_PLUS,
     IES_MINUS,
+    IES_NOT,
     IES_MULTIPLY,
     IES_DIVIDE,
     IES_LBRAC,
@@ -372,6 +373,7 @@ private:
         State = IES_ERROR;
         break;
       case IES_PLUS:
+      case IES_NOT:
       case IES_MULTIPLY:
       case IES_DIVIDE:
       case IES_LPAREN:
@@ -401,6 +403,19 @@ private:
       }
       PrevState = CurrState;
     }
+    void onNot() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_PLUS:
+      case IES_NOT:
+        State = IES_NOT;
+        break;
+      }
+      PrevState = CurrState;
+    }
     void onRegister(unsigned Reg) {
       IntelExprState CurrState = State;
       switch (State) {
@@ -438,6 +453,7 @@ private:
         break;
       case IES_PLUS:
       case IES_MINUS:
+      case IES_NOT:
         State = IES_INTEGER;
         Sym = SymRef;
         SymName = SymRefName;
@@ -453,6 +469,7 @@ private:
         break;
       case IES_PLUS:
       case IES_MINUS:
+      case IES_NOT:
       case IES_OR:
       case IES_AND:
       case IES_LSHIFT:
@@ -476,11 +493,22 @@ private:
                     PrevState == IES_OR || PrevState == IES_AND ||
                     PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
                     PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
-                    PrevState == IES_LPAREN || PrevState == IES_LBRAC) &&
+                    PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+                    PrevState == IES_NOT) &&
                    CurrState == IES_MINUS) {
           // Unary minus.  No need to pop the minus operand because it was never
           // pushed.
           IC.pushOperand(IC_IMM, -TmpInt); // Push -Imm.
+        } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+                    PrevState == IES_OR || PrevState == IES_AND ||
+                    PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
+                    PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+                    PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+                    PrevState == IES_NOT) &&
+                   CurrState == IES_NOT) {
+          // Unary not.  No need to pop the not operand because it was never
+          // pushed.
+          IC.pushOperand(IC_IMM, ~TmpInt); // Push ~Imm.
         } else {
           IC.pushOperand(IC_IMM, TmpInt);
         }
@@ -561,6 +589,7 @@ private:
         break;
       case IES_PLUS:
       case IES_MINUS:
+      case IES_NOT:
       case IES_OR:
       case IES_AND:
       case IES_LSHIFT:
@@ -568,13 +597,14 @@ private:
       case IES_MULTIPLY:
       case IES_DIVIDE:
       case IES_LPAREN:
-        // FIXME: We don't handle this type of unary minus, yet.
+        // FIXME: We don't handle this type of unary minus or not, yet.
         if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
             PrevState == IES_OR || PrevState == IES_AND ||
             PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
             PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
-            PrevState == IES_LPAREN || PrevState == IES_LBRAC) &&
-            CurrState == IES_MINUS) {
+            PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+            PrevState == IES_NOT) &&
+            (CurrState == IES_MINUS || CurrState == IES_NOT)) {
           State = IES_ERROR;
           break;
         }
@@ -618,52 +648,52 @@ private:
       return Error(L, Msg, Ranges, MatchingInlineAsm);
   }
 
-  X86Operand *ErrorOperand(SMLoc Loc, StringRef Msg) {
+  std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg) {
     Error(Loc, Msg);
     return nullptr;
   }
 
-  X86Operand *DefaultMemSIOperand(SMLoc Loc);
-  X86Operand *DefaultMemDIOperand(SMLoc Loc);
-  X86Operand *ParseOperand();
-  X86Operand *ParseATTOperand();
-  X86Operand *ParseIntelOperand();
-  X86Operand *ParseIntelOffsetOfOperator();
+  std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc);
+  std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc);
+  std::unique_ptr<X86Operand> ParseOperand();
+  std::unique_ptr<X86Operand> ParseATTOperand();
+  std::unique_ptr<X86Operand> ParseIntelOperand();
+  std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator();
   bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
-  X86Operand *ParseIntelOperator(unsigned OpKind);
-  X86Operand *ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size);
-  X86Operand *ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc,
-                                   unsigned Size);
+  std::unique_ptr<X86Operand> ParseIntelOperator(unsigned OpKind);
+  std::unique_ptr<X86Operand>
+  ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size);
+  std::unique_ptr<X86Operand>
+  ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc, unsigned Size);
   bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
-  X86Operand *ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
-                                       int64_t ImmDisp, unsigned Size);
+  std::unique_ptr<X86Operand> ParseIntelBracExpression(unsigned SegReg,
+                                                       SMLoc Start,
+                                                       int64_t ImmDisp,
+                                                       unsigned Size);
   bool ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier,
                             InlineAsmIdentifierInfo &Info,
                             bool IsUnevaluatedOperand, SMLoc &End);
 
-  X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
+  std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
 
-  X86Operand *CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp,
-                                    unsigned BaseReg, unsigned IndexReg,
-                                    unsigned Scale, SMLoc Start, SMLoc End,
-                                    unsigned Size, StringRef Identifier,
-                                    InlineAsmIdentifierInfo &Info);
+  std::unique_ptr<X86Operand>
+  CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
+                        unsigned IndexReg, unsigned Scale, SMLoc Start,
+                        SMLoc End, unsigned Size, StringRef Identifier,
+                        InlineAsmIdentifierInfo &Info);
 
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
   bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
 
-  bool processInstruction(MCInst &Inst,
-                          const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
+  bool processInstruction(MCInst &Inst, const OperandVector &Ops);
 
   /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds
   /// instrumentation around Inst.
-  void EmitInstruction(MCInst &Inst,
-                       SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-                       MCStreamer &Out);
+  void EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out);
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer &Out, unsigned &ErrorInfo,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
                                bool MatchingInlineAsm) override;
 
   /// doSrcDstMatch - Returns true if operands are matching in their
@@ -674,8 +704,8 @@ private:
   /// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z})
   /// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required.
   /// \return \c true if no parsing errors occurred, \c false otherwise.
-  bool HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                            const MCParsedAsmOperand &Op);
+  bool HandleAVX512Operand(OperandVector &Operands,
+                           const MCParsedAsmOperand &Op);
 
   bool is64BitMode() const {
     // FIXME: Can tablegen auto-generate this?
@@ -725,9 +755,8 @@ public:
 
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
 
-  bool
-    ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
-                     SmallVectorImpl<MCParsedAsmOperand*> &Operands) override;
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
 
   bool ParseDirective(AsmToken DirectiveID) override;
 };
@@ -908,7 +937,7 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
   return false;
 }
 
-X86Operand *X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
+std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
   unsigned basereg =
     is64BitMode() ? X86::RSI : (is32BitMode() ? X86::ESI : X86::SI);
   const MCExpr *Disp = MCConstantExpr::Create(0, getContext());
@@ -916,7 +945,7 @@ X86Operand *X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
                                /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0);
 }
 
-X86Operand *X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
+std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
   unsigned basereg =
     is64BitMode() ? X86::RDI : (is32BitMode() ? X86::EDI : X86::DI);
   const MCExpr *Disp = MCConstantExpr::Create(0, getContext());
@@ -924,7 +953,7 @@ X86Operand *X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
                                /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0);
 }
 
-X86Operand *X86AsmParser::ParseOperand() {
+std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() {
   if (isParsingIntelSyntax())
     return ParseIntelOperand();
   return ParseATTOperand();
@@ -946,12 +975,10 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) {
   return Size;
 }
 
-X86Operand *
-X86AsmParser::CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp,
-                                    unsigned BaseReg, unsigned IndexReg,
-                                    unsigned Scale, SMLoc Start, SMLoc End,
-                                    unsigned Size, StringRef Identifier,
-                                    InlineAsmIdentifierInfo &Info){
+std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
+    unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg,
+    unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier,
+    InlineAsmIdentifierInfo &Info) {
   // If this is not a VarDecl then assume it is a FuncDecl or some other label
   // reference.  We need an 'r' constraint here, so we need to create register
   // operand to ensure proper matching.  Just pick a GPR based on the size of
@@ -1064,7 +1091,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
     if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac)
       break;
 
-    switch (getLexer().getKind()) {
+    AsmToken::TokenKind TK = getLexer().getKind();
+    switch (TK) {
     default: {
       if (SM.isValidEndState()) {
         Done = true;
@@ -1076,13 +1104,14 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
       Done = true;
       break;
     }
+    case AsmToken::String:
     case AsmToken::Identifier: {
       // This could be a register or a symbolic displacement.
       unsigned TmpReg;
       const MCExpr *Val;
       SMLoc IdentLoc = Tok.getLoc();
       StringRef Identifier = Tok.getString();
-      if(!ParseRegister(TmpReg, IdentLoc, End)) {
+      if (TK != AsmToken::String && !ParseRegister(TmpReg, IdentLoc, End)) {
         SM.onRegister(TmpReg);
         UpdateLocLex = false;
         break;
@@ -1142,6 +1171,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
     }
     case AsmToken::Plus:    SM.onPlus(); break;
     case AsmToken::Minus:   SM.onMinus(); break;
+    case AsmToken::Tilde:   SM.onNot(); break;
     case AsmToken::Star:    SM.onStar(); break;
     case AsmToken::Slash:   SM.onDivide(); break;
     case AsmToken::Pipe:    SM.onOr(); break;
@@ -1164,9 +1194,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
   return false;
 }
 
-X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
-                                                   int64_t ImmDisp,
-                                                   unsigned Size) {
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
+                                       int64_t ImmDisp, unsigned Size) {
   const AsmToken &Tok = Parser.getTok();
   SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc();
   if (getLexer().isNot(AsmToken::LBrac))
@@ -1270,9 +1300,9 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
 }
 
 /// \brief Parse intel style segment override.
-X86Operand *X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg,
-                                                    SMLoc Start,
-                                                    unsigned Size) {
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start,
+                                        unsigned Size) {
   assert(SegReg != 0 && "Tried to parse a segment override without a segment!");
   const AsmToken &Tok = Parser.getTok(); // Eat colon.
   if (Tok.isNot(AsmToken::Colon))
@@ -1321,8 +1351,9 @@ X86Operand *X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg,
 }
 
 /// ParseIntelMemOperand - Parse intel style memory operand.
-X86Operand *X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start,
-                                               unsigned Size) {
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp,
+                                                               SMLoc Start,
+                                                               unsigned Size) {
   const AsmToken &Tok = Parser.getTok();
   SMLoc End;
 
@@ -1425,7 +1456,7 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
 
 /// Parse the 'offset' operator.  This operator is used to specify the
 /// location rather then the content of a variable.
-X86Operand *X86AsmParser::ParseIntelOffsetOfOperator() {
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() {
   const AsmToken &Tok = Parser.getTok();
   SMLoc OffsetOfLoc = Tok.getLoc();
   Parser.Lex(); // Eat offset.
@@ -1462,7 +1493,7 @@ enum IntelOperatorKind {
 /// variable.  A variable's size is the product of its LENGTH and TYPE.  The
 /// TYPE operator returns the size of a C or C++ type or variable. If the
 /// variable is an array, TYPE returns the size of a single element.
-X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) {
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   const AsmToken &Tok = Parser.getTok();
   SMLoc TypeLoc = Tok.getLoc();
   Parser.Lex(); // Eat operator.
@@ -1495,7 +1526,7 @@ X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   return X86Operand::CreateImm(Imm, Start, End);
 }
 
-X86Operand *X86AsmParser::ParseIntelOperand() {
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
   const AsmToken &Tok = Parser.getTok();
   SMLoc Start, End;
 
@@ -1523,7 +1554,7 @@ X86Operand *X86AsmParser::ParseIntelOperand() {
 
   // Immediate.
   if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Minus) ||
-      getLexer().is(AsmToken::LParen)) {    
+      getLexer().is(AsmToken::Tilde) || getLexer().is(AsmToken::LParen)) {
     AsmToken StartTok = Tok;
     IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true,
                              /*AddImmPrefix=*/false);
@@ -1577,7 +1608,7 @@ X86Operand *X86AsmParser::ParseIntelOperand() {
   return ParseIntelMemOperand(/*Disp=*/0, Start, Size);
 }
 
-X86Operand *X86AsmParser::ParseATTOperand() {
+std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
   switch (getLexer().getKind()) {
   default:
     // Parse a memory operand with no segment register.
@@ -1613,9 +1644,8 @@ X86Operand *X86AsmParser::ParseATTOperand() {
   }
 }
 
-bool
-X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                                  const MCParsedAsmOperand &Op) {
+bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
+                                       const MCParsedAsmOperand &Op) {
   if(STI.getFeatureBits() & X86::FeatureAVX512) {
     if (getLexer().is(AsmToken::LCurly)) {
       // Eat "{" and mark the current place.
@@ -1653,8 +1683,8 @@ X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands
       } else {
         // Parse mask register {%k1}
         Operands.push_back(X86Operand::CreateToken("{", consumedToken));
-        if (X86Operand *Op = ParseOperand()) {
-          Operands.push_back(Op);
+        if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
+          Operands.push_back(std::move(Op));
           if (!getLexer().is(AsmToken::RCurly))
             return !ErrorAndEatStatement(getLexer().getLoc(),
                                          "Expected } at this point");
@@ -1682,7 +1712,8 @@ X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands
 
 /// ParseMemOperand: segment: disp(basereg, indexreg, scale).  The '%ds:' prefix
 /// has already been parsed if present.
-X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
+std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
+                                                          SMLoc MemStart) {
 
   // We have to disambiguate a parenthesized expression "(4+5)" from the start
   // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)".  The
@@ -1845,9 +1876,8 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
                                MemStart, MemEnd);
 }
 
-bool X86AsmParser::
-ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
-                 SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                    SMLoc NameLoc, OperandVector &Operands) {
   InstInfo = &Info;
   StringRef PatchedName = Name;
 
@@ -1940,9 +1970,9 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
 
     // Read the operands.
     while(1) {
-      if (X86Operand *Op = ParseOperand()) {
-         Operands.push_back(Op);
-        if (!HandleAVX512Operand(Operands, *Op))
+      if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
+        Operands.push_back(std::move(Op));
+        if (!HandleAVX512Operand(Operands, *Operands.back()))
           return true;
       } else {
          Parser.eatToEndOfStatement();
@@ -1973,27 +2003,25 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
   // documented form in various unofficial manuals, so a lot of code uses it.
   if ((Name == "outb" || Name == "outw" || Name == "outl" || Name == "out") &&
       Operands.size() == 3) {
-    X86Operand &Op = *(X86Operand*)Operands.back();
+    X86Operand &Op = (X86Operand &)*Operands.back();
     if (Op.isMem() && Op.Mem.SegReg == 0 &&
         isa<MCConstantExpr>(Op.Mem.Disp) &&
         cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
         Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
       SMLoc Loc = Op.getEndLoc();
       Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
-      delete &Op;
     }
   }
   // Same hack for "in[bwl]? (%dx), %al" -> "inb %dx, %al".
   if ((Name == "inb" || Name == "inw" || Name == "inl" || Name == "in") &&
       Operands.size() == 3) {
-    X86Operand &Op = *(X86Operand*)Operands.begin()[1];
+    X86Operand &Op = (X86Operand &)*Operands[1];
     if (Op.isMem() && Op.Mem.SegReg == 0 &&
         isa<MCConstantExpr>(Op.Mem.Disp) &&
         cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
         Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
       SMLoc Loc = Op.getEndLoc();
-      Operands.begin()[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
-      delete &Op;
+      Operands[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
     }
   }
 
@@ -2060,8 +2088,8 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
         Operands.push_back(DefaultMemSIOperand(NameLoc));
       }
     } else if (Operands.size() == 3) {
-      X86Operand &Op = *(X86Operand*)Operands.begin()[1];
-      X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
+      X86Operand &Op = (X86Operand &)*Operands[1];
+      X86Operand &Op2 = (X86Operand &)*Operands[2];
       if (!doSrcDstMatch(Op, Op2))
         return Error(Op.getStartLoc(),
                      "mismatching source and destination index registers");
@@ -2076,10 +2104,8 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
       (Name == "smov" || Name == "smovb" || Name == "smovw" ||
        Name == "smovl" || Name == "smovd" || Name == "smovq"))) {
     if (Operands.size() == 1) {
-      if (Name == "movsd") {
-        delete Operands.back();
+      if (Name == "movsd")
         Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
-      }
       if (isParsingIntelSyntax()) {
         Operands.push_back(DefaultMemDIOperand(NameLoc));
         Operands.push_back(DefaultMemSIOperand(NameLoc));
@@ -2088,8 +2114,8 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
         Operands.push_back(DefaultMemDIOperand(NameLoc));
       }
     } else if (Operands.size() == 3) {
-      X86Operand &Op = *(X86Operand*)Operands.begin()[1];
-      X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
+      X86Operand &Op = (X86Operand &)*Operands[1];
+      X86Operand &Op2 = (X86Operand &)*Operands[2];
       if (!doSrcDstMatch(Op, Op2))
         return Error(Op.getStartLoc(),
                      "mismatching source and destination index registers");
@@ -2105,31 +2131,26 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
       Operands.size() == 3) {
     if (isParsingIntelSyntax()) {
       // Intel syntax
-      X86Operand *Op1 = static_cast<X86Operand*>(Operands[2]);
-      if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
-          cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
-        delete Operands[2];
+      X86Operand &Op1 = static_cast<X86Operand &>(*Operands[2]);
+      if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+          cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
         Operands.pop_back();
-      }
     } else {
-      X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
-      if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
-          cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
-        delete Operands[1];
+      X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+      if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+          cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
         Operands.erase(Operands.begin() + 1);
-      }
     }
   }
 
   // Transforms "int $3" into "int3" as a size optimization.  We can't write an
   // instalias with an immediate operand yet.
   if (Name == "int" && Operands.size() == 2) {
-    X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
-    if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
-        cast<MCConstantExpr>(Op1->getImm())->getValue() == 3) {
-      delete Operands[1];
+    X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+    if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+        cast<MCConstantExpr>(Op1.getImm())->getValue() == 3) {
       Operands.erase(Operands.begin() + 1);
-      static_cast<X86Operand*>(Operands[0])->setTokenValue("int3");
+      static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3");
     }
   }
 
@@ -2175,9 +2196,7 @@ static bool convert64i32to64ri8(MCInst &Inst, unsigned Opcode,
   return convertToSExti8(Inst, Opcode, X86::RAX, isCmp);
 }
 
-bool X86AsmParser::
-processInstruction(MCInst &Inst,
-                   const SmallVectorImpl<MCParsedAsmOperand*> &Ops) {
+bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
   switch (Inst.getOpcode()) {
   default: return false;
   case X86::AND16i16: return convert16i16to16ri8(Inst, X86::AND16ri8);
@@ -2258,51 +2277,47 @@ processInstruction(MCInst &Inst,
 
 static const char *getSubtargetFeatureName(unsigned Val);
 
-void X86AsmParser::EmitInstruction(
-    MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-    MCStreamer &Out) {
+void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands,
+                                   MCStreamer &Out) {
   Instrumentation->InstrumentInstruction(Inst, Operands, getContext(), MII,
                                          Out);
   Out.EmitInstruction(Inst, STI);
 }
 
-bool X86AsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCStreamer &Out, unsigned &ErrorInfo,
-                        bool MatchingInlineAsm) {
+bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                           OperandVector &Operands,
+                                           MCStreamer &Out, unsigned &ErrorInfo,
+                                           bool MatchingInlineAsm) {
   assert(!Operands.empty() && "Unexpect empty operand list!");
-  X86Operand *Op = static_cast<X86Operand*>(Operands[0]);
-  assert(Op->isToken() && "Leading operand should always be a mnemonic!");
+  X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
+  assert(Op.isToken() && "Leading operand should always be a mnemonic!");
   ArrayRef<SMRange> EmptyRanges = None;
 
   // First, handle aliases that expand to multiple instructions.
   // FIXME: This should be replaced with a real .td file alias mechanism.
   // Also, MatchInstructionImpl should actually *do* the EmitInstruction
   // call.
-  if (Op->getToken() == "fstsw" || Op->getToken() == "fstcw" ||
-      Op->getToken() == "fstsww" || Op->getToken() == "fstcww" ||
-      Op->getToken() == "finit" || Op->getToken() == "fsave" ||
-      Op->getToken() == "fstenv" || Op->getToken() == "fclex") {
+  if (Op.getToken() == "fstsw" || Op.getToken() == "fstcw" ||
+      Op.getToken() == "fstsww" || Op.getToken() == "fstcww" ||
+      Op.getToken() == "finit" || Op.getToken() == "fsave" ||
+      Op.getToken() == "fstenv" || Op.getToken() == "fclex") {
     MCInst Inst;
     Inst.setOpcode(X86::WAIT);
     Inst.setLoc(IDLoc);
     if (!MatchingInlineAsm)
       EmitInstruction(Inst, Operands, Out);
 
-    const char *Repl =
-      StringSwitch<const char*>(Op->getToken())
-        .Case("finit",  "fninit")
-        .Case("fsave",  "fnsave")
-        .Case("fstcw",  "fnstcw")
-        .Case("fstcww",  "fnstcw")
-        .Case("fstenv", "fnstenv")
-        .Case("fstsw",  "fnstsw")
-        .Case("fstsww", "fnstsw")
-        .Case("fclex",  "fnclex")
-        .Default(nullptr);
+    const char *Repl = StringSwitch<const char *>(Op.getToken())
+                           .Case("finit", "fninit")
+                           .Case("fsave", "fnsave")
+                           .Case("fstcw", "fnstcw")
+                           .Case("fstcww", "fnstcw")
+                           .Case("fstenv", "fnstenv")
+                           .Case("fstsw", "fnstsw")
+                           .Case("fstsww", "fnstsw")
+                           .Case("fclex", "fnclex")
+                           .Default(nullptr);
     assert(Repl && "Unknown wait-prefixed instruction");
-    delete Operands[0];
     Operands[0] = X86Operand::CreateToken(Repl, IDLoc);
   }
 
@@ -2355,11 +2370,11 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   // following hack.
 
   // Change the operand to point to a temporary token.
-  StringRef Base = Op->getToken();
+  StringRef Base = Op.getToken();
   SmallString<16> Tmp;
   Tmp += Base;
   Tmp += ' ';
-  Op->setTokenValue(Tmp.str());
+  Op.setTokenValue(Tmp.str());
 
   // If this instruction starts with an 'f', then it is a floating point stack
   // instruction.  These come in up to three forms for 32-bit, 64-bit, and
@@ -2400,7 +2415,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     ErrorInfoMissingFeature = ErrorInfoIgnore;
 
   // Restore the old token.
-  Op->setTokenValue(Base);
+  Op.setTokenValue(Base);
 
   // If exactly one matched, then we treat that as a successful match (and the
   // instruction will already have been filled in correctly, since the failing
@@ -2450,8 +2465,8 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) &&
       (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) {
     if (!WasOriginallyInvalidOperand) {
-      ArrayRef<SMRange> Ranges = MatchingInlineAsm ? EmptyRanges :
-        Op->getLocRange();
+      ArrayRef<SMRange> Ranges =
+          MatchingInlineAsm ? EmptyRanges : Op.getLocRange();
       return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
                    Ranges, MatchingInlineAsm);
     }
@@ -2462,10 +2477,10 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
         return Error(IDLoc, "too few operands for instruction",
                      EmptyRanges, MatchingInlineAsm);
 
-      X86Operand *Operand = (X86Operand*)Operands[ErrorInfo];
-      if (Operand->getStartLoc().isValid()) {
-        SMRange OperandRange = Operand->getLocRange();
-        return Error(Operand->getStartLoc(), "invalid operand for instruction",
+      X86Operand &Operand = (X86Operand &)*Operands[ErrorInfo];
+      if (Operand.getStartLoc().isValid()) {
+        SMRange OperandRange = Operand.getLocRange();
+        return Error(Operand.getStartLoc(), "invalid operand for instruction",
                      OperandRange, MatchingInlineAsm);
       }
     }
diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h
index de3be38..1bbfc11 100644
--- a/lib/Target/X86/AsmParser/X86Operand.h
+++ b/lib/Target/X86/AsmParser/X86Operand.h
@@ -13,6 +13,7 @@
 #include "X86AsmParserCommon.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/ADT/STLExtras.h"
 
 namespace llvm {
 
@@ -410,20 +411,19 @@ struct X86Operand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::CreateReg(getMemSegReg()));
   }
 
-  static X86Operand *CreateToken(StringRef Str, SMLoc Loc) {
+  static std::unique_ptr<X86Operand> CreateToken(StringRef Str, SMLoc Loc) {
     SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
-    X86Operand *Res = new X86Operand(Token, Loc, EndLoc);
+    auto Res = llvm::make_unique<X86Operand>(Token, Loc, EndLoc);
     Res->Tok.Data = Str.data();
     Res->Tok.Length = Str.size();
     return Res;
   }
 
-  static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
-                               bool AddressOf = false,
-                               SMLoc OffsetOfLoc = SMLoc(),
-                               StringRef SymName = StringRef(),
-                               void *OpDecl = nullptr) {
-    X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc);
+  static std::unique_ptr<X86Operand>
+  CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
+            bool AddressOf = false, SMLoc OffsetOfLoc = SMLoc(),
+            StringRef SymName = StringRef(), void *OpDecl = nullptr) {
+    auto Res = llvm::make_unique<X86Operand>(Register, StartLoc, EndLoc);
     Res->Reg.RegNo = RegNo;
     Res->AddressOf = AddressOf;
     Res->OffsetOfLoc = OffsetOfLoc;
@@ -432,17 +432,18 @@ struct X86Operand : public MCParsedAsmOperand {
     return Res;
   }
 
-  static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc){
-    X86Operand *Res = new X86Operand(Immediate, StartLoc, EndLoc);
+  static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val,
+                                               SMLoc StartLoc, SMLoc EndLoc) {
+    auto Res = llvm::make_unique<X86Operand>(Immediate, StartLoc, EndLoc);
     Res->Imm.Val = Val;
     return Res;
   }
 
   /// Create an absolute memory operand.
-  static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
-                               unsigned Size = 0, StringRef SymName = StringRef(),
-                               void *OpDecl = nullptr) {
-    X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
+  static std::unique_ptr<X86Operand>
+  CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, unsigned Size = 0,
+            StringRef SymName = StringRef(), void *OpDecl = nullptr) {
+    auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
     Res->Mem.SegReg   = 0;
     Res->Mem.Disp     = Disp;
     Res->Mem.BaseReg  = 0;
@@ -456,12 +457,11 @@ struct X86Operand : public MCParsedAsmOperand {
   }
 
   /// Create a generalized memory operand.
-  static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp,
-                               unsigned BaseReg, unsigned IndexReg,
-                               unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
-                               unsigned Size = 0,
-                               StringRef SymName = StringRef(),
-                               void *OpDecl = nullptr) {
+  static std::unique_ptr<X86Operand>
+  CreateMem(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
+            unsigned IndexReg, unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
+            unsigned Size = 0, StringRef SymName = StringRef(),
+            void *OpDecl = nullptr) {
     // We should never just have a displacement, that should be parsed as an
     // absolute memory operand.
     assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
@@ -469,7 +469,7 @@ struct X86Operand : public MCParsedAsmOperand {
     // The scale should always be one of {1,2,4,8}.
     assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
            "Invalid scale!");
-    X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
+    auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
     Res->Mem.SegReg   = SegReg;
     Res->Mem.Disp     = Disp;
     Res->Mem.BaseReg  = BaseReg;
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index c54fbc1..a09767e 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -14,6 +14,7 @@ add_public_tablegen_target(X86CommonTableGen)
 
 set(sources
   X86AsmPrinter.cpp
+  X86AtomicExpandPass.cpp
   X86CodeEmitter.cpp
   X86FastISel.cpp
   X86FloatingPoint.cpp
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index 804606d..55587d4 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -1620,7 +1620,8 @@ static int readVVVV(struct InternalInstruction* insn) {
 
   int vvvv;
   if (insn->vectorExtensionType == TYPE_EVEX)
-    vvvv = vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]);
+    vvvv = (v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4 |
+            vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]));
   else if (insn->vectorExtensionType == TYPE_VEX_3B)
     vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]);
   else if (insn->vectorExtensionType == TYPE_VEX_2B)
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index bf30a8e..23bca0d 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -73,11 +73,12 @@ public:
 };
 
 class X86AsmBackend : public MCAsmBackend {
-  StringRef CPU;
+  const StringRef CPU;
   bool HasNopl;
+  const uint64_t MaxNopLength;
 public:
   X86AsmBackend(const Target &T, StringRef _CPU)
-    : MCAsmBackend(), CPU(_CPU) {
+    : MCAsmBackend(), CPU(_CPU), MaxNopLength(_CPU == "slm" ? 7 : 15) {
     HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" &&
               CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&
               CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&
@@ -331,7 +332,7 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   // 15 is the longest single nop instruction.  Emit as many 15-byte nops as
   // needed, then emit a nop of the remaining length.
   do {
-    const uint8_t ThisNopLength = (uint8_t) std::min(Count, (uint64_t) 15);
+    const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength);
     const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
     for (uint8_t i = 0; i < Prefixes; i++)
       OW->Write8(0x66);
@@ -365,6 +366,17 @@ public:
   }
 };
 
+class ELFX86_X32AsmBackend : public ELFX86AsmBackend {
+public:
+  ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+      : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
+                                    ELF::EM_X86_64);
+  }
+};
+
 class ELFX86_64AsmBackend : public ELFX86AsmBackend {
 public:
   ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
@@ -717,11 +729,10 @@ public:
 };
 
 class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
-  bool SupportsCU;
 public:
   DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                         StringRef CPU, bool SupportsCU)
-    : DarwinX86AsmBackend(T, MRI, CPU, false), SupportsCU(SupportsCU) {}
+                         StringRef CPU)
+      : DarwinX86AsmBackend(T, MRI, CPU, false) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
     return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
@@ -732,20 +743,16 @@ public:
   /// \brief Generate the compact unwind encoding for the CFI instructions.
   uint32_t generateCompactUnwindEncoding(
                              ArrayRef<MCCFIInstruction> Instrs) const override {
-    return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0;
+    return generateCompactUnwindEncodingImpl(Instrs);
   }
 };
 
 class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
-  bool SupportsCU;
   const MachO::CPUSubTypeX86 Subtype;
 public:
   DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                         StringRef CPU, bool SupportsCU,
-                         MachO::CPUSubTypeX86 st)
-    : DarwinX86AsmBackend(T, MRI, CPU, true), SupportsCU(SupportsCU),
-      Subtype(st) {
-  }
+                         StringRef CPU, MachO::CPUSubTypeX86 st)
+      : DarwinX86AsmBackend(T, MRI, CPU, true), Subtype(st) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
     return createX86MachObjectWriter(OS, /*Is64Bit=*/true,
@@ -788,7 +795,7 @@ public:
   /// \brief Generate the compact unwind encoding for the CFI instructions.
   uint32_t generateCompactUnwindEncoding(
                              ArrayRef<MCCFIInstruction> Instrs) const override {
-    return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0;
+    return generateCompactUnwindEncodingImpl(Instrs);
   }
 };
 
@@ -801,9 +808,7 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
   Triple TheTriple(TT);
 
   if (TheTriple.isOSBinFormatMachO())
-    return new DarwinX86_32AsmBackend(T, MRI, CPU,
-                                      TheTriple.isMacOSX() &&
-                                      !TheTriple.isMacOSXVersionLT(10, 7));
+    return new DarwinX86_32AsmBackend(T, MRI, CPU);
 
   if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF())
     return new WindowsX86AsmBackend(T, false, CPU);
@@ -823,14 +828,15 @@ MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
         StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName())
             .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H)
             .Default(MachO::CPU_SUBTYPE_X86_64_ALL);
-    return new DarwinX86_64AsmBackend(T, MRI, CPU,
-                                      TheTriple.isMacOSX() &&
-                                      !TheTriple.isMacOSXVersionLT(10, 7), CS);
+    return new DarwinX86_64AsmBackend(T, MRI, CPU, CS);
   }
 
   if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF())
     return new WindowsX86AsmBackend(T, true, CPU);
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+
+  if (TheTriple.getEnvironment() == Triple::GNUX32)
+    return new ELFX86_X32AsmBackend(T, OSABI, CPU);
   return new ELFX86_64AsmBackend(T, OSABI, CPU);
 }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index 39480ea..83b2777 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -74,8 +74,9 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
 
   // FIXME: this should not depend on the target OS version, but on the ld64
   // version in use.  From at least >= ld64-97.17 (Xcode 3.2.6) the abs-ified
-  // FDE relocs may be used.
-  DwarfFDESymbolsUseAbsDiff = T.isMacOSX() && !T.isMacOSXVersionLT(10, 6);
+  // FDE relocs may be used. We also use them for the ios simulator.
+  DwarfFDESymbolsUseAbsDiff = (T.isMacOSX() && !T.isMacOSXVersionLT(10, 6))
+    || T.isiOS();
 
   UseIntegratedAssembler = true;
 }
@@ -142,8 +143,11 @@ getNonexecutableStackSection(MCContext &Ctx) const {
 void X86MCAsmInfoMicrosoft::anchor() { }
 
 X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
-  if (Triple.getArch() == Triple::x86_64)
+  if (Triple.getArch() == Triple::x86_64) {
     PrivateGlobalPrefix = ".L";
+    PointerSize = 8;
+    ExceptionsType = ExceptionHandling::WinEH;
+  }
 
   AssemblerDialect = AsmWriterFlavor;
 
@@ -157,17 +161,18 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
 void X86MCAsmInfoGNUCOFF::anchor() { }
 
 X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
+  assert(Triple.isOSWindows() && "Windows is the only supported COFF target");
   if (Triple.getArch() == Triple::x86_64) {
     PrivateGlobalPrefix = ".L";
     PointerSize = 8;
+    ExceptionsType = ExceptionHandling::WinEH;
+  } else {
+    ExceptionsType = ExceptionHandling::DwarfCFI;
   }
 
   AssemblerDialect = AsmWriterFlavor;
 
   TextAlignFillValue = 0x90;
 
-  // Exceptions handling
-  ExceptionsType = ExceptionHandling::DwarfCFI;
-
   UseIntegratedAssembler = true;
 }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index e63036c..5e29e5c 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -197,14 +197,13 @@ void X86_MC::DetectFamilyModel(unsigned EAX, unsigned &Family,
   }
 }
 
-unsigned X86_MC::getDwarfRegFlavour(StringRef TT, bool isEH) {
-  Triple TheTriple(TT);
-  if (TheTriple.getArch() == Triple::x86_64)
+unsigned X86_MC::getDwarfRegFlavour(Triple TT, bool isEH) {
+  if (TT.getArch() == Triple::x86_64)
     return DWARFFlavour::X86_64;
 
-  if (TheTriple.isOSDarwin())
+  if (TT.isOSDarwin())
     return isEH ? DWARFFlavour::X86_32_DarwinEH : DWARFFlavour::X86_32_Generic;
-  if (TheTriple.isOSCygMing())
+  if (TT.isOSCygMing())
     // Unsupported by now, just quick fallback
     return DWARFFlavour::X86_32_Generic;
   return DWARFFlavour::X86_32_Generic;
@@ -251,8 +250,8 @@ static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) {
 
   MCRegisterInfo *X = new MCRegisterInfo();
   InitX86MCRegisterInfo(X, RA,
-                        X86_MC::getDwarfRegFlavour(TT, false),
-                        X86_MC::getDwarfRegFlavour(TT, true),
+                        X86_MC::getDwarfRegFlavour(TheTriple, false),
+                        X86_MC::getDwarfRegFlavour(TheTriple, true),
                         RA);
   X86_MC::InitLLVM2SEHRegisterMapping(X);
   return X;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 8fe40fd..ebe74cf 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -28,6 +28,7 @@ class MCSubtargetInfo;
 class MCRelocationInfo;
 class MCStreamer;
 class Target;
+class Triple;
 class StringRef;
 class raw_ostream;
 
@@ -64,7 +65,7 @@ namespace X86_MC {
 
   void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model);
 
-  unsigned getDwarfRegFlavour(StringRef TT, bool isEH);
+  unsigned getDwarfRegFlavour(Triple TT, bool isEH);
 
   void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI);
 
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index c62fd0a..7fa4180 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -19,12 +19,12 @@ public:
                      raw_ostream &OS)
     : MCWinCOFFStreamer(C, AB, *CE, OS) { }
 
-  void EmitWin64EHHandlerData() override;
+  void EmitWinEHHandlerData() override;
   void FinishImpl() override;
 };
 
-void X86WinCOFFStreamer::EmitWin64EHHandlerData() {
-  MCStreamer::EmitWin64EHHandlerData();
+void X86WinCOFFStreamer::EmitWinEHHandlerData() {
+  MCStreamer::EmitWinEHHandlerData();
 
   // We have to emit the unwind info now, because this directive
   // actually switches to the .xdata section!
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 64e8ea8..d5522ed 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -24,6 +24,10 @@ class ImmutablePass;
 class JITCodeEmitter;
 class X86TargetMachine;
 
+/// createX86AtomicExpandPass - This pass expands atomic operations that cannot
+/// be handled natively in terms of a loop using cmpxchg.
+FunctionPass *createX86AtomicExpandPass(const X86TargetMachine *TM);
+
 /// createX86ISelDag - This pass converts a legalized DAG into a
 /// X86-specific DAG, ready for instruction scheduling.
 ///
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 6912b57..93f516a 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -168,6 +168,8 @@ def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
                                    "LEA instruction needs inputs at AG stage">;
 def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
                                    "LEA instruction with certain arguments is slow">;
+def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
+                                   "INC and DEC instructions are slower than ADD and SUB">;
 
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
@@ -228,7 +230,7 @@ def : ProcessorModel<"slm",  SLMModel, [ProcIntelSLM,
                                FeaturePCLMUL, FeatureAES,
                                FeatureCallRegIndirect,
                                FeaturePRFCHW,
-                               FeatureSlowLEA,
+                               FeatureSlowLEA, FeatureSlowIncDec,
                                FeatureSlowBTMem, FeatureFastUAMem]>;
 // "Arrandale" along with corei3 and corei5
 def : ProcessorModel<"corei7", SandyBridgeModel,
@@ -271,7 +273,8 @@ def : ProcessorModel<"knl", HaswellModel,
                       FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT,
                       FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
                       FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI,
-                      FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE]>;
+                      FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE,
+                      FeatureSlowIncDec]>;
 
 def : Proc<"k6",              [FeatureMMX]>;
 def : Proc<"k6-2",            [Feature3DNow]>;
diff --git a/lib/Target/X86/X86AtomicExpandPass.cpp b/lib/Target/X86/X86AtomicExpandPass.cpp
new file mode 100644
index 0000000..61eefbb
--- /dev/null
+++ b/lib/Target/X86/X86AtomicExpandPass.cpp
@@ -0,0 +1,287 @@
+//===-- X86AtomicExpandPass.cpp - Expand illegal atomic instructions --0---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass (at IR level) to replace atomic instructions which
+// cannot be implemented as a single instruction with cmpxchg-based loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-atomic-expand"
+
+namespace {
+  class X86AtomicExpandPass : public FunctionPass {
+    const X86TargetMachine *TM;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit X86AtomicExpandPass(const X86TargetMachine *TM)
+      : FunctionPass(ID), TM(TM) {}
+
+    bool runOnFunction(Function &F) override;
+    bool expandAtomicInsts(Function &F);
+
+    bool needsCmpXchgNb(Type *MemType);
+
+    /// There are four kinds of atomic operations. Two never need expanding:
+    /// cmpxchg is what we expand the others *to*, and loads are easily handled
+    /// by ISelLowering. Atomicrmw and store can need expanding in some
+    /// circumstances.
+    bool shouldExpand(Instruction *Inst);
+
+    /// 128-bit atomic stores (64-bit on i686) need to be implemented in terms
+    /// of trivial cmpxchg16b loops. A simple store isn't necessarily atomic.
+    bool shouldExpandStore(StoreInst *SI);
+
+    /// Only some atomicrmw instructions need expanding -- some operations
+    /// (e.g. max) have absolutely no architectural support; some (e.g. or) have
+    /// limited support but can't return the previous value; some (e.g. add)
+    /// have complete support in the instruction set.
+    ///
+    /// Also, naturally, 128-bit operations always need to be expanded.
+    bool shouldExpandAtomicRMW(AtomicRMWInst *AI);
+
+    bool expandAtomicRMW(AtomicRMWInst *AI);
+    bool expandAtomicStore(StoreInst *SI);
+  };
+}
+
+char X86AtomicExpandPass::ID = 0;
+
+FunctionPass *llvm::createX86AtomicExpandPass(const X86TargetMachine *TM) {
+  return new X86AtomicExpandPass(TM);
+}
+
+bool X86AtomicExpandPass::runOnFunction(Function &F) {
+  SmallVector<Instruction *, 1> AtomicInsts;
+
+  // Changing control-flow while iterating through it is a bad idea, so gather a
+  // list of all atomic instructions before we start.
+  for (BasicBlock &BB : F)
+    for (Instruction &Inst : BB) {
+      if (isa<AtomicRMWInst>(&Inst) ||
+          (isa<StoreInst>(&Inst) && cast<StoreInst>(&Inst)->isAtomic()))
+        AtomicInsts.push_back(&Inst);
+    }
+
+  bool MadeChange = false;
+  for (Instruction *Inst : AtomicInsts) {
+    if (!shouldExpand(Inst))
+      continue;
+
+    if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
+      MadeChange |= expandAtomicRMW(AI);
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+      MadeChange |= expandAtomicStore(SI);
+
+    assert(MadeChange && "Atomic inst not expanded when it should be?");
+    Inst->eraseFromParent();
+  }
+
+  return MadeChange;
+}
+
+/// Returns true if operations on the given type will need to use either
+/// cmpxchg8b or cmpxchg16b. This occurs if the type is 1 step up from the
+/// native width, and the instructions are available (otherwise we leave them
+/// alone to become __sync_fetch_and_... calls).
+bool X86AtomicExpandPass::needsCmpXchgNb(llvm::Type *MemType) {
+  const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>();
+  if (!Subtarget.hasCmpxchg16b())
+    return false;
+
+  unsigned CmpXchgNbWidth = Subtarget.is64Bit() ? 128 : 64;
+
+  unsigned OpWidth = MemType->getPrimitiveSizeInBits();
+  if (OpWidth == CmpXchgNbWidth)
+    return true;
+
+  return false;
+}
+
+
+bool X86AtomicExpandPass::shouldExpandAtomicRMW(AtomicRMWInst *AI) {
+  const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>();
+  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+
+  if (needsCmpXchgNb(AI->getType()))
+    return true;
+
+  if (AI->getType()->getPrimitiveSizeInBits() > NativeWidth)
+    return false;
+
+  AtomicRMWInst::BinOp Op = AI->getOperation();
+  switch (Op) {
+  default:
+    llvm_unreachable("Unknown atomic operation");
+  case AtomicRMWInst::Xchg:
+  case AtomicRMWInst::Add:
+  case AtomicRMWInst::Sub:
+    // It's better to use xadd, xsub or xchg for these in all cases.
+    return false;
+  case AtomicRMWInst::Or:
+  case AtomicRMWInst::And:
+  case AtomicRMWInst::Xor:
+    // If the atomicrmw's result isn't actually used, we can just add a "lock"
+    // prefix to a normal instruction for these operations.
+    return !AI->use_empty();
+  case AtomicRMWInst::Nand:
+  case AtomicRMWInst::Max:
+  case AtomicRMWInst::Min:
+  case AtomicRMWInst::UMax:
+  case AtomicRMWInst::UMin:
+    // These always require a non-trivial set of data operations on x86. We must
+    // use a cmpxchg loop.
+    return true;
+  }
+}
+
+bool X86AtomicExpandPass::shouldExpandStore(StoreInst *SI) {
+  if (needsCmpXchgNb(SI->getValueOperand()->getType()))
+    return true;
+
+  return false;
+}
+
+bool X86AtomicExpandPass::shouldExpand(Instruction *Inst) {
+  if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
+    return shouldExpandAtomicRMW(AI);
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    return shouldExpandStore(SI);
+  return false;
+}
+
+/// Emit IR to implement the given atomicrmw operation on values in registers,
+/// returning the new value.
+static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
+                              Value *Loaded, Value *Inc) {
+  Value *NewVal;
+  switch (Op) {
+  case AtomicRMWInst::Xchg:
+    return Inc;
+  case AtomicRMWInst::Add:
+    return Builder.CreateAdd(Loaded, Inc, "new");
+  case AtomicRMWInst::Sub:
+    return Builder.CreateSub(Loaded, Inc, "new");
+  case AtomicRMWInst::And:
+    return Builder.CreateAnd(Loaded, Inc, "new");
+  case AtomicRMWInst::Nand:
+    return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
+  case AtomicRMWInst::Or:
+    return Builder.CreateOr(Loaded, Inc, "new");
+  case AtomicRMWInst::Xor:
+    return Builder.CreateXor(Loaded, Inc, "new");
+  case AtomicRMWInst::Max:
+    NewVal = Builder.CreateICmpSGT(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::Min:
+    NewVal = Builder.CreateICmpSLE(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::UMax:
+    NewVal = Builder.CreateICmpUGT(Loaded, Inc);
+    return  Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::UMin:
+    NewVal = Builder.CreateICmpULE(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  default:
+    break;
+  }
+  llvm_unreachable("Unknown atomic op");
+}
+
+bool X86AtomicExpandPass::expandAtomicRMW(AtomicRMWInst *AI) {
+  AtomicOrdering Order =
+      AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering();
+  Value *Addr = AI->getPointerOperand();
+  BasicBlock *BB = AI->getParent();
+  Function *F = BB->getParent();
+  LLVMContext &Ctx = F->getContext();
+
+  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
+  //
+  // The standard expansion we produce is:
+  //     [...]
+  //     %init_loaded = load atomic iN* %addr
+  //     br label %loop
+  // loop:
+  //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
+  //     %new = some_op iN %loaded, %incr
+  //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
+  //     %new_loaded = extractvalue { iN, i1 } %pair, 0
+  //     %success = extractvalue { iN, i1 } %pair, 1
+  //     br i1 %success, label %atomicrmw.end, label %loop
+  // atomicrmw.end:
+  //     [...]
+  BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end");
+  BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
+
+  // This grabs the DebugLoc from AI.
+  IRBuilder<> Builder(AI);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place), but we want a load. It's easiest to just remove
+  // the branch entirely.
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  LoadInst *InitLoaded = Builder.CreateLoad(Addr);
+  InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits());
+  Builder.CreateBr(LoopBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(LoopBB);
+  PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded");
+  Loaded->addIncoming(InitLoaded, BB);
+
+  Value *NewVal =
+      performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
+
+  Value *Pair = Builder.CreateAtomicCmpXchg(
+      Addr, Loaded, NewVal, Order,
+      AtomicCmpXchgInst::getStrongestFailureOrdering(Order));
+  Value *NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
+  Loaded->addIncoming(NewLoaded, LoopBB);
+
+  Value *Success = Builder.CreateExtractValue(Pair, 1, "success");
+  Builder.CreateCondBr(Success, ExitBB, LoopBB);
+
+  AI->replaceAllUsesWith(NewLoaded);
+
+  return true;
+}
+
+bool X86AtomicExpandPass::expandAtomicStore(StoreInst *SI) {
+  // An atomic store might need cmpxchg16b (or 8b on x86) to execute. Express
+  // this in terms of the usual expansion to "atomicrmw xchg".
+  IRBuilder<> Builder(SI);
+  AtomicOrdering Order =
+      SI->getOrdering() == Unordered ? Monotonic : SI->getOrdering();
+  AtomicRMWInst *AI =
+      Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
+                              SI->getValueOperand(), Order);
+
+  // Now we have an appropriate swap instruction, lower it as usual.
+  if (shouldExpandAtomicRMW(AI)) {
+    expandAtomicRMW(AI);
+    AI->eraseFromParent();
+    return true;
+  }
+
+  return AI;
+}
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index 76718d0..a3ae7ee 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -1113,9 +1113,14 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     case TargetOpcode::INLINEASM:
       // We allow inline assembler nodes with empty bodies - they can
       // implicitly define registers, which is ok for JIT.
-      if (MI.getOperand(0).getSymbolName()[0])
+      if (MI.getOperand(0).getSymbolName()[0]) {
+        DebugLoc DL = MI.getDebugLoc();
+        DL.print(MI.getParent()->getParent()->getFunction()->getContext(),
+                 llvm::errs());
         report_fatal_error("JIT does not support inline asm!");
+      }
       break;
+    case TargetOpcode::DBG_VALUE:
     case TargetOpcode::CFI_INSTRUCTION:
       break;
     case TargetOpcode::GC_LABEL:
@@ -1126,6 +1131,16 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     case TargetOpcode::IMPLICIT_DEF:
     case TargetOpcode::KILL:
       break;
+
+    case X86::SEH_PushReg:
+    case X86::SEH_SaveReg:
+    case X86::SEH_SaveXMM:
+    case X86::SEH_StackAlloc:
+    case X86::SEH_SetFrame:
+    case X86::SEH_PushFrame:
+    case X86::SEH_EndPrologue:
+      break;
+
     case X86::MOVPC32r: {
       // This emits the "call" portion of this pseudo instruction.
       MCE.emitByte(BaseOpcode);
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 56bcfa3..ce554ba 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -16,10 +16,12 @@
 #include "X86.h"
 #include "X86CallingConv.h"
 #include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86RegisterInfo.h"
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -78,12 +80,14 @@ public:
 private:
   bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT);
 
-  bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR);
+  bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO,
+                       unsigned &ResultReg);
 
   bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM,
-                        bool Aligned = false);
-  bool X86FastEmitStore(EVT VT, unsigned ValReg, const X86AddressMode &AM,
-                        bool Aligned = false);
+                        MachineMemOperand *MMO = nullptr, bool Aligned = false);
+  bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
+                        const X86AddressMode &AM,
+                        MachineMemOperand *MMO = nullptr, bool Aligned = false);
 
   bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
                          unsigned &ResultReg);
@@ -107,6 +111,12 @@ private:
 
   bool X86SelectDivRem(const Instruction *I);
 
+  bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I);
+
+  bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I);
+
+  bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I);
+
   bool X86SelectSelect(const Instruction *I);
 
   bool X86SelectTrunc(const Instruction *I);
@@ -147,10 +157,182 @@ private:
 
   bool TryEmitSmallMemcpy(X86AddressMode DestAM,
                           X86AddressMode SrcAM, uint64_t Len);
+
+  bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
+                            const Value *Cond);
 };
 
 } // end anonymous namespace.
 
+static CmpInst::Predicate optimizeCmpPredicate(const CmpInst *CI) {
+  // If both operands are the same, then try to optimize or fold the cmp.
+  CmpInst::Predicate Predicate = CI->getPredicate();
+  if (CI->getOperand(0) != CI->getOperand(1))
+    return Predicate;
+
+  switch (Predicate) {
+  default: llvm_unreachable("Invalid predicate!");
+  case CmpInst::FCMP_FALSE: Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OEQ:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_OGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OGE:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_OLT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OLE:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_ONE:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_ORD:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_UNO:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_UEQ:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_UGT:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_UGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_ULT:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_ULE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_UNE:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_TRUE:  Predicate = CmpInst::FCMP_TRUE;  break;
+
+  case CmpInst::ICMP_EQ:    Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_NE:    Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_UGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_UGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_ULT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_ULE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_SGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_SGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_SLT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_SLE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  }
+
+  return Predicate;
+}
+
+static std::pair<X86::CondCode, bool>
+getX86ConditionCode(CmpInst::Predicate Predicate) {
+  X86::CondCode CC = X86::COND_INVALID;
+  bool NeedSwap = false;
+  switch (Predicate) {
+  default: break;
+  // Floating-point Predicates
+  case CmpInst::FCMP_UEQ: CC = X86::COND_E;       break;
+  case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_OGT: CC = X86::COND_A;       break;
+  case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_OGE: CC = X86::COND_AE;      break;
+  case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_ULT: CC = X86::COND_B;       break;
+  case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_ULE: CC = X86::COND_BE;      break;
+  case CmpInst::FCMP_ONE: CC = X86::COND_NE;      break;
+  case CmpInst::FCMP_UNO: CC = X86::COND_P;       break;
+  case CmpInst::FCMP_ORD: CC = X86::COND_NP;      break;
+  case CmpInst::FCMP_OEQ: // fall-through
+  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
+
+  // Integer Predicates
+  case CmpInst::ICMP_EQ:  CC = X86::COND_E;       break;
+  case CmpInst::ICMP_NE:  CC = X86::COND_NE;      break;
+  case CmpInst::ICMP_UGT: CC = X86::COND_A;       break;
+  case CmpInst::ICMP_UGE: CC = X86::COND_AE;      break;
+  case CmpInst::ICMP_ULT: CC = X86::COND_B;       break;
+  case CmpInst::ICMP_ULE: CC = X86::COND_BE;      break;
+  case CmpInst::ICMP_SGT: CC = X86::COND_G;       break;
+  case CmpInst::ICMP_SGE: CC = X86::COND_GE;      break;
+  case CmpInst::ICMP_SLT: CC = X86::COND_L;       break;
+  case CmpInst::ICMP_SLE: CC = X86::COND_LE;      break;
+  }
+
+  return std::make_pair(CC, NeedSwap);
+}
+
+static std::pair<unsigned, bool>
+getX86SSEConditionCode(CmpInst::Predicate Predicate) {
+  unsigned CC;
+  bool NeedSwap = false;
+
+  // SSE Condition code mapping:
+  //  0 - EQ
+  //  1 - LT
+  //  2 - LE
+  //  3 - UNORD
+  //  4 - NEQ
+  //  5 - NLT
+  //  6 - NLE
+  //  7 - ORD
+  switch (Predicate) {
+  default: llvm_unreachable("Unexpected predicate");
+  case CmpInst::FCMP_OEQ: CC = 0;          break;
+  case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_OLT: CC = 1;          break;
+  case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_OLE: CC = 2;          break;
+  case CmpInst::FCMP_UNO: CC = 3;          break;
+  case CmpInst::FCMP_UNE: CC = 4;          break;
+  case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_UGE: CC = 5;          break;
+  case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through
+  case CmpInst::FCMP_UGT: CC = 6;          break;
+  case CmpInst::FCMP_ORD: CC = 7;          break;
+  case CmpInst::FCMP_UEQ:
+  case CmpInst::FCMP_ONE: CC = 8;          break;
+  }
+
+  return std::make_pair(CC, NeedSwap);
+}
+
+/// \brief Check if it is possible to fold the condition from the XALU intrinsic
+/// into the user. The condition code will only be updated on success.
+bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
+                                       const Value *Cond) {
+  if (!isa<ExtractValueInst>(Cond))
+    return false;
+
+  const auto *EV = cast<ExtractValueInst>(Cond);
+  if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
+    return false;
+
+  const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
+  MVT RetVT;
+  const Function *Callee = II->getCalledFunction();
+  Type *RetTy =
+    cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
+  if (!isTypeLegal(RetTy, RetVT))
+    return false;
+
+  if (RetVT != MVT::i32 && RetVT != MVT::i64)
+    return false;
+
+  X86::CondCode TmpCC;
+  switch (II->getIntrinsicID()) {
+  default: return false;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break;
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break;
+  }
+
+  // Check if both instructions are in the same basic block.
+  if (II->getParent() != I->getParent())
+    return false;
+
+  // Make sure nothing is in the way
+  BasicBlock::const_iterator Start = I;
+  BasicBlock::const_iterator End = II;
+  for (auto Itr = std::prev(Start); Itr != End; --Itr) {
+    // We only expect extractvalue instructions between the intrinsic and the
+    // instruction to be selected.
+    if (!isa<ExtractValueInst>(Itr))
+      return false;
+
+    // Check that the extractvalue operand comes from the intrinsic.
+    const auto *EVI = cast<ExtractValueInst>(Itr);
+    if (EVI->getAggregateOperand() != II)
+      return false;
+  }
+
+  CC = TmpCC;
+  return true;
+}
+
 bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
   EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true);
   if (evt == MVT::Other || !evt.isSimple())
@@ -180,7 +362,7 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
 /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
 /// Return true and the result register by reference if it is possible.
 bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
-                                  unsigned &ResultReg) {
+                                  MachineMemOperand *MMO, unsigned &ResultReg) {
   // Get opcode and regclass of the output for the given load instruction.
   unsigned Opc = 0;
   const TargetRegisterClass *RC = nullptr;
@@ -228,8 +410,11 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
   }
 
   ResultReg = createResultReg(RC);
-  addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-                         DbgLoc, TII.get(Opc), ResultReg), AM);
+  MachineInstrBuilder MIB =
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+  addFullAddress(MIB, AM);
+  if (MMO)
+    MIB->addMemOperand(*FuncInfo.MF, MMO);
   return true;
 }
 
@@ -237,9 +422,9 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
 /// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
 /// and a displacement offset, or a GlobalAddress,
 /// i.e. V. Return true if it is possible.
-bool
-X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg,
-                              const X86AddressMode &AM, bool Aligned) {
+bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
+                                   const X86AddressMode &AM,
+                                   MachineMemOperand *MMO, bool Aligned) {
   // Get opcode and regclass of the output for the given store instruction.
   unsigned Opc = 0;
   switch (VT.getSimpleVT().SimpleTy) {
@@ -249,7 +434,8 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg,
     // Mask out all but lowest bit.
     unsigned AndResult = createResultReg(&X86::GR8RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(X86::AND8ri), AndResult).addReg(ValReg).addImm(1);
+            TII.get(X86::AND8ri), AndResult)
+      .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
     ValReg = AndResult;
   }
   // FALLTHROUGH, handling i1 as i8.
@@ -288,13 +474,18 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg,
     break;
   }
 
-  addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-                         DbgLoc, TII.get(Opc)), AM).addReg(ValReg);
+  MachineInstrBuilder MIB =
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+  addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill));
+  if (MMO)
+    MIB->addMemOperand(*FuncInfo.MF, MMO);
+
   return true;
 }
 
 bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
-                                   const X86AddressMode &AM, bool Aligned) {
+                                   const X86AddressMode &AM,
+                                   MachineMemOperand *MMO, bool Aligned) {
   // Handle 'null' like i32/i64 0.
   if (isa<ConstantPointerNull>(Val))
     Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));
@@ -317,10 +508,12 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
     }
 
     if (Opc) {
-      addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-                             DbgLoc, TII.get(Opc)), AM)
-                             .addImm(Signed ? (uint64_t) CI->getSExtValue() :
-                                              CI->getZExtValue());
+      MachineInstrBuilder MIB =
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+      addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()
+                                            : CI->getZExtValue());
+      if (MMO)
+        MIB->addMemOperand(*FuncInfo.MF, MMO);
       return true;
     }
   }
@@ -329,7 +522,8 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
   if (ValReg == 0)
     return false;
 
-  return X86FastEmitStore(VT, ValReg, AM, Aligned);
+  bool ValKill = hasTrivialKill(Val);
+  return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned);
 }
 
 /// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
@@ -355,17 +549,8 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
       return false;
 
     // Can't handle TLS yet.
-    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
-      if (GVar->isThreadLocal())
-        return false;
-
-    // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how
-    // it works...).
-    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      if (const GlobalVariable *GVar =
-              dyn_cast_or_null<GlobalVariable>(GA->getAliasee()))
-        if (GVar->isThreadLocal())
-          return false;
+    if (GV->isThreadLocal())
+      return false;
 
     // RIP-relative addresses can't have additional register operands, so if
     // we've already folded stuff into the addressing mode, just force the
@@ -696,7 +881,7 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
         (AM.Base.Reg != 0 || AM.IndexReg != 0))
       return false;
 
-    // Can't handle DbgLocLImport.
+    // Can't handle DLL Import.
     if (GV->hasDLLImportStorageClass())
       return false;
 
@@ -749,19 +934,24 @@ bool X86FastISel::X86SelectStore(const Instruction *I) {
   if (S->isAtomic())
     return false;
 
-  unsigned SABIAlignment =
-    DL.getABITypeAlignment(S->getValueOperand()->getType());
-  bool Aligned = S->getAlignment() == 0 || S->getAlignment() >= SABIAlignment;
+  const Value *Val = S->getValueOperand();
+  const Value *Ptr = S->getPointerOperand();
 
   MVT VT;
-  if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true))
+  if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))
     return false;
 
+  unsigned Alignment = S->getAlignment();
+  unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType());
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = ABIAlignment;
+  bool Aligned = Alignment >= ABIAlignment;
+
   X86AddressMode AM;
-  if (!X86SelectAddress(I->getOperand(1), AM))
+  if (!X86SelectAddress(Ptr, AM))
     return false;
 
-  return X86FastEmitStore(VT, I->getOperand(0), AM, Aligned);
+  return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);
 }
 
 /// X86SelectRet - Select and emit code to implement ret instructions.
@@ -896,25 +1086,29 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
 
 /// X86SelectLoad - Select and emit code to implement load instructions.
 ///
-bool X86FastISel::X86SelectLoad(const Instruction *I)  {
+bool X86FastISel::X86SelectLoad(const Instruction *I) {
+  const LoadInst *LI = cast<LoadInst>(I);
+
   // Atomic loads need special handling.
-  if (cast<LoadInst>(I)->isAtomic())
+  if (LI->isAtomic())
     return false;
 
   MVT VT;
-  if (!isTypeLegal(I->getType(), VT, /*AllowI1=*/true))
+  if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true))
     return false;
 
+  const Value *Ptr = LI->getPointerOperand();
+
   X86AddressMode AM;
-  if (!X86SelectAddress(I->getOperand(0), AM))
+  if (!X86SelectAddress(Ptr, AM))
     return false;
 
   unsigned ResultReg = 0;
-  if (X86FastEmitLoad(VT, AM, ResultReg)) {
-    UpdateValueMap(I, ResultReg);
-    return true;
-  }
-  return false;
+  if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg))
+    return false;
+
+  UpdateValueMap(I, ResultReg);
+  return true;
 }
 
 static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
@@ -994,73 +1188,89 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
   if (!isTypeLegal(I->getOperand(0)->getType(), VT))
     return false;
 
-  unsigned ResultReg = createResultReg(&X86::GR8RegClass);
-  unsigned SetCCOpc;
-  bool SwapArgs;  // false -> compare Op0, Op1.  true -> compare Op1, Op0.
-  switch (CI->getPredicate()) {
-  case CmpInst::FCMP_OEQ: {
-    if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT))
+  // Try to optimize or fold the cmp.
+  CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+  unsigned ResultReg = 0;
+  switch (Predicate) {
+  default: break;
+  case CmpInst::FCMP_FALSE: {
+    ResultReg = createResultReg(&X86::GR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
+            ResultReg);
+    ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
+                                           X86::sub_8bit);
+    if (!ResultReg)
       return false;
+    break;
+  }
+  case CmpInst::FCMP_TRUE: {
+    ResultReg = createResultReg(&X86::GR8RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
+            ResultReg).addImm(1);
+    break;
+  }
+  }
 
-    unsigned EReg = createResultReg(&X86::GR8RegClass);
-    unsigned NPReg = createResultReg(&X86::GR8RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETEr), EReg);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(X86::SETNPr), NPReg);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(X86::AND8rr), ResultReg).addReg(NPReg).addReg(EReg);
+  if (ResultReg) {
     UpdateValueMap(I, ResultReg);
     return true;
   }
-  case CmpInst::FCMP_UNE: {
-    if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT))
+
+  const Value *LHS = CI->getOperand(0);
+  const Value *RHS = CI->getOperand(1);
+
+  // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
+  // We don't have to materialize a zero constant for this case and can just use
+  // %x again on the RHS.
+  if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+    const auto *RHSC = dyn_cast<ConstantFP>(RHS);
+    if (RHSC && RHSC->isNullValue())
+      RHS = LHS;
+  }
+
+  // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
+  static unsigned SETFOpcTable[2][3] = {
+    { X86::SETEr,  X86::SETNPr, X86::AND8rr },
+    { X86::SETNEr, X86::SETPr,  X86::OR8rr  }
+  };
+  unsigned *SETFOpc = nullptr;
+  switch (Predicate) {
+  default: break;
+  case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
+  case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;
+  }
+
+  ResultReg = createResultReg(&X86::GR8RegClass);
+  if (SETFOpc) {
+    if (!X86FastEmitCompare(LHS, RHS, VT))
       return false;
 
-    unsigned NEReg = createResultReg(&X86::GR8RegClass);
-    unsigned PReg = createResultReg(&X86::GR8RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETNEr), NEReg);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETPr), PReg);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::OR8rr),ResultReg)
-      .addReg(PReg).addReg(NEReg);
+    unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
+    unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
+            FlagReg1);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
+            FlagReg2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]),
+            ResultReg).addReg(FlagReg1).addReg(FlagReg2);
     UpdateValueMap(I, ResultReg);
     return true;
   }
-  case CmpInst::FCMP_OGT: SwapArgs = false; SetCCOpc = X86::SETAr;  break;
-  case CmpInst::FCMP_OGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break;
-  case CmpInst::FCMP_OLT: SwapArgs = true;  SetCCOpc = X86::SETAr;  break;
-  case CmpInst::FCMP_OLE: SwapArgs = true;  SetCCOpc = X86::SETAEr; break;
-  case CmpInst::FCMP_ONE: SwapArgs = false; SetCCOpc = X86::SETNEr; break;
-  case CmpInst::FCMP_ORD: SwapArgs = false; SetCCOpc = X86::SETNPr; break;
-  case CmpInst::FCMP_UNO: SwapArgs = false; SetCCOpc = X86::SETPr;  break;
-  case CmpInst::FCMP_UEQ: SwapArgs = false; SetCCOpc = X86::SETEr;  break;
-  case CmpInst::FCMP_UGT: SwapArgs = true;  SetCCOpc = X86::SETBr;  break;
-  case CmpInst::FCMP_UGE: SwapArgs = true;  SetCCOpc = X86::SETBEr; break;
-  case CmpInst::FCMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr;  break;
-  case CmpInst::FCMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break;
-
-  case CmpInst::ICMP_EQ:  SwapArgs = false; SetCCOpc = X86::SETEr;  break;
-  case CmpInst::ICMP_NE:  SwapArgs = false; SetCCOpc = X86::SETNEr; break;
-  case CmpInst::ICMP_UGT: SwapArgs = false; SetCCOpc = X86::SETAr;  break;
-  case CmpInst::ICMP_UGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break;
-  case CmpInst::ICMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr;  break;
-  case CmpInst::ICMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break;
-  case CmpInst::ICMP_SGT: SwapArgs = false; SetCCOpc = X86::SETGr;  break;
-  case CmpInst::ICMP_SGE: SwapArgs = false; SetCCOpc = X86::SETGEr; break;
-  case CmpInst::ICMP_SLT: SwapArgs = false; SetCCOpc = X86::SETLr;  break;
-  case CmpInst::ICMP_SLE: SwapArgs = false; SetCCOpc = X86::SETLEr; break;
-  default:
-    return false;
-  }
 
-  const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+  X86::CondCode CC;
+  bool SwapArgs;
+  std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+  assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+  unsigned Opc = X86::getSETFromCond(CC);
+
   if (SwapArgs)
-    std::swap(Op0, Op1);
+    std::swap(LHS, RHS);
 
-  // Emit a compare of Op0/Op1.
-  if (!X86FastEmitCompare(Op0, Op1, VT))
+  // Emit a compare of LHS/RHS.
+  if (!X86FastEmitCompare(LHS, RHS, VT))
     return false;
 
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SetCCOpc), ResultReg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
   UpdateValueMap(I, ResultReg);
   return true;
 }
@@ -1126,73 +1336,88 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
   // Fold the common case of a conditional branch with a comparison
   // in the same block (values defined on other blocks may not have
   // initialized registers).
+  X86::CondCode CC;
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
     if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
       EVT VT = TLI.getValueType(CI->getOperand(0)->getType());
 
+      // Try to optimize or fold the cmp.
+      CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+      switch (Predicate) {
+      default: break;
+      case CmpInst::FCMP_FALSE: FastEmitBranch(FalseMBB, DbgLoc); return true;
+      case CmpInst::FCMP_TRUE:  FastEmitBranch(TrueMBB, DbgLoc); return true;
+      }
+
+      const Value *CmpLHS = CI->getOperand(0);
+      const Value *CmpRHS = CI->getOperand(1);
+
+      // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,
+      // 0.0.
+      // We don't have to materialize a zero constant for this case and can just
+      // use %x again on the RHS.
+      if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+        const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
+        if (CmpRHSC && CmpRHSC->isNullValue())
+          CmpRHS = CmpLHS;
+      }
+
       // Try to take advantage of fallthrough opportunities.
-      CmpInst::Predicate Predicate = CI->getPredicate();
       if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
         std::swap(TrueMBB, FalseMBB);
         Predicate = CmpInst::getInversePredicate(Predicate);
       }
 
-      bool SwapArgs;  // false -> compare Op0, Op1.  true -> compare Op1, Op0.
-      unsigned BranchOpc; // Opcode to jump on, e.g. "X86::JA"
-
+      // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition
+      // code check. Instead two branch instructions are required to check all
+      // the flags. First we change the predicate to a supported condition code,
+      // which will be the first branch. Later one we will emit the second
+      // branch.
+      bool NeedExtraBranch = false;
       switch (Predicate) {
+      default: break;
       case CmpInst::FCMP_OEQ:
-        std::swap(TrueMBB, FalseMBB);
-        Predicate = CmpInst::FCMP_UNE;
-        // FALL THROUGH
-      case CmpInst::FCMP_UNE: SwapArgs = false; BranchOpc = X86::JNE_4; break;
-      case CmpInst::FCMP_OGT: SwapArgs = false; BranchOpc = X86::JA_4;  break;
-      case CmpInst::FCMP_OGE: SwapArgs = false; BranchOpc = X86::JAE_4; break;
-      case CmpInst::FCMP_OLT: SwapArgs = true;  BranchOpc = X86::JA_4;  break;
-      case CmpInst::FCMP_OLE: SwapArgs = true;  BranchOpc = X86::JAE_4; break;
-      case CmpInst::FCMP_ONE: SwapArgs = false; BranchOpc = X86::JNE_4; break;
-      case CmpInst::FCMP_ORD: SwapArgs = false; BranchOpc = X86::JNP_4; break;
-      case CmpInst::FCMP_UNO: SwapArgs = false; BranchOpc = X86::JP_4;  break;
-      case CmpInst::FCMP_UEQ: SwapArgs = false; BranchOpc = X86::JE_4;  break;
-      case CmpInst::FCMP_UGT: SwapArgs = true;  BranchOpc = X86::JB_4;  break;
-      case CmpInst::FCMP_UGE: SwapArgs = true;  BranchOpc = X86::JBE_4; break;
-      case CmpInst::FCMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4;  break;
-      case CmpInst::FCMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break;
-
-      case CmpInst::ICMP_EQ:  SwapArgs = false; BranchOpc = X86::JE_4;  break;
-      case CmpInst::ICMP_NE:  SwapArgs = false; BranchOpc = X86::JNE_4; break;
-      case CmpInst::ICMP_UGT: SwapArgs = false; BranchOpc = X86::JA_4;  break;
-      case CmpInst::ICMP_UGE: SwapArgs = false; BranchOpc = X86::JAE_4; break;
-      case CmpInst::ICMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4;  break;
-      case CmpInst::ICMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break;
-      case CmpInst::ICMP_SGT: SwapArgs = false; BranchOpc = X86::JG_4;  break;
-      case CmpInst::ICMP_SGE: SwapArgs = false; BranchOpc = X86::JGE_4; break;
-      case CmpInst::ICMP_SLT: SwapArgs = false; BranchOpc = X86::JL_4;  break;
-      case CmpInst::ICMP_SLE: SwapArgs = false; BranchOpc = X86::JLE_4; break;
-      default:
-        return false;
+        std::swap(TrueMBB, FalseMBB); // fall-through
+      case CmpInst::FCMP_UNE:
+        NeedExtraBranch = true;
+        Predicate = CmpInst::FCMP_ONE;
+        break;
       }
 
-      const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+      bool SwapArgs;
+      unsigned BranchOpc;
+      std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+      assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+
+      BranchOpc = X86::GetCondBranchFromCond(CC);
       if (SwapArgs)
-        std::swap(Op0, Op1);
+        std::swap(CmpLHS, CmpRHS);
 
       // Emit a compare of the LHS and RHS, setting the flags.
-      if (!X86FastEmitCompare(Op0, Op1, VT))
+      if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT))
         return false;
 
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
         .addMBB(TrueMBB);
 
-      if (Predicate == CmpInst::FCMP_UNE) {
-        // X86 requires a second branch to handle UNE (and OEQ,
-        // which is mapped to UNE above).
+      // X86 requires a second branch to handle UNE (and OEQ, which is mapped
+      // to UNE above).
+      if (NeedExtraBranch) {
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_4))
           .addMBB(TrueMBB);
       }
 
+      // Obtain the branch weight and add the TrueBB to the successor list.
+      uint32_t BranchWeight = 0;
+      if (FuncInfo.BPI)
+        BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                                   TrueMBB->getBasicBlock());
+      FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
+
+      // Emits an unconditional branch to the FalseBB, obtains the branch
+      // weight, and adds it to the successor list.
       FastEmitBranch(FalseMBB, DbgLoc);
-      FuncInfo.MBB->addSuccessor(TrueMBB);
+
       return true;
     }
   } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
@@ -1224,10 +1449,32 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
           .addMBB(TrueMBB);
         FastEmitBranch(FalseMBB, DbgLoc);
-        FuncInfo.MBB->addSuccessor(TrueMBB);
+        uint32_t BranchWeight = 0;
+        if (FuncInfo.BPI)
+          BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                                     TrueMBB->getBasicBlock());
+        FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
         return true;
       }
     }
+  } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
+    // Fake request the condition, otherwise the intrinsic might be completely
+    // optimized away.
+    unsigned TmpReg = getRegForValue(BI->getCondition());
+    if (TmpReg == 0)
+      return false;
+
+    unsigned BranchOpc = X86::GetCondBranchFromCond(CC);
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
+      .addMBB(TrueMBB);
+    FastEmitBranch(FalseMBB, DbgLoc);
+    uint32_t BranchWeight = 0;
+    if (FuncInfo.BPI)
+      BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                                 TrueMBB->getBasicBlock());
+    FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
+    return true;
   }
 
   // Otherwise do a clumsy setcc and re-test it.
@@ -1241,7 +1488,11 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_4))
     .addMBB(TrueMBB);
   FastEmitBranch(FalseMBB, DbgLoc);
-  FuncInfo.MBB->addSuccessor(TrueMBB);
+  uint32_t BranchWeight = 0;
+  if (FuncInfo.BPI)
+    BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                               TrueMBB->getBasicBlock());
+  FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
   return true;
 }
 
@@ -1478,50 +1729,319 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
   return true;
 }
 
-bool X86FastISel::X86SelectSelect(const Instruction *I) {
-  MVT VT;
-  if (!isTypeLegal(I->getType(), VT))
+/// \brief Emit a conditional move instruction (if the are supported) to lower
+/// the select.
+bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
+  // Check if the subtarget supports these instructions.
+  if (!Subtarget->hasCMov())
     return false;
 
-  // We only use cmov here, if we don't have a cmov instruction bail.
-  if (!Subtarget->hasCMov()) return false;
+  // FIXME: Add support for i8.
+  if (RetVT < MVT::i16 || RetVT > MVT::i64)
+    return false;
 
-  unsigned Opc = 0;
-  const TargetRegisterClass *RC = nullptr;
-  if (VT == MVT::i16) {
-    Opc = X86::CMOVE16rr;
-    RC = &X86::GR16RegClass;
-  } else if (VT == MVT::i32) {
-    Opc = X86::CMOVE32rr;
-    RC = &X86::GR32RegClass;
-  } else if (VT == MVT::i64) {
-    Opc = X86::CMOVE64rr;
-    RC = &X86::GR64RegClass;
-  } else {
+  const Value *Cond = I->getOperand(0);
+  const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+  bool NeedTest = true;
+  X86::CondCode CC = X86::COND_NE;
+
+  // Optimize conditions coming from a compare if both instructions are in the
+  // same basic block (values defined in other basic blocks may not have
+  // initialized registers).
+  const auto *CI = dyn_cast<CmpInst>(Cond);
+  if (CI && (CI->getParent() == I->getParent())) {
+    CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+    // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
+    static unsigned SETFOpcTable[2][3] = {
+      { X86::SETNPr, X86::SETEr , X86::TEST8rr },
+      { X86::SETPr,  X86::SETNEr, X86::OR8rr   }
+    };
+    unsigned *SETFOpc = nullptr;
+    switch (Predicate) {
+    default: break;
+    case CmpInst::FCMP_OEQ:
+      SETFOpc = &SETFOpcTable[0][0];
+      Predicate = CmpInst::ICMP_NE;
+      break;
+    case CmpInst::FCMP_UNE:
+      SETFOpc = &SETFOpcTable[1][0];
+      Predicate = CmpInst::ICMP_NE;
+      break;
+    }
+
+    bool NeedSwap;
+    std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate);
+    assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+
+    const Value *CmpLHS = CI->getOperand(0);
+    const Value *CmpRHS = CI->getOperand(1);
+    if (NeedSwap)
+      std::swap(CmpLHS, CmpRHS);
+
+    EVT CmpVT = TLI.getValueType(CmpLHS->getType());
+    // Emit a compare of the LHS and RHS, setting the flags.
+    if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT))
+     return false;
+
+    if (SETFOpc) {
+      unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
+      unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
+              FlagReg1);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
+              FlagReg2);
+      auto const &II = TII.get(SETFOpc[2]);
+      if (II.getNumDefs()) {
+        unsigned TmpReg = createResultReg(&X86::GR8RegClass);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg)
+          .addReg(FlagReg2).addReg(FlagReg1);
+      } else {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+          .addReg(FlagReg2).addReg(FlagReg1);
+      }
+    }
+    NeedTest = false;
+  } else if (foldX86XALUIntrinsic(CC, I, Cond)) {
+    // Fake request the condition, otherwise the intrinsic might be completely
+    // optimized away.
+    unsigned TmpReg = getRegForValue(Cond);
+    if (TmpReg == 0)
+      return false;
+
+    NeedTest = false;
+  }
+
+  if (NeedTest) {
+    // Selects operate on i1, however, CondReg is 8 bits width and may contain
+    // garbage. Indeed, only the less significant bit is supposed to be
+    // accurate. If we read more than the lsb, we may see non-zero values
+    // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
+    // the select. This is achieved by performing TEST against 1.
+    unsigned CondReg = getRegForValue(Cond);
+    if (CondReg == 0)
+      return false;
+    bool CondIsKill = hasTrivialKill(Cond);
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+      .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
+  }
+
+  const Value *LHS = I->getOperand(1);
+  const Value *RHS = I->getOperand(2);
+
+  unsigned RHSReg = getRegForValue(RHS);
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  unsigned LHSReg = getRegForValue(LHS);
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  if (!LHSReg || !RHSReg)
+    return false;
+
+  unsigned Opc = X86::getCMovFromCond(CC, RC->getSize());
+  unsigned ResultReg = FastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill,
+                                       LHSReg, LHSIsKill);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+/// \brief Emit SSE instructions to lower the select.
+///
+/// Try to use SSE1/SSE2 instructions to simulate a select without branches.
+/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
+/// SSE instructions are available.
+bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
+  // Optimize conditions coming from a compare if both instructions are in the
+  // same basic block (values defined in other basic blocks may not have
+  // initialized registers).
+  const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0));
+  if (!CI || (CI->getParent() != I->getParent()))
     return false;
+
+  if (I->getType() != CI->getOperand(0)->getType() ||
+      !((Subtarget->hasSSE1() && RetVT == MVT::f32) ||
+        (Subtarget->hasSSE2() && RetVT == MVT::f64)    ))
+    return false;
+
+  const Value *CmpLHS = CI->getOperand(0);
+  const Value *CmpRHS = CI->getOperand(1);
+  CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+  // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
+  // We don't have to materialize a zero constant for this case and can just use
+  // %x again on the RHS.
+  if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+    const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
+    if (CmpRHSC && CmpRHSC->isNullValue())
+      CmpRHS = CmpLHS;
   }
 
-  unsigned Op0Reg = getRegForValue(I->getOperand(0));
-  if (Op0Reg == 0) return false;
-  unsigned Op1Reg = getRegForValue(I->getOperand(1));
-  if (Op1Reg == 0) return false;
-  unsigned Op2Reg = getRegForValue(I->getOperand(2));
-  if (Op2Reg == 0) return false;
-
-  // Selects operate on i1, however, Op0Reg is 8 bits width and may contain
-  // garbage. Indeed, only the less significant bit is supposed to be accurate.
-  // If we read more than the lsb, we may see non-zero values whereas lsb
-  // is zero. Therefore, we have to truncate Op0Reg to i1 for the select.
-  // This is achieved by performing TEST against 1.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
-    .addReg(Op0Reg).addImm(1);
-  unsigned ResultReg = createResultReg(RC);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-    .addReg(Op1Reg).addReg(Op2Reg);
+  unsigned CC;
+  bool NeedSwap;
+  std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);
+  if (CC > 7)
+    return false;
+
+  if (NeedSwap)
+    std::swap(CmpLHS, CmpRHS);
+
+  static unsigned OpcTable[2][2][4] = {
+    { { X86::CMPSSrr,  X86::FsANDPSrr,  X86::FsANDNPSrr,  X86::FsORPSrr  },
+      { X86::VCMPSSrr, X86::VFsANDPSrr, X86::VFsANDNPSrr, X86::VFsORPSrr }  },
+    { { X86::CMPSDrr,  X86::FsANDPDrr,  X86::FsANDNPDrr,  X86::FsORPDrr  },
+      { X86::VCMPSDrr, X86::VFsANDPDrr, X86::VFsANDNPDrr, X86::VFsORPDrr }  }
+  };
+
+  bool HasAVX = Subtarget->hasAVX();
+  unsigned *Opc = nullptr;
+  switch (RetVT.SimpleTy) {
+  default: return false;
+  case MVT::f32: Opc = &OpcTable[0][HasAVX][0]; break;
+  case MVT::f64: Opc = &OpcTable[1][HasAVX][0]; break;
+  }
+
+  const Value *LHS = I->getOperand(1);
+  const Value *RHS = I->getOperand(2);
+
+  unsigned LHSReg = getRegForValue(LHS);
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  unsigned RHSReg = getRegForValue(RHS);
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  unsigned CmpLHSReg = getRegForValue(CmpLHS);
+  bool CmpLHSIsKill = hasTrivialKill(CmpLHS);
+
+  unsigned CmpRHSReg = getRegForValue(CmpRHS);
+  bool CmpRHSIsKill = hasTrivialKill(CmpRHS);
+
+  if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS)
+    return false;
+
+  const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+  unsigned CmpReg = FastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
+                                     CmpRHSReg, CmpRHSIsKill, CC);
+  unsigned AndReg = FastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false,
+                                    LHSReg, LHSIsKill);
+  unsigned AndNReg = FastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true,
+                                     RHSReg, RHSIsKill);
+  unsigned ResultReg = FastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true,
+                                       AndReg, /*IsKill=*/true);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
+  // These are pseudo CMOV instructions and will be later expanded into control-
+  // flow.
+  unsigned Opc;
+  switch (RetVT.SimpleTy) {
+  default: return false;
+  case MVT::i8:  Opc = X86::CMOV_GR8;  break;
+  case MVT::i16: Opc = X86::CMOV_GR16; break;
+  case MVT::i32: Opc = X86::CMOV_GR32; break;
+  case MVT::f32: Opc = X86::CMOV_FR32; break;
+  case MVT::f64: Opc = X86::CMOV_FR64; break;
+  }
+
+  const Value *Cond = I->getOperand(0);
+  X86::CondCode CC = X86::COND_NE;
+
+  // Optimize conditions coming from a compare if both instructions are in the
+  // same basic block (values defined in other basic blocks may not have
+  // initialized registers).
+  const auto *CI = dyn_cast<CmpInst>(Cond);
+  if (CI && (CI->getParent() == I->getParent())) {
+    bool NeedSwap;
+    std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate());
+    if (CC > X86::LAST_VALID_COND)
+      return false;
+
+    const Value *CmpLHS = CI->getOperand(0);
+    const Value *CmpRHS = CI->getOperand(1);
+
+    if (NeedSwap)
+      std::swap(CmpLHS, CmpRHS);
+
+    EVT CmpVT = TLI.getValueType(CmpLHS->getType());
+    if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT))
+      return false;
+  } else {
+    unsigned CondReg = getRegForValue(Cond);
+    if (CondReg == 0)
+      return false;
+    bool CondIsKill = hasTrivialKill(Cond);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+      .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
+  }
+
+  const Value *LHS = I->getOperand(1);
+  const Value *RHS = I->getOperand(2);
+
+  unsigned LHSReg = getRegForValue(LHS);
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  unsigned RHSReg = getRegForValue(RHS);
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  if (!LHSReg || !RHSReg)
+    return false;
+
+  const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+
+  unsigned ResultReg =
+    FastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
   UpdateValueMap(I, ResultReg);
   return true;
 }
 
+bool X86FastISel::X86SelectSelect(const Instruction *I) {
+  MVT RetVT;
+  if (!isTypeLegal(I->getType(), RetVT))
+    return false;
+
+  // Check if we can fold the select.
+  if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) {
+    CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+    const Value *Opnd = nullptr;
+    switch (Predicate) {
+    default:                              break;
+    case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break;
+    case CmpInst::FCMP_TRUE:  Opnd = I->getOperand(1); break;
+    }
+    // No need for a select anymore - this is an unconditional move.
+    if (Opnd) {
+      unsigned OpReg = getRegForValue(Opnd);
+      if (OpReg == 0)
+        return false;
+      bool OpIsKill = hasTrivialKill(Opnd);
+      const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+      unsigned ResultReg = createResultReg(RC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(OpReg, getKillRegState(OpIsKill));
+      UpdateValueMap(I, ResultReg);
+      return true;
+    }
+  }
+
+  // First try to use real conditional move instructions.
+  if (X86FastEmitCMoveSelect(RetVT, I))
+    return true;
+
+  // Try to use a sequence of SSE instructions to simulate a conditional move.
+  if (X86FastEmitSSESelect(RetVT, I))
+    return true;
+
+  // Fall-back to pseudo conditional move instructions, which will be later
+  // converted to control-flow.
+  if (X86FastEmitPseudoSelect(RetVT, I))
+    return true;
+
+  return false;
+}
+
 bool X86FastISel::X86SelectFPExt(const Instruction *I) {
   // fpext from float to double.
   if (X86ScalarSSEf64 &&
@@ -1633,8 +2153,8 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
     }
 
     unsigned Reg;
-    bool RV = X86FastEmitLoad(VT, SrcAM, Reg);
-    RV &= X86FastEmitStore(VT, Reg, DestAM);
+    bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
+    RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM);
     assert(RV && "Failed to emit load or store??");
 
     unsigned Size = VT.getSizeInBits()/8;
@@ -1646,10 +2166,74 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
   return true;
 }
 
+static bool isCommutativeIntrinsic(IntrinsicInst const &I) {
+  switch (I.getIntrinsicID()) {
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow:
+    return true;
+  default:
+    return false;
+  }
+}
+
 bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
   // FIXME: Handle more intrinsics.
   switch (I.getIntrinsicID()) {
   default: return false;
+  case Intrinsic::frameaddress: {
+    Type *RetTy = I.getCalledFunction()->getReturnType();
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    unsigned Opc;
+    const TargetRegisterClass *RC = nullptr;
+
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Invalid result type for frameaddress.");
+    case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;
+    case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
+    }
+
+    // This needs to be set before we call getFrameRegister, otherwise we get
+    // the wrong frame register.
+    MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo();
+    MFI->setFrameAddressIsTaken(true);
+
+    const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
+    unsigned FrameReg = RegInfo->getFrameRegister(*(FuncInfo.MF));
+    assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
+            (FrameReg == X86::EBP && VT == MVT::i32)) &&
+           "Invalid Frame Register!");
+
+    // Always make a copy of the frame register to to a vreg first, so that we
+    // never directly reference the frame register (the TwoAddressInstruction-
+    // Pass doesn't like that).
+    unsigned SrcReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);
+
+    // Now recursively load from the frame address.
+    // movq (%rbp), %rax
+    // movq (%rax), %rax
+    // movq (%rax), %rax
+    // ...
+    unsigned DestReg;
+    unsigned Depth = cast<ConstantInt>(I.getOperand(0))->getZExtValue();
+    while (Depth--) {
+      DestReg = createResultReg(RC);
+      addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                           TII.get(Opc), DestReg), SrcReg);
+      SrcReg = DestReg;
+    }
+
+    UpdateValueMap(&I, SrcReg);
+    return true;
+  }
   case Intrinsic::memcpy: {
     const MemCpyInst &MCI = cast<MemCpyInst>(I);
     // Don't handle volatile or variable length memcpys.
@@ -1726,52 +2310,233 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP));
     return true;
   }
-  case Intrinsic::sadd_with_overflow:
-  case Intrinsic::uadd_with_overflow: {
-    // FIXME: Should fold immediates.
+  case Intrinsic::sqrt: {
+    if (!Subtarget->hasSSE1())
+      return false;
 
-    // Replace "add with overflow" intrinsics with an "add" instruction followed
-    // by a seto/setc instruction.
-    const Function *Callee = I.getCalledFunction();
-    Type *RetTy =
-      cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0));
+    Type *RetTy = I.getCalledFunction()->getReturnType();
 
     MVT VT;
     if (!isTypeLegal(RetTy, VT))
       return false;
 
-    const Value *Op1 = I.getArgOperand(0);
-    const Value *Op2 = I.getArgOperand(1);
-    unsigned Reg1 = getRegForValue(Op1);
-    unsigned Reg2 = getRegForValue(Op2);
+    // Unfortunately we can't use FastEmit_r, because the AVX version of FSQRT
+    // is not generated by FastISel yet.
+    // FIXME: Update this code once tablegen can handle it.
+    static const unsigned SqrtOpc[2][2] = {
+      {X86::SQRTSSr, X86::VSQRTSSr},
+      {X86::SQRTSDr, X86::VSQRTSDr}
+    };
+    bool HasAVX = Subtarget->hasAVX();
+    unsigned Opc;
+    const TargetRegisterClass *RC;
+    switch (VT.SimpleTy) {
+    default: return false;
+    case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break;
+    case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break;
+    }
+
+    const Value *SrcVal = I.getArgOperand(0);
+    unsigned SrcReg = getRegForValue(SrcVal);
 
-    if (Reg1 == 0 || Reg2 == 0)
-      // FIXME: Handle values *not* in registers.
+    if (SrcReg == 0)
       return false;
 
-    unsigned OpC = 0;
-    if (VT == MVT::i32)
-      OpC = X86::ADD32rr;
-    else if (VT == MVT::i64)
-      OpC = X86::ADD64rr;
-    else
+    unsigned ImplicitDefReg = 0;
+    if (HasAVX) {
+      ImplicitDefReg = createResultReg(RC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+    }
+
+    unsigned ResultReg = createResultReg(RC);
+    MachineInstrBuilder MIB;
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                  ResultReg);
+
+    if (ImplicitDefReg)
+      MIB.addReg(ImplicitDefReg);
+
+    MIB.addReg(SrcReg);
+
+    UpdateValueMap(&I, ResultReg);
+    return true;
+  }
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow: {
+    // This implements the basic lowering of the xalu with overflow intrinsics
+    // into add/sub/mul followed by either seto or setb.
+    const Function *Callee = I.getCalledFunction();
+    auto *Ty = cast<StructType>(Callee->getReturnType());
+    Type *RetTy = Ty->getTypeAtIndex(0U);
+    Type *CondTy = Ty->getTypeAtIndex(1);
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    if (VT < MVT::i8 || VT > MVT::i64)
+      return false;
+
+    const Value *LHS = I.getArgOperand(0);
+    const Value *RHS = I.getArgOperand(1);
+
+    // Canonicalize immediate to the RHS.
+    if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
+        isCommutativeIntrinsic(I))
+      std::swap(LHS, RHS);
+
+    unsigned BaseOpc, CondOpc;
+    switch (I.getIntrinsicID()) {
+    default: llvm_unreachable("Unexpected intrinsic!");
+    case Intrinsic::sadd_with_overflow:
+      BaseOpc = ISD::ADD; CondOpc = X86::SETOr; break;
+    case Intrinsic::uadd_with_overflow:
+      BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break;
+    case Intrinsic::ssub_with_overflow:
+      BaseOpc = ISD::SUB; CondOpc = X86::SETOr; break;
+    case Intrinsic::usub_with_overflow:
+      BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break;
+    case Intrinsic::smul_with_overflow:
+      BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break;
+    case Intrinsic::umul_with_overflow:
+      BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break;
+    }
+
+    unsigned LHSReg = getRegForValue(LHS);
+    if (LHSReg == 0)
       return false;
+    bool LHSIsKill = hasTrivialKill(LHS);
 
-    // The call to CreateRegs builds two sequential registers, to store the
-    // both the returned values.
-    unsigned ResultReg = FuncInfo.CreateRegs(I.getType());
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpC), ResultReg)
-      .addReg(Reg1).addReg(Reg2);
+    unsigned ResultReg = 0;
+    // Check if we have an immediate version.
+    if (auto const *C = dyn_cast<ConstantInt>(RHS)) {
+      ResultReg = FastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
+                              C->getZExtValue());
+    }
 
-    unsigned Opc = X86::SETBr;
-    if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow)
-      Opc = X86::SETOr;
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
-            ResultReg + 1);
+    unsigned RHSReg;
+    bool RHSIsKill;
+    if (!ResultReg) {
+      RHSReg = getRegForValue(RHS);
+      if (RHSReg == 0)
+        return false;
+      RHSIsKill = hasTrivialKill(RHS);
+      ResultReg = FastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
+                              RHSIsKill);
+    }
+
+    // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit
+    // it manually.
+    if (BaseOpc == X86ISD::UMUL && !ResultReg) {
+      static const unsigned MULOpc[] =
+        { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
+      static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
+      // First copy the first operand into RAX, which is an implicit input to
+      // the X86::MUL*r instruction.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
+        .addReg(LHSReg, getKillRegState(LHSIsKill));
+      ResultReg = FastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
+                                 TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
+    } else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
+      static const unsigned MULOpc[] =
+        { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
+      if (VT == MVT::i8) {
+        // Copy the first operand into AL, which is an implicit input to the
+        // X86::IMUL8r instruction.
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+               TII.get(TargetOpcode::COPY), X86::AL)
+          .addReg(LHSReg, getKillRegState(LHSIsKill));
+        ResultReg = FastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg,
+                                   RHSIsKill);
+      } else
+        ResultReg = FastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
+                                    TLI.getRegClassFor(VT), LHSReg, LHSIsKill,
+                                    RHSReg, RHSIsKill);
+    }
+
+    if (!ResultReg)
+      return false;
+
+    unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy);
+    assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
+            ResultReg2);
 
     UpdateValueMap(&I, ResultReg, 2);
     return true;
   }
+  case Intrinsic::x86_sse_cvttss2si:
+  case Intrinsic::x86_sse_cvttss2si64:
+  case Intrinsic::x86_sse2_cvttsd2si:
+  case Intrinsic::x86_sse2_cvttsd2si64: {
+    bool IsInputDouble;
+    switch (I.getIntrinsicID()) {
+    default: llvm_unreachable("Unexpected intrinsic.");
+    case Intrinsic::x86_sse_cvttss2si:
+    case Intrinsic::x86_sse_cvttss2si64:
+      if (!Subtarget->hasSSE1())
+        return false;
+      IsInputDouble = false;
+      break;
+    case Intrinsic::x86_sse2_cvttsd2si:
+    case Intrinsic::x86_sse2_cvttsd2si64:
+      if (!Subtarget->hasSSE2())
+        return false;
+      IsInputDouble = true;
+      break;
+    }
+
+    Type *RetTy = I.getCalledFunction()->getReturnType();
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    static const unsigned CvtOpc[2][2][2] = {
+      { { X86::CVTTSS2SIrr,   X86::VCVTTSS2SIrr   },
+        { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr }  },
+      { { X86::CVTTSD2SIrr,   X86::VCVTTSD2SIrr   },
+        { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr }  }
+    };
+    bool HasAVX = Subtarget->hasAVX();
+    unsigned Opc;
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected result type.");
+    case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break;
+    case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break;
+    }
+
+    // Check if we can fold insertelement instructions into the convert.
+    const Value *Op = I.getArgOperand(0);
+    while (auto *IE = dyn_cast<InsertElementInst>(Op)) {
+      const Value *Index = IE->getOperand(2);
+      if (!isa<ConstantInt>(Index))
+        break;
+      unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
+
+      if (Idx == 0) {
+        Op = IE->getOperand(1);
+        break;
+      }
+      Op = IE->getOperand(0);
+    }
+
+    unsigned Reg = getRegForValue(Op);
+    if (Reg == 0)
+      return false;
+
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(Reg);
+
+    UpdateValueMap(&I, ResultReg);
+    return true;
+  }
   }
 }
 
@@ -1794,31 +2559,43 @@ bool X86FastISel::FastLowerArguments() {
     return false;
   
   // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
-  unsigned Idx = 1;
-  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-       I != E; ++I, ++Idx) {
-    if (Idx > 6)
-      return false;
-
+  unsigned GPRCnt = 0;
+  unsigned FPRCnt = 0;
+  unsigned Idx = 0;
+  for (auto const &Arg : F->args()) {
+    // The first argument is at index 1.
+    ++Idx;
     if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
         F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
         F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
         F->getAttributes().hasAttribute(Idx, Attribute::Nest))
       return false;
 
-    Type *ArgTy = I->getType();
+    Type *ArgTy = Arg.getType();
     if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
       return false;
 
     EVT ArgVT = TLI.getValueType(ArgTy);
     if (!ArgVT.isSimple()) return false;
     switch (ArgVT.getSimpleVT().SimpleTy) {
+    default: return false;
     case MVT::i32:
     case MVT::i64:
+      ++GPRCnt;
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      if (!Subtarget->hasSSE1())
+        return false;
+      ++FPRCnt;
       break;
-    default:
-      return false;
     }
+
+    if (GPRCnt > 6)
+      return false;
+
+    if (FPRCnt > 8)
+      return false;
   }
 
   static const MCPhysReg GPR32ArgRegs[] = {
@@ -1827,24 +2604,33 @@ bool X86FastISel::FastLowerArguments() {
   static const MCPhysReg GPR64ArgRegs[] = {
     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
   };
+  static const MCPhysReg XMMArgRegs[] = {
+    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+    X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+  };
 
-  Idx = 0;
-  const TargetRegisterClass *RC32 = TLI.getRegClassFor(MVT::i32);
-  const TargetRegisterClass *RC64 = TLI.getRegClassFor(MVT::i64);
-  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-       I != E; ++I, ++Idx) {
-    bool is32Bit = TLI.getValueType(I->getType()) == MVT::i32;
-    const TargetRegisterClass *RC = is32Bit ? RC32 : RC64;
-    unsigned SrcReg = is32Bit ? GPR32ArgRegs[Idx] : GPR64ArgRegs[Idx];
+  unsigned GPRIdx = 0;
+  unsigned FPRIdx = 0;
+  for (auto const &Arg : F->args()) {
+    MVT VT = TLI.getSimpleValueType(Arg.getType());
+    const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
+    unsigned SrcReg;
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type.");
+    case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
+    case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
+    case MVT::f32: // fall-through
+    case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
+    }
     unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
     // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
     // Without this, EmitLiveInCopies may eliminate the livein if its only
     // use is a bitcast (which isn't turned into an instruction).
     unsigned ResultReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(TargetOpcode::COPY),
-            ResultReg).addReg(DstReg, getKillRegState(true));
-    UpdateValueMap(I, ResultReg);
+            TII.get(TargetOpcode::COPY), ResultReg)
+      .addReg(DstReg, getKillRegState(true));
+    UpdateValueMap(&Arg, ResultReg);
   }
   return true;
 }
@@ -2147,7 +2933,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
         if (!X86FastEmitStore(ArgVT, ArgVal, AM))
           return false;
       } else {
-        if (!X86FastEmitStore(ArgVT, Arg, AM))
+        if (!X86FastEmitStore(ArgVT, Arg, /*ValIsKill=*/false, AM))
           return false;
       }
     }
@@ -2430,7 +3216,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
     return 0;
   }
 
-  // Materialize addresses with LEA instructions.
+  // Materialize addresses with LEA/MOV instructions.
   if (isa<GlobalValue>(C)) {
     X86AddressMode AM;
     if (X86SelectAddress(C, AM)) {
@@ -2440,10 +3226,19 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
           AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
         return AM.Base.Reg;
 
-      Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
       unsigned ResultReg = createResultReg(RC);
-      addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+      if (TM.getRelocationModel() == Reloc::Static &&
+          TLI.getPointerTy() == MVT::i64) {
+        // The displacement code be more than 32 bits away so we need to use
+        // an instruction with a 64 bit immediate
+        Opc = X86::MOV64ri;
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(Opc), ResultReg).addGlobalAddress(cast<GlobalValue>(C));
+      } else {
+        Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
+        addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                              TII.get(Opc), ResultReg), AM);
+      }
       return ResultReg;
     }
     return 0;
@@ -2544,8 +3339,9 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
 
 bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
                                       const LoadInst *LI) {
+  const Value *Ptr = LI->getPointerOperand();
   X86AddressMode AM;
-  if (!X86SelectAddress(LI->getOperand(0), AM))
+  if (!X86SelectAddress(Ptr, AM))
     return false;
 
   const X86InstrInfo &XII = (const X86InstrInfo&)TII;
@@ -2553,13 +3349,18 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
   unsigned Size = DL.getTypeAllocSize(LI->getType());
   unsigned Alignment = LI->getAlignment();
 
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = DL.getABITypeAlignment(LI->getType());
+
   SmallVector<MachineOperand, 8> AddrOps;
   AM.getFullAddress(AddrOps);
 
   MachineInstr *Result =
     XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment);
-  if (!Result) return false;
+  if (!Result)
+    return false;
 
+  Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
   FuncInfo.MBB->insert(FuncInfo.InsertPt, Result);
   MI->eraseFromParent();
   return true;
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 6c5b86f..4be766a 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -32,86 +32,89 @@ using namespace llvm;
 STATISTIC(NumLEAs, "Number of LEA instructions created");
 
 namespace {
-  class FixupLEAPass : public MachineFunctionPass {
-    enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
-    static char ID;
-    /// \brief Loop over all of the instructions in the basic block
-    /// replacing applicable instructions with LEA instructions,
-    /// where appropriate.
-    bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
+class FixupLEAPass : public MachineFunctionPass {
+  enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
+  static char ID;
+  /// \brief Loop over all of the instructions in the basic block
+  /// replacing applicable instructions with LEA instructions,
+  /// where appropriate.
+  bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
 
-    const char *getPassName() const override { return "X86 Atom LEA Fixup";}
+  const char *getPassName() const override { return "X86 Atom LEA Fixup"; }
 
-    /// \brief Given a machine register, look for the instruction
-    /// which writes it in the current basic block. If found,
-    /// try to replace it with an equivalent LEA instruction.
-    /// If replacement succeeds, then also process the the newly created
-    /// instruction.
-    void  seekLEAFixup(MachineOperand& p, MachineBasicBlock::iterator& I,
-                      MachineFunction::iterator MFI);
+  /// \brief Given a machine register, look for the instruction
+  /// which writes it in the current basic block. If found,
+  /// try to replace it with an equivalent LEA instruction.
+  /// If replacement succeeds, then also process the the newly created
+  /// instruction.
+  void seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I,
+                    MachineFunction::iterator MFI);
 
-    /// \brief Given a memory access or LEA instruction
-    /// whose address mode uses a base and/or index register, look for
-    /// an opportunity to replace the instruction which sets the base or index
-    /// register with an equivalent LEA instruction.
-    void processInstruction(MachineBasicBlock::iterator& I,
-                            MachineFunction::iterator MFI);
+  /// \brief Given a memory access or LEA instruction
+  /// whose address mode uses a base and/or index register, look for
+  /// an opportunity to replace the instruction which sets the base or index
+  /// register with an equivalent LEA instruction.
+  void processInstruction(MachineBasicBlock::iterator &I,
+                          MachineFunction::iterator MFI);
 
-    /// \brief Given a LEA instruction which is unprofitable
-    /// on Silvermont try to replace it with an equivalent ADD instruction
-    void processInstructionForSLM(MachineBasicBlock::iterator& I,
-                                  MachineFunction::iterator MFI);
+  /// \brief Given a LEA instruction which is unprofitable
+  /// on Silvermont try to replace it with an equivalent ADD instruction
+  void processInstructionForSLM(MachineBasicBlock::iterator &I,
+                                MachineFunction::iterator MFI);
 
-    /// \brief Determine if an instruction references a machine register
-    /// and, if so, whether it reads or writes the register.
-    RegUsageState usesRegister(MachineOperand& p,
-                               MachineBasicBlock::iterator I);
+  /// \brief Determine if an instruction references a machine register
+  /// and, if so, whether it reads or writes the register.
+  RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I);
 
-    /// \brief Step backwards through a basic block, looking
-    /// for an instruction which writes a register within 
-    /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
-    MachineBasicBlock::iterator searchBackwards(MachineOperand& p,
-                                                MachineBasicBlock::iterator& I,
-                                                MachineFunction::iterator MFI);
+  /// \brief Step backwards through a basic block, looking
+  /// for an instruction which writes a register within
+  /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
+  MachineBasicBlock::iterator searchBackwards(MachineOperand &p,
+                                              MachineBasicBlock::iterator &I,
+                                              MachineFunction::iterator MFI);
 
-    /// \brief if an instruction can be converted to an 
-    /// equivalent LEA, insert the new instruction into the basic block
-    /// and return a pointer to it. Otherwise, return zero.
-    MachineInstr* postRAConvertToLEA(MachineFunction::iterator &MFI,
-                                     MachineBasicBlock::iterator &MBBI) const;
+  /// \brief if an instruction can be converted to an
+  /// equivalent LEA, insert the new instruction into the basic block
+  /// and return a pointer to it. Otherwise, return zero.
+  MachineInstr *postRAConvertToLEA(MachineFunction::iterator &MFI,
+                                   MachineBasicBlock::iterator &MBBI) const;
 
-  public:
-    FixupLEAPass() : MachineFunctionPass(ID) {}
+public:
+  FixupLEAPass() : MachineFunctionPass(ID) {}
 
-    /// \brief Loop over all of the basic blocks,
-    /// replacing instructions by equivalent LEA instructions
-    /// if needed and when possible.
-    bool runOnMachineFunction(MachineFunction &MF) override;
+  /// \brief Loop over all of the basic blocks,
+  /// replacing instructions by equivalent LEA instructions
+  /// if needed and when possible.
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  private:
-    MachineFunction *MF;
-    const TargetMachine *TM;
-    const X86InstrInfo *TII; // Machine instruction info.
-
-  };
-  char FixupLEAPass::ID = 0;
+private:
+  MachineFunction *MF;
+  const TargetMachine *TM;
+  const X86InstrInfo *TII; // Machine instruction info.
+};
+char FixupLEAPass::ID = 0;
 }
 
 MachineInstr *
 FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
                                  MachineBasicBlock::iterator &MBBI) const {
-  MachineInstr* MI = MBBI;
-  MachineInstr* NewMI;
+  MachineInstr *MI = MBBI;
+  MachineInstr *NewMI;
   switch (MI->getOpcode()) {
   case X86::MOV32rr:
   case X86::MOV64rr: {
-    const MachineOperand& Src = MI->getOperand(1);
-    const MachineOperand& Dest = MI->getOperand(0);
+    const MachineOperand &Src = MI->getOperand(1);
+    const MachineOperand &Dest = MI->getOperand(0);
     NewMI = BuildMI(*MF, MI->getDebugLoc(),
-      TII->get( MI->getOpcode() == X86::MOV32rr ? X86::LEA32r : X86::LEA64r))
-      .addOperand(Dest)
-      .addOperand(Src).addImm(1).addReg(0).addImm(0).addReg(0);
-    MFI->insert(MBBI, NewMI);   // Insert the new inst
+                    TII->get(MI->getOpcode() == X86::MOV32rr ? X86::LEA32r
+                                                             : X86::LEA64r))
+                .addOperand(Dest)
+                .addOperand(Src)
+                .addImm(1)
+                .addReg(0)
+                .addImm(0)
+                .addReg(0);
+    MFI->insert(MBBI, NewMI); // Insert the new inst
     return NewMI;
   }
   case X86::ADD64ri32:
@@ -144,17 +147,16 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
   return TII->convertToThreeAddress(MFI, MBBI, nullptr);
 }
 
-FunctionPass *llvm::createX86FixupLEAs() {
-  return new FixupLEAPass();
-}
+FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); }
 
 bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
+  MF = &Func;
   TM = &Func.getTarget();
   const X86Subtarget &ST = TM->getSubtarget<X86Subtarget>();
   if (!ST.LEAusesAG() && !ST.slowLEA())
     return false;
 
-  TII = static_cast<const X86InstrInfo*>(TM->getInstrInfo());
+  TII = static_cast<const X86InstrInfo *>(TM->getInstrInfo());
 
   DEBUG(dbgs() << "Start X86FixupLEAs\n";);
   // Process all basic blocks.
@@ -165,14 +167,14 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
   return true;
 }
 
-FixupLEAPass::RegUsageState FixupLEAPass::usesRegister(MachineOperand& p,
-                                MachineBasicBlock::iterator I) {
+FixupLEAPass::RegUsageState
+FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) {
   RegUsageState RegUsage = RU_NotUsed;
-  MachineInstr* MI = I;
+  MachineInstr *MI = I;
 
   for (unsigned int i = 0; i < MI->getNumOperands(); ++i) {
-    MachineOperand& opnd = MI->getOperand(i);
-    if (opnd.isReg() && opnd.getReg() == p.getReg()){
+    MachineOperand &opnd = MI->getOperand(i);
+    if (opnd.isReg() && opnd.getReg() == p.getReg()) {
       if (opnd.isDef())
         return RU_Write;
       RegUsage = RU_Read;
@@ -185,23 +187,22 @@ FixupLEAPass::RegUsageState FixupLEAPass::usesRegister(MachineOperand& p,
 /// block, return a reference to the previous instruction in the block,
 /// wrapping around to the last instruction of the block if the block
 /// branches to itself.
-static inline bool getPreviousInstr(MachineBasicBlock::iterator& I,
+static inline bool getPreviousInstr(MachineBasicBlock::iterator &I,
                                     MachineFunction::iterator MFI) {
   if (I == MFI->begin()) {
     if (MFI->isPredecessor(MFI)) {
       I = --MFI->end();
       return true;
-    }
-    else
+    } else
       return false;
   }
   --I;
   return true;
 }
 
-MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p,
-                                   MachineBasicBlock::iterator& I,
-                                   MachineFunction::iterator MFI) {
+MachineBasicBlock::iterator
+FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
+                              MachineFunction::iterator MFI) {
   int InstrDistance = 1;
   MachineBasicBlock::iterator CurInst;
   static const int INSTR_DISTANCE_THRESHOLD = 5;
@@ -209,12 +210,12 @@ MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p,
   CurInst = I;
   bool Found;
   Found = getPreviousInstr(CurInst, MFI);
-  while( Found && I != CurInst) {
+  while (Found && I != CurInst) {
     if (CurInst->isCall() || CurInst->isInlineAsm())
       break;
     if (InstrDistance > INSTR_DISTANCE_THRESHOLD)
       break; // too far back to make a difference
-    if (usesRegister(p, CurInst) == RU_Write){
+    if (usesRegister(p, CurInst) == RU_Write) {
       return CurInst;
     }
     InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst);
@@ -223,32 +224,32 @@ MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p,
   return nullptr;
 }
 
-void FixupLEAPass::processInstruction(MachineBasicBlock::iterator& I,
+void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I,
                                       MachineFunction::iterator MFI) {
   // Process a load, store, or LEA instruction.
   MachineInstr *MI = I;
   int opcode = MI->getOpcode();
-  const MCInstrDesc& Desc = MI->getDesc();
+  const MCInstrDesc &Desc = MI->getDesc();
   int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags, opcode);
   if (AddrOffset >= 0) {
     AddrOffset += X86II::getOperandBias(Desc);
-    MachineOperand& p = MI->getOperand(AddrOffset + X86::AddrBaseReg);
+    MachineOperand &p = MI->getOperand(AddrOffset + X86::AddrBaseReg);
     if (p.isReg() && p.getReg() != X86::ESP) {
       seekLEAFixup(p, I, MFI);
     }
-    MachineOperand& q = MI->getOperand(AddrOffset + X86::AddrIndexReg);
+    MachineOperand &q = MI->getOperand(AddrOffset + X86::AddrIndexReg);
     if (q.isReg() && q.getReg() != X86::ESP) {
       seekLEAFixup(q, I, MFI);
     }
   }
 }
 
-void FixupLEAPass::seekLEAFixup(MachineOperand& p,
-                                MachineBasicBlock::iterator& I,
+void FixupLEAPass::seekLEAFixup(MachineOperand &p,
+                                MachineBasicBlock::iterator &I,
                                 MachineFunction::iterator MFI) {
   MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI);
   if (MBI) {
-    MachineInstr* NewMI = postRAConvertToLEA(MFI, MBI);
+    MachineInstr *NewMI = postRAConvertToLEA(MFI, MBI);
     if (NewMI) {
       ++NumLEAs;
       DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
@@ -256,7 +257,7 @@ void FixupLEAPass::seekLEAFixup(MachineOperand& p,
       DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
       MFI->erase(MBI);
       MachineBasicBlock::iterator J =
-                             static_cast<MachineBasicBlock::iterator> (NewMI);
+          static_cast<MachineBasicBlock::iterator>(NewMI);
       processInstruction(J, MFI);
     }
   }
@@ -299,7 +300,7 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
   }
   DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
   DEBUG(dbgs() << "FixLEA: Replaced by: ";);
-  MachineInstr *NewMI = 0;
+  MachineInstr *NewMI = nullptr;
   const MachineOperand &Dst = MI->getOperand(0);
   // Make ADD instruction for two registers writing to LEA's destination
   if (SrcR1 != 0 && SrcR2 != 0) {
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 4c1374f..8c029a8 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -29,6 +29,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
@@ -45,7 +46,7 @@ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
 bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const MachineModuleInfo &MMI = MF.getMMI();
-  const TargetRegisterInfo *RegInfo = TM.getRegisterInfo();
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
 
   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
           RegInfo->needsStackRealignment(MF) ||
@@ -305,65 +306,25 @@ static bool isEAXLiveIn(MachineFunction &MF) {
   return false;
 }
 
-void X86FrameLowering::emitCalleeSavedFrameMoves(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL,
-    unsigned FramePtr) const {
+void
+X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MBBI,
+                                            DebugLoc DL) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
-  const X86InstrInfo &TII = *TM.getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
 
   // Add callee saved registers to move list.
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   if (CSI.empty()) return;
 
-  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
-  bool HasFP = hasFP(MF);
-
-  // Calculate amount of bytes used for return address storing.
-  int stackGrowth = -RegInfo->getSlotSize();
-
-  // FIXME: This is dirty hack. The code itself is pretty mess right now.
-  // It should be rewritten from scratch and generalized sometimes.
-
-  // Determine maximum offset (minimum due to stack growth).
-  int64_t MaxOffset = 0;
-  for (std::vector<CalleeSavedInfo>::const_iterator
-         I = CSI.begin(), E = CSI.end(); I != E; ++I)
-    MaxOffset = std::min(MaxOffset,
-                         MFI->getObjectOffset(I->getFrameIdx()));
-
   // Calculate offsets.
-  int64_t saveAreaOffset = (HasFP ? 3 : 2) * stackGrowth;
   for (std::vector<CalleeSavedInfo>::const_iterator
          I = CSI.begin(), E = CSI.end(); I != E; ++I) {
     int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
     unsigned Reg = I->getReg();
-    Offset = MaxOffset - Offset + saveAreaOffset;
-
-    // Don't output a new machine move if we're re-saving the frame
-    // pointer. This happens when the PrologEpilogInserter has inserted an extra
-    // "PUSH" of the frame pointer -- the "emitPrologue" method automatically
-    // generates one when frame pointers are used. If we generate a "machine
-    // move" for this extra "PUSH", the linker will lose track of the fact that
-    // the frame pointer should have the value of the first "PUSH" when it's
-    // trying to unwind.
-    //
-    // FIXME: This looks inelegant. It's possibly correct, but it's covering up
-    //        another bug. I.e., one where we generate a prolog like this:
-    //
-    //          pushl  %ebp
-    //          movl   %esp, %ebp
-    //          pushl  %ebp
-    //          pushl  %esi
-    //           ...
-    //
-    //        The immediate re-push of EBP is unnecessary. At the least, it's an
-    //        optimization bug. EBP can be used as a scratch register in certain
-    //        cases, but probably not when we have a frame pointer.
-    if (HasFP && FramePtr == Reg)
-      continue;
 
     unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
     unsigned CFIIndex =
@@ -395,23 +356,107 @@ static bool usesTheStack(const MachineFunction &MF) {
 /// automatically adjust the stack pointer. Adjust the stack pointer to allocate
 /// space for local variables. Also emit labels used by the exception handler to
 /// generate the exception handling frames.
+
+/*
+  Here's a gist of what gets emitted:
+
+  ; Establish frame pointer, if needed
+  [if needs FP]
+      push  %rbp
+      .cfi_def_cfa_offset 16
+      .cfi_offset %rbp, -16
+      .seh_pushreg %rpb
+      mov  %rsp, %rbp
+      .cfi_def_cfa_register %rbp
+
+  ; Spill general-purpose registers
+  [for all callee-saved GPRs]
+      pushq %<reg>
+      [if not needs FP]
+         .cfi_def_cfa_offset (offset from RETADDR)
+      .seh_pushreg %<reg>
+
+  ; If the required stack alignment > default stack alignment
+  ; rsp needs to be re-aligned.  This creates a "re-alignment gap"
+  ; of unknown size in the stack frame.
+  [if stack needs re-alignment]
+      and  $MASK, %rsp
+
+  ; Allocate space for locals
+  [if target is Windows and allocated space > 4096 bytes]
+      ; Windows needs special care for allocations larger
+      ; than one page.
+      mov $NNN, %rax
+      call ___chkstk_ms/___chkstk
+      sub  %rax, %rsp
+  [else]
+      sub  $NNN, %rsp
+
+  [if needs FP]
+      .seh_stackalloc (size of XMM spill slots)
+      .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots
+  [else]
+      .seh_stackalloc NNN
+
+  ; Spill XMMs
+  ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved,
+  ; they may get spilled on any platform, if the current function
+  ; calls @llvm.eh.unwind.init
+  [if needs FP]
+      [for all callee-saved XMM registers]
+          movaps  %<xmm reg>, -MMM(%rbp)
+      [for all callee-saved XMM registers]
+          .seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset)
+              ; i.e. the offset relative to (%rbp - SEHFrameOffset)
+  [else]
+      [for all callee-saved XMM registers]
+          movaps  %<xmm reg>, KKK(%rsp)
+      [for all callee-saved XMM registers]
+          .seh_savexmm %<xmm reg>, KKK
+
+  .seh_endprologue
+
+  [if needs base pointer]
+      mov  %rsp, %rbx
+
+  ; Emit CFI info
+  [if needs FP]
+      [for all callee-saved registers]
+          .cfi_offset %<reg>, (offset from %rbp)
+  [else]
+       .cfi_def_cfa_offset (offset from RETADDR)
+      [for all callee-saved registers]
+          .cfi_offset %<reg>, (offset from %rsp)
+
+  Notes:
+  - .seh directives are emitted only for Windows 64 ABI
+  - .cfi directives are emitted for all other ABIs
+  - for 32-bit code, substitute %e?? registers for %r??
+*/
+
 void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *Fn = MF.getFunction();
-  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
-  const X86InstrInfo &TII = *TM.getInstrInfo();
+  const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
-  bool needsFrameMoves = MMI.hasDebugInfo() ||
-    Fn->needsUnwindTableEntry();
   uint64_t MaxAlign  = MFI->getMaxAlignment(); // Desired stack alignment.
   uint64_t StackSize = MFI->getStackSize();    // Number of bytes to allocate.
   bool HasFP = hasFP(MF);
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool Is64Bit = STI.is64Bit();
   bool IsLP64 = STI.isTarget64BitLP64();
   bool IsWin64 = STI.isTargetWin64();
+  bool IsWinEH =
+      MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() ==
+      ExceptionHandling::WinEH; // Not necessarily synonymous with IsWin64.
+  bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry();
+  bool NeedsDwarfCFI =
+      !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
   bool UseLEA = STI.useLeaForSP();
   unsigned StackAlign = getStackAlignment();
   unsigned SlotSize = RegInfo->getSlotSize();
@@ -509,7 +554,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       .addReg(FramePtr, RegState::Kill)
       .setMIFlag(MachineInstr::FrameSetup);
 
-    if (needsFrameMoves) {
+    if (NeedsDwarfCFI) {
       // Mark the place where EBP/RBP was saved.
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
@@ -527,13 +572,19 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
           .addCFIIndex(CFIIndex);
     }
 
+    if (NeedsWinEH) {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
+          .addImm(FramePtr)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+
     // Update EBP with the new base value.
     BuildMI(MBB, MBBI, DL,
             TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr)
         .addReg(StackPtr)
         .setMIFlag(MachineInstr::FrameSetup);
 
-    if (needsFrameMoves) {
+    if (NeedsDwarfCFI) {
       // Mark effective beginning of when frame pointer becomes valid.
       // Define the current CFA to use the EBP/RBP register.
       unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
@@ -543,9 +594,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
           .addCFIIndex(CFIIndex);
     }
 
-    // Mark the FramePtr as live-in in every block except the entry.
-    for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
-         I != E; ++I)
+    // Mark the FramePtr as live-in in every block.
+    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
       I->addLiveIn(FramePtr);
   } else {
     NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
@@ -559,10 +609,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
          (MBBI->getOpcode() == X86::PUSH32r ||
           MBBI->getOpcode() == X86::PUSH64r)) {
     PushedRegs = true;
-    MBBI->setFlag(MachineInstr::FrameSetup);
+    unsigned Reg = MBBI->getOperand(0).getReg();
     ++MBBI;
 
-    if (!HasFP && needsFrameMoves) {
+    if (!HasFP && NeedsDwarfCFI) {
       // Mark callee-saved push instruction.
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
@@ -572,16 +622,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
           .addCFIIndex(CFIIndex);
       StackOffset += stackGrowth;
     }
+
+    if (NeedsWinEH) {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag(
+          MachineInstr::FrameSetup);
+    }
   }
 
   // Realign stack after we pushed callee-saved registers (so that we'll be
   // able to calculate their offsets from the frame pointer).
-
-  // NOTE: We push the registers before realigning the stack, so
-  // vector callee-saved (xmm) registers may be saved w/o proper
-  // alignment in this way. However, currently these regs are saved in
-  // stack slots (see X86FrameLowering::spillCalleeSavedRegisters()), so
-  // this shouldn't be a problem.
   if (RegInfo->needsStackRealignment(MF)) {
     assert(HasFP && "There should be a frame pointer if stack is realigned.");
     MachineInstr *MI =
@@ -680,23 +729,88 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
         MI->setFlag(MachineInstr::FrameSetup);
         MBB.insert(MBBI, MI);
     }
-  } else if (NumBytes)
+  } else if (NumBytes) {
     emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64,
                  UseLEA, TII, *RegInfo);
+  }
+
+  int SEHFrameOffset = 0;
+  if (NeedsWinEH) {
+    if (HasFP) {
+      // We need to set frame base offset low enough such that all saved
+      // register offsets would be positive relative to it, but we can't
+      // just use NumBytes, because .seh_setframe offset must be <=240.
+      // So we pretend to have only allocated enough space to spill the
+      // non-volatile registers.
+      // We don't care about the rest of stack allocation, because unwinder
+      // will restore SP to (BP - SEHFrameOffset)
+      for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) {
+        int offset = MFI->getObjectOffset(Info.getFrameIdx());
+        SEHFrameOffset = std::max(SEHFrameOffset, abs(offset));
+      }
+      SEHFrameOffset += SEHFrameOffset % 16; // ensure alignmant
+
+      // This only needs to account for XMM spill slots, GPR slots
+      // are covered by the .seh_pushreg's emitted above.
+      unsigned Size = SEHFrameOffset - X86FI->getCalleeSavedFrameSize();
+      if (Size) {
+        BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
+            .addImm(Size)
+            .setMIFlag(MachineInstr::FrameSetup);
+      }
+
+      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
+          .addImm(FramePtr)
+          .addImm(SEHFrameOffset)
+          .setMIFlag(MachineInstr::FrameSetup);
+    } else {
+      // SP will be the base register for restoring XMMs
+      if (NumBytes) {
+        BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
+            .addImm(NumBytes)
+            .setMIFlag(MachineInstr::FrameSetup);
+      }
+    }
+  }
+
+  // Skip the rest of register spilling code
+  while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
+    ++MBBI;
+
+  // Emit SEH info for non-GPRs
+  if (NeedsWinEH) {
+    for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) {
+      unsigned Reg = Info.getReg();
+      if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+        continue;
+      assert(X86::FR64RegClass.contains(Reg) && "Unexpected register class");
+
+      int Offset = getFrameIndexOffset(MF, Info.getFrameIdx());
+      Offset += SEHFrameOffset;
+
+      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
+          .addImm(Reg)
+          .addImm(Offset)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+
+    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
 
   // If we need a base pointer, set it up here. It's whatever the value
   // of the stack pointer is at this point. Any variable size objects
   // will be allocated after this, so we can still use the base pointer
   // to reference locals.
   if (RegInfo->hasBasePointer(MF)) {
-    // Update the frame pointer with the current stack pointer.
+    // Update the base pointer with the current stack pointer.
     unsigned Opc = Is64Bit ? X86::MOV64rr : X86::MOV32rr;
     BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
       .addReg(StackPtr)
       .setMIFlag(MachineInstr::FrameSetup);
   }
 
-  if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) {
+  if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) {
     // Mark end of stack pointer adjustment.
     if (!HasFP && NumBytes) {
       // Define the current CFA rule to use the provided offset.
@@ -711,7 +825,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
 
     // Emit DWARF info specifying the offsets of the callee-saved registers.
     if (PushedRegs)
-      emitCalleeSavedFrameMoves(MBB, MBBI, DL, HasFP ? FramePtr : StackPtr);
+      emitCalleeSavedFrameMoves(MBB, MBBI, DL);
   }
 }
 
@@ -719,12 +833,14 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
-  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
-  const X86InstrInfo &TII = *TM.getInstrInfo();
+  const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   assert(MBBI != MBB.end() && "Returning block has no instructions");
   unsigned RetOpcode = MBBI->getOpcode();
   DebugLoc DL = MBBI->getDebugLoc();
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool Is64Bit = STI.is64Bit();
   bool IsLP64 = STI.isTarget64BitLP64();
   bool UseLEA = STI.useLeaForSP();
@@ -969,46 +1085,97 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
   return getFrameIndexOffset(MF, FI);
 }
 
-bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                             MachineBasicBlock::iterator MI,
-                                        const std::vector<CalleeSavedInfo> &CSI,
-                                          const TargetRegisterInfo *TRI) const {
-  if (CSI.empty())
-    return false;
+bool X86FrameLowering::assignCalleeSavedSpillSlots(
+    MachineFunction &MF, const TargetRegisterInfo *TRI,
+    std::vector<CalleeSavedInfo> &CSI) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  unsigned SlotSize = RegInfo->getSlotSize();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
 
-  DebugLoc DL = MBB.findDebugLoc(MI);
+  unsigned CalleeSavedFrameSize = 0;
+  int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
 
-  MachineFunction &MF = *MBB.getParent();
+  if (hasFP(MF)) {
+    // emitPrologue always spills frame register the first thing.
+    SpillSlotOffset -= SlotSize;
+    MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+
+    // Since emitPrologue and emitEpilogue will handle spilling and restoring of
+    // the frame register, we can delete it from CSI list and not have to worry
+    // about avoiding it later.
+    unsigned FPReg = RegInfo->getFrameRegister(MF);
+    for (unsigned i = 0; i < CSI.size(); ++i) {
+      if (CSI[i].getReg() == FPReg) {
+        CSI.erase(CSI.begin() + i);
+        break;
+      }
+    }
+  }
+
+  // Assign slots for GPRs. It increases frame size.
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i - 1].getReg();
+
+    if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
+      continue;
 
-  unsigned SlotSize = STI.is64Bit() ? 8 : 4;
-  unsigned FPReg = TRI->getFrameRegister(MF);
-  unsigned CalleeFrameSize = 0;
+    SpillSlotOffset -= SlotSize;
+    CalleeSavedFrameSize += SlotSize;
+
+    int SlotIndex = MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+    CSI[i - 1].setFrameIdx(SlotIndex);
+  }
+
+  X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
+
+  // Assign slots for XMMs.
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i - 1].getReg();
+    if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+      continue;
+
+    const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
+    // ensure alignment
+    SpillSlotOffset -= abs(SpillSlotOffset) % RC->getAlignment();
+    // spill into slot
+    SpillSlotOffset -= RC->getSize();
+    int SlotIndex =
+        MFI->CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset);
+    CSI[i - 1].setFrameIdx(SlotIndex);
+    MFI->ensureMaxAlignment(RC->getAlignment());
+  }
+
+  return true;
+}
 
+bool X86FrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBB.findDebugLoc(MI);
+
+  MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
 
   // Push GPRs. It increases frame size.
   unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
   for (unsigned i = CSI.size(); i != 0; --i) {
-    unsigned Reg = CSI[i-1].getReg();
-    if (!X86::GR64RegClass.contains(Reg) &&
-        !X86::GR32RegClass.contains(Reg))
+    unsigned Reg = CSI[i - 1].getReg();
+
+    if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
       continue;
     // Add the callee-saved register as live-in. It's killed at the spill.
     MBB.addLiveIn(Reg);
-    if (Reg == FPReg)
-      // X86RegisterInfo::emitPrologue will handle spilling of frame register.
-      continue;
-    CalleeFrameSize += SlotSize;
+
     BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill)
       .setMIFlag(MachineInstr::FrameSetup);
   }
 
-  X86FI->setCalleeSavedFrameSize(CalleeFrameSize);
-
   // Make XMM regs spilled. X86 does not have ability of push/pop XMM.
   // It can be done by spilling XMMs to stack frame.
-  // Note that only Win64 ABI might spill XMMs.
   for (unsigned i = CSI.size(); i != 0; --i) {
     unsigned Reg = CSI[i-1].getReg();
     if (X86::GR64RegClass.contains(Reg) ||
@@ -1017,8 +1184,12 @@ bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     // Add the callee-saved register as live-in. It's killed at the spill.
     MBB.addLiveIn(Reg);
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(),
-                            RC, TRI);
+
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC,
+                            TRI);
+    --MI;
+    MI->setFlag(MachineInstr::FrameSetup);
+    ++MI;
   }
 
   return true;
@@ -1035,6 +1206,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
 
   // Reload XMMs from stack frame.
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
@@ -1042,22 +1214,19 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
     if (X86::GR64RegClass.contains(Reg) ||
         X86::GR32RegClass.contains(Reg))
       continue;
+
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(),
-                             RC, TRI);
+    TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
   }
 
   // POP GPRs.
-  unsigned FPReg = TRI->getFrameRegister(MF);
   unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
     if (!X86::GR64RegClass.contains(Reg) &&
         !X86::GR32RegClass.contains(Reg))
       continue;
-    if (Reg == FPReg)
-      // X86RegisterInfo::emitEpilogue will handle restoring of frame register.
-      continue;
+
     BuildMI(MBB, MI, DL, TII.get(Opc), Reg);
   }
   return true;
@@ -1065,9 +1234,10 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 
 void
 X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                                   RegScavenger *RS) const {
+                                                       RegScavenger *RS) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
+  const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
   unsigned SlotSize = RegInfo->getSlotSize();
 
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
@@ -1087,22 +1257,6 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                            TailCallReturnAddrDelta - SlotSize, true);
   }
 
-  if (hasFP(MF)) {
-    assert((TailCallReturnAddrDelta <= 0) &&
-           "The Delta should always be zero or negative");
-    const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering();
-
-    // Create a frame entry for the EBP register that must be saved.
-    int FrameIdx = MFI->CreateFixedObject(SlotSize,
-                                          -(int)SlotSize +
-                                          TFI.getOffsetOfLocalArea() +
-                                          TailCallReturnAddrDelta,
-                                          true);
-    assert(FrameIdx == MFI->getObjectIndexBegin() &&
-           "Slot for EBP register must be last in order to be found!");
-    (void)FrameIdx;
-  }
-
   // Spill the BasePtr if it's used.
   if (RegInfo->hasBasePointer(MF))
     MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister());
@@ -1160,8 +1314,9 @@ void
 X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   MachineBasicBlock &prologueMBB = MF.front();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  const X86InstrInfo &TII = *TM.getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   uint64_t StackSize;
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool Is64Bit = STI.is64Bit();
   unsigned TlsReg, TlsOffset;
   DebugLoc DL;
@@ -1368,9 +1523,12 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
 ///       temp0 = sp - MaxStack
 ///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
 void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
-  const X86InstrInfo &TII = *TM.getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  const unsigned SlotSize = TM.getRegisterInfo()->getSlotSize();
+  const unsigned SlotSize =
+      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo())
+          ->getSlotSize();
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   const bool Is64Bit = STI.is64Bit();
   DebugLoc DL;
   // HiPE-specific values
@@ -1499,12 +1657,14 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
 void X86FrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
-  const X86InstrInfo &TII = *TM.getInstrInfo();
-  const X86RegisterInfo &RegInfo = *TM.getRegisterInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const X86RegisterInfo &RegInfo =
+      *static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
   unsigned StackPtr = RegInfo.getStackRegister();
   bool reseveCallFrame = hasReservedCallFrame(MF);
   int Opcode = I->getOpcode();
   bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool IsLP64 = STI.isTarget64BitLP64();
   DebugLoc DL = I->getDebugLoc();
   uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0;
@@ -1522,7 +1682,8 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     // We need to keep the stack aligned properly.  To do this, we round the
     // amount of space needed for the outgoing arguments up to the next
     // alignment boundary.
-    unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+    unsigned StackAlign =
+        MF.getTarget().getFrameLowering()->getStackAlignment();
     Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
 
     MachineInstr *New = nullptr;
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 208bb8b..5ad3d4d 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -14,7 +14,6 @@
 #ifndef X86_FRAMELOWERING_H
 #define X86_FRAMELOWERING_H
 
-#include "X86Subtarget.h"
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
@@ -23,19 +22,13 @@ class MCSymbol;
 class X86TargetMachine;
 
 class X86FrameLowering : public TargetFrameLowering {
-  const X86TargetMachine &TM;
-  const X86Subtarget &STI;
 public:
-  explicit X86FrameLowering(const X86TargetMachine &tm, const X86Subtarget &sti)
-    : TargetFrameLowering(StackGrowsDown,
-                          sti.getStackAlignment(),
-                          (sti.is64Bit() ? -8 : -4)),
-      TM(tm), STI(sti) {
-  }
+  explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO)
+    : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {}
 
   void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI, DebugLoc DL,
-                                 unsigned FramePtr) const;
+                                 MachineBasicBlock::iterator MBBI,
+                                 DebugLoc DL) const;
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
@@ -49,6 +42,11 @@ public:
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                      RegScavenger *RS = nullptr) const override;
 
+  bool
+  assignCalleeSavedSpillSlots(MachineFunction &MF,
+                              const TargetRegisterInfo *TRI,
+                              std::vector<CalleeSavedInfo> &CSI) const override;
+
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 74386d3..ba2f5f6 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2126,38 +2126,6 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     return getGlobalBaseReg();
 
 
-  case X86ISD::ATOMOR64_DAG:
-  case X86ISD::ATOMXOR64_DAG:
-  case X86ISD::ATOMADD64_DAG:
-  case X86ISD::ATOMSUB64_DAG:
-  case X86ISD::ATOMNAND64_DAG:
-  case X86ISD::ATOMAND64_DAG:
-  case X86ISD::ATOMMAX64_DAG:
-  case X86ISD::ATOMMIN64_DAG:
-  case X86ISD::ATOMUMAX64_DAG:
-  case X86ISD::ATOMUMIN64_DAG:
-  case X86ISD::ATOMSWAP64_DAG: {
-    unsigned Opc;
-    switch (Opcode) {
-    default: llvm_unreachable("Impossible opcode");
-    case X86ISD::ATOMOR64_DAG:   Opc = X86::ATOMOR6432;   break;
-    case X86ISD::ATOMXOR64_DAG:  Opc = X86::ATOMXOR6432;  break;
-    case X86ISD::ATOMADD64_DAG:  Opc = X86::ATOMADD6432;  break;
-    case X86ISD::ATOMSUB64_DAG:  Opc = X86::ATOMSUB6432;  break;
-    case X86ISD::ATOMNAND64_DAG: Opc = X86::ATOMNAND6432; break;
-    case X86ISD::ATOMAND64_DAG:  Opc = X86::ATOMAND6432;  break;
-    case X86ISD::ATOMMAX64_DAG:  Opc = X86::ATOMMAX6432;  break;
-    case X86ISD::ATOMMIN64_DAG:  Opc = X86::ATOMMIN6432;  break;
-    case X86ISD::ATOMUMAX64_DAG: Opc = X86::ATOMUMAX6432; break;
-    case X86ISD::ATOMUMIN64_DAG: Opc = X86::ATOMUMIN6432; break;
-    case X86ISD::ATOMSWAP64_DAG: Opc = X86::ATOMSWAP6432; break;
-    }
-    SDNode *RetVal = SelectAtomic64(Node, Opc);
-    if (RetVal)
-      return RetVal;
-    break;
-  }
-
   case ISD::ATOMIC_LOAD_XOR:
   case ISD::ATOMIC_LOAD_AND:
   case ISD::ATOMIC_LOAD_OR:
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index cbaf44e..5ccff20 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -44,11 +44,13 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetOptions.h"
 #include <bitset>
+#include <numeric>
 #include <cctype>
 using namespace llvm;
 
@@ -56,6 +58,17 @@ using namespace llvm;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 
+static cl::opt<bool> ExperimentalVectorWideningLegalization(
+    "x86-experimental-vector-widening-legalization", cl::init(false),
+    cl::desc("Enable an experimental vector type legalization through widening "
+             "rather than promotion."),
+    cl::Hidden);
+
+static cl::opt<bool> ExperimentalVectorShuffleLowering(
+    "x86-experimental-vector-shuffle-lowering", cl::init(false),
+    cl::desc("Enable an experimental vector shuffle lowering code path."),
+    cl::Hidden);
+
 // Forward declarations.
 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
                        SDValue V2);
@@ -178,29 +191,28 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
 }
 
-static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
-  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
-  bool is64Bit = Subtarget->is64Bit();
-
-  if (Subtarget->isTargetMacho()) {
-    if (is64Bit)
+static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
+  if (TT.isOSBinFormatMachO()) {
+    if (TT.getArch() == Triple::x86_64)
       return new X86_64MachoTargetObjectFile();
     return new TargetLoweringObjectFileMachO();
   }
 
-  if (Subtarget->isTargetLinux())
+  if (TT.isOSLinux())
     return new X86LinuxTargetObjectFile();
-  if (Subtarget->isTargetELF())
+  if (TT.isOSBinFormatELF())
     return new TargetLoweringObjectFileELF();
-  if (Subtarget->isTargetKnownWindowsMSVC())
+  if (TT.isKnownWindowsMSVCEnvironment())
     return new X86WindowsTargetObjectFile();
-  if (Subtarget->isTargetCOFF())
+  if (TT.isOSBinFormatCOFF())
     return new TargetLoweringObjectFileCOFF();
   llvm_unreachable("unknown subtarget type");
 }
 
+// FIXME: This should stop caching the target machine as soon as
+// we can remove resetOperationActions et al.
 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
-  : TargetLowering(TM, createTLOF(TM)) {
+  : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
   Subtarget = &TM.getSubtarget<X86Subtarget>();
   X86ScalarSSEf64 = Subtarget->hasSSE2();
   X86ScalarSSEf32 = Subtarget->hasSSE1();
@@ -443,7 +455,13 @@ void X86TargetLowering::resetOperationActions() {
   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
-  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
   if (Subtarget->is64Bit())
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
@@ -497,6 +515,14 @@ void X86TargetLowering::resetOperationActions() {
     }
   }
 
+  // Special handling for half-precision floating point conversions.
+  // If we don't have F16C support, then lower half float conversions
+  // into library calls.
+  if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
+    setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand);
+    setOperationAction(ISD::FP32_TO_FP16, MVT::i16, Expand);
+  }
+
   if (Subtarget->hasPOPCNT()) {
     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
   } else {
@@ -575,34 +601,18 @@ void X86TargetLowering::resetOperationActions() {
   // Expand certain atomics
   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
     MVT VT = IntVTs[i];
-    setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
   }
 
-  if (!Subtarget->is64Bit()) {
-    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
-  }
-
   if (Subtarget->hasCmpxchg16b()) {
-    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
   }
 
   // FIXME - use subtarget debug flags
-  if (!Subtarget->isTargetDarwin() &&
-      !Subtarget->isTargetELF() &&
-      !Subtarget->isTargetCygMing()) {
+  if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
+      !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
   }
 
@@ -861,6 +871,7 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
     setOperationAction(ISD::VSELECT, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
     for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
              InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
       setTruncStoreAction(VT,
@@ -1433,6 +1444,11 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
 
+    if (Subtarget->hasCDI()) {
+      setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
+      setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
+    }
+
     // Custom lower several nodes.
     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
@@ -1563,6 +1579,7 @@ void X86TargetLowering::resetOperationActions() {
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+  setTargetDAGCombine(ISD::BUILD_VECTOR);
   if (Subtarget->is64Bit())
     setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::XOR);
@@ -1585,6 +1602,16 @@ void X86TargetLowering::resetOperationActions() {
   setPrefFunctionAlignment(4); // 2^4 bytes.
 }
 
+TargetLoweringBase::LegalizeTypeAction
+X86TargetLowering::getPreferredVectorAction(EVT VT) const {
+  if (ExperimentalVectorWideningLegalization &&
+      VT.getVectorNumElements() != 1 &&
+      VT.getVectorElementType().getSimpleVT() != MVT::i1)
+    return TypeWidenVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   if (!VT.isVector())
     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
@@ -1725,7 +1752,7 @@ const MCExpr *
 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
                                              const MachineBasicBlock *MBB,
                                              unsigned uid,MCContext &Ctx) const{
-  assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+  assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
          Subtarget->isPICStyleGOT());
   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
   // entries.
@@ -1824,7 +1851,7 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+  CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
                  RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC_X86);
 }
@@ -1844,7 +1871,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
 
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+  CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
                  RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
 
@@ -2016,7 +2043,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   SmallVector<CCValAssign, 16> RVLocs;
   bool Is64Bit = Subtarget->is64Bit();
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+                 DAG.getTarget(), RVLocs, *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
 
   // Copy all of the result registers out of their specified physreg.
@@ -2166,8 +2193,8 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
                                     unsigned i) const {
   // Create the nodes corresponding to a load from this parameter slot.
   ISD::ArgFlagsTy Flags = Ins[i].Flags;
-  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
-                              getTargetMachine().Options.GuaranteedTailCallOpt);
+  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
+      CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   EVT ValVT;
 
@@ -2224,7 +2251,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+  CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
                  ArgLocs, *DAG.getContext());
 
   // Allocate shadow area for Win64
@@ -2388,7 +2415,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
         TotalNumXMMRegs = 0;
 
       if (IsWin64) {
-        const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
+        const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering();
         // Get to the caller-allocated home save location.  Add 8 to account
         // for the return address.
         int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
@@ -2587,7 +2614,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+  CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
                  ArgLocs, *DAG.getContext());
 
   // Allocate shadow area for Win64
@@ -2602,7 +2629,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // This is a sibcall. The memory operands are available in caller's
     // own caller's stack.
     NumBytes = 0;
-  else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+  else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
            IsTailCallConvention(CallConv))
     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
 
@@ -2649,7 +2676,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization arguments are handle later.
   const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     // Skip inalloca arguments, they have already been written.
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
@@ -2840,7 +2867,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     InFlag = Chain.getValue(1);
   }
 
-  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+  if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
     // In the 64-bit large code model, we have to make all calls
     // through a register, since the call instruction's 32-bit
@@ -2864,7 +2891,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       // has hidden or protected visibility, or if it is static or local, then
       // we don't need to use the PLT - we can directly call it.
       if (Subtarget->isTargetELF() &&
-          getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+          DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
         OpFlags = X86II::MO_PLT;
       } else if (Subtarget->isPICStyleStubAny() &&
@@ -2906,7 +2933,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
     // external symbols should go through the PLT.
     if (Subtarget->isTargetELF() &&
-        getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+        DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
       OpFlags = X86II::MO_PLT;
     } else if (Subtarget->isPICStyleStubAny() &&
                (!Subtarget->getTargetTriple().isMacOSX() ||
@@ -2945,7 +2972,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
@@ -2969,7 +2996,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Create the CALLSEQ_END node.
   unsigned NumBytesForCalleeToPop;
   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
-                       getTargetMachine().Options.GuaranteedTailCallOpt))
+                       DAG.getTarget().Options.GuaranteedTailCallOpt))
     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
            !Subtarget->getTargetTriple().isOSMSVCRT() &&
@@ -3140,7 +3167,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
 
-  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
+  if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
     if (IsTailCallConvention(CalleeCC) && CCMatch)
       return true;
     return false;
@@ -3152,7 +3179,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
   // emit a special epilogue.
   const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   if (RegInfo->needsStackRealignment(MF))
     return false;
 
@@ -3181,7 +3208,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
 
     SmallVector<CCValAssign, 16> ArgLocs;
     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                   getTargetMachine(), ArgLocs, *DAG.getContext());
+                   DAG.getTarget(), ArgLocs, *DAG.getContext());
 
     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
@@ -3202,7 +3229,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   if (Unused) {
     SmallVector<CCValAssign, 16> RVLocs;
     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
-                   getTargetMachine(), RVLocs, *DAG.getContext());
+                   DAG.getTarget(), RVLocs, *DAG.getContext());
     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
       CCValAssign &VA = RVLocs[i];
@@ -3216,12 +3243,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   if (!CCMatch) {
     SmallVector<CCValAssign, 16> RVLocs1;
     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs1, *DAG.getContext());
+                    DAG.getTarget(), RVLocs1, *DAG.getContext());
     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
 
     SmallVector<CCValAssign, 16> RVLocs2;
     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs2, *DAG.getContext());
+                    DAG.getTarget(), RVLocs2, *DAG.getContext());
     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
 
     if (RVLocs1.size() != RVLocs2.size())
@@ -3248,7 +3275,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     // argument is passed on the stack.
     SmallVector<CCValAssign, 16> ArgLocs;
     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                   getTargetMachine(), ArgLocs, *DAG.getContext());
+                   DAG.getTarget(), ArgLocs, *DAG.getContext());
 
     // Allocate shadow area for Win64
     if (IsCalleeWin64)
@@ -3265,7 +3292,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
       MachineFrameInfo *MFI = MF.getFrameInfo();
       const MachineRegisterInfo *MRI = &MF.getRegInfo();
       const X86InstrInfo *TII =
-        ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
+          static_cast<const X86InstrInfo *>(DAG.getTarget().getInstrInfo());
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
         SDValue Arg = OutVals[i];
@@ -3288,12 +3315,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     if (!Subtarget->is64Bit() &&
         ((!isa<GlobalAddressSDNode>(Callee) &&
           !isa<ExternalSymbolSDNode>(Callee)) ||
-         getTargetMachine().getRelocationModel() == Reloc::PIC_)) {
+         DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
       unsigned NumInRegs = 0;
       // In PIC we need an extra register to formulate the address computation
       // for the callee.
       unsigned MaxInRegs =
-          (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
+	(DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
 
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
@@ -3417,7 +3444,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   int ReturnAddrIndex = FuncInfo->getRAIndex();
 
@@ -3967,14 +3994,22 @@ static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
 
   unsigned CorrectPosV1 = 0;
   unsigned CorrectPosV2 = 0;
-  for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i)
+  for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
+    if (Mask[i] == -1) {
+      ++CorrectPosV1;
+      ++CorrectPosV2;
+      continue;
+    }
+
     if (Mask[i] == i)
       ++CorrectPosV1;
     else if (Mask[i] == i + 4)
       ++CorrectPosV2;
+  }
 
   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
-    // We have 3 elements from one vector, and one from another.
+    // We have 3 elements (undefs count as elements from any vector) from one
+    // vector, and one from another.
     return true;
 
   return false;
@@ -4823,19 +4858,6 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
   return true;
 }
 
-/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
-/// all the same.
-static bool isSplatVector(SDNode *N) {
-  if (N->getOpcode() != ISD::BUILD_VECTOR)
-    return false;
-
-  SDValue SplatValue = N->getOperand(0);
-  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
-    if (N->getOperand(i) != SplatValue)
-      return false;
-  return true;
-}
-
 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
 /// to an zero vector.
 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
@@ -5744,18 +5766,22 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
       return SDValue();
 
     case ISD::BUILD_VECTOR: {
-      // The BUILD_VECTOR node must be a splat.
-      if (!isSplatVector(Op.getNode()))
+      auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
+      BitVector UndefElements;
+      SDValue Splat = BVOp->getSplatValue(&UndefElements);
+
+      // We need a splat of a single value to use broadcast, and it doesn't
+      // make any sense if the value is only in one element of the vector.
+      if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
         return SDValue();
 
-      Ld = Op.getOperand(0);
+      Ld = Splat;
       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
-                     Ld.getOpcode() == ISD::ConstantFP);
+                       Ld.getOpcode() == ISD::ConstantFP);
 
-      // The suspected load node has several users. Make sure that all
-      // of its users are from the BUILD_VECTOR node.
-      // Constants may have multiple users.
-      if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
+      // Make sure that all of the users of a non-constant load are from the
+      // BUILD_VECTOR node.
+      if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
         return SDValue();
       break;
     }
@@ -6042,6 +6068,433 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
 }
 
+/// \brief Return true if \p N implements a horizontal binop and return the
+/// operands for the horizontal binop into V0 and V1.
+/// 
+/// This is a helper function of PerformBUILD_VECTORCombine.
+/// This function checks that the build_vector \p N in input implements a
+/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
+/// operation to match.
+/// For example, if \p Opcode is equal to ISD::ADD, then this function
+/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
+/// is equal to ISD::SUB, then this function checks if this is a horizontal
+/// arithmetic sub.
+///
+/// This function only analyzes elements of \p N whose indices are
+/// in range [BaseIdx, LastIdx).
+static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
+                              SelectionDAG &DAG,
+                              unsigned BaseIdx, unsigned LastIdx,
+                              SDValue &V0, SDValue &V1) {
+  EVT VT = N->getValueType(0);
+
+  assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
+  assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
+         "Invalid Vector in input!");
+  
+  bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
+  bool CanFold = true;
+  unsigned ExpectedVExtractIdx = BaseIdx;
+  unsigned NumElts = LastIdx - BaseIdx;
+  V0 = DAG.getUNDEF(VT);
+  V1 = DAG.getUNDEF(VT);
+
+  // Check if N implements a horizontal binop.
+  for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
+    SDValue Op = N->getOperand(i + BaseIdx);
+
+    // Skip UNDEFs.
+    if (Op->getOpcode() == ISD::UNDEF) {
+      // Update the expected vector extract index.
+      if (i * 2 == NumElts)
+        ExpectedVExtractIdx = BaseIdx;
+      ExpectedVExtractIdx += 2;
+      continue;
+    }
+
+    CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
+
+    if (!CanFold)
+      break;
+
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    // Try to match the following pattern:
+    // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
+    CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        Op0.getOperand(0) == Op1.getOperand(0) &&
+        isa<ConstantSDNode>(Op0.getOperand(1)) &&
+        isa<ConstantSDNode>(Op1.getOperand(1)));
+    if (!CanFold)
+      break;
+
+    unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+    unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
+
+    if (i * 2 < NumElts) {
+      if (V0.getOpcode() == ISD::UNDEF)
+        V0 = Op0.getOperand(0);
+    } else {
+      if (V1.getOpcode() == ISD::UNDEF)
+        V1 = Op0.getOperand(0);
+      if (i * 2 == NumElts)
+        ExpectedVExtractIdx = BaseIdx;
+    }
+
+    SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
+    if (I0 == ExpectedVExtractIdx)
+      CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
+    else if (IsCommutable && I1 == ExpectedVExtractIdx) {
+      // Try to match the following dag sequence:
+      // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
+      CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
+    } else
+      CanFold = false;
+
+    ExpectedVExtractIdx += 2;
+  }
+
+  return CanFold;
+}
+
+/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
+/// a concat_vector. 
+///
+/// This is a helper function of PerformBUILD_VECTORCombine.
+/// This function expects two 256-bit vectors called V0 and V1.
+/// At first, each vector is split into two separate 128-bit vectors.
+/// Then, the resulting 128-bit vectors are used to implement two
+/// horizontal binary operations. 
+///
+/// The kind of horizontal binary operation is defined by \p X86Opcode.
+///
+/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
+/// the two new horizontal binop.
+/// When Mode is set, the first horizontal binop dag node would take as input
+/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
+/// horizontal binop dag node would take as input the lower 128-bit of V1
+/// and the upper 128-bit of V1.
+///   Example:
+///     HADD V0_LO, V0_HI
+///     HADD V1_LO, V1_HI
+///
+/// Otherwise, the first horizontal binop dag node takes as input the lower
+/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
+/// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
+///   Example:
+///     HADD V0_LO, V1_LO
+///     HADD V0_HI, V1_HI
+///
+/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
+/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
+/// the upper 128-bits of the result.
+static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
+                                     SDLoc DL, SelectionDAG &DAG,
+                                     unsigned X86Opcode, bool Mode,
+                                     bool isUndefLO, bool isUndefHI) {
+  EVT VT = V0.getValueType();
+  assert(VT.is256BitVector() && VT == V1.getValueType() &&
+         "Invalid nodes in input!");
+
+  unsigned NumElts = VT.getVectorNumElements();
+  SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
+  SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
+  SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
+  SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
+  EVT NewVT = V0_LO.getValueType();
+
+  SDValue LO = DAG.getUNDEF(NewVT);
+  SDValue HI = DAG.getUNDEF(NewVT);
+
+  if (Mode) {
+    // Don't emit a horizontal binop if the result is expected to be UNDEF.
+    if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
+      LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
+    if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
+      HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
+  } else {
+    // Don't emit a horizontal binop if the result is expected to be UNDEF.
+    if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
+                       V1_LO->getOpcode() != ISD::UNDEF))
+      LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
+
+    if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
+                       V1_HI->getOpcode() != ISD::UNDEF))
+      HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
+  }
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
+}
+
+/// \brief Try to fold a build_vector that performs an 'addsub' into the
+/// sequence of 'vadd + vsub + blendi'.
+static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
+                           const X86Subtarget *Subtarget) {
+  SDLoc DL(BV);
+  EVT VT = BV->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  SDValue InVec0 = DAG.getUNDEF(VT);
+  SDValue InVec1 = DAG.getUNDEF(VT);
+
+  assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
+          VT == MVT::v2f64) && "build_vector with an invalid type found!");
+
+  // Don't try to emit a VSELECT that cannot be lowered into a blend.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
+    return SDValue();
+
+  // Odd-numbered elements in the input build vector are obtained from
+  // adding two integer/float elements.
+  // Even-numbered elements in the input build vector are obtained from
+  // subtracting two integer/float elements.
+  unsigned ExpectedOpcode = ISD::FSUB;
+  unsigned NextExpectedOpcode = ISD::FADD;
+  bool AddFound = false;
+  bool SubFound = false;
+
+  for (unsigned i = 0, e = NumElts; i != e; i++) {
+    SDValue Op = BV->getOperand(i);
+      
+    // Skip 'undef' values.
+    unsigned Opcode = Op.getOpcode();
+    if (Opcode == ISD::UNDEF) {
+      std::swap(ExpectedOpcode, NextExpectedOpcode);
+      continue;
+    }
+      
+    // Early exit if we found an unexpected opcode.
+    if (Opcode != ExpectedOpcode)
+      return SDValue();
+
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    // Try to match the following pattern:
+    // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
+    // Early exit if we cannot match that sequence.
+    if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        !isa<ConstantSDNode>(Op0.getOperand(1)) ||
+        !isa<ConstantSDNode>(Op1.getOperand(1)) ||
+        Op0.getOperand(1) != Op1.getOperand(1))
+      return SDValue();
+
+    unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+    if (I0 != i)
+      return SDValue();
+
+    // We found a valid add/sub node. Update the information accordingly.
+    if (i & 1)
+      AddFound = true;
+    else
+      SubFound = true;
+
+    // Update InVec0 and InVec1.
+    if (InVec0.getOpcode() == ISD::UNDEF)
+      InVec0 = Op0.getOperand(0);
+    if (InVec1.getOpcode() == ISD::UNDEF)
+      InVec1 = Op1.getOperand(0);
+
+    // Make sure that operands in input to each add/sub node always
+    // come from a same pair of vectors.
+    if (InVec0 != Op0.getOperand(0)) {
+      if (ExpectedOpcode == ISD::FSUB)
+        return SDValue();
+
+      // FADD is commutable. Try to commute the operands
+      // and then test again.
+      std::swap(Op0, Op1);
+      if (InVec0 != Op0.getOperand(0))
+        return SDValue();
+    }
+
+    if (InVec1 != Op1.getOperand(0))
+      return SDValue();
+
+    // Update the pair of expected opcodes.
+    std::swap(ExpectedOpcode, NextExpectedOpcode);
+  }
+
+  // Don't try to fold this build_vector into a VSELECT if it has
+  // too many UNDEF operands.
+  if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
+      InVec1.getOpcode() != ISD::UNDEF) {
+    // Emit a sequence of vector add and sub followed by a VSELECT.
+    // The new VSELECT will be lowered into a BLENDI.
+    // At ISel stage, we pattern-match the sequence 'add + sub + BLENDI'
+    // and emit a single ADDSUB instruction.
+    SDValue Sub = DAG.getNode(ExpectedOpcode, DL, VT, InVec0, InVec1);
+    SDValue Add = DAG.getNode(NextExpectedOpcode, DL, VT, InVec0, InVec1);
+
+    // Construct the VSELECT mask.
+    EVT MaskVT = VT.changeVectorElementTypeToInteger();
+    EVT SVT = MaskVT.getVectorElementType();
+    unsigned SVTBits = SVT.getSizeInBits();
+    SmallVector<SDValue, 8> Ops;
+
+    for (unsigned i = 0, e = NumElts; i != e; ++i) {
+      APInt Value = i & 1 ? APInt::getNullValue(SVTBits) :
+                            APInt::getAllOnesValue(SVTBits);
+      SDValue Constant = DAG.getConstant(Value, SVT);
+      Ops.push_back(Constant);
+    }
+
+    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, Ops);
+    return DAG.getSelect(DL, VT, Mask, Sub, Add);
+  }
+  
+  return SDValue();
+}
+
+static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
+                                          const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
+  SDValue InVec0, InVec1;
+
+  // Try to match an ADDSUB.
+  if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+      (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
+    SDValue Value = matchAddSub(BV, DAG, Subtarget);
+    if (Value.getNode())
+      return Value;
+  }
+
+  // Try to match horizontal ADD/SUB.
+  unsigned NumUndefsLO = 0;
+  unsigned NumUndefsHI = 0;
+  unsigned Half = NumElts/2;
+
+  // Count the number of UNDEF operands in the build_vector in input.
+  for (unsigned i = 0, e = Half; i != e; ++i)
+    if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
+      NumUndefsLO++;
+
+  for (unsigned i = Half, e = NumElts; i != e; ++i)
+    if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
+      NumUndefsHI++;
+
+  // Early exit if this is either a build_vector of all UNDEFs or all the
+  // operands but one are UNDEF.
+  if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
+    return SDValue();
+
+  if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
+    // Try to match an SSE3 float HADD/HSUB.
+    if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
+    
+    if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
+  } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
+    // Try to match an SSSE3 integer HADD/HSUB.
+    if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
+    
+    if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
+  }
+  
+  if (!Subtarget->hasAVX())
+    return SDValue();
+
+  if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
+    // Try to match an AVX horizontal add/sub of packed single/double
+    // precision floating point values from 256-bit vectors.
+    SDValue InVec2, InVec3;
+    if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
+        ((InVec0.getOpcode() == ISD::UNDEF ||
+          InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+        ((InVec1.getOpcode() == ISD::UNDEF ||
+          InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+      return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
+
+    if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
+        ((InVec0.getOpcode() == ISD::UNDEF ||
+          InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+        ((InVec1.getOpcode() == ISD::UNDEF ||
+          InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+      return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
+  } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
+    // Try to match an AVX2 horizontal add/sub of signed integers.
+    SDValue InVec2, InVec3;
+    unsigned X86Opcode;
+    bool CanFold = true;
+
+    if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
+        ((InVec0.getOpcode() == ISD::UNDEF ||
+          InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+        ((InVec1.getOpcode() == ISD::UNDEF ||
+          InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+      X86Opcode = X86ISD::HADD;
+    else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
+        ((InVec0.getOpcode() == ISD::UNDEF ||
+          InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
+        ((InVec1.getOpcode() == ISD::UNDEF ||
+          InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
+      X86Opcode = X86ISD::HSUB;
+    else
+      CanFold = false;
+
+    if (CanFold) {
+      // Fold this build_vector into a single horizontal add/sub.
+      // Do this only if the target has AVX2.
+      if (Subtarget->hasAVX2())
+        return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
+ 
+      // Do not try to expand this build_vector into a pair of horizontal
+      // add/sub if we can emit a pair of scalar add/sub.
+      if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
+        return SDValue();
+
+      // Convert this build_vector into a pair of horizontal binop followed by
+      // a concat vector.
+      bool isUndefLO = NumUndefsLO == Half;
+      bool isUndefHI = NumUndefsHI == Half;
+      return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
+                                   isUndefLO, isUndefHI);
+    }
+  }
+
+  if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
+       VT == MVT::v16i16) && Subtarget->hasAVX()) {
+    unsigned X86Opcode;
+    if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::HADD;
+    else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::HSUB;
+    else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::FHADD;
+    else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::FHSUB;
+    else
+      return SDValue();
+
+    // Don't try to expand this build_vector into a pair of horizontal add/sub
+    // if we can simply emit a pair of scalar add/sub.
+    if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
+      return SDValue();
+
+    // Convert this build_vector into two horizontal add/sub followed by
+    // a concat vector.
+    bool isUndefLO = NumUndefsLO == Half;
+    bool isUndefHI = NumUndefsHI == Half;
+    return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
+                                 isUndefLO, isUndefHI);
+  }
+
+  return SDValue();
+}
+
 SDValue
 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -6429,38 +6882,1160 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   return LowerAVXCONCAT_VECTORS(Op, DAG);
 }
 
-// Try to lower a shuffle node into a simple blend instruction.
-static SDValue
-LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
-                           const X86Subtarget *Subtarget, SelectionDAG &DAG) {
-  SDValue V1 = SVOp->getOperand(0);
-  SDValue V2 = SVOp->getOperand(1);
-  SDLoc dl(SVOp);
-  MVT VT = SVOp->getSimpleValueType(0);
+
+//===----------------------------------------------------------------------===//
+// Vector shuffle lowering
+//
+// This is an experimental code path for lowering vector shuffles on x86. It is
+// designed to handle arbitrary vector shuffles and blends, gracefully
+// degrading performance as necessary. It works hard to recognize idiomatic
+// shuffles and lower them to optimal instruction patterns without leaving
+// a framework that allows reasonably efficient handling of all vector shuffle
+// patterns.
+//===----------------------------------------------------------------------===//
+
+/// \brief Tiny helper function to identify a no-op mask.
+///
+/// This is a somewhat boring predicate function. It checks whether the mask
+/// array input, which is assumed to be a single-input shuffle mask of the kind
+/// used by the X86 shuffle instructions (not a fully general
+/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
+/// in-place shuffle are 'no-op's.
+static bool isNoopShuffleMask(ArrayRef<int> Mask) {
+  for (int i = 0, Size = Mask.size(); i < Size; ++i)
+    if (Mask[i] != -1 && Mask[i] != i)
+      return false;
+  return true;
+}
+
+/// \brief Helper function to classify a mask as a single-input mask.
+///
+/// This isn't a generic single-input test because in the vector shuffle
+/// lowering we canonicalize single inputs to be the first input operand. This
+/// means we can more quickly test for a single input by only checking whether
+/// an input from the second operand exists. We also assume that the size of
+/// mask corresponds to the size of the input vectors which isn't true in the
+/// fully general case.
+static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
+  for (int M : Mask)
+    if (M >= (int)Mask.size())
+      return false;
+  return true;
+}
+
+/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
+///
+/// This helper function produces an 8-bit shuffle immediate corresponding to
+/// the ubiquitous shuffle encoding scheme used in x86 instructions for
+/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
+/// example.
+///
+/// NB: We rely heavily on "undef" masks preserving the input lane.
+static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
+                                          SelectionDAG &DAG) {
+  assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
+  assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
+  assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
+  assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
+  assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
+
+  unsigned Imm = 0;
+  Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
+  Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
+  Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
+  Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
+  return DAG.getConstant(Imm, MVT::i8);
+}
+
+/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
+///
+/// This is the basis function for the 2-lane 64-bit shuffles as we have full
+/// support for floating point shuffles but not integer shuffles. These
+/// instructions will incur a domain crossing penalty on some chips though so
+/// it is better to avoid lowering through this for integer vectors where
+/// possible.
+static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
+
+  if (isSingleInputShuffleMask(Mask)) {
+    // Straight shuffle of a single input vector. Simulate this by using the
+    // single input as both of the "inputs" to this instruction..
+    unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
+    return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
+                       DAG.getConstant(SHUFPDMask, MVT::i8));
+  }
+  assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
+  assert(Mask[1] >= 2 && "Non-canonicalized blend!");
+
+  unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
+  return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
+                     DAG.getConstant(SHUFPDMask, MVT::i8));
+}
+
+/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
+///
+/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
+/// the integer unit to minimize domain crossing penalties. However, for blends
+/// it falls back to the floating point shuffle operation with appropriate bit
+/// casting.
+static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
+
+  if (isSingleInputShuffleMask(Mask)) {
+    // Straight shuffle of a single input vector. For everything from SSE2
+    // onward this has a single fast instruction with no scary immediates.
+    // We have to map the mask as it is actually a v4i32 shuffle instruction.
+    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
+    int WidenedMask[4] = {
+        std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
+        std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
+    return DAG.getNode(
+        ISD::BITCAST, DL, MVT::v2i64,
+        DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
+                    getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
+  }
+
+  // We implement this with SHUFPD which is pretty lame because it will likely
+  // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
+  // However, all the alternatives are still more cycles and newer chips don't
+  // have this problem. It would be really nice if x86 had better shuffles here.
+  V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
+  V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
+  return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                     DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
+}
+
+/// \brief Lower 4-lane 32-bit floating point shuffles.
+///
+/// Uses instructions exclusively from the floating point unit to minimize
+/// domain crossing penalties, as these are sufficient to implement all v4f32
+/// shuffles.
+static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+  SDValue LowV = V1, HighV = V2;
+  int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
+
+  int NumV2Elements =
+      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+
+  if (NumV2Elements == 0)
+    // Straight shuffle of a single input vector. We pass the input vector to
+    // both operands to simulate this with a SHUFPS.
+    return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
+                       getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+  if (NumV2Elements == 1) {
+    int V2Index =
+        std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
+        Mask.begin();
+    // Compute the index adjacent to V2Index and in the same half by toggling
+    // the low bit.
+    int V2AdjIndex = V2Index ^ 1;
+
+    if (Mask[V2AdjIndex] == -1) {
+      // Handles all the cases where we have a single V2 element and an undef.
+      // This will only ever happen in the high lanes because we commute the
+      // vector otherwise.
+      if (V2Index < 2)
+        std::swap(LowV, HighV);
+      NewMask[V2Index] -= 4;
+    } else {
+      // Handle the case where the V2 element ends up adjacent to a V1 element.
+      // To make this work, blend them together as the first step.
+      int V1Index = V2AdjIndex;
+      int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
+      V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1,
+                       getV4X86ShuffleImm8ForMask(BlendMask, DAG));
+
+      // Now proceed to reconstruct the final blend as we have the necessary
+      // high or low half formed.
+      if (V2Index < 2) {
+        LowV = V2;
+        HighV = V1;
+      } else {
+        HighV = V2;
+      }
+      NewMask[V1Index] = 2; // We put the V1 element in V2[2].
+      NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
+    }
+  } else if (NumV2Elements == 2) {
+    if (Mask[0] < 4 && Mask[1] < 4) {
+      // Handle the easy case where we have V1 in the low lanes and V2 in the
+      // high lanes. We never see this reversed because we sort the shuffle.
+      NewMask[2] -= 4;
+      NewMask[3] -= 4;
+    } else {
+      // We have a mixture of V1 and V2 in both low and high lanes. Rather than
+      // trying to place elements directly, just blend them and set up the final
+      // shuffle to place them.
+
+      // The first two blend mask elements are for V1, the second two are for
+      // V2.
+      int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
+                          Mask[2] < 4 ? Mask[2] : Mask[3],
+                          (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
+                          (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
+      V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2,
+                       getV4X86ShuffleImm8ForMask(BlendMask, DAG));
+
+      // Now we do a normal shuffle of V1 by giving V1 as both operands to
+      // a blend.
+      LowV = HighV = V1;
+      NewMask[0] = Mask[0] < 4 ? 0 : 2;
+      NewMask[1] = Mask[0] < 4 ? 2 : 0;
+      NewMask[2] = Mask[2] < 4 ? 1 : 3;
+      NewMask[3] = Mask[2] < 4 ? 3 : 1;
+    }
+  }
+  return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV,
+                     getV4X86ShuffleImm8ForMask(NewMask, DAG));
+}
+
+/// \brief Lower 4-lane i32 vector shuffles.
+///
+/// We try to handle these with integer-domain shuffles where we can, but for
+/// blends we use the floating point domain blend instructions.
+static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+  if (isSingleInputShuffleMask(Mask))
+    // Straight shuffle of a single input vector. For everything from SSE2
+    // onward this has a single fast instruction with no scary immediates.
+    return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
+                       getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+  // We implement this with SHUFPS because it can blend from two vectors.
+  // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
+  // up the inputs, bypassing domain shift penalties that we would encur if we
+  // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
+  // relevant.
+  return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
+                     DAG.getVectorShuffle(
+                         MVT::v4f32, DL,
+                         DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
+                         DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
+}
+
+/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
+/// shuffle lowering, and the most complex part.
+///
+/// The lowering strategy is to try to form pairs of input lanes which are
+/// targeted at the same half of the final vector, and then use a dword shuffle
+/// to place them onto the right half, and finally unpack the paired lanes into
+/// their final position.
+///
+/// The exact breakdown of how to form these dword pairs and align them on the
+/// correct sides is really tricky. See the comments within the function for
+/// more of the details.
+static SDValue lowerV8I16SingleInputVectorShuffle(
+    SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
+    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+  assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
+  MutableArrayRef<int> LoMask = Mask.slice(0, 4);
+  MutableArrayRef<int> HiMask = Mask.slice(4, 4);
+
+  SmallVector<int, 4> LoInputs;
+  std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
+               [](int M) { return M >= 0; });
+  std::sort(LoInputs.begin(), LoInputs.end());
+  LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
+  SmallVector<int, 4> HiInputs;
+  std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
+               [](int M) { return M >= 0; });
+  std::sort(HiInputs.begin(), HiInputs.end());
+  HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
+  int NumLToL =
+      std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
+  int NumHToL = LoInputs.size() - NumLToL;
+  int NumLToH =
+      std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
+  int NumHToH = HiInputs.size() - NumLToH;
+  MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
+  MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
+  MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
+  MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
+
+  // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
+  // such inputs we can swap two of the dwords across the half mark and end up
+  // with <=2 inputs to each half in each half. Once there, we can fall through
+  // to the generic code below. For example:
+  //
+  // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
+  // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
+  //
+  // Before we had 3-1 in the low half and 3-1 in the high half. Afterward, 2-2
+  // and 2-2.
+  auto balanceSides = [&](ArrayRef<int> ThreeInputs, int OneInput,
+                          int ThreeInputHalfSum, int OneInputHalfOffset) {
+    // Compute the index of dword with only one word among the three inputs in
+    // a half by taking the sum of the half with three inputs and subtracting
+    // the sum of the actual three inputs. The difference is the remaining
+    // slot.
+    int DWordA = (ThreeInputHalfSum -
+                  std::accumulate(ThreeInputs.begin(), ThreeInputs.end(), 0)) /
+                 2;
+    int DWordB = OneInputHalfOffset / 2 + (OneInput / 2 + 1) % 2;
+
+    int PSHUFDMask[] = {0, 1, 2, 3};
+    PSHUFDMask[DWordA] = DWordB;
+    PSHUFDMask[DWordB] = DWordA;
+    V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                    DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+                                DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
+                                getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
+
+    // Adjust the mask to match the new locations of A and B.
+    for (int &M : Mask)
+      if (M != -1 && M/2 == DWordA)
+        M = 2 * DWordB + M % 2;
+      else if (M != -1 && M/2 == DWordB)
+        M = 2 * DWordA + M % 2;
+
+    // Recurse back into this routine to re-compute state now that this isn't
+    // a 3 and 1 problem.
+    return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
+                                Mask);
+  };
+  if (NumLToL == 3 && NumHToL == 1)
+    return balanceSides(LToLInputs, HToLInputs[0], 0 + 1 + 2 + 3, 4);
+  else if (NumLToL == 1 && NumHToL == 3)
+    return balanceSides(HToLInputs, LToLInputs[0], 4 + 5 + 6 + 7, 0);
+  else if (NumLToH == 1 && NumHToH == 3)
+    return balanceSides(HToHInputs, LToHInputs[0], 4 + 5 + 6 + 7, 0);
+  else if (NumLToH == 3 && NumHToH == 1)
+    return balanceSides(LToHInputs, HToHInputs[0], 0 + 1 + 2 + 3, 4);
+
+  // At this point there are at most two inputs to the low and high halves from
+  // each half. That means the inputs can always be grouped into dwords and
+  // those dwords can then be moved to the correct half with a dword shuffle.
+  // We use at most one low and one high word shuffle to collect these paired
+  // inputs into dwords, and finally a dword shuffle to place them.
+  int PSHUFLMask[4] = {-1, -1, -1, -1};
+  int PSHUFHMask[4] = {-1, -1, -1, -1};
+  int PSHUFDMask[4] = {-1, -1, -1, -1};
+
+  // First fix the masks for all the inputs that are staying in their
+  // original halves. This will then dictate the targets of the cross-half
+  // shuffles.
+  auto fixInPlaceInputs = [&PSHUFDMask](
+      ArrayRef<int> InPlaceInputs, MutableArrayRef<int> SourceHalfMask,
+      MutableArrayRef<int> HalfMask, int HalfOffset) {
+    if (InPlaceInputs.empty())
+      return;
+    if (InPlaceInputs.size() == 1) {
+      SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+          InPlaceInputs[0] - HalfOffset;
+      PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
+      return;
+    }
+
+    assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
+    SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+        InPlaceInputs[0] - HalfOffset;
+    // Put the second input next to the first so that they are packed into
+    // a dword. We find the adjacent index by toggling the low bit.
+    int AdjIndex = InPlaceInputs[0] ^ 1;
+    SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
+    std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
+    PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
+  };
+  if (!HToLInputs.empty())
+    fixInPlaceInputs(LToLInputs, PSHUFLMask, LoMask, 0);
+  if (!LToHInputs.empty())
+    fixInPlaceInputs(HToHInputs, PSHUFHMask, HiMask, 4);
+
+  // Now gather the cross-half inputs and place them into a free dword of
+  // their target half.
+  // FIXME: This operation could almost certainly be simplified dramatically to
+  // look more like the 3-1 fixing operation.
+  auto moveInputsToRightHalf = [&PSHUFDMask](
+      MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
+      MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
+      int SourceOffset, int DestOffset) {
+    auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
+      return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
+    };
+    auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
+                                               int Word) {
+      int LowWord = Word & ~1;
+      int HighWord = Word | 1;
+      return isWordClobbered(SourceHalfMask, LowWord) ||
+             isWordClobbered(SourceHalfMask, HighWord);
+    };
+
+    if (IncomingInputs.empty())
+      return;
+
+    if (ExistingInputs.empty()) {
+      // Map any dwords with inputs from them into the right half.
+      for (int Input : IncomingInputs) {
+        // If the source half mask maps over the inputs, turn those into
+        // swaps and use the swapped lane.
+        if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
+          if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
+            SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
+                Input - SourceOffset;
+            // We have to swap the uses in our half mask in one sweep.
+            for (int &M : HalfMask)
+              if (M == SourceHalfMask[Input - SourceOffset])
+                M = Input;
+              else if (M == Input)
+                M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
+          } else {
+            assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
+                       Input - SourceOffset &&
+                   "Previous placement doesn't match!");
+          }
+          // Note that this correctly re-maps both when we do a swap and when
+          // we observe the other side of the swap above. We rely on that to
+          // avoid swapping the members of the input list directly.
+          Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
+        }
+
+        // Map the input's dword into the correct half.
+        if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
+          PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
+        else
+          assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
+                     Input / 2 &&
+                 "Previous placement doesn't match!");
+      }
+
+      // And just directly shift any other-half mask elements to be same-half
+      // as we will have mirrored the dword containing the element into the
+      // same position within that half.
+      for (int &M : HalfMask)
+        if (M >= SourceOffset && M < SourceOffset + 4) {
+          M = M - SourceOffset + DestOffset;
+          assert(M >= 0 && "This should never wrap below zero!");
+        }
+      return;
+    }
+
+    // Ensure we have the input in a viable dword of its current half. This
+    // is particularly tricky because the original position may be clobbered
+    // by inputs being moved and *staying* in that half.
+    if (IncomingInputs.size() == 1) {
+      if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
+        int InputFixed = std::find(std::begin(SourceHalfMask),
+                                   std::end(SourceHalfMask), -1) -
+                         std::begin(SourceHalfMask) + SourceOffset;
+        SourceHalfMask[InputFixed - SourceOffset] =
+            IncomingInputs[0] - SourceOffset;
+        std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
+                     InputFixed);
+        IncomingInputs[0] = InputFixed;
+      }
+    } else if (IncomingInputs.size() == 2) {
+      if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
+          isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
+        int SourceDWordBase = !isDWordClobbered(SourceHalfMask, 0) ? 0 : 2;
+        assert(!isDWordClobbered(SourceHalfMask, SourceDWordBase) &&
+               "Not all dwords can be clobbered!");
+        SourceHalfMask[SourceDWordBase] = IncomingInputs[0] - SourceOffset;
+        SourceHalfMask[SourceDWordBase + 1] = IncomingInputs[1] - SourceOffset;
+        for (int &M : HalfMask)
+          if (M == IncomingInputs[0])
+            M = SourceDWordBase + SourceOffset;
+          else if (M == IncomingInputs[1])
+            M = SourceDWordBase + 1 + SourceOffset;
+        IncomingInputs[0] = SourceDWordBase + SourceOffset;
+        IncomingInputs[1] = SourceDWordBase + 1 + SourceOffset;
+      }
+    } else {
+      llvm_unreachable("Unhandled input size!");
+    }
+
+    // Now hoist the DWord down to the right half.
+    int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
+    assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
+    PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
+    for (int Input : IncomingInputs)
+      std::replace(HalfMask.begin(), HalfMask.end(), Input,
+                   FreeDWord * 2 + Input % 2);
+  };
+  moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask,
+                        /*SourceOffset*/ 4, /*DestOffset*/ 0);
+  moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask,
+                        /*SourceOffset*/ 0, /*DestOffset*/ 4);
+
+  // Now enact all the shuffles we've computed to move the inputs into their
+  // target half.
+  if (!isNoopShuffleMask(PSHUFLMask))
+    V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
+                    getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
+  if (!isNoopShuffleMask(PSHUFHMask))
+    V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
+                    getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
+  if (!isNoopShuffleMask(PSHUFDMask))
+    V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                    DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+                                DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
+                                getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
+
+  // At this point, each half should contain all its inputs, and we can then
+  // just shuffle them into their final position.
+  assert(std::count_if(LoMask.begin(), LoMask.end(),
+                       [](int M) { return M >= 4; }) == 0 &&
+         "Failed to lift all the high half inputs to the low mask!");
+  assert(std::count_if(HiMask.begin(), HiMask.end(),
+                       [](int M) { return M >= 0 && M < 4; }) == 0 &&
+         "Failed to lift all the low half inputs to the high mask!");
+
+  // Do a half shuffle for the low mask.
+  if (!isNoopShuffleMask(LoMask))
+    V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
+                    getV4X86ShuffleImm8ForMask(LoMask, DAG));
+
+  // Do a half shuffle with the high mask after shifting its values down.
+  for (int &M : HiMask)
+    if (M >= 0)
+      M -= 4;
+  if (!isNoopShuffleMask(HiMask))
+    V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
+                    getV4X86ShuffleImm8ForMask(HiMask, DAG));
+
+  return V;
+}
+
+/// \brief Detect whether the mask pattern should be lowered through
+/// interleaving.
+///
+/// This essentially tests whether viewing the mask as an interleaving of two
+/// sub-sequences reduces the cross-input traffic of a blend operation. If so,
+/// lowering it through interleaving is a significantly better strategy.
+static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
+  int NumEvenInputs[2] = {0, 0};
+  int NumOddInputs[2] = {0, 0};
+  int NumLoInputs[2] = {0, 0};
+  int NumHiInputs[2] = {0, 0};
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    if (Mask[i] < 0)
+      continue;
+
+    int InputIdx = Mask[i] >= Size;
+
+    if (i < Size / 2)
+      ++NumLoInputs[InputIdx];
+    else
+      ++NumHiInputs[InputIdx];
+
+    if ((i % 2) == 0)
+      ++NumEvenInputs[InputIdx];
+    else
+      ++NumOddInputs[InputIdx];
+  }
+
+  // The minimum number of cross-input results for both the interleaved and
+  // split cases. If interleaving results in fewer cross-input results, return
+  // true.
+  int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
+                                    NumEvenInputs[0] + NumOddInputs[1]);
+  int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
+                              NumLoInputs[0] + NumHiInputs[1]);
+  return InterleavedCrosses < SplitCrosses;
+}
+
+/// \brief Blend two v8i16 vectors using a naive unpack strategy.
+///
+/// This strategy only works when the inputs from each vector fit into a single
+/// half of that vector, and generally there are not so many inputs as to leave
+/// the in-place shuffles required highly constrained (and thus expensive). It
+/// shifts all the inputs into a single side of both input vectors and then
+/// uses an unpack to interleave these inputs in a single vector. At that
+/// point, we will fall back on the generic single input shuffle lowering.
+static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
+                                                 SDValue V2,
+                                                 MutableArrayRef<int> Mask,
+                                                 const X86Subtarget *Subtarget,
+                                                 SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
+  assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
+  SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
+  for (int i = 0; i < 8; ++i)
+    if (Mask[i] >= 0 && Mask[i] < 4)
+      LoV1Inputs.push_back(i);
+    else if (Mask[i] >= 4 && Mask[i] < 8)
+      HiV1Inputs.push_back(i);
+    else if (Mask[i] >= 8 && Mask[i] < 12)
+      LoV2Inputs.push_back(i);
+    else if (Mask[i] >= 12)
+      HiV2Inputs.push_back(i);
+
+  int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
+  int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
+  (void)NumV1Inputs;
+  (void)NumV2Inputs;
+  assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
+  assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
+  assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
+
+  bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
+                     HiV1Inputs.size() + HiV2Inputs.size();
+
+  auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
+                              ArrayRef<int> HiInputs, bool MoveToLo,
+                              int MaskOffset) {
+    ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
+    ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
+    if (BadInputs.empty())
+      return V;
+
+    int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+    int MoveOffset = MoveToLo ? 0 : 4;
+
+    if (GoodInputs.empty()) {
+      for (int BadInput : BadInputs) {
+        MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
+        Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
+      }
+    } else {
+      if (GoodInputs.size() == 2) {
+        // If the low inputs are spread across two dwords, pack them into
+        // a single dword.
+        MoveMask[Mask[GoodInputs[0]] % 2 + MoveOffset] =
+            Mask[GoodInputs[0]] - MaskOffset;
+        MoveMask[Mask[GoodInputs[1]] % 2 + MoveOffset] =
+            Mask[GoodInputs[1]] - MaskOffset;
+        Mask[GoodInputs[0]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset;
+        Mask[GoodInputs[1]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset;
+      } else {
+        // Otherwise pin the low inputs.
+        for (int GoodInput : GoodInputs)
+          MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
+      }
+
+      int MoveMaskIdx =
+          std::find(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), -1) -
+          std::begin(MoveMask);
+      assert(MoveMaskIdx >= MoveOffset && "Established above");
+
+      if (BadInputs.size() == 2) {
+        assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
+        assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
+        MoveMask[MoveMaskIdx + Mask[BadInputs[0]] % 2] =
+            Mask[BadInputs[0]] - MaskOffset;
+        MoveMask[MoveMaskIdx + Mask[BadInputs[1]] % 2] =
+            Mask[BadInputs[1]] - MaskOffset;
+        Mask[BadInputs[0]] = MoveMaskIdx + Mask[BadInputs[0]] % 2 + MaskOffset;
+        Mask[BadInputs[1]] = MoveMaskIdx + Mask[BadInputs[1]] % 2 + MaskOffset;
+      } else {
+        assert(BadInputs.size() == 1 && "All sizes handled");
+        MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
+        Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
+      }
+    }
+
+    return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
+                                MoveMask);
+  };
+  V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
+                        /*MaskOffset*/ 0);
+  V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
+                        /*MaskOffset*/ 8);
+
+  // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
+  // cross-half traffic in the final shuffle.
+
+  // Munge the mask to be a single-input mask after the unpack merges the
+  // results.
+  for (int &M : Mask)
+    if (M != -1)
+      M = 2 * (M % 4) + (M / 8);
+
+  return DAG.getVectorShuffle(
+      MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
+                                  DL, MVT::v8i16, V1, V2),
+      DAG.getUNDEF(MVT::v8i16), Mask);
+}
+
+/// \brief Generic lowering of 8-lane i16 shuffles.
+///
+/// This handles both single-input shuffles and combined shuffle/blends with
+/// two inputs. The single input shuffles are immediately delegated to
+/// a dedicated lowering routine.
+///
+/// The blends are lowered in one of three fundamental ways. If there are few
+/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
+/// of the input is significantly cheaper when lowered as an interleaving of
+/// the two inputs, try to interleave them. Otherwise, blend the low and high
+/// halves of the inputs separately (making them have relatively few inputs)
+/// and then concatenate them.
+static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> OrigMask = SVOp->getMask();
+  int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
+                        OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
+  MutableArrayRef<int> Mask(MaskStorage);
+
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+  auto isV1 = [](int M) { return M >= 0 && M < 8; };
+  auto isV2 = [](int M) { return M >= 8; };
+
+  int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
+  int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
+
+  if (NumV2Inputs == 0)
+    return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
+
+  assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
+                            "to be V1-input shuffles.");
+
+  if (NumV1Inputs + NumV2Inputs <= 4)
+    return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
+
+  // Check whether an interleaving lowering is likely to be more efficient.
+  // This isn't perfect but it is a strong heuristic that tends to work well on
+  // the kinds of shuffles that show up in practice.
+  //
+  // FIXME: Handle 1x, 2x, and 4x interleaving.
+  if (shouldLowerAsInterleaving(Mask)) {
+    // FIXME: Figure out whether we should pack these into the low or high
+    // halves.
+
+    int EMask[8], OMask[8];
+    for (int i = 0; i < 4; ++i) {
+      EMask[i] = Mask[2*i];
+      OMask[i] = Mask[2*i + 1];
+      EMask[i + 4] = -1;
+      OMask[i + 4] = -1;
+    }
+
+    SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
+    SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
+
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
+  }
+
+  int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+
+  for (int i = 0; i < 4; ++i) {
+    LoBlendMask[i] = Mask[i];
+    HiBlendMask[i] = Mask[i + 4];
+  }
+
+  SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
+  SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
+  LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
+  HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
+
+  return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                     DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
+}
+
+/// \brief Generic lowering of v16i8 shuffles.
+///
+/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
+/// detect any complexity reducing interleaving. If that doesn't help, it uses
+/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
+/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
+/// back together.
+static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
+  assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> OrigMask = SVOp->getMask();
+  assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+  int MaskStorage[16] = {
+      OrigMask[0],  OrigMask[1],  OrigMask[2],  OrigMask[3],
+      OrigMask[4],  OrigMask[5],  OrigMask[6],  OrigMask[7],
+      OrigMask[8],  OrigMask[9],  OrigMask[10], OrigMask[11],
+      OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
+  MutableArrayRef<int> Mask(MaskStorage);
+  MutableArrayRef<int> LoMask = Mask.slice(0, 8);
+  MutableArrayRef<int> HiMask = Mask.slice(8, 8);
+
+  // For single-input shuffles, there are some nicer lowering tricks we can use.
+  if (isSingleInputShuffleMask(Mask)) {
+    // Check whether we can widen this to an i16 shuffle by duplicating bytes.
+    // Notably, this handles splat and partial-splat shuffles more efficiently.
+    // However, it only makes sense if the pre-duplication shuffle simplifies
+    // things significantly. Currently, this means we need to be able to
+    // express the pre-duplication shuffle as an i16 shuffle.
+    //
+    // FIXME: We should check for other patterns which can be widened into an
+    // i16 shuffle as well.
+    auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
+      for (int i = 0; i < 16; i += 2) {
+        if (Mask[i] != Mask[i + 1])
+          return false;
+      }
+      return true;
+    };
+    auto tryToWidenViaDuplication = [&]() -> SDValue {
+      if (!canWidenViaDuplication(Mask))
+        return SDValue();
+      SmallVector<int, 4> LoInputs;
+      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
+                   [](int M) { return M >= 0 && M < 8; });
+      std::sort(LoInputs.begin(), LoInputs.end());
+      LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
+                     LoInputs.end());
+      SmallVector<int, 4> HiInputs;
+      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
+                   [](int M) { return M >= 8; });
+      std::sort(HiInputs.begin(), HiInputs.end());
+      HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
+                     HiInputs.end());
+
+      bool TargetLo = LoInputs.size() >= HiInputs.size();
+      ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
+      ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
+
+      int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+      SmallDenseMap<int, int, 8> LaneMap;
+      for (int I : InPlaceInputs) {
+        PreDupI16Shuffle[I/2] = I/2;
+        LaneMap[I] = I;
+      }
+      int j = TargetLo ? 0 : 4, je = j + 4;
+      for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
+        // Check if j is already a shuffle of this input. This happens when
+        // there are two adjacent bytes after we move the low one.
+        if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
+          // If we haven't yet mapped the input, search for a slot into which
+          // we can map it.
+          while (j < je && PreDupI16Shuffle[j] != -1)
+            ++j;
+
+          if (j == je)
+            // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
+            return SDValue();
+
+          // Map this input with the i16 shuffle.
+          PreDupI16Shuffle[j] = MovingInputs[i] / 2;
+        }
+
+        // Update the lane map based on the mapping we ended up with.
+        LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
+      }
+      V1 = DAG.getNode(
+          ISD::BITCAST, DL, MVT::v16i8,
+          DAG.getVectorShuffle(MVT::v8i16, DL,
+                               DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+                               DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
+
+      // Unpack the bytes to form the i16s that will be shuffled into place.
+      V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+                       MVT::v16i8, V1, V1);
+
+      int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+      for (int i = 0; i < 16; i += 2) {
+        if (Mask[i] != -1)
+          PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
+        assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!");
+      }
+      return DAG.getNode(
+          ISD::BITCAST, DL, MVT::v16i8,
+          DAG.getVectorShuffle(MVT::v8i16, DL,
+                               DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+                               DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
+    };
+    if (SDValue V = tryToWidenViaDuplication())
+      return V;
+  }
+
+  // Check whether an interleaving lowering is likely to be more efficient.
+  // This isn't perfect but it is a strong heuristic that tends to work well on
+  // the kinds of shuffles that show up in practice.
+  //
+  // FIXME: We need to handle other interleaving widths (i16, i32, ...).
+  if (shouldLowerAsInterleaving(Mask)) {
+    // FIXME: Figure out whether we should pack these into the low or high
+    // halves.
+
+    int EMask[16], OMask[16];
+    for (int i = 0; i < 8; ++i) {
+      EMask[i] = Mask[2*i];
+      OMask[i] = Mask[2*i + 1];
+      EMask[i + 8] = -1;
+      OMask[i + 8] = -1;
+    }
+
+    SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
+    SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
+
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds);
+  }
+
+  int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+
+  auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
+                            MutableArrayRef<int> V1HalfBlendMask,
+                            MutableArrayRef<int> V2HalfBlendMask) {
+    for (int i = 0; i < 8; ++i)
+      if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
+        V1HalfBlendMask[i] = HalfMask[i];
+        HalfMask[i] = i;
+      } else if (HalfMask[i] >= 16) {
+        V2HalfBlendMask[i] = HalfMask[i] - 16;
+        HalfMask[i] = i + 8;
+      }
+  };
+  buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
+  buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
+
+  SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
+
+  auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
+                             MutableArrayRef<int> HiBlendMask) {
+    SDValue V1, V2;
+    // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
+    // them out and avoid using UNPCK{L,H} to extract the elements of V as
+    // i16s.
+    if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
+                     [](int M) { return M >= 0 && M % 2 == 1; }) &&
+        std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
+                     [](int M) { return M >= 0 && M % 2 == 1; })) {
+      // Use a mask to drop the high bytes.
+      V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+      V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
+                       DAG.getConstant(0x00FF, MVT::v8i16));
+
+      // This will be a single vector shuffle instead of a blend so nuke V2.
+      V2 = DAG.getUNDEF(MVT::v8i16);
+
+      // Squash the masks to point directly into V1.
+      for (int &M : LoBlendMask)
+        if (M >= 0)
+          M /= 2;
+      for (int &M : HiBlendMask)
+        if (M >= 0)
+          M /= 2;
+    } else {
+      // Otherwise just unpack the low half of V into V1 and the high half into
+      // V2 so that we can blend them as i16s.
+      V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                       DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
+      V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                       DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
+    }
+
+    SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
+    SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
+    return std::make_pair(BlendedLo, BlendedHi);
+  };
+  SDValue V1Lo, V1Hi, V2Lo, V2Hi;
+  std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
+  std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
+
+  SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
+  SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
+
+  return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
+}
+
+/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
+///
+/// This routine breaks down the specific type of 128-bit shuffle and
+/// dispatches to the lowering routines accordingly.
+static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                        MVT VT, const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  switch (VT.SimpleTy) {
+  case MVT::v2i64:
+    return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v2f64:
+    return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v4i32:
+    return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v4f32:
+    return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v8i16:
+    return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v16i8:
+    return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
+
+  default:
+    llvm_unreachable("Unimplemented!");
+  }
+}
+
+/// \brief Tiny helper function to test whether adjacent masks are sequential.
+static bool areAdjacentMasksSequential(ArrayRef<int> Mask) {
+  for (int i = 0, Size = Mask.size(); i < Size; i += 2)
+    if (Mask[i] + 1 != Mask[i+1])
+      return false;
+
+  return true;
+}
+
+/// \brief Top-level lowering for x86 vector shuffles.
+///
+/// This handles decomposition, canonicalization, and lowering of all x86
+/// vector shuffles. Most of the specific lowering strategies are encapsulated
+/// above in helper routines. The canonicalization attempts to widen shuffles
+/// to involve fewer lanes of wider elements, consolidate symmetric patterns
+/// s.t. only one of the two inputs needs to be tested, etc.
+static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
+                                  SelectionDAG &DAG) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  MVT VT = Op.getSimpleValueType();
+  int NumElements = VT.getVectorNumElements();
+  SDLoc dl(Op);
+
+  assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
+
+  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+  if (V1IsUndef && V2IsUndef)
+    return DAG.getUNDEF(VT);
+
+  // When we create a shuffle node we put the UNDEF node to second operand,
+  // but in some cases the first operand may be transformed to UNDEF.
+  // In this case we should just commute the node.
+  if (V1IsUndef)
+    return CommuteVectorShuffle(SVOp, DAG);
+
+  // Check for non-undef masks pointing at an undef vector and make the masks
+  // undef as well. This makes it easier to match the shuffle based solely on
+  // the mask.
+  if (V2IsUndef)
+    for (int M : Mask)
+      if (M >= NumElements) {
+        SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
+        for (int &M : NewMask)
+          if (M >= NumElements)
+            M = -1;
+        return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
+      }
+
+  // For integer vector shuffles, try to collapse them into a shuffle of fewer
+  // lanes but wider integers. We cap this to not form integers larger than i64
+  // but it might be interesting to form i128 integers to handle flipping the
+  // low and high halves of AVX 256-bit vectors.
+  if (VT.isInteger() && VT.getScalarSizeInBits() < 64 &&
+      areAdjacentMasksSequential(Mask)) {
+    SmallVector<int, 8> NewMask;
+    for (int i = 0, Size = Mask.size(); i < Size; i += 2)
+      NewMask.push_back(Mask[i] / 2);
+    MVT NewVT =
+        MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2),
+                         VT.getVectorNumElements() / 2);
+    V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
+    V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
+    return DAG.getNode(ISD::BITCAST, dl, VT,
+                       DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask));
+  }
+
+  int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
+  for (int M : SVOp->getMask())
+    if (M < 0)
+      ++NumUndefElements;
+    else if (M < NumElements)
+      ++NumV1Elements;
+    else
+      ++NumV2Elements;
+
+  // Commute the shuffle as needed such that more elements come from V1 than
+  // V2. This allows us to match the shuffle pattern strictly on how many
+  // elements come from V1 without handling the symmetric cases.
+  if (NumV2Elements > NumV1Elements)
+    return CommuteVectorShuffle(SVOp, DAG);
+
+  // When the number of V1 and V2 elements are the same, try to minimize the
+  // number of uses of V2 in the low half of the vector.
+  if (NumV1Elements == NumV2Elements) {
+    int LowV1Elements = 0, LowV2Elements = 0;
+    for (int M : SVOp->getMask().slice(0, NumElements / 2))
+      if (M >= NumElements)
+        ++LowV2Elements;
+      else if (M >= 0)
+        ++LowV1Elements;
+    if (LowV2Elements > LowV1Elements)
+      return CommuteVectorShuffle(SVOp, DAG);
+  }
+
+  // For each vector width, delegate to a specialized lowering routine.
+  if (VT.getSizeInBits() == 128)
+    return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+
+  llvm_unreachable("Unimplemented!");
+}
+
+
+//===----------------------------------------------------------------------===//
+// Legacy vector shuffle lowering
+//
+// This code is the legacy code handling vector shuffles until the above
+// replaces its functionality and performance.
+//===----------------------------------------------------------------------===//
+
+static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
+                        bool hasInt256, unsigned *MaskOut = nullptr) {
   MVT EltVT = VT.getVectorElementType();
-  unsigned NumElems = VT.getVectorNumElements();
 
   // There is no blend with immediate in AVX-512.
   if (VT.is512BitVector())
-    return SDValue();
+    return false;
 
-  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
-    return SDValue();
-  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
-    return SDValue();
+  if (!hasSSE41 || EltVT == MVT::i8)
+    return false;
+  if (!hasInt256 && VT == MVT::v16i16)
+    return false;
 
-  // Check the mask for BLEND and build the value.
   unsigned MaskValue = 0;
+  unsigned NumElems = VT.getVectorNumElements();
   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
-  unsigned NumLanes = (NumElems-1)/8 + 1;
+  unsigned NumLanes = (NumElems - 1) / 8 + 1;
   unsigned NumElemsInLane = NumElems / NumLanes;
 
   // Blend for v16i16 should be symetric for the both lanes.
   for (unsigned i = 0; i < NumElemsInLane; ++i) {
 
-    int SndLaneEltIdx = (NumLanes == 2) ?
-      SVOp->getMaskElt(i + NumElemsInLane) : -1;
-    int EltIdx = SVOp->getMaskElt(i);
+    int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
+    int EltIdx = MaskVals[i];
 
     if ((EltIdx < 0 || EltIdx == (int)i) &&
         (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
@@ -6469,11 +8044,34 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
     if (((unsigned)EltIdx == (i + NumElems)) &&
         (SndLaneEltIdx < 0 ||
          (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
-      MaskValue |= (1<<i);
+      MaskValue |= (1 << i);
     else
-      return SDValue();
+      return false;
   }
 
+  if (MaskOut)
+    *MaskOut = MaskValue;
+  return true;
+}
+
+// Try to lower a shuffle node into a simple blend instruction.
+// This function assumes isBlendMask returns true for this
+// SuffleVectorSDNode
+static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
+                                          unsigned MaskValue,
+                                          const X86Subtarget *Subtarget,
+                                          SelectionDAG &DAG) {
+  MVT VT = SVOp->getSimpleValueType(0);
+  MVT EltVT = VT.getVectorElementType();
+  assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
+                     Subtarget->hasInt256() && "Trying to lower a "
+                                               "VECTOR_SHUFFLE to a Blend but "
+                                               "with the wrong mask"));
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  SDLoc dl(SVOp);
+  unsigned NumElems = VT.getVectorNumElements();
+
   // Convert i32 vectors to floating point if it is not AVX2.
   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
   MVT BlendVT = VT;
@@ -7450,8 +9048,9 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
          "unsupported vector type for insertps/pinsrd");
 
-  int FromV1 = std::count_if(Mask.begin(), Mask.end(),
-                             [](const int &i) { return i < 4; });
+  auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
+  auto FromV2Predicate = [](const int &i) { return i >= 4; };
+  int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
 
   SDValue From;
   SDValue To;
@@ -7459,23 +9058,26 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
   if (FromV1 == 1) {
     From = V1;
     To = V2;
-    DestIndex = std::find_if(Mask.begin(), Mask.end(),
-                             [](const int &i) { return i < 4; }) -
+    DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
                 Mask.begin();
   } else {
+    assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
+           "More than one element from V1 and from V2, or no elements from one "
+           "of the vectors. This case should not have returned true from "
+           "isINSERTPSMask");
     From = V2;
     To = V1;
-    DestIndex = std::find_if(Mask.begin(), Mask.end(),
-                             [](const int &i) { return i >= 4; }) -
-                Mask.begin();
+    DestIndex =
+        std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
   }
 
+  unsigned SrcIndex = Mask[DestIndex] % 4;
   if (MayFoldLoad(From)) {
     // Trivial case, when From comes from a load and is only used by the
     // shuffle. Make it use insertps from the vector that we need from that
     // load.
     SDValue NewLoad =
-        NarrowVectorLoadToElement(cast<LoadSDNode>(From), DestIndex, DAG);
+        NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
     if (!NewLoad.getNode())
       return SDValue();
 
@@ -7496,7 +9098,6 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
   }
 
   // Vector-element-to-vector
-  unsigned SrcIndex = Mask[DestIndex] % 4;
   SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
   return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
 }
@@ -7663,6 +9264,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   bool OptForSize = MF.getFunction()->getAttributes().
     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
 
+  // Check if we should use the experimental vector shuffle lowering. If so,
+  // delegate completely to that code path.
+  if (ExperimentalVectorShuffleLowering)
+    return lowerVectorShuffle(Op, Subtarget, DAG);
+
   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
 
   if (V1IsUndef && V2IsUndef)
@@ -7796,8 +9402,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   bool Commuted = false;
   // FIXME: This should also accept a bitcast of a splat?  Be careful, not
   // 1,1,1,1 -> v8i16 though.
-  V1IsSplat = isSplatVector(V1.getNode());
-  V2IsSplat = isSplatVector(V2.getNode());
+  BitVector UndefElements;
+  if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
+    if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
+      V1IsSplat = true;
+  if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
+    if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
+      V2IsSplat = true;
 
   // Canonicalize the splat or undef, if present, to be on the RHS.
   if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
@@ -7873,6 +9484,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
                                 getShufflePSHUFLWImmediate(SVOp),
                                 DAG);
 
+  unsigned MaskValue;
+  if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
+                  &MaskValue))
+    return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
+
   if (isSHUFPMask(M, VT))
     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
                                 getShuffleSHUFImmediate(SVOp), DAG);
@@ -7910,10 +9526,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
                                 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
 
-  SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG);
-  if (BlendOp.getNode())
-    return BlendOp;
-
   if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
     return getINSERTPS(SVOp, dl, DAG);
 
@@ -8530,7 +10142,7 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   // global base reg.
   unsigned char OpFlag = 0;
   unsigned WrapperKind = X86ISD::Wrapper;
-  CodeModel::Model M = getTargetMachine().getCodeModel();
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
 
   if (Subtarget->isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel))
@@ -8563,7 +10175,7 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   // global base reg.
   unsigned char OpFlag = 0;
   unsigned WrapperKind = X86ISD::Wrapper;
-  CodeModel::Model M = getTargetMachine().getCodeModel();
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
 
   if (Subtarget->isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel))
@@ -8596,7 +10208,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   // global base reg.
   unsigned char OpFlag = 0;
   unsigned WrapperKind = X86ISD::Wrapper;
-  CodeModel::Model M = getTargetMachine().getCodeModel();
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
 
   if (Subtarget->isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel)) {
@@ -8617,7 +10229,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
 
   // With PIC, the address is actually $g + Offset.
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+  if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
       !Subtarget->is64Bit()) {
     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                          DAG.getNode(X86ISD::GlobalBaseReg,
@@ -8639,7 +10251,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   // Create the TargetBlockAddressAddress node.
   unsigned char OpFlags =
     Subtarget->ClassifyBlockAddressReference();
-  CodeModel::Model M = getTargetMachine().getCodeModel();
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
   SDLoc dl(Op);
@@ -8668,8 +10280,8 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
   // Create the TargetGlobalAddress node, folding in the constant
   // offset if it is legal.
   unsigned char OpFlags =
-    Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
-  CodeModel::Model M = getTargetMachine().getCodeModel();
+      Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
   SDValue Result;
   if (OpFlags == X86II::MO_NO_FLAG &&
       X86::isOffsetSuitableForCodeModel(Offset, M)) {
@@ -8868,7 +10480,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   const GlobalValue *GV = GA->getGlobal();
 
   if (Subtarget->isTargetELF()) {
-    TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+    TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
 
     switch (model) {
       case TLSModel::GeneralDynamic:
@@ -8880,9 +10492,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
                                            Subtarget->is64Bit());
       case TLSModel::InitialExec:
       case TLSModel::LocalExec:
-        return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
-                                   Subtarget->is64Bit(),
-                        getTargetMachine().getRelocationModel() == Reloc::PIC_);
+        return LowerToTLSExecModel(
+            GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
+            DAG.getTarget().getRelocationModel() == Reloc::PIC_);
     }
     llvm_unreachable("Unknown TLS model.");
   }
@@ -8895,8 +10507,8 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 
     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
     // global base reg.
-    bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
-                  !Subtarget->is64Bit();
+    bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
+                 !Subtarget->is64Bit();
     if (PIC32)
       OpFlag = X86II::MO_TLVP_PIC_BASE;
     else
@@ -10050,10 +11662,27 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
     break;
   case X86::COND_G: case X86::COND_GE:
   case X86::COND_L: case X86::COND_LE:
-  case X86::COND_O: case X86::COND_NO:
-    NeedOF = true;
+  case X86::COND_O: case X86::COND_NO: {
+    // Check if we really need to set the
+    // Overflow flag. If NoSignedWrap is present
+    // that is not actually needed.
+    switch (Op->getOpcode()) {
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::MUL:
+    case ISD::SHL: {
+      const BinaryWithFlagsSDNode *BinNode =
+          cast<BinaryWithFlagsSDNode>(Op.getNode());
+      if (BinNode->hasNoSignedWrap())
+        break;
+    }
+    default:
+      NeedOF = true;
+      break;
+    }
     break;
   }
+  }
   // See if we can use the EFLAGS value from the operand instead of
   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
   // we prove that the arithmetic won't overflow, we can't use OF or CF.
@@ -10115,14 +11744,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
     if (ConstantSDNode *C =
         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
       // An add of one will be selected as an INC.
-      if (C->getAPIntValue() == 1) {
+      if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
         Opcode = X86ISD::INC;
         NumOperands = 1;
         break;
       }
 
       // An add of negative one (subtract of one) will be selected as a DEC.
-      if (C->getAPIntValue().isAllOnesValue()) {
+      if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
         Opcode = X86ISD::DEC;
         NumOperands = 1;
         break;
@@ -10138,7 +11767,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
     // If we have a constant logical shift that's only used in a comparison
     // against zero turn it into an equivalent AND. This allows turning it into
     // a TEST instruction later.
-    if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
+    if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
       EVT VT = Op.getValueType();
       unsigned BitWidth = VT.getSizeInBits();
@@ -11469,8 +13098,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (addTest) {
-    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
-    Cond = EmitTest(Cond, X86::COND_NE, dl, DAG);
+    X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
+    CC = DAG.getConstant(X86Cond, MVT::i8);
+    Cond = EmitTest(Cond, X86Cond, dl, DAG);
   }
   Cond = ConvertCmpIfNecessary(Cond, DAG);
   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
@@ -11513,7 +13143,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
     Chain = SP.getValue(1);
     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
-    const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
+    const TargetFrameLowering &TFI = *DAG.getTarget().getFrameLowering();
     unsigned StackAlign = TFI.getStackAlignment();
     Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
     if (Align > StackAlign)
@@ -11572,7 +13202,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
 
     const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+      static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
     unsigned SPReg = RegInfo->getStackRegister();
     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
     Chain = SP.getValue(1);
@@ -11681,7 +13311,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
 
   if (ArgMode == 2) {
     // Sanity Check: Make sure using fp_offset makes sense.
-    assert(!getTargetMachine().Options.UseSoftFloat &&
+    assert(!DAG.getTarget().Options.UseSoftFloat &&
            !(DAG.getMachineFunction()
                 .getFunction()->getAttributes()
                 .hasAttribute(AttributeSet::FunctionIndex,
@@ -12158,11 +13788,37 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
                        Op.getOperand(1), Op.getOperand(2));
   }
 
+  case Intrinsic::x86_sse2_packssdw_128:
+  case Intrinsic::x86_sse2_packsswb_128:
+  case Intrinsic::x86_avx2_packssdw:
+  case Intrinsic::x86_avx2_packsswb:
+    return DAG.getNode(X86ISD::PACKSS, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_sse2_packuswb_128:
+  case Intrinsic::x86_sse41_packusdw:
+  case Intrinsic::x86_avx2_packuswb:
+  case Intrinsic::x86_avx2_packusdw:
+    return DAG.getNode(X86ISD::PACKUS, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
   case Intrinsic::x86_ssse3_pshuf_b_128:
   case Intrinsic::x86_avx2_pshuf_b:
     return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
 
+  case Intrinsic::x86_sse2_pshuf_d:
+    return DAG.getNode(X86ISD::PSHUFD, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_sse2_pshufl_w:
+    return DAG.getNode(X86ISD::PSHUFLW, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_sse2_pshufh_w:
+    return DAG.getNode(X86ISD::PSHUFHW, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
   case Intrinsic::x86_ssse3_psign_b_128:
   case Intrinsic::x86_ssse3_psign_w_128:
   case Intrinsic::x86_ssse3_psign_d_128:
@@ -12610,6 +14266,51 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   return SDValue(Res, 0);
 }
 
+// getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
+// read performance monitor counters (x86_rdpmc).
+static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
+                              SelectionDAG &DAG, const X86Subtarget *Subtarget,
+                              SmallVectorImpl<SDValue> &Results) {
+  assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue LO, HI;
+
+  // The ECX register is used to select the index of the performance counter
+  // to read.
+  SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
+                                   N->getOperand(2));
+  SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
+
+  // Reads the content of a 64-bit performance counter and returns it in the
+  // registers EDX:EAX.
+  if (Subtarget->is64Bit()) {
+    LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+                            LO.getValue(2));
+  } else {
+    LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+                            LO.getValue(2));
+  }
+  Chain = HI.getValue(1);
+
+  if (Subtarget->is64Bit()) {
+    // The EAX register is loaded with the low-order 32 bits. The EDX register
+    // is loaded with the supported high-order bits of the counter.
+    SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+                              DAG.getConstant(32, MVT::i8));
+    Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+    Results.push_back(Chain);
+    return;
+  }
+
+  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+  SDValue Ops[] = { LO, HI };
+  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+  Results.push_back(Pair);
+  Results.push_back(Chain);
+}
+
 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
 // also used to custom lower READCYCLECOUNTER nodes.
@@ -12674,7 +14375,7 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
 }
 
 enum IntrinsicType {
-  GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDTSC, XTEST
+  GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST
 };
 
 struct IntrinsicData {
@@ -12768,6 +14469,8 @@ static void InitIntinsicsMap() {
                                 IntrinsicData(RDTSC,  X86ISD::RDTSC_DAG, 0)));
   IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp,
                                 IntrinsicData(RDTSC,  X86ISD::RDTSCP_DAG, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdpmc,
+                                IntrinsicData(RDPMC,  X86ISD::RDPMC_DAG, 0)));
   Initialized = true;
 }
 
@@ -12826,7 +14529,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
   case PREFETCH: {
     SDValue Hint = Op.getOperand(6);
     unsigned HintVal;
-    if (dyn_cast<ConstantSDNode> (Hint) == 0 ||
+    if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
         (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
       llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
     unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0);
@@ -12843,6 +14546,12 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
     getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results);
     return DAG.getMergeValues(Results, dl);
   }
+  // Read Performance Monitoring Counters.
+  case RDPMC: {
+    SmallVector<SDValue, 2> Results;
+    getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
+    return DAG.getMergeValues(Results, dl);
+  }
   // XTEST intrinsics.
   case XTEST: {
     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
@@ -12873,7 +14582,7 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+      static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, PtrVT,
@@ -12895,7 +14604,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
           (FrameReg == X86::EBP && VT == MVT::i32)) &&
@@ -12924,7 +14633,7 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName,
 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
                                                      SelectionDAG &DAG) const {
   const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
 }
 
@@ -12936,7 +14645,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
 
   EVT PtrVT = getPointerTy();
   const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
@@ -12983,7 +14692,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   SDLoc dl (Op);
 
   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
-  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo* TRI = DAG.getTarget().getRegisterInfo();
 
   if (Subtarget->is64Bit()) {
     SDValue OutChains[6];
@@ -13431,7 +15140,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
   CLI.setDebugLoc(dl).setChain(InChain)
     .setCallee(getLibcallCallingConv(LC),
                static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
-               Callee, &Args, 0)
+               Callee, std::move(Args), 0)
     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
@@ -13448,7 +15157,7 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
          (VT == MVT::v8i32 && Subtarget->hasInt256()));
 
   // Get the high parts.
-  const int Mask[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
   SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
   SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
 
@@ -13464,10 +15173,18 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
                              DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1));
 
   // Shuffle it back into the right order.
-  const int HighMask[] = {1, 5, 3, 7, 9, 13, 11, 15};
-  SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
-  const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14};
-  SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+  SDValue Highs, Lows;
+  if (VT == MVT::v8i32) {
+    const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
+    Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+    const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
+    Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+  } else {
+    const int HighMask[] = {1, 5, 3, 7};
+    Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+    const int LowMask[] = {0, 4, 2, 6};
+    Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+  }
 
   // If we have a signed multiply but no PMULDQ fix up the high parts of a
   // unsigned multiply.
@@ -13494,10 +15211,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   SDValue Amt = Op.getOperand(1);
 
   // Optimize shl/srl/sra with constant shift amount.
-  if (isSplatVector(Amt.getNode())) {
-    SDValue SclrAmt = Amt->getOperand(0);
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
-      uint64_t ShiftAmt = C->getZExtValue();
+  if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
+    if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
+      uint64_t ShiftAmt = ShiftConst->getZExtValue();
 
       if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
           (Subtarget->hasInt256() &&
@@ -13804,15 +15520,14 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
 
 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
                           SelectionDAG &DAG) {
-
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
   SDValue V;
 
-  if (!Subtarget->hasSSE2())
-    return SDValue();
+  assert(VT.isVector() && "Custom lowering only for vector shifts!");
+  assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
 
   V = LowerScalarImmediateShift(Op, DAG, Subtarget);
   if (V.getNode())
@@ -14254,7 +15969,7 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
     break;
   }
   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
-                                    Op.getOperand(2), SDValue());
+                                  Op.getOperand(2), SDValue());
   SDValue Ops[] = { cpIn.getValue(0),
                     Op.getOperand(1),
                     Op.getOperand(3),
@@ -14264,9 +15979,18 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
                                            Ops, T, MMO);
+
   SDValue cpOut =
     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
-  return cpOut;
+  SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
+                                      MVT::i32, cpOut.getValue(2));
+  SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
+                                DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
+
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
+  return SDValue();
 }
 
 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
@@ -14422,7 +16146,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(CallingConv::C, RetTy, Callee, &Args, 0);
+    .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
 
   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
@@ -14446,7 +16170,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   default: llvm_unreachable("Should not custom lower this!");
   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
-  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op, Subtarget, DAG);
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+    return LowerCMP_SWAP(Op, Subtarget, DAG);
   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
@@ -14528,8 +16253,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 }
 
 static void ReplaceATOMIC_LOAD(SDNode *Node,
-                                  SmallVectorImpl<SDValue> &Results,
-                                  SelectionDAG &DAG) {
+                               SmallVectorImpl<SDValue> &Results,
+                               SelectionDAG &DAG) {
   SDLoc dl(Node);
   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
 
@@ -14538,38 +16263,16 @@ static void ReplaceATOMIC_LOAD(SDNode *Node,
   //        (The only way to get a 16-byte load is cmpxchg16b)
   // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
   SDValue Zero = DAG.getConstant(0, VT);
-  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
-                               Node->getOperand(0),
-                               Node->getOperand(1), Zero, Zero,
-                               cast<AtomicSDNode>(Node)->getMemOperand(),
-                               cast<AtomicSDNode>(Node)->getOrdering(),
-                               cast<AtomicSDNode>(Node)->getOrdering(),
-                               cast<AtomicSDNode>(Node)->getSynchScope());
+  SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
+  SDValue Swap =
+      DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, VT, VTs,
+                           Node->getOperand(0), Node->getOperand(1), Zero, Zero,
+                           cast<AtomicSDNode>(Node)->getMemOperand(),
+                           cast<AtomicSDNode>(Node)->getOrdering(),
+                           cast<AtomicSDNode>(Node)->getOrdering(),
+                           cast<AtomicSDNode>(Node)->getSynchScope());
   Results.push_back(Swap.getValue(0));
-  Results.push_back(Swap.getValue(1));
-}
-
-static void
-ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
-                        SelectionDAG &DAG, unsigned NewOp) {
-  SDLoc dl(Node);
-  assert (Node->getValueType(0) == MVT::i64 &&
-          "Only know how to expand i64 atomics");
-
-  SDValue Chain = Node->getOperand(0);
-  SDValue In1 = Node->getOperand(1);
-  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                             Node->getOperand(2), DAG.getIntPtrConstant(0));
-  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                             Node->getOperand(2), DAG.getIntPtrConstant(1));
-  SDValue Ops[] = { Chain, In1, In2L, In2H };
-  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
-  SDValue Result =
-    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, MVT::i64,
-                            cast<MemSDNode>(Node)->getMemOperand());
-  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
-  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF));
-  Results.push_back(Result.getValue(2));
+  Results.push_back(Swap.getValue(2));
 }
 
 /// ReplaceNodeResults - Replace a node with an illegal result type
@@ -14656,13 +16359,15 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     case Intrinsic::x86_rdtscp:
       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
                                      Results);
+    case Intrinsic::x86_rdpmc:
+      return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
     }
   }
   case ISD::READCYCLECOUNTER: {
     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
                                    Results);
   }
-  case ISD::ATOMIC_CMP_SWAP: {
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
     EVT T = N->getValueType(0);
     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
     bool Regs64bit = T == MVT::i128;
@@ -14704,61 +16409,33 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                         Regs64bit ? X86::RDX : X86::EDX,
                                         HalfT, cpOutL.getValue(2));
     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+
+    SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
+                                        MVT::i32, cpOutH.getValue(2));
+    SDValue Success =
+        DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                    DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
+    Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
+
     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
-    Results.push_back(cpOutH.getValue(1));
+    Results.push_back(Success);
+    Results.push_back(EFLAGS.getValue(1));
     return;
   }
+  case ISD::ATOMIC_SWAP:
   case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
   case ISD::ATOMIC_LOAD_AND:
-  case ISD::ATOMIC_LOAD_NAND:
   case ISD::ATOMIC_LOAD_OR:
-  case ISD::ATOMIC_LOAD_SUB:
   case ISD::ATOMIC_LOAD_XOR:
-  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_NAND:
   case ISD::ATOMIC_LOAD_MIN:
-  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_LOAD_MAX:
   case ISD::ATOMIC_LOAD_UMIN:
-  case ISD::ATOMIC_SWAP: {
-    unsigned Opc;
-    switch (N->getOpcode()) {
-    default: llvm_unreachable("Unexpected opcode");
-    case ISD::ATOMIC_LOAD_ADD:
-      Opc = X86ISD::ATOMADD64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_AND:
-      Opc = X86ISD::ATOMAND64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_NAND:
-      Opc = X86ISD::ATOMNAND64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_OR:
-      Opc = X86ISD::ATOMOR64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_SUB:
-      Opc = X86ISD::ATOMSUB64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_XOR:
-      Opc = X86ISD::ATOMXOR64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_MAX:
-      Opc = X86ISD::ATOMMAX64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_MIN:
-      Opc = X86ISD::ATOMMIN64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_UMAX:
-      Opc = X86ISD::ATOMUMAX64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_UMIN:
-      Opc = X86ISD::ATOMUMIN64_DAG;
-      break;
-    case ISD::ATOMIC_SWAP:
-      Opc = X86ISD::ATOMSWAP64_DAG;
-      break;
-    }
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
-    return;
-  }
+  case ISD::ATOMIC_LOAD_UMAX:
+    // Delegate to generic TypeLegalization. Situations we can really handle
+    // should have already been dealt with by X86AtomicExpand.cpp.
+    break;
   case ISD::ATOMIC_LOAD: {
     ReplaceATOMIC_LOAD(N, Results, DAG);
     return;
@@ -14779,6 +16456,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                    MVT::v2f64, N->getOperand(0));
     SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
 
+    if (ExperimentalVectorWideningLegalization) {
+      // If we are legalizing vectors by widening, we already have the desired
+      // legal vector type, just return it.
+      Results.push_back(ToVecInt);
+      return;
+    }
+
     SmallVector<SDValue, 8> Elts;
     for (unsigned i = 0, e = NumElts; i != e; ++i)
       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
@@ -14810,6 +16494,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FST:                return "X86ISD::FST";
   case X86ISD::CALL:               return "X86ISD::CALL";
   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
+  case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
+  case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
   case X86ISD::BT:                 return "X86ISD::BT";
   case X86ISD::CMP:                return "X86ISD::CMP";
   case X86ISD::COMI:               return "X86ISD::COMI";
@@ -14863,12 +16549,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
-  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
-  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
-  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
-  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
-  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
-  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
+  case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
@@ -14909,6 +16590,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::TESTM:              return "X86ISD::TESTM";
   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
+  case X86ISD::PACKSS:             return "X86ISD::PACKSS";
+  case X86ISD::PACKUS:             return "X86ISD::PACKUS";
   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
@@ -15173,7 +16856,8 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
           isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
           isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
           isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
-          isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()));
+          isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
+          isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()));
 }
 
 bool
@@ -15256,685 +16940,6 @@ static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
   return sinkMBB;
 }
 
-// Get CMPXCHG opcode for the specified data type.
-static unsigned getCmpXChgOpcode(EVT VT) {
-  switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::i8:  return X86::LCMPXCHG8;
-  case MVT::i16: return X86::LCMPXCHG16;
-  case MVT::i32: return X86::LCMPXCHG32;
-  case MVT::i64: return X86::LCMPXCHG64;
-  default:
-    break;
-  }
-  llvm_unreachable("Invalid operand size!");
-}
-
-// Get LOAD opcode for the specified data type.
-static unsigned getLoadOpcode(EVT VT) {
-  switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::i8:  return X86::MOV8rm;
-  case MVT::i16: return X86::MOV16rm;
-  case MVT::i32: return X86::MOV32rm;
-  case MVT::i64: return X86::MOV64rm;
-  default:
-    break;
-  }
-  llvm_unreachable("Invalid operand size!");
-}
-
-// Get opcode of the non-atomic one from the specified atomic instruction.
-static unsigned getNonAtomicOpcode(unsigned Opc) {
-  switch (Opc) {
-  case X86::ATOMAND8:  return X86::AND8rr;
-  case X86::ATOMAND16: return X86::AND16rr;
-  case X86::ATOMAND32: return X86::AND32rr;
-  case X86::ATOMAND64: return X86::AND64rr;
-  case X86::ATOMOR8:   return X86::OR8rr;
-  case X86::ATOMOR16:  return X86::OR16rr;
-  case X86::ATOMOR32:  return X86::OR32rr;
-  case X86::ATOMOR64:  return X86::OR64rr;
-  case X86::ATOMXOR8:  return X86::XOR8rr;
-  case X86::ATOMXOR16: return X86::XOR16rr;
-  case X86::ATOMXOR32: return X86::XOR32rr;
-  case X86::ATOMXOR64: return X86::XOR64rr;
-  }
-  llvm_unreachable("Unhandled atomic-load-op opcode!");
-}
-
-// Get opcode of the non-atomic one from the specified atomic instruction with
-// extra opcode.
-static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc,
-                                               unsigned &ExtraOpc) {
-  switch (Opc) {
-  case X86::ATOMNAND8:  ExtraOpc = X86::NOT8r;   return X86::AND8rr;
-  case X86::ATOMNAND16: ExtraOpc = X86::NOT16r;  return X86::AND16rr;
-  case X86::ATOMNAND32: ExtraOpc = X86::NOT32r;  return X86::AND32rr;
-  case X86::ATOMNAND64: ExtraOpc = X86::NOT64r;  return X86::AND64rr;
-  case X86::ATOMMAX8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVL32rr;
-  case X86::ATOMMAX16:  ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr;
-  case X86::ATOMMAX32:  ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr;
-  case X86::ATOMMAX64:  ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr;
-  case X86::ATOMMIN8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVG32rr;
-  case X86::ATOMMIN16:  ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr;
-  case X86::ATOMMIN32:  ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr;
-  case X86::ATOMMIN64:  ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr;
-  case X86::ATOMUMAX8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVB32rr;
-  case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr;
-  case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr;
-  case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr;
-  case X86::ATOMUMIN8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVA32rr;
-  case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr;
-  case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr;
-  case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr;
-  }
-  llvm_unreachable("Unhandled atomic-load-op opcode!");
-}
-
-// Get opcode of the non-atomic one from the specified atomic instruction for
-// 64-bit data type on 32-bit target.
-static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) {
-  switch (Opc) {
-  case X86::ATOMAND6432:  HiOpc = X86::AND32rr; return X86::AND32rr;
-  case X86::ATOMOR6432:   HiOpc = X86::OR32rr;  return X86::OR32rr;
-  case X86::ATOMXOR6432:  HiOpc = X86::XOR32rr; return X86::XOR32rr;
-  case X86::ATOMADD6432:  HiOpc = X86::ADC32rr; return X86::ADD32rr;
-  case X86::ATOMSUB6432:  HiOpc = X86::SBB32rr; return X86::SUB32rr;
-  case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr;
-  case X86::ATOMMAX6432:  HiOpc = X86::SETLr;   return X86::SETLr;
-  case X86::ATOMMIN6432:  HiOpc = X86::SETGr;   return X86::SETGr;
-  case X86::ATOMUMAX6432: HiOpc = X86::SETBr;   return X86::SETBr;
-  case X86::ATOMUMIN6432: HiOpc = X86::SETAr;   return X86::SETAr;
-  }
-  llvm_unreachable("Unhandled atomic-load-op opcode!");
-}
-
-// Get opcode of the non-atomic one from the specified atomic instruction for
-// 64-bit data type on 32-bit target with extra opcode.
-static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc,
-                                                   unsigned &HiOpc,
-                                                   unsigned &ExtraOpc) {
-  switch (Opc) {
-  case X86::ATOMNAND6432:
-    ExtraOpc = X86::NOT32r;
-    HiOpc = X86::AND32rr;
-    return X86::AND32rr;
-  }
-  llvm_unreachable("Unhandled atomic-load-op opcode!");
-}
-
-// Get pseudo CMOV opcode from the specified data type.
-static unsigned getPseudoCMOVOpc(EVT VT) {
-  switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::i8:  return X86::CMOV_GR8;
-  case MVT::i16: return X86::CMOV_GR16;
-  case MVT::i32: return X86::CMOV_GR32;
-  default:
-    break;
-  }
-  llvm_unreachable("Unknown CMOV opcode!");
-}
-
-// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions.
-// They will be translated into a spin-loop or compare-exchange loop from
-//
-//    ...
-//    dst = atomic-fetch-op MI.addr, MI.val
-//    ...
-//
-// to
-//
-//    ...
-//    t1 = LOAD MI.addr
-// loop:
-//    t4 = phi(t1, t3 / loop)
-//    t2 = OP MI.val, t4
-//    EAX = t4
-//    LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined]
-//    t3 = EAX
-//    JNE loop
-// sink:
-//    dst = t3
-//    ...
-MachineBasicBlock *
-X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
-                                       MachineBasicBlock *MBB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
-
-  MachineFunction *MF = MBB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-
-  const BasicBlock *BB = MBB->getBasicBlock();
-  MachineFunction::iterator I = MBB;
-  ++I;
-
-  assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
-         "Unexpected number of operands");
-
-  assert(MI->hasOneMemOperand() &&
-         "Expected atomic-load-op to have one memoperand");
-
-  // Memory Reference
-  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
-  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
-
-  unsigned DstReg, SrcReg;
-  unsigned MemOpndSlot;
-
-  unsigned CurOp = 0;
-
-  DstReg = MI->getOperand(CurOp++).getReg();
-  MemOpndSlot = CurOp;
-  CurOp += X86::AddrNumOperands;
-  SrcReg = MI->getOperand(CurOp++).getReg();
-
-  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
-  MVT::SimpleValueType VT = *RC->vt_begin();
-  unsigned t1 = MRI.createVirtualRegister(RC);
-  unsigned t2 = MRI.createVirtualRegister(RC);
-  unsigned t3 = MRI.createVirtualRegister(RC);
-  unsigned t4 = MRI.createVirtualRegister(RC);
-  unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT);
-
-  unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT);
-  unsigned LOADOpc = getLoadOpcode(VT);
-
-  // For the atomic load-arith operator, we generate
-  //
-  //  thisMBB:
-  //    t1 = LOAD [MI.addr]
-  //  mainMBB:
-  //    t4 = phi(t1 / thisMBB, t3 / mainMBB)
-  //    t1 = OP MI.val, EAX
-  //    EAX = t4
-  //    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
-  //    t3 = EAX
-  //    JNE mainMBB
-  //  sinkMBB:
-  //    dst = t3
-
-  MachineBasicBlock *thisMBB = MBB;
-  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
-  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
-  MF->insert(I, mainMBB);
-  MF->insert(I, sinkMBB);
-
-  MachineInstrBuilder MIB;
-
-  // Transfer the remainder of BB and its successor edges to sinkMBB.
-  sinkMBB->splice(sinkMBB->begin(), MBB,
-                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
-  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
-
-  // thisMBB:
-  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1);
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
-    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
-    if (NewMO.isReg())
-      NewMO.setIsKill(false);
-    MIB.addOperand(NewMO);
-  }
-  for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
-    unsigned flags = (*MMOI)->getFlags();
-    flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
-    MachineMemOperand *MMO =
-      MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
-                               (*MMOI)->getSize(),
-                               (*MMOI)->getBaseAlignment(),
-                               (*MMOI)->getTBAAInfo(),
-                               (*MMOI)->getRanges());
-    MIB.addMemOperand(MMO);
-  }
-
-  thisMBB->addSuccessor(mainMBB);
-
-  // mainMBB:
-  MachineBasicBlock *origMainMBB = mainMBB;
-
-  // Add a PHI.
-  MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4)
-                        .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
-
-  unsigned Opc = MI->getOpcode();
-  switch (Opc) {
-  default:
-    llvm_unreachable("Unhandled atomic-load-op opcode!");
-  case X86::ATOMAND8:
-  case X86::ATOMAND16:
-  case X86::ATOMAND32:
-  case X86::ATOMAND64:
-  case X86::ATOMOR8:
-  case X86::ATOMOR16:
-  case X86::ATOMOR32:
-  case X86::ATOMOR64:
-  case X86::ATOMXOR8:
-  case X86::ATOMXOR16:
-  case X86::ATOMXOR32:
-  case X86::ATOMXOR64: {
-    unsigned ARITHOpc = getNonAtomicOpcode(Opc);
-    BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg)
-      .addReg(t4);
-    break;
-  }
-  case X86::ATOMNAND8:
-  case X86::ATOMNAND16:
-  case X86::ATOMNAND32:
-  case X86::ATOMNAND64: {
-    unsigned Tmp = MRI.createVirtualRegister(RC);
-    unsigned NOTOpc;
-    unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc);
-    BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg)
-      .addReg(t4);
-    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp);
-    break;
-  }
-  case X86::ATOMMAX8:
-  case X86::ATOMMAX16:
-  case X86::ATOMMAX32:
-  case X86::ATOMMAX64:
-  case X86::ATOMMIN8:
-  case X86::ATOMMIN16:
-  case X86::ATOMMIN32:
-  case X86::ATOMMIN64:
-  case X86::ATOMUMAX8:
-  case X86::ATOMUMAX16:
-  case X86::ATOMUMAX32:
-  case X86::ATOMUMAX64:
-  case X86::ATOMUMIN8:
-  case X86::ATOMUMIN16:
-  case X86::ATOMUMIN32:
-  case X86::ATOMUMIN64: {
-    unsigned CMPOpc;
-    unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc);
-
-    BuildMI(mainMBB, DL, TII->get(CMPOpc))
-      .addReg(SrcReg)
-      .addReg(t4);
-
-    if (Subtarget->hasCMov()) {
-      if (VT != MVT::i8) {
-        // Native support
-        BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2)
-          .addReg(SrcReg)
-          .addReg(t4);
-      } else {
-        // Promote i8 to i32 to use CMOV32
-        const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
-        const TargetRegisterClass *RC32 =
-          TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit);
-        unsigned SrcReg32 = MRI.createVirtualRegister(RC32);
-        unsigned AccReg32 = MRI.createVirtualRegister(RC32);
-        unsigned Tmp = MRI.createVirtualRegister(RC32);
-
-        unsigned Undef = MRI.createVirtualRegister(RC32);
-        BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef);
-
-        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32)
-          .addReg(Undef)
-          .addReg(SrcReg)
-          .addImm(X86::sub_8bit);
-        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32)
-          .addReg(Undef)
-          .addReg(t4)
-          .addImm(X86::sub_8bit);
-
-        BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp)
-          .addReg(SrcReg32)
-          .addReg(AccReg32);
-
-        BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2)
-          .addReg(Tmp, 0, X86::sub_8bit);
-      }
-    } else {
-      // Use pseudo select and lower them.
-      assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) &&
-             "Invalid atomic-load-op transformation!");
-      unsigned SelOpc = getPseudoCMOVOpc(VT);
-      X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc);
-      assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!");
-      MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2)
-              .addReg(SrcReg).addReg(t4)
-              .addImm(CC);
-      mainMBB = EmitLoweredSelect(MIB, mainMBB);
-      // Replace the original PHI node as mainMBB is changed after CMOV
-      // lowering.
-      BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4)
-        .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
-      Phi->eraseFromParent();
-    }
-    break;
-  }
-  }
-
-  // Copy PhyReg back from virtual register.
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg)
-    .addReg(t4);
-
-  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
-    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
-    if (NewMO.isReg())
-      NewMO.setIsKill(false);
-    MIB.addOperand(NewMO);
-  }
-  MIB.addReg(t2);
-  MIB.setMemRefs(MMOBegin, MMOEnd);
-
-  // Copy PhyReg back to virtual register.
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3)
-    .addReg(PhyReg);
-
-  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
-
-  mainMBB->addSuccessor(origMainMBB);
-  mainMBB->addSuccessor(sinkMBB);
-
-  // sinkMBB:
-  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
-          TII->get(TargetOpcode::COPY), DstReg)
-    .addReg(t3);
-
-  MI->eraseFromParent();
-  return sinkMBB;
-}
-
-// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic
-// instructions. They will be translated into a spin-loop or compare-exchange
-// loop from
-//
-//    ...
-//    dst = atomic-fetch-op MI.addr, MI.val
-//    ...
-//
-// to
-//
-//    ...
-//    t1L = LOAD [MI.addr + 0]
-//    t1H = LOAD [MI.addr + 4]
-// loop:
-//    t4L = phi(t1L, t3L / loop)
-//    t4H = phi(t1H, t3H / loop)
-//    t2L = OP MI.val.lo, t4L
-//    t2H = OP MI.val.hi, t4H
-//    EAX = t4L
-//    EDX = t4H
-//    EBX = t2L
-//    ECX = t2H
-//    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
-//    t3L = EAX
-//    t3H = EDX
-//    JNE loop
-// sink:
-//    dstL = t3L
-//    dstH = t3H
-//    ...
-MachineBasicBlock *
-X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
-                                           MachineBasicBlock *MBB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
-
-  MachineFunction *MF = MBB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-
-  const BasicBlock *BB = MBB->getBasicBlock();
-  MachineFunction::iterator I = MBB;
-  ++I;
-
-  assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 &&
-         "Unexpected number of operands");
-
-  assert(MI->hasOneMemOperand() &&
-         "Expected atomic-load-op32 to have one memoperand");
-
-  // Memory Reference
-  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
-  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
-
-  unsigned DstLoReg, DstHiReg;
-  unsigned SrcLoReg, SrcHiReg;
-  unsigned MemOpndSlot;
-
-  unsigned CurOp = 0;
-
-  DstLoReg = MI->getOperand(CurOp++).getReg();
-  DstHiReg = MI->getOperand(CurOp++).getReg();
-  MemOpndSlot = CurOp;
-  CurOp += X86::AddrNumOperands;
-  SrcLoReg = MI->getOperand(CurOp++).getReg();
-  SrcHiReg = MI->getOperand(CurOp++).getReg();
-
-  const TargetRegisterClass *RC = &X86::GR32RegClass;
-  const TargetRegisterClass *RC8 = &X86::GR8RegClass;
-
-  unsigned t1L = MRI.createVirtualRegister(RC);
-  unsigned t1H = MRI.createVirtualRegister(RC);
-  unsigned t2L = MRI.createVirtualRegister(RC);
-  unsigned t2H = MRI.createVirtualRegister(RC);
-  unsigned t3L = MRI.createVirtualRegister(RC);
-  unsigned t3H = MRI.createVirtualRegister(RC);
-  unsigned t4L = MRI.createVirtualRegister(RC);
-  unsigned t4H = MRI.createVirtualRegister(RC);
-
-  unsigned LCMPXCHGOpc = X86::LCMPXCHG8B;
-  unsigned LOADOpc = X86::MOV32rm;
-
-  // For the atomic load-arith operator, we generate
-  //
-  //  thisMBB:
-  //    t1L = LOAD [MI.addr + 0]
-  //    t1H = LOAD [MI.addr + 4]
-  //  mainMBB:
-  //    t4L = phi(t1L / thisMBB, t3L / mainMBB)
-  //    t4H = phi(t1H / thisMBB, t3H / mainMBB)
-  //    t2L = OP MI.val.lo, t4L
-  //    t2H = OP MI.val.hi, t4H
-  //    EBX = t2L
-  //    ECX = t2H
-  //    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
-  //    t3L = EAX
-  //    t3H = EDX
-  //    JNE loop
-  //  sinkMBB:
-  //    dstL = t3L
-  //    dstH = t3H
-
-  MachineBasicBlock *thisMBB = MBB;
-  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
-  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
-  MF->insert(I, mainMBB);
-  MF->insert(I, sinkMBB);
-
-  MachineInstrBuilder MIB;
-
-  // Transfer the remainder of BB and its successor edges to sinkMBB.
-  sinkMBB->splice(sinkMBB->begin(), MBB,
-                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
-  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
-
-  // thisMBB:
-  // Lo
-  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L);
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
-    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
-    if (NewMO.isReg())
-      NewMO.setIsKill(false);
-    MIB.addOperand(NewMO);
-  }
-  for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
-    unsigned flags = (*MMOI)->getFlags();
-    flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
-    MachineMemOperand *MMO =
-      MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
-                               (*MMOI)->getSize(),
-                               (*MMOI)->getBaseAlignment(),
-                               (*MMOI)->getTBAAInfo(),
-                               (*MMOI)->getRanges());
-    MIB.addMemOperand(MMO);
-  };
-  MachineInstr *LowMI = MIB;
-
-  // Hi
-  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H);
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
-    if (i == X86::AddrDisp) {
-      MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32)
-    } else {
-      MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
-      if (NewMO.isReg())
-        NewMO.setIsKill(false);
-      MIB.addOperand(NewMO);
-    }
-  }
-  MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end());
-
-  thisMBB->addSuccessor(mainMBB);
-
-  // mainMBB:
-  MachineBasicBlock *origMainMBB = mainMBB;
-
-  // Add PHIs.
-  MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L)
-                        .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
-  MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H)
-                        .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
-
-  unsigned Opc = MI->getOpcode();
-  switch (Opc) {
-  default:
-    llvm_unreachable("Unhandled atomic-load-op6432 opcode!");
-  case X86::ATOMAND6432:
-  case X86::ATOMOR6432:
-  case X86::ATOMXOR6432:
-  case X86::ATOMADD6432:
-  case X86::ATOMSUB6432: {
-    unsigned HiOpc;
-    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
-    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L)
-      .addReg(SrcLoReg);
-    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H)
-      .addReg(SrcHiReg);
-    break;
-  }
-  case X86::ATOMNAND6432: {
-    unsigned HiOpc, NOTOpc;
-    unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc);
-    unsigned TmpL = MRI.createVirtualRegister(RC);
-    unsigned TmpH = MRI.createVirtualRegister(RC);
-    BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg)
-      .addReg(t4L);
-    BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg)
-      .addReg(t4H);
-    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL);
-    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH);
-    break;
-  }
-  case X86::ATOMMAX6432:
-  case X86::ATOMMIN6432:
-  case X86::ATOMUMAX6432:
-  case X86::ATOMUMIN6432: {
-    unsigned HiOpc;
-    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
-    unsigned cL = MRI.createVirtualRegister(RC8);
-    unsigned cH = MRI.createVirtualRegister(RC8);
-    unsigned cL32 = MRI.createVirtualRegister(RC);
-    unsigned cH32 = MRI.createVirtualRegister(RC);
-    unsigned cc = MRI.createVirtualRegister(RC);
-    // cl := cmp src_lo, lo
-    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
-      .addReg(SrcLoReg).addReg(t4L);
-    BuildMI(mainMBB, DL, TII->get(LoOpc), cL);
-    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL);
-    // ch := cmp src_hi, hi
-    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
-      .addReg(SrcHiReg).addReg(t4H);
-    BuildMI(mainMBB, DL, TII->get(HiOpc), cH);
-    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH);
-    // cc := if (src_hi == hi) ? cl : ch;
-    if (Subtarget->hasCMov()) {
-      BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc)
-        .addReg(cH32).addReg(cL32);
-    } else {
-      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc)
-              .addReg(cH32).addReg(cL32)
-              .addImm(X86::COND_E);
-      mainMBB = EmitLoweredSelect(MIB, mainMBB);
-    }
-    BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc);
-    if (Subtarget->hasCMov()) {
-      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L)
-        .addReg(SrcLoReg).addReg(t4L);
-      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H)
-        .addReg(SrcHiReg).addReg(t4H);
-    } else {
-      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L)
-              .addReg(SrcLoReg).addReg(t4L)
-              .addImm(X86::COND_NE);
-      mainMBB = EmitLoweredSelect(MIB, mainMBB);
-      // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the
-      // 2nd CMOV lowering.
-      mainMBB->addLiveIn(X86::EFLAGS);
-      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H)
-              .addReg(SrcHiReg).addReg(t4H)
-              .addImm(X86::COND_NE);
-      mainMBB = EmitLoweredSelect(MIB, mainMBB);
-      // Replace the original PHI node as mainMBB is changed after CMOV
-      // lowering.
-      BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L)
-        .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
-      BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H)
-        .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
-      PhiL->eraseFromParent();
-      PhiH->eraseFromParent();
-    }
-    break;
-  }
-  case X86::ATOMSWAP6432: {
-    unsigned HiOpc;
-    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
-    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg);
-    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg);
-    break;
-  }
-  }
-
-  // Copy EDX:EAX back from HiReg:LoReg
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L);
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H);
-  // Copy ECX:EBX from t1H:t1L
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L);
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H);
-
-  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
-    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
-    if (NewMO.isReg())
-      NewMO.setIsKill(false);
-    MIB.addOperand(NewMO);
-  }
-  MIB.setMemRefs(MMOBegin, MMOEnd);
-
-  // Copy EDX:EAX back to t3H:t3L
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX);
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX);
-
-  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
-
-  mainMBB->addSuccessor(origMainMBB);
-  mainMBB->addSuccessor(sinkMBB);
-
-  // sinkMBB:
-  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
-          TII->get(TargetOpcode::COPY), DstLoReg)
-    .addReg(t3L);
-  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
-          TII->get(TargetOpcode::COPY), DstHiReg)
-    .addReg(t3H);
-
-  MI->eraseFromParent();
-  return sinkMBB;
-}
-
 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
 // or XMM0_V32I8 in AVX all of this code can be replaced with that
 // in the .td file.
@@ -16068,7 +17073,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
 
   // Machine Information
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo();
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
@@ -16324,7 +17329,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
   XMMSaveMBB->addSuccessor(EndMBB);
 
   // Now add the instructions.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   unsigned CountReg = MI->getOperand(0).getReg();
@@ -16407,7 +17412,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
                                      MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   // To "insert" a SELECT_CC instruction, we actually have to insert the
@@ -16433,7 +17438,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
 
   // If the EFLAGS register isn't dead in the terminator, then claim that it's
   // live into the sink and copy blocks.
-  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo* TRI = BB->getParent()->getTarget().getRegisterInfo();
   if (!MI->killsRegister(X86::EFLAGS) &&
       !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
     copy0MBB->addLiveIn(X86::EFLAGS);
@@ -16474,9 +17479,9 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
                                         bool Is64Bit) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
   MachineFunction *MF = BB->getParent();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 
   assert(MF->shouldSplitStack());
@@ -16546,7 +17551,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
 
   // Calls into a routine in libgcc to allocate more space from the heap.
   const uint32_t *RegMask =
-    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+    MF->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
   if (Is64Bit) {
     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
       .addReg(sizeVReg);
@@ -16594,8 +17599,8 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
 
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
-                                          MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+                                        MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   assert(!Subtarget->isTargetMacho());
@@ -16651,10 +17656,10 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
   // our load from the relocation, sticking it in either RDI (x86-64)
   // or EAX and doing an indirect call.  The return value will then
   // be in the normal return register.
+  MachineFunction *F = BB->getParent();
   const X86InstrInfo *TII
-    = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
+    = static_cast<const X86InstrInfo*>(F->getTarget().getInstrInfo());
   DebugLoc DL = MI->getDebugLoc();
-  MachineFunction *F = BB->getParent();
 
   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
   assert(MI->getOperand(3).isGlobal() && "This should be a global");
@@ -16663,7 +17668,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
   // proper register mask.
   const uint32_t *RegMask =
-    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+    F->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
   if (Subtarget->is64Bit()) {
     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
                                       TII->get(X86::MOV64rm), X86::RDI)
@@ -16675,7 +17680,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
     addDirectMem(MIB, X86::RDI);
     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
-  } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
+  } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
                                       TII->get(X86::MOV32rm), X86::EAX)
     .addReg(0)
@@ -16707,9 +17712,8 @@ MachineBasicBlock *
 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
                                     MachineBasicBlock *MBB) const {
   DebugLoc DL = MI->getDebugLoc();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
   MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   const BasicBlock *BB = MBB->getBasicBlock();
@@ -16771,8 +17775,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   unsigned PtrStoreOpc = 0;
   unsigned LabelReg = 0;
   const int64_t LabelOffset = 1 * PVT.getStoreSize();
-  Reloc::Model RM = getTargetMachine().getRelocationModel();
-  bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) &&
+  Reloc::Model RM = MF->getTarget().getRelocationModel();
+  bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
                      (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
 
   // Prepare IP either in reg or imm.
@@ -16816,7 +17820,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
           .addMBB(restoreMBB);
 
   const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+    static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo());
   MIB.addRegMask(RegInfo->getNoPreservedMask());
   thisMBB->addSuccessor(mainMBB);
   thisMBB->addSuccessor(restoreMBB);
@@ -16845,9 +17849,8 @@ MachineBasicBlock *
 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
                                      MachineBasicBlock *MBB) const {
   DebugLoc DL = MI->getDebugLoc();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
   MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   // Memory Reference
@@ -16863,7 +17866,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   unsigned Tmp = MRI.createVirtualRegister(RC);
   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+    static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo());
   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
   unsigned SP = RegInfo->getStackRegister();
 
@@ -17038,12 +18041,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::FP80_TO_INT16_IN_MEM:
   case X86::FP80_TO_INT32_IN_MEM:
   case X86::FP80_TO_INT64_IN_MEM: {
-    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+    MachineFunction *F = BB->getParent();
+    const TargetInstrInfo *TII = F->getTarget().getInstrInfo();
     DebugLoc DL = MI->getDebugLoc();
 
     // Change the floating point control register to use "round towards zero"
     // mode when truncating to an integer value.
-    MachineFunction *F = BB->getParent();
     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
     addFrameReference(BuildMI(*BB, MI, DL,
                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
@@ -17123,7 +18126,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::VPCMPESTRM128MEM:
     assert(Subtarget->hasSSE42() &&
            "Target must have SSE4.2 or AVX features enabled");
-    return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo());
+    return EmitPCMPSTRM(MI, BB, BB->getParent()->getTarget().getInstrInfo());
 
   // String/text processing lowering.
   case X86::PCMPISTRIREG:
@@ -17136,71 +18139,15 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::VPCMPESTRIMEM:
     assert(Subtarget->hasSSE42() &&
            "Target must have SSE4.2 or AVX features enabled");
-    return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo());
+    return EmitPCMPSTRI(MI, BB, BB->getParent()->getTarget().getInstrInfo());
 
   // Thread synchronization.
   case X86::MONITOR:
-    return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget);
+    return EmitMonitor(MI, BB, BB->getParent()->getTarget().getInstrInfo(), Subtarget);
 
   // xbegin
   case X86::XBEGIN:
-    return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo());
-
-  // Atomic Lowering.
-  case X86::ATOMAND8:
-  case X86::ATOMAND16:
-  case X86::ATOMAND32:
-  case X86::ATOMAND64:
-    // Fall through
-  case X86::ATOMOR8:
-  case X86::ATOMOR16:
-  case X86::ATOMOR32:
-  case X86::ATOMOR64:
-    // Fall through
-  case X86::ATOMXOR16:
-  case X86::ATOMXOR8:
-  case X86::ATOMXOR32:
-  case X86::ATOMXOR64:
-    // Fall through
-  case X86::ATOMNAND8:
-  case X86::ATOMNAND16:
-  case X86::ATOMNAND32:
-  case X86::ATOMNAND64:
-    // Fall through
-  case X86::ATOMMAX8:
-  case X86::ATOMMAX16:
-  case X86::ATOMMAX32:
-  case X86::ATOMMAX64:
-    // Fall through
-  case X86::ATOMMIN8:
-  case X86::ATOMMIN16:
-  case X86::ATOMMIN32:
-  case X86::ATOMMIN64:
-    // Fall through
-  case X86::ATOMUMAX8:
-  case X86::ATOMUMAX16:
-  case X86::ATOMUMAX32:
-  case X86::ATOMUMAX64:
-    // Fall through
-  case X86::ATOMUMIN8:
-  case X86::ATOMUMIN16:
-  case X86::ATOMUMIN32:
-  case X86::ATOMUMIN64:
-    return EmitAtomicLoadArith(MI, BB);
-
-  // This group does 64-bit operations on a 32-bit host.
-  case X86::ATOMAND6432:
-  case X86::ATOMOR6432:
-  case X86::ATOMXOR6432:
-  case X86::ATOMNAND6432:
-  case X86::ATOMADD6432:
-  case X86::ATOMSUB6432:
-  case X86::ATOMMAX6432:
-  case X86::ATOMMIN6432:
-  case X86::ATOMUMAX6432:
-  case X86::ATOMUMIN6432:
-  case X86::ATOMSWAP6432:
-    return EmitAtomicLoadArith6432(MI, BB);
+    return EmitXBegin(MI, BB, BB->getParent()->getTarget().getInstrInfo());
 
   case X86::VASTART_SAVE_XMM_REGS:
     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
@@ -17473,13 +18420,385 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// \brief Get the PSHUF-style mask from PSHUF node.
+///
+/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
+/// PSHUF-style masks that can be reused with such instructions.
+static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
+  SmallVector<int, 4> Mask;
+  bool IsUnary;
+  bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
+  (void)HaveMask;
+  assert(HaveMask);
+
+  switch (N.getOpcode()) {
+  case X86ISD::PSHUFD:
+    return Mask;
+  case X86ISD::PSHUFLW:
+    Mask.resize(4);
+    return Mask;
+  case X86ISD::PSHUFHW:
+    Mask.erase(Mask.begin(), Mask.begin() + 4);
+    for (int &M : Mask)
+      M -= 4;
+    return Mask;
+  default:
+    llvm_unreachable("No valid shuffle instruction found!");
+  }
+}
+
+/// \brief Search for a combinable shuffle across a chain ending in pshufd.
+///
+/// We walk up the chain and look for a combinable shuffle, skipping over
+/// shuffles that we could hoist this shuffle's transformation past without
+/// altering anything.
+static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
+                                         SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  assert(N.getOpcode() == X86ISD::PSHUFD &&
+         "Called with something other than an x86 128-bit half shuffle!");
+  SDLoc DL(N);
+
+  // Walk up a single-use chain looking for a combinable shuffle.
+  SDValue V = N.getOperand(0);
+  for (; V.hasOneUse(); V = V.getOperand(0)) {
+    switch (V.getOpcode()) {
+    default:
+      return false; // Nothing combined!
+
+    case ISD::BITCAST:
+      // Skip bitcasts as we always know the type for the target specific
+      // instructions.
+      continue;
+
+    case X86ISD::PSHUFD:
+      // Found another dword shuffle.
+      break;
+
+    case X86ISD::PSHUFLW:
+      // Check that the low words (being shuffled) are the identity in the
+      // dword shuffle, and the high words are self-contained.
+      if (Mask[0] != 0 || Mask[1] != 1 ||
+          !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
+        return false;
+
+      continue;
+
+    case X86ISD::PSHUFHW:
+      // Check that the high words (being shuffled) are the identity in the
+      // dword shuffle, and the low words are self-contained.
+      if (Mask[2] != 2 || Mask[3] != 3 ||
+          !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
+        return false;
+
+      continue;
+
+    case X86ISD::UNPCKL:
+    case X86ISD::UNPCKH:
+      // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
+      // shuffle into a preceding word shuffle.
+      if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
+        return false;
+
+      // Search for a half-shuffle which we can combine with.
+      unsigned CombineOp =
+          V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
+      if (V.getOperand(0) != V.getOperand(1) ||
+          !V->isOnlyUserOf(V.getOperand(0).getNode()))
+        return false;
+      V = V.getOperand(0);
+      do {
+        switch (V.getOpcode()) {
+        default:
+          return false; // Nothing to combine.
+
+        case X86ISD::PSHUFLW:
+        case X86ISD::PSHUFHW:
+          if (V.getOpcode() == CombineOp)
+            break;
+
+          // Fallthrough!
+        case ISD::BITCAST:
+          V = V.getOperand(0);
+          continue;
+        }
+        break;
+      } while (V.hasOneUse());
+      break;
+    }
+    // Break out of the loop if we break out of the switch.
+    break;
+  }
+
+  if (!V.hasOneUse())
+    // We fell out of the loop without finding a viable combining instruction.
+    return false;
+
+  // Record the old value to use in RAUW-ing.
+  SDValue Old = V;
+
+  // Merge this node's mask and our incoming mask.
+  SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+  for (int &M : Mask)
+    M = VMask[M];
+  V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
+                  getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+  // It is possible that one of the combinable shuffles was completely absorbed
+  // by the other, just replace it and revisit all users in that case.
+  if (Old.getNode() == V.getNode()) {
+    DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo=*/true);
+    return true;
+  }
+
+  // Replace N with its operand as we're going to combine that shuffle away.
+  DAG.ReplaceAllUsesWith(N, N.getOperand(0));
+
+  // Replace the combinable shuffle with the combined one, updating all users
+  // so that we re-evaluate the chain here.
+  DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
+  return true;
+}
+
+/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
+///
+/// We walk up the chain, skipping shuffles of the other half and looking
+/// through shuffles which switch halves trying to find a shuffle of the same
+/// pair of dwords.
+static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
+                                        SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI) {
+  assert(
+      (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
+      "Called with something other than an x86 128-bit half shuffle!");
+  SDLoc DL(N);
+  unsigned CombineOpcode = N.getOpcode();
+
+  // Walk up a single-use chain looking for a combinable shuffle.
+  SDValue V = N.getOperand(0);
+  for (; V.hasOneUse(); V = V.getOperand(0)) {
+    switch (V.getOpcode()) {
+    default:
+      return false; // Nothing combined!
+
+    case ISD::BITCAST:
+      // Skip bitcasts as we always know the type for the target specific
+      // instructions.
+      continue;
+
+    case X86ISD::PSHUFLW:
+    case X86ISD::PSHUFHW:
+      if (V.getOpcode() == CombineOpcode)
+        break;
+
+      // Other-half shuffles are no-ops.
+      continue;
+
+    case X86ISD::PSHUFD: {
+      // We can only handle pshufd if the half we are combining either stays in
+      // its half, or switches to the other half. Bail if one of these isn't
+      // true.
+      SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+      int DOffset = CombineOpcode == X86ISD::PSHUFLW ? 0 : 2;
+      if (!((VMask[DOffset + 0] < 2 && VMask[DOffset + 1] < 2) ||
+            (VMask[DOffset + 0] >= 2 && VMask[DOffset + 1] >= 2)))
+        return false;
+
+      // Map the mask through the pshufd and keep walking up the chain.
+      for (int i = 0; i < 4; ++i)
+        Mask[i] = 2 * (VMask[DOffset + Mask[i] / 2] % 2) + Mask[i] % 2;
+
+      // Switch halves if the pshufd does.
+      CombineOpcode =
+          VMask[DOffset + Mask[0] / 2] < 2 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
+      continue;
+    }
+    }
+    // Break out of the loop if we break out of the switch.
+    break;
+  }
+
+  if (!V.hasOneUse())
+    // We fell out of the loop without finding a viable combining instruction.
+    return false;
+
+  // Record the old value to use in RAUW-ing.
+  SDValue Old = V;
+
+  // Merge this node's mask and our incoming mask (adjusted to account for all
+  // the pshufd instructions encountered).
+  SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+  for (int &M : Mask)
+    M = VMask[M];
+  V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
+                  getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+  // Replace N with its operand as we're going to combine that shuffle away.
+  DAG.ReplaceAllUsesWith(N, N.getOperand(0));
+
+  // Replace the combinable shuffle with the combined one, updating all users
+  // so that we re-evaluate the chain here.
+  DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
+  return true;
+}
+
+/// \brief Try to combine x86 target specific shuffles.
+static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+  MVT VT = N.getSimpleValueType();
+  SmallVector<int, 4> Mask;
+
+  switch (N.getOpcode()) {
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFLW:
+  case X86ISD::PSHUFHW:
+    Mask = getPSHUFShuffleMask(N);
+    assert(Mask.size() == 4);
+    break;
+  default:
+    return SDValue();
+  }
+
+  // Nuke no-op shuffles that show up after combining.
+  if (isNoopShuffleMask(Mask))
+    return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
+
+  // Look for simplifications involving one or two shuffle instructions.
+  SDValue V = N.getOperand(0);
+  switch (N.getOpcode()) {
+  default:
+    break;
+  case X86ISD::PSHUFLW:
+  case X86ISD::PSHUFHW:
+    assert(VT == MVT::v8i16);
+    (void)VT;
+
+    if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
+      return SDValue(); // We combined away this shuffle, so we're done.
+
+    // See if this reduces to a PSHUFD which is no more expensive and can
+    // combine with more operations.
+    if (Mask[0] % 2 == 0 && Mask[2] % 2 == 0 &&
+        areAdjacentMasksSequential(Mask)) {
+      int DMask[] = {-1, -1, -1, -1};
+      int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
+      DMask[DOffset + 0] = DOffset + Mask[0] / 2;
+      DMask[DOffset + 1] = DOffset + Mask[2] / 2;
+      V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
+      DCI.AddToWorklist(V.getNode());
+      V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
+                      getV4X86ShuffleImm8ForMask(DMask, DAG));
+      DCI.AddToWorklist(V.getNode());
+      return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+    }
+
+    // Look for shuffle patterns which can be implemented as a single unpack.
+    // FIXME: This doesn't handle the location of the PSHUFD generically, and
+    // only works when we have a PSHUFD followed by two half-shuffles.
+    if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
+        (V.getOpcode() == X86ISD::PSHUFLW ||
+         V.getOpcode() == X86ISD::PSHUFHW) &&
+        V.getOpcode() != N.getOpcode() &&
+        V.hasOneUse()) {
+      SDValue D = V.getOperand(0);
+      while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
+        D = D.getOperand(0);
+      if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
+        SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+        SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
+        int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+        int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+        int WordMask[8];
+        for (int i = 0; i < 4; ++i) {
+          WordMask[i + NOffset] = Mask[i] + NOffset;
+          WordMask[i + VOffset] = VMask[i] + VOffset;
+        }
+        // Map the word mask through the DWord mask.
+        int MappedMask[8];
+        for (int i = 0; i < 8; ++i)
+          MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
+        const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
+        const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
+        if (std::equal(std::begin(MappedMask), std::end(MappedMask),
+                       std::begin(UnpackLoMask)) ||
+            std::equal(std::begin(MappedMask), std::end(MappedMask),
+                       std::begin(UnpackHiMask))) {
+          // We can replace all three shuffles with an unpack.
+          V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
+          DCI.AddToWorklist(V.getNode());
+          return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
+                                                : X86ISD::UNPCKH,
+                             DL, MVT::v8i16, V, V);
+        }
+      }
+    }
+
+    break;
+
+  case X86ISD::PSHUFD:
+    if (combineRedundantDWordShuffle(N, Mask, DAG, DCI))
+      return SDValue(); // We combined away this shuffle.
+
+    break;
+  }
+
+  return SDValue();
+}
+
 /// PerformShuffleCombine - Performs several different shuffle combines.
 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const X86Subtarget *Subtarget) {
   SDLoc dl(N);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
+  // Canonicalize shuffles that perform 'addsub' on packed float vectors
+  // according to the rule:
+  //  (shuffle (FADD A, B), (FSUB A, B), Mask) ->
+  //  (shuffle (FSUB A, -B), (FADD A, -B), Mask)
+  //
+  // Where 'Mask' is:
+  //  <0,5,2,7>             -- for v4f32 and v4f64 shuffles;
+  //  <0,3>                 -- for v2f64 shuffles;
+  //  <0,9,2,11,4,13,6,15>  -- for v8f32 shuffles.
+  //
+  // This helps pattern-matching more SSE3/AVX ADDSUB instructions
+  // during ISel stage.
+  if (N->getOpcode() == ISD::VECTOR_SHUFFLE &&
+      ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+      N0->getOpcode() == ISD::FADD && N1->getOpcode() == ISD::FSUB &&
+      // Operands to the FADD and FSUB must be the same.
+      ((N0->getOperand(0) == N1->getOperand(0) &&
+        N0->getOperand(1) == N1->getOperand(1)) ||
+       // FADD is commutable. See if by commuting the operands of the FADD
+       // we would still be able to match the operands of the FSUB dag node.
+       (N0->getOperand(1) == N1->getOperand(0) &&
+        N0->getOperand(0) == N1->getOperand(1))) &&
+      N0->getOperand(0)->getOpcode() != ISD::UNDEF &&
+      N0->getOperand(1)->getOpcode() != ISD::UNDEF) {
+    
+    ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N);
+    unsigned NumElts = VT.getVectorNumElements();
+    ArrayRef<int> Mask = SV->getMask();
+    bool CanFold = true;
+
+    for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i)
+      CanFold = Mask[i] == (int)((i & 1) ? i + NumElts : i);
+
+    if (CanFold) {
+      SDValue Op0 = N1->getOperand(0);
+      SDValue Op1 = DAG.getNode(ISD::FNEG, dl, VT, N1->getOperand(1));
+      SDValue Sub = DAG.getNode(ISD::FSUB, dl, VT, Op0, Op1);
+      SDValue Add = DAG.getNode(ISD::FADD, dl, VT, Op0, Op1);
+      return DAG.getVectorShuffle(VT, dl, Sub, Add, Mask);
+    }
+  }
+
   // Don't create instructions with illegal types after legalize types has run.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
@@ -17490,6 +18809,57 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
       N->getOpcode() == ISD::VECTOR_SHUFFLE)
     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
 
+  // During Type Legalization, when promoting illegal vector types,
+  // the backend might introduce new shuffle dag nodes and bitcasts.
+  //
+  // This code performs the following transformation:
+  // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
+  //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
+  //
+  // We do this only if both the bitcast and the BINOP dag nodes have
+  // one use. Also, perform this transformation only if the new binary
+  // operation is legal. This is to avoid introducing dag nodes that
+  // potentially need to be further expanded (or custom lowered) into a
+  // less optimal sequence of dag nodes.
+  if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
+      N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
+      N0.getOpcode() == ISD::BITCAST) {
+    SDValue BC0 = N0.getOperand(0);
+    EVT SVT = BC0.getValueType();
+    unsigned Opcode = BC0.getOpcode();
+    unsigned NumElts = VT.getVectorNumElements();
+    
+    if (BC0.hasOneUse() && SVT.isVector() &&
+        SVT.getVectorNumElements() * 2 == NumElts &&
+        TLI.isOperationLegal(Opcode, VT)) {
+      bool CanFold = false;
+      switch (Opcode) {
+      default : break;
+      case ISD::ADD :
+      case ISD::FADD :
+      case ISD::SUB :
+      case ISD::FSUB :
+      case ISD::MUL :
+      case ISD::FMUL :
+        CanFold = true;
+      }
+
+      unsigned SVTNumElts = SVT.getVectorNumElements();
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+      for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
+        CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
+      for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
+        CanFold = SVOp->getMaskElt(i) < 0;
+
+      if (CanFold) {
+        SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
+        SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
+        SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
+        return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
+      }
+    }
+  }
+
   // Only handle 128 wide vector from here on.
   if (!VT.is128BitVector())
     return SDValue();
@@ -17501,7 +18871,18 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
 
-  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
+  SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
+  if (LD.getNode())
+    return LD;
+
+  if (isTargetShuffle(N->getOpcode())) {
+    SDValue Shuffle =
+        PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
+    if (Shuffle.getNode())
+      return Shuffle;
+  }
+
+  return SDValue();
 }
 
 /// PerformTruncateCombine - Converts truncate operation to
@@ -18155,28 +19536,34 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
 
-      // If the RHS is a constant we have to reverse the const canonicalization.
-      // x > C-1 ? x+-C : 0 --> subus x, C
-      if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
-          isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) {
-        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
-        if (CondRHS.getConstantOperandVal(0) == -A-1)
-          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS,
-                             DAG.getConstant(-A, VT));
-      }
-
-      // Another special case: If C was a sign bit, the sub has been
-      // canonicalized into a xor.
-      // FIXME: Would it be better to use computeKnownBits to determine whether
-      //        it's safe to decanonicalize the xor?
-      // x s< 0 ? x^C : 0 --> subus x, C
-      if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
-          ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
-          isSplatVector(OpRHS.getNode())) {
-        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
-        if (A.isSignBit())
-          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
-      }
+      if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
+        if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
+          if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
+            if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
+              // If the RHS is a constant we have to reverse the const
+              // canonicalization.
+              // x > C-1 ? x+-C : 0 --> subus x, C
+              if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
+                  CondRHSConst->getAPIntValue() ==
+                      (-OpRHSConst->getAPIntValue() - 1))
+                return DAG.getNode(
+                    X86ISD::SUBUS, DL, VT, OpLHS,
+                    DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
+
+          // Another special case: If C was a sign bit, the sub has been
+          // canonicalized into a xor.
+          // FIXME: Would it be better to use computeKnownBits to determine
+          //        whether it's safe to decanonicalize the xor?
+          // x s< 0 ? x^C : 0 --> subus x, C
+          if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
+              ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+              OpRHSConst->getAPIntValue().isSignBit())
+            // Note that we have to rebuild the RHS constant here to ensure we
+            // don't rely on particular values of undef lanes.
+            return DAG.getNode(
+                X86ISD::SUBUS, DL, VT, OpLHS,
+                DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
+        }
     }
   }
 
@@ -18743,6 +20130,8 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
       if (C->isAllOnesValue())
         return Op1;
     }
+
+    return SDValue();
   }
 
   // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
@@ -18882,16 +20271,15 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
   // vector operations in many cases. Also, on sandybridge ADD is faster than
   // shl.
   // (shl V, 1) -> add V,V
-  if (isSplatVector(N1.getNode())) {
-    assert(N0.getValueType().isVector() && "Invalid vector shift type");
-    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0));
-    // We shift all of the values by one. In many cases we do not have
-    // hardware support for this operation. This is better expressed as an ADD
-    // of two values.
-    if (N1C && (1 == N1C->getZExtValue())) {
-      return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
+  if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
+    if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
+      assert(N0.getValueType().isVector() && "Invalid vector shift type");
+      // We shift all of the values by one. In many cases we do not have
+      // hardware support for this operation. This is better expressed as an ADD
+      // of two values.
+      if (N1SplatC->getZExtValue() == 1)
+        return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
     }
-  }
 
   return SDValue();
 }
@@ -18910,10 +20298,9 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
 
   SDValue Amt = N->getOperand(1);
   SDLoc DL(N);
-  if (isSplatVector(Amt.getNode())) {
-    SDValue SclrAmt = Amt->getOperand(0);
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
-      APInt ShiftAmt = C->getAPIntValue();
+  if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
+    if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
+      APInt ShiftAmt = AmtSplat->getAPIntValue();
       unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
 
       // SSE2/AVX2 logical shifts always return a vector of 0s
@@ -18923,7 +20310,6 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
       if (ShiftAmt.trunc(8).uge(MaxAmount))
         return getZeroVector(VT, Subtarget, DAG, DL);
     }
-  }
 
   return SDValue();
 }
@@ -19117,9 +20503,10 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
 
   // The right side has to be a 'trunc' or a constant vector.
   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
-  bool RHSConst = (isSplatVector(N1.getNode()) &&
-                   isa<ConstantSDNode>(N1->getOperand(0)));
-  if (!RHSTrunc && !RHSConst)
+  ConstantSDNode *RHSConstSplat = nullptr;
+  if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
+    RHSConstSplat = RHSBV->getConstantSplatNode();
+  if (!RHSTrunc && !RHSConstSplat)
     return SDValue();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -19129,9 +20516,9 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
 
   // Set N0 and N1 to hold the inputs to the new wide operation.
   N0 = N0->getOperand(0);
-  if (RHSConst) {
+  if (RHSConstSplat) {
     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
-                     N1->getOperand(0));
+                     SDValue(RHSConstSplat, 0));
     SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
     N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
   } else if (RHSTrunc) {
@@ -19277,12 +20664,9 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
       unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
       unsigned SraAmt = ~0;
       if (Mask.getOpcode() == ISD::SRA) {
-        SDValue Amt = Mask.getOperand(1);
-        if (isSplatVector(Amt.getNode())) {
-          SDValue SclrAmt = Amt->getOperand(0);
-          if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt))
-            SraAmt = C->getZExtValue();
-        }
+        if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
+          if (auto *AmtConst = AmtBV->getConstantSplatNode())
+            SraAmt = AmtConst->getZExtValue();
       } else if (Mask.getOpcode() == X86ISD::VSRAI) {
         SDValue SraC = Mask.getOperand(1);
         SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
@@ -20642,6 +22026,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
     return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
   case X86ISD::INSERTPS:
     return PerformINSERTPSCombine(N, DAG, Subtarget);
+  case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
   }
 
   return SDValue();
@@ -21146,8 +22531,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     const GlobalValue *GV = GA->getGlobal();
     // If we require an extra load to get this address, as in PIC mode, we
     // can't accept it.
-    if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
-                                                        getTargetMachine())))
+    if (isGlobalStubReference(
+            Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
       return;
 
     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
@@ -21425,3 +22810,7 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
     return AM.Scale != 0;
   return -1;
 }
+
+bool X86TargetLowering::isTargetFTOL() const {
+  return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
+}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 9f51b53..c8cdce7 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -15,13 +15,13 @@
 #ifndef X86ISELLOWERING_H
 #define X86ISELLOWERING_H
 
-#include "X86Subtarget.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 
 namespace llvm {
+  class X86Subtarget;
   class X86TargetMachine;
 
   namespace X86ISD {
@@ -86,6 +86,9 @@ namespace llvm {
       /// X86 Read Time-Stamp Counter and Processor ID.
       RDTSCP_DAG,
 
+      /// X86 Read Performance Monitoring Counters.
+      RDPMC_DAG,
+
       /// X86 compare and logical compare instructions.
       CMP, COMI, UCOMI,
 
@@ -315,6 +318,8 @@ namespace llvm {
       KORTEST,
 
       // Several flavors of instructions with vector shuffle behaviors.
+      PACKSS,
+      PACKUS,
       PALIGNR,
       PSHUFD,
       PSHUFHW,
@@ -400,23 +405,8 @@ namespace llvm {
       // XTEST - Test if in transactional execution.
       XTEST,
 
-      // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG,
-      // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG -
-      // Atomic 64-bit binary operations.
-      ATOMADD64_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
-      ATOMSUB64_DAG,
-      ATOMOR64_DAG,
-      ATOMXOR64_DAG,
-      ATOMAND64_DAG,
-      ATOMNAND64_DAG,
-      ATOMMAX64_DAG,
-      ATOMMIN64_DAG,
-      ATOMUMAX64_DAG,
-      ATOMUMIN64_DAG,
-      ATOMSWAP64_DAG,
-
       // LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap.
-      LCMPXCHG_DAG,
+      LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
       LCMPXCHG8_DAG,
       LCMPXCHG16_DAG,
 
@@ -766,9 +756,7 @@ namespace llvm {
 
     /// isTargetFTOL - Return true if the target uses the MSVC _ftol2 routine
     /// for fptoui.
-    bool isTargetFTOL() const {
-      return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
-    }
+    bool isTargetFTOL() const;
 
     /// isIntegerTypeFTOL - Return true if the MSVC _ftol2 routine should be
     /// used for fptoui to the given type.
@@ -808,6 +796,9 @@ namespace llvm {
     /// \brief Reset the operation actions based on target options.
     void resetOperationActions() override;
 
+    /// \brief Customize the preferred legalization strategy for certain types.
+    LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
+
   protected:
     std::pair<const TargetRegisterClass*, uint8_t>
     findRepresentativeClass(MVT VT) const override;
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 37bcc52..41e900e 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -476,6 +476,28 @@ defm VPBROADCASTQZ  : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem,
                       loadi64, VR512, v8i64, v2i64, VK8WM>,  EVEX_V512, VEX_W,
                       EVEX_CD8<64, CD8VT1>;
 
+multiclass avx512_int_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
+                          X86MemOperand x86memop, PatFrag ld_frag,
+                          RegisterClass KRC> {
+  let mayLoad = 1 in {
+  def rm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins x86memop:$src),
+                  !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                  []>, EVEX;
+  def krm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins KRC:$mask,
+                                                         x86memop:$src),
+                  !strconcat(OpcodeStr,
+                      " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+                  []>, EVEX, EVEX_KZ;
+  }
+}
+
+defm VBROADCASTI32X4 : avx512_int_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
+                       i128mem, loadv2i64, VK16WM>,
+                       EVEX_V512, EVEX_CD8<32, CD8VT4>;
+defm VBROADCASTI64X4 : avx512_int_subvec_broadcast_rm<0x5b, "vbroadcasti64x4",
+                       i256mem, loadv4i64, VK16WM>, VEX_W,
+                       EVEX_V512, EVEX_CD8<64, CD8VT4>;
+
 def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))),
           (VPBROADCASTDZrr VR128X:$src)>;
 def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))),
@@ -517,10 +539,12 @@ def rr : AVX512XS8I<opc, MRMDestReg, (outs DstRC:$dst), (ins KRC:$src),
                   []>, EVEX;
 }
 
+let Predicates = [HasCDI] in {
 defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512,
                                              VK16, v16i32, v16i1>, EVEX_V512;
 defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512,
                                             VK8, v8i64, v8i1>, EVEX_V512, VEX_W;
+}
 
 //===----------------------------------------------------------------------===//
 // AVX-512 - VPERM
@@ -585,7 +609,7 @@ defm VPERMPDZ  : avx512_perm<0x16, "vpermpd", VR512,  memopv8f64, f512mem,
 // -- VPERM2I - 3 source operands form --
 multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr, RegisterClass RC,
                           PatFrag mem_frag, X86MemOperand x86memop,
-                          SDNode OpNode, ValueType OpVT> {
+                          SDNode OpNode, ValueType OpVT, RegisterClass KRC> {
 let Constraints = "$src1 = $dst" in {
   def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, RC:$src3),
@@ -595,48 +619,107 @@ let Constraints = "$src1 = $dst" in {
                      (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>,
                     EVEX_4V;
 
+  def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, KRC:$mask, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                       " \t{$src3, $src2, $dst {${mask}}|"
+                       "$dst {${mask}}, $src2, $src3}"),
+                   [(set RC:$dst, (OpVT (vselect KRC:$mask,
+                                           (OpNode RC:$src1, RC:$src2,
+                                              RC:$src3),
+                                           RC:$src1)))]>,
+                    EVEX_4V, EVEX_K;
+
+  let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<>
+    def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, KRC:$mask, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                       " \t{$src3, $src2, $dst {${mask}} {z} |",
+                       "$dst {${mask}} {z}, $src2, $src3}"),
+                   [(set RC:$dst, (OpVT (vselect KRC:$mask,
+                                           (OpNode RC:$src1, RC:$src2,
+                                              RC:$src3),
+                                           (OpVT (bitconvert
+                                              (v16i32 immAllZerosV))))))]>,
+                    EVEX_4V, EVEX_KZ;
+
   def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, x86memop:$src3),
                    !strconcat(OpcodeStr,
                     " \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set RC:$dst,
-                     (OpVT (OpNode RC:$src1, RC:$src2, 
+                     (OpVT (OpNode RC:$src1, RC:$src2,
                       (mem_frag addr:$src3))))]>, EVEX_4V;
+
+  def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, KRC:$mask, RC:$src2, x86memop:$src3),
+                   !strconcat(OpcodeStr,
+                    " \t{$src3, $src2, $dst {${mask}}|"
+                    "$dst {${mask}}, $src2, $src3}"),
+                   [(set RC:$dst,
+                       (OpVT (vselect KRC:$mask,
+                                      (OpNode RC:$src1, RC:$src2,
+                                         (mem_frag addr:$src3)),
+                                      RC:$src1)))]>,
+                    EVEX_4V, EVEX_K;
+
+  let AddedComplexity = 10 in // Prefer over the rrkz variant
+    def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, KRC:$mask, RC:$src2, x86memop:$src3),
+                   !strconcat(OpcodeStr,
+                    " \t{$src3, $src2, $dst {${mask}} {z}|"
+                    "$dst {${mask}} {z}, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (vselect KRC:$mask,
+                                    (OpNode RC:$src1, RC:$src2,
+                                            (mem_frag addr:$src3)),
+                                    (OpVT (bitconvert
+                                       (v16i32 immAllZerosV))))))]>,
+                    EVEX_4V, EVEX_KZ;
   }
 }
-defm VPERMI2D  : avx512_perm_3src<0x76, "vpermi2d",  VR512, memopv16i32, i512mem, 
-                               X86VPermiv3, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPERMI2Q  : avx512_perm_3src<0x76, "vpermi2q",  VR512, memopv8i64, i512mem, 
-                               X86VPermiv3, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps",  VR512, memopv16f32, i512mem, 
-                               X86VPermiv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd",  VR512, memopv8f64, i512mem, 
-                               X86VPermiv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
-defm VPERMT2D  : avx512_perm_3src<0x7E, "vpermt2d",  VR512, memopv16i32, i512mem, 
-                               X86VPermv3, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPERMT2Q  : avx512_perm_3src<0x7E, "vpermt2q",  VR512, memopv8i64, i512mem, 
-                               X86VPermv3, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPERMT2PS : avx512_perm_3src<0x7F, "vpermt2ps",  VR512, memopv16f32, i512mem, 
-                               X86VPermv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPERMT2PD : avx512_perm_3src<0x7F, "vpermt2pd",  VR512, memopv8f64, i512mem, 
-                               X86VPermv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
-def : Pat<(v16f32 (int_x86_avx512_mask_vpermt_ps_512 (v16i32 VR512:$idx),
-                   (v16f32 VR512:$src1), (v16f32 VR512:$src2), (i16 -1))),
-          (VPERMT2PSrr VR512:$src1, VR512:$idx, VR512:$src2)>;
-
-def : Pat<(v16i32 (int_x86_avx512_mask_vpermt_d_512 (v16i32 VR512:$idx),
-                   (v16i32 VR512:$src1), (v16i32 VR512:$src2), (i16 -1))),
-          (VPERMT2Drr VR512:$src1, VR512:$idx, VR512:$src2)>;
-
-def : Pat<(v8f64 (int_x86_avx512_mask_vpermt_pd_512 (v8i64 VR512:$idx),
-                   (v8f64 VR512:$src1), (v8f64 VR512:$src2), (i8 -1))),
-          (VPERMT2PDrr VR512:$src1, VR512:$idx, VR512:$src2)>;
-
-def : Pat<(v8i64 (int_x86_avx512_mask_vpermt_q_512 (v8i64 VR512:$idx),
-                   (v8i64 VR512:$src1), (v8i64 VR512:$src2), (i8 -1))),
-          (VPERMT2Qrr VR512:$src1, VR512:$idx, VR512:$src2)>;
+defm VPERMI2D  : avx512_perm_3src<0x76, "vpermi2d",  VR512, memopv16i32,
+                                  i512mem, X86VPermiv3, v16i32, VK16WM>,
+                 EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMI2Q  : avx512_perm_3src<0x76, "vpermi2q",  VR512, memopv8i64,
+                                  i512mem, X86VPermiv3, v8i64, VK8WM>,
+                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps",  VR512, memopv16f32,
+                                  i512mem, X86VPermiv3, v16f32, VK16WM>,
+                 EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd",  VR512, memopv8f64,
+                                  i512mem, X86VPermiv3, v8f64, VK8WM>,
+                  EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+multiclass avx512_perm_table_3src<bits<8> opc, string Suffix, RegisterClass RC,
+                          PatFrag mem_frag, X86MemOperand x86memop,
+                          SDNode OpNode, ValueType OpVT, RegisterClass KRC,
+                          ValueType MaskVT, RegisterClass MRC> :
+        avx512_perm_3src<opc, "vpermt2"##Suffix, RC, mem_frag, x86memop, OpNode,
+                         OpVT, KRC> {
+  def : Pat<(OpVT (!cast<Intrinsic>("int_x86_avx512_mask_vpermt_"##Suffix##"_512")
+                     VR512:$idx, VR512:$src1, VR512:$src2, -1)),
+            (!cast<Instruction>(NAME#rr) VR512:$src1, VR512:$idx, VR512:$src2)>;
+
+  def : Pat<(OpVT (!cast<Intrinsic>("int_x86_avx512_mask_vpermt_"##Suffix##"_512")
+                     VR512:$idx, VR512:$src1, VR512:$src2, MRC:$mask)),
+            (!cast<Instruction>(NAME#rrk) VR512:$src1,
+              (MaskVT (COPY_TO_REGCLASS MRC:$mask, KRC)), VR512:$idx, VR512:$src2)>;
+}
+
+defm VPERMT2D  : avx512_perm_table_3src<0x7E, "d",  VR512, memopv16i32, i512mem,
+                               X86VPermv3, v16i32, VK16WM, v16i1, GR16>,
+                 EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMT2Q  : avx512_perm_table_3src<0x7E, "q",  VR512, memopv8i64, i512mem,
+                               X86VPermv3, v8i64, VK8WM, v8i1, GR8>,
+                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMT2PS : avx512_perm_table_3src<0x7F, "ps",  VR512, memopv16f32, i512mem,
+                               X86VPermv3, v16f32, VK16WM, v16i1, GR16>,
+                 EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMT2PD : avx512_perm_table_3src<0x7F, "pd",  VR512, memopv8f64, i512mem,
+                               X86VPermv3, v8f64, VK8WM, v8i1, GR8>,
+                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
 //===----------------------------------------------------------------------===//
 // AVX-512 - BLEND using mask
 //
@@ -790,52 +873,61 @@ def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
             (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
             (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>;
 
-multiclass avx512_icmp_cc<bits<8> opc, RegisterClass KRC,
+multiclass avx512_icmp_cc<bits<8> opc, RegisterClass WMRC, RegisterClass KRC,
               RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, 
-              SDNode OpNode, ValueType vt, Operand CC, string asm,
-              string asm_alt> {
+              SDNode OpNode, ValueType vt, Operand CC, string Suffix> {
   def rri : AVX512AIi8<opc, MRMSrcReg,
-             (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc),
+             !strconcat("vpcmp${cc}", Suffix,
+                        "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2), imm:$cc))], 
              IIC_SSE_ALU_F32P_RR>, EVEX_4V;
   def rmi : AVX512AIi8<opc, MRMSrcMem,
-             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc),
+             !strconcat("vpcmp${cc}", Suffix,
+                        "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2),
                               imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
   // Accept explicit immediate argument form instead of comparison code.
   let isAsmParserOnly = 1, hasSideEffects = 0 in {
     def rri_alt : AVX512AIi8<opc, MRMSrcReg,
                (outs KRC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
-               asm_alt, [], IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+               !strconcat("vpcmp", Suffix,
+                  "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
+               [], IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+    def rrik_alt : AVX512AIi8<opc, MRMSrcReg,
+               (outs KRC:$dst), (ins WMRC:$mask, RC:$src1, RC:$src2, i8imm:$cc),
+               !strconcat("vpcmp", Suffix,
+                  "\t{$cc, $src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2, $cc}"),
+               [], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
     def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
                (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
-               asm_alt, [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+               !strconcat("vpcmp", Suffix,
+                  "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
+               [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+    def rmik_alt : AVX512AIi8<opc, MRMSrcMem,
+               (outs KRC:$dst), (ins WMRC:$mask, RC:$src1, x86memop:$src2, i8imm:$cc),
+               !strconcat("vpcmp", Suffix,
+                  "\t{$cc, $src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2, $cc}"),
+               [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
   }
 }
 
-defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16, VR512, i512mem, memopv16i32,
-                              X86cmpm, v16i32, AVXCC,
-              "vpcmp${cc}d\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-              "vpcmpd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
-              EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16, VR512, i512mem, memopv16i32,
-                               X86cmpmu, v16i32, AVXCC,
-              "vpcmp${cc}ud\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-              "vpcmpud\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
-              EVEX_V512, EVEX_CD8<32, CD8VF>;
-
-defm VPCMPQZ : avx512_icmp_cc<0x1F, VK8, VR512, i512mem, memopv8i64,
-                              X86cmpm, v8i64, AVXCC,
-              "vpcmp${cc}q\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-              "vpcmpq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
-              VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
-defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8, VR512, i512mem, memopv8i64,
-                               X86cmpmu, v8i64, AVXCC,
-              "vpcmp${cc}uq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-              "vpcmpuq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
-              VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
-
-// avx512_cmp_packed - sse 1 & 2 compare packed instructions
+defm VPCMPDZ :  avx512_icmp_cc<0x1F, VK16WM, VK16, VR512, i512mem, memopv16i32,
+                               X86cmpm, v16i32, AVXCC, "d">,
+                EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16WM, VK16, VR512, i512mem, memopv16i32,
+                               X86cmpmu, v16i32, AVXCC, "ud">,
+                EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPCMPQZ :  avx512_icmp_cc<0x1F, VK8WM, VK8, VR512, i512mem, memopv8i64,
+                               X86cmpm, v8i64, AVXCC, "q">,
+                VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8WM, VK8, VR512, i512mem, memopv8i64,
+                               X86cmpmu, v8i64, AVXCC, "uq">,
+                VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+// avx512_cmp_packed - compare packed instructions
 multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC,
                            X86MemOperand x86memop, ValueType vt,
                            string suffix, Domain d> {
@@ -859,11 +951,11 @@ multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC,
   // Accept explicit immediate argument form instead of comparison code.
   let isAsmParserOnly = 1, hasSideEffects = 0 in {
     def rri_alt : AVX512PIi8<0xC2, MRMSrcReg,
-               (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
+               (outs KRC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
               !strconcat("vcmp", suffix,
                         " \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>;
     def rmi_alt : AVX512PIi8<0xC2, MRMSrcMem,
-               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+               (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
               !strconcat("vcmp", suffix,
                         " \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>;
   }
@@ -1788,6 +1880,46 @@ def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
         (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
 
 //===----------------------------------------------------------------------===//
+// AVX-512 - Non-temporals
+//===----------------------------------------------------------------------===//
+
+def VMOVNTDQAZrm : AVX5128I<0x2A, MRMSrcMem, (outs VR512:$dst),
+                            (ins i512mem:$src),
+                            "vmovntdqa\t{$src, $dst|$dst, $src}",
+                            [(set VR512:$dst,
+                              (int_x86_avx512_movntdqa addr:$src))]>,
+                   EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+// Prefer non-temporal over temporal versions
+let AddedComplexity = 400, SchedRW = [WriteStore] in {
+
+def VMOVNTPSZmr : AVX512PSI<0x2B, MRMDestMem, (outs),
+                            (ins f512mem:$dst, VR512:$src),
+                            "vmovntps\t{$src, $dst|$dst, $src}",
+                            [(alignednontemporalstore (v16f32 VR512:$src),
+                                                      addr:$dst)],
+                            IIC_SSE_MOVNT>,
+                  EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+def VMOVNTPDZmr : AVX512PDI<0x2B, MRMDestMem, (outs),
+                            (ins f512mem:$dst, VR512:$src),
+                            "vmovntpd\t{$src, $dst|$dst, $src}",
+                            [(alignednontemporalstore (v8f64 VR512:$src),
+                                                      addr:$dst)],
+			    IIC_SSE_MOVNT>,
+                  EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+
+def VMOVNTDQZmr : AVX512BI<0xE7, MRMDestMem, (outs),
+                           (ins i512mem:$dst, VR512:$src),
+                           "vmovntdq\t{$src, $dst|$dst, $src}",
+                           [(alignednontemporalstore (v8i64 VR512:$src),
+                                                     addr:$dst)],
+                           IIC_SSE_MOVNT>,
+                  EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
+}
+
+//===----------------------------------------------------------------------===//
 // AVX-512 - Integer arithmetic
 //
 multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -3161,6 +3293,10 @@ def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))),
           (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
            (v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
 
+def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))),
+          (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr
+           (v8i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_ymm)>;
+
 def : Pat<(v16f32 (int_x86_avx512_mask_cvtdq2ps_512 (v16i32 VR512:$src),
                    (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)),
           (VCVTDQ2PSZrrb VR512:$src, imm:$rc)>;
@@ -4343,6 +4479,37 @@ def : Pat<(int_x86_avx512_mask_conflict_q_512 VR512:$src2, VR512:$src1,
           (VPCONFLICTQrrk VR512:$src1,
            (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>;
 
+let Predicates = [HasCDI] in {
+defm VPLZCNTD : avx512_conflict<0x44, "vplzcntd", VR512, VK16WM,
+                    i512mem, i32mem, "{1to16}">,
+                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+
+defm VPLZCNTQ : avx512_conflict<0x44, "vplzcntq", VR512, VK8WM,
+                    i512mem, i64mem, "{1to8}">,
+                    EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+}
+
+def : Pat<(int_x86_avx512_mask_lzcnt_d_512 VR512:$src2, VR512:$src1,
+                                              GR16:$mask),
+          (VPLZCNTDrrk VR512:$src1,
+           (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src2)>;
+
+def : Pat<(int_x86_avx512_mask_lzcnt_q_512 VR512:$src2, VR512:$src1,
+                                              GR8:$mask),
+          (VPLZCNTQrrk VR512:$src1,
+           (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>;
+
+def : Pat<(v16i32 (ctlz (memopv16i32 addr:$src))),
+          (VPLZCNTDrm addr:$src)>;
+def : Pat<(v16i32 (ctlz (v16i32 VR512:$src))),
+          (VPLZCNTDrr VR512:$src)>;
+def : Pat<(v8i64 (ctlz (memopv8i64 addr:$src))),
+          (VPLZCNTQrm addr:$src)>;
+def : Pat<(v8i64 (ctlz (v8i64 VR512:$src))),
+          (VPLZCNTQrr VR512:$src)>;
+
 def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
 def : Pat<(store (i1  1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
 def : Pat<(store (i1  0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index 368e14b..f2574cc 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -1278,8 +1278,10 @@ let isCompare = 1 in {
     def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>;
 
     // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
-    // register class is constrained to GR8_NOREX.
-    let isPseudo = 1 in
+    // register class is constrained to GR8_NOREX. This pseudo is explicitly
+    // marked side-effect free, since it doesn't have an isel pattern like
+    // other test instructions. 
+    let isPseudo = 1, hasSideEffects = 0 in
     def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
                           "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
   } // Defs = [EFLAGS]
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 34d8fb9..ca4f608 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -110,7 +110,7 @@ let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
 
 // When using segmented stacks these are lowered into instructions which first
 // check if the current stacklet has enough free memory. If it does, memory is
-// allocated by bumping the stack pointer. Otherwise memory is allocated from 
+// allocated by bumping the stack pointer. Otherwise memory is allocated from
 // the heap.
 
 let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
@@ -197,6 +197,26 @@ let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in {
 }
 
 //===----------------------------------------------------------------------===//
+// Pseudo instructions used by unwind info.
+//
+let isPseudo = 1 in {
+  def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg),
+                            "#SEH_PushReg $reg", []>;
+  def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
+                            "#SEH_SaveReg $reg, $dst", []>;
+  def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
+                            "#SEH_SaveXMM $reg, $dst", []>;
+  def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size),
+                            "#SEH_StackAlloc $size", []>;
+  def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset),
+                            "#SEH_SetFrame $reg, $offset", []>;
+  def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode),
+                            "#SEH_PushFrame $mode", []>;
+  def SEH_EndPrologue : I<0, Pseudo, (outs), (ins),
+                            "#SEH_EndPrologue", []>;
+}
+
+//===----------------------------------------------------------------------===//
 // Pseudo instructions used by segmented stacks.
 //
 
@@ -371,7 +391,7 @@ let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
   def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
                       [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
                      Requires<[In64BitMode]>;
- 
+
   let Uses = [RAX,RCX,RDI] in
   def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
                       [(X86rep_stos i64)], IIC_REP_STOS>, REP,
@@ -502,83 +522,6 @@ def CMOV_RFP80 : I<0, Pseudo,
 
 
 //===----------------------------------------------------------------------===//
-// Atomic Instruction Pseudo Instructions
-//===----------------------------------------------------------------------===//
-
-// Pseudo atomic instructions
-
-multiclass PSEUDO_ATOMIC_LOAD_BINOP<string mnemonic> {
-  let usesCustomInserter = 1, mayLoad = 1, mayStore = 1 in {
-    let Defs = [EFLAGS, AL] in
-    def NAME#8  : I<0, Pseudo, (outs GR8:$dst),
-                    (ins i8mem:$ptr, GR8:$val),
-                    !strconcat(mnemonic, "8 PSEUDO!"), []>;
-    let Defs = [EFLAGS, AX] in
-    def NAME#16 : I<0, Pseudo,(outs GR16:$dst),
-                    (ins i16mem:$ptr, GR16:$val),
-                    !strconcat(mnemonic, "16 PSEUDO!"), []>;
-    let Defs = [EFLAGS, EAX] in
-    def NAME#32 : I<0, Pseudo, (outs GR32:$dst),
-                    (ins i32mem:$ptr, GR32:$val),
-                    !strconcat(mnemonic, "32 PSEUDO!"), []>;
-    let Defs = [EFLAGS, RAX] in
-    def NAME#64 : I<0, Pseudo, (outs GR64:$dst),
-                    (ins i64mem:$ptr, GR64:$val),
-                    !strconcat(mnemonic, "64 PSEUDO!"), []>;
-  }
-}
-
-multiclass PSEUDO_ATOMIC_LOAD_BINOP_PATS<string name, string frag> {
-  def : Pat<(!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val),
-            (!cast<Instruction>(name # "8") addr:$ptr, GR8:$val)>;
-  def : Pat<(!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val),
-            (!cast<Instruction>(name # "16") addr:$ptr, GR16:$val)>;
-  def : Pat<(!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val),
-            (!cast<Instruction>(name # "32") addr:$ptr, GR32:$val)>;
-  def : Pat<(!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val),
-            (!cast<Instruction>(name # "64") addr:$ptr, GR64:$val)>;
-}
-
-// Atomic exchange, and, or, xor
-defm ATOMAND  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMAND">;
-defm ATOMOR   : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMOR">;
-defm ATOMXOR  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMXOR">;
-defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMNAND">;
-defm ATOMMAX  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMAX">;
-defm ATOMMIN  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMIN">;
-defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMAX">;
-defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMIN">;
-
-defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMAND",  "atomic_load_and">;
-defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMOR",   "atomic_load_or">;
-defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMXOR",  "atomic_load_xor">;
-defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMNAND", "atomic_load_nand">;
-defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMAX",  "atomic_load_max">;
-defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMIN",  "atomic_load_min">;
-defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMAX", "atomic_load_umax">;
-defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMIN", "atomic_load_umin">;
-
-multiclass PSEUDO_ATOMIC_LOAD_BINOP6432<string mnemonic> {
-  let usesCustomInserter = 1, Defs = [EFLAGS, EAX, EDX],
-      mayLoad = 1, mayStore = 1, hasSideEffects = 0 in
-    def NAME#6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                      (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-                      !strconcat(mnemonic, "6432 PSEUDO!"), []>;
-}
-
-defm ATOMAND  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMAND">;
-defm ATOMOR   : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMOR">;
-defm ATOMXOR  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMXOR">;
-defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMNAND">;
-defm ATOMADD  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMADD">;
-defm ATOMSUB  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSUB">;
-defm ATOMMAX  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMAX">;
-defm ATOMMIN  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMIN">;
-defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMAX">;
-defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMIN">;
-defm ATOMSWAP : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSWAP">;
-
-//===----------------------------------------------------------------------===//
 // Normal-Instructions-With-Lock-Prefix Pseudo Instructions
 //===----------------------------------------------------------------------===//
 
@@ -1696,20 +1639,34 @@ def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
           (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
 
 // Increment reg.
-def : Pat<(add GR8 :$src, 1), (INC8r     GR8 :$src)>;
-def : Pat<(add GR16:$src, 1), (INC16r    GR16:$src)>, Requires<[Not64BitMode]>;
-def : Pat<(add GR16:$src, 1), (INC64_16r GR16:$src)>, Requires<[In64BitMode]>;
-def : Pat<(add GR32:$src, 1), (INC32r    GR32:$src)>, Requires<[Not64BitMode]>;
-def : Pat<(add GR32:$src, 1), (INC64_32r GR32:$src)>, Requires<[In64BitMode]>;
-def : Pat<(add GR64:$src, 1), (INC64r    GR64:$src)>;
+// Do not make INC if it is slow
+def : Pat<(add GR8:$src, 1),
+          (INC8r GR8:$src)>, Requires<[NotSlowIncDec]>;
+def : Pat<(add GR16:$src, 1),
+          (INC16r GR16:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
+def : Pat<(add GR16:$src, 1),
+          (INC64_16r GR16:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
+def : Pat<(add GR32:$src, 1),
+          (INC32r GR32:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
+def : Pat<(add GR32:$src, 1),
+          (INC64_32r GR32:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
+def : Pat<(add GR64:$src, 1),
+          (INC64r GR64:$src)>, Requires<[NotSlowIncDec]>;
 
 // Decrement reg.
-def : Pat<(add GR8 :$src, -1), (DEC8r     GR8 :$src)>;
-def : Pat<(add GR16:$src, -1), (DEC16r    GR16:$src)>, Requires<[Not64BitMode]>;
-def : Pat<(add GR16:$src, -1), (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>;
-def : Pat<(add GR32:$src, -1), (DEC32r    GR32:$src)>, Requires<[Not64BitMode]>;
-def : Pat<(add GR32:$src, -1), (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>;
-def : Pat<(add GR64:$src, -1), (DEC64r    GR64:$src)>;
+// Do not make DEC if it is slow
+def : Pat<(add GR8:$src, -1),
+          (DEC8r GR8:$src)>, Requires<[NotSlowIncDec]>;
+def : Pat<(add GR16:$src, -1),
+          (DEC16r GR16:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
+def : Pat<(add GR16:$src, -1),
+          (DEC64_16r GR16:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
+def : Pat<(add GR32:$src, -1),
+          (DEC32r GR32:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>;
+def : Pat<(add GR32:$src, -1),
+          (DEC64_32r GR32:$src)>, Requires<[NotSlowIncDec, In64BitMode]>;
+def : Pat<(add GR64:$src, -1),
+          (DEC64r GR64:$src)>, Requires<[NotSlowIncDec]>;
 
 // or reg/reg.
 def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr  GR8 :$src1, GR8 :$src2)>;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 1582f43..6f0fa94 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -224,6 +224,10 @@ def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
 def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
 def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
 
+def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>;
+def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>;
+def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>;
+
 def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
 def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
 
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 6993577..0d3afc4 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -28,6 +28,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -97,14 +98,11 @@ struct X86OpTblEntry {
 // Pin the vtable to this file.
 void X86InstrInfo::anchor() {}
 
-X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
-  : X86GenInstrInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
-                     ? X86::ADJCALLSTACKDOWN64
-                     : X86::ADJCALLSTACKDOWN32),
-                    (tm.getSubtarget<X86Subtarget>().is64Bit()
-                     ? X86::ADJCALLSTACKUP64
-                     : X86::ADJCALLSTACKUP32)),
-    TM(tm), RI(tm) {
+X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
+    : X86GenInstrInfo(
+          (STI.is64Bit() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32),
+          (STI.is64Bit() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)),
+      Subtarget(STI), RI(STI) {
 
   static const X86OpTblEntry OpTbl2Addr[] = {
     { X86::ADC32ri,     X86::ADC32mi,    0 },
@@ -1472,7 +1470,7 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
   case X86::MOVSX32rr8:
   case X86::MOVZX32rr8:
   case X86::MOVSX64rr8:
-    if (!TM.getSubtarget<X86Subtarget>().is64Bit())
+    if (!Subtarget.is64Bit())
       // It's not always legal to reference the low 8-bit of the larger
       // register in 32-bit mode.
       return false;
@@ -1950,7 +1948,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
   MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
   unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
   unsigned Opc, leaInReg;
-  if (TM.getSubtarget<X86Subtarget>().is64Bit()) {
+  if (Subtarget.is64Bit()) {
     Opc = X86::LEA64_32r;
     leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
   } else {
@@ -2006,7 +2004,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
       // just a single insert_subreg.
       addRegReg(MIB, leaInReg, true, leaInReg, false);
     } else {
-      if (TM.getSubtarget<X86Subtarget>().is64Bit())
+      if (Subtarget.is64Bit())
         leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
       else
         leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
@@ -2076,13 +2074,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   // we have better subtarget support, enable the 16-bit LEA generation here.
   // 16-bit LEA is also slow on Core2.
   bool DisableLEA16 = true;
-  bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+  bool is64Bit = Subtarget.is64Bit();
 
   unsigned MIOpc = MI->getOpcode();
   switch (MIOpc) {
   case X86::SHUFPSrri: {
     assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!");
-    if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return nullptr;
+    if (!Subtarget.hasSSE2()) return nullptr;
 
     unsigned B = MI->getOperand(1).getReg();
     unsigned C = MI->getOperand(2).getReg();
@@ -2094,7 +2092,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   }
   case X86::SHUFPDrri: {
     assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!");
-    if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return nullptr;
+    if (!Subtarget.hasSSE2()) return nullptr;
 
     unsigned B = MI->getOperand(1).getReg();
     unsigned C = MI->getOperand(2).getReg();
@@ -2672,8 +2670,7 @@ static X86::CondCode getSwappedCondition(X86::CondCode CC) {
 
 /// getSETFromCond - Return a set opcode for the given condition and
 /// whether it has memory operand.
-static unsigned getSETFromCond(X86::CondCode CC,
-                               bool HasMemoryOperand) {
+unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
   static const uint16_t Opc[16][2] = {
     { X86::SETAr,  X86::SETAm  },
     { X86::SETAEr, X86::SETAEm },
@@ -2693,14 +2690,14 @@ static unsigned getSETFromCond(X86::CondCode CC,
     { X86::SETSr,  X86::SETSm  }
   };
 
-  assert(CC < 16 && "Can only handle standard cond codes");
+  assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes");
   return Opc[CC][HasMemoryOperand ? 1 : 0];
 }
 
 /// getCMovFromCond - Return a cmov opcode for the given condition,
 /// register size in bytes, and operand type.
-static unsigned getCMovFromCond(X86::CondCode CC, unsigned RegBytes,
-                                bool HasMemoryOperand) {
+unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes,
+                              bool HasMemoryOperand) {
   static const uint16_t Opc[32][3] = {
     { X86::CMOVA16rr,  X86::CMOVA32rr,  X86::CMOVA64rr  },
     { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
@@ -2976,7 +2973,7 @@ canInsertSelect(const MachineBasicBlock &MBB,
                 unsigned TrueReg, unsigned FalseReg,
                 int &CondCycles, int &TrueCycles, int &FalseCycles) const {
   // Not all subtargets have cmov instructions.
-  if (!TM.getSubtarget<X86Subtarget>().hasCMov())
+  if (!Subtarget.hasCMov())
     return false;
   if (Cond.size() != 1)
     return false;
@@ -3027,8 +3024,7 @@ static bool isHReg(unsigned Reg) {
 
 // Try and copy between VR128/VR64 and GR64 registers.
 static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
-                                        const X86Subtarget& Subtarget) {
-
+                                        const X86Subtarget &Subtarget) {
 
   // SrcReg(VR128) -> DestReg(GR64)
   // SrcReg(VR64)  -> DestReg(GR64)
@@ -3107,8 +3103,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                unsigned DestReg, unsigned SrcReg,
                                bool KillSrc) const {
   // First deal with the normal symmetric copies.
-  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
-  bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
   unsigned Opc = 0;
   if (X86::GR64RegClass.contains(DestReg, SrcReg))
     Opc = X86::MOV64rr;
@@ -3120,7 +3116,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     // Copying to or from a physical H register on x86-64 requires a NOREX
     // move.  Otherwise use a normal move.
     if ((isHReg(DestReg) || isHReg(SrcReg)) &&
-        TM.getSubtarget<X86Subtarget>().is64Bit()) {
+        Subtarget.is64Bit()) {
       Opc = X86::MOV8rr_NOREX;
       // Both operands must be encodable without an REX prefix.
       assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
@@ -3137,7 +3133,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   else if (X86::VR256RegClass.contains(DestReg, SrcReg))
     Opc = X86::VMOVAPSYrr;
   if (!Opc)
-    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, TM.getSubtarget<X86Subtarget>());
+    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
 
   if (Opc) {
     BuildMI(MBB, MI, DL, get(Opc), DestReg)
@@ -3183,9 +3179,9 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 static unsigned getLoadStoreRegOpcode(unsigned Reg,
                                       const TargetRegisterClass *RC,
                                       bool isStackAligned,
-                                      const TargetMachine &TM,
+                                      const X86Subtarget &STI,
                                       bool load) {
-  if (TM.getSubtarget<X86Subtarget>().hasAVX512()) {
+  if (STI.hasAVX512()) {
     if (X86::VK8RegClass.hasSubClassEq(RC)  ||
       X86::VK16RegClass.hasSubClassEq(RC))
       return load ? X86::KMOVWkm : X86::KMOVWmk;
@@ -3197,13 +3193,13 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
       return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
   }
 
-  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+  bool HasAVX = STI.hasAVX();
   switch (RC->getSize()) {
   default:
     llvm_unreachable("Unknown spill size");
   case 1:
     assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
-    if (TM.getSubtarget<X86Subtarget>().is64Bit())
+    if (STI.is64Bit())
       // Copying to or from a physical H register on x86-64 requires a NOREX
       // move.  Otherwise use a normal move.
       if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
@@ -3270,16 +3266,16 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
 static unsigned getStoreRegOpcode(unsigned SrcReg,
                                   const TargetRegisterClass *RC,
                                   bool isStackAligned,
-                                  TargetMachine &TM) {
-  return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, TM, false);
+                                  const X86Subtarget &STI) {
+  return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, STI, false);
 }
 
 
 static unsigned getLoadRegOpcode(unsigned DestReg,
                                  const TargetRegisterClass *RC,
                                  bool isStackAligned,
-                                 const TargetMachine &TM) {
-  return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, TM, true);
+                                 const X86Subtarget &STI) {
+  return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, STI, true);
 }
 
 void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -3291,9 +3287,10 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
          "Stack slot too small for store");
   unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
-  bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
-    RI.canRealignStack(MF);
-  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
+  bool isAligned =
+      (MF.getTarget().getFrameLowering()->getStackAlignment() >= Alignment) ||
+      RI.canRealignStack(MF);
+  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
   DebugLoc DL = MBB.findDebugLoc(MI);
   addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
     .addReg(SrcReg, getKillRegState(isKill));
@@ -3309,7 +3306,7 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
   unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = MMOBegin != MMOEnd &&
                    (*MMOBegin)->getAlignment() >= Alignment;
-  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
+  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
   for (unsigned i = 0, e = Addr.size(); i != e; ++i)
@@ -3327,9 +3324,10 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
   unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
-  bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
-    RI.canRealignStack(MF);
-  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
+  bool isAligned =
+      (MF.getTarget().getFrameLowering()->getStackAlignment() >= Alignment) ||
+      RI.canRealignStack(MF);
+  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
   DebugLoc DL = MBB.findDebugLoc(MI);
   addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
 }
@@ -3343,7 +3341,7 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
   unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = MMOBegin != MMOEnd &&
                    (*MMOBegin)->getAlignment() >= Alignment;
-  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
+  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
   for (unsigned i = 0, e = Addr.size(); i != e; ++i)
@@ -3741,7 +3739,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
       continue;
 
     // EFLAGS is used by this instruction.
-    X86::CondCode OldCC;
+    X86::CondCode OldCC = X86::COND_INVALID;
     bool OpcIsSET = false;
     if (IsCmpZero || IsSwapped) {
       // We decode the condition code from opcode.
@@ -3964,7 +3962,7 @@ static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
 }
 
 bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
-  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+  bool HasAVX = Subtarget.hasAVX();
   MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
   switch (MI->getOpcode()) {
   case X86::MOV32r0:
@@ -4075,7 +4073,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                     unsigned Size, unsigned Align) const {
   const DenseMap<unsigned,
                  std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
-  bool isCallRegIndirect = TM.getSubtarget<X86Subtarget>().callRegIndirect();
+  bool isCallRegIndirect = Subtarget.callRegIndirect();
   bool isTwoAddrFold = false;
 
   // Atom favors register form of call. So, we do not fold loads into calls
@@ -4316,7 +4314,7 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
   if (X86::VR128RegClass.contains(Reg)) {
     // These instructions are all floating point domain, so xorps is the best
     // choice.
-    bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+    bool HasAVX = Subtarget.hasAVX();
     unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr;
     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg)
       .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
@@ -4352,7 +4350,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
   // If the function stack isn't realigned we don't want to fold instructions
   // that need increased alignment.
   if (!RI.needsStackRealignment(MF))
-    Alignment = std::min(Alignment, TM.getFrameLowering()->getStackAlignment());
+    Alignment = std::min(
+        Alignment, MF.getTarget().getFrameLowering()->getStackAlignment());
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
     unsigned RCSize = 0;
@@ -4453,14 +4452,14 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     // Create a constant-pool entry and operands to load from it.
 
     // Medium and large mode can't fold loads this way.
-    if (TM.getCodeModel() != CodeModel::Small &&
-        TM.getCodeModel() != CodeModel::Kernel)
+    if (MF.getTarget().getCodeModel() != CodeModel::Small &&
+        MF.getTarget().getCodeModel() != CodeModel::Kernel)
       return nullptr;
 
     // x86-32 PIC requires a PIC base register for constant pools.
     unsigned PICBase = 0;
-    if (TM.getRelocationModel() == Reloc::PIC_) {
-      if (TM.getSubtarget<X86Subtarget>().is64Bit())
+    if (MF.getTarget().getRelocationModel() == Reloc::PIC_) {
+      if (Subtarget.is64Bit())
         PICBase = X86::RIP;
       else
         // FIXME: PICBase = getGlobalBaseReg(&MF);
@@ -4600,7 +4599,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
   const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
   if (!MI->hasOneMemOperand() &&
       RC == &X86::VR128RegClass &&
-      !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+      !Subtarget.isUnalignedMemAccessFast())
     // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
     // conservatively assume the address is unaligned. That's bad for
     // performance.
@@ -4748,13 +4747,13 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
                             cast<MachineSDNode>(N)->memoperands_end());
     if (!(*MMOs.first) &&
         RC == &X86::VR128RegClass &&
-        !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+        !Subtarget.isUnalignedMemAccessFast())
       // Do not introduce a slow unaligned load.
       return false;
     unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
     bool isAligned = (*MMOs.first) &&
                      (*MMOs.first)->getAlignment() >= Alignment;
-    Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl,
+    Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
                               VT, MVT::Other, AddrOps);
     NewNodes.push_back(Load);
 
@@ -4791,15 +4790,15 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
                              cast<MachineSDNode>(N)->memoperands_end());
     if (!(*MMOs.first) &&
         RC == &X86::VR128RegClass &&
-        !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+        !Subtarget.isUnalignedMemAccessFast())
       // Do not introduce a slow unaligned store.
       return false;
     unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
     bool isAligned = (*MMOs.first) &&
                      (*MMOs.first)->getAlignment() >= Alignment;
-    SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC,
-                                                         isAligned, TM),
-                                       dl, MVT::Other, AddrOps);
+    SDNode *Store =
+        DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
+                           dl, MVT::Other, AddrOps);
     NewNodes.push_back(Store);
 
     // Preserve memory reference information.
@@ -4960,7 +4959,7 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
   default:
     // XMM registers. In 64-bit mode we can be a bit more aggressive since we
     // have 16 of them to play with.
-    if (TM.getSubtargetImpl()->is64Bit()) {
+    if (Subtarget.is64Bit()) {
       if (NumLoads >= 3)
         return false;
     } else if (NumLoads) {
@@ -4986,7 +4985,7 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First,
   // Check if this processor supports macro-fusion. Since this is a minor
   // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
   // proxy for SandyBridge+.
-  if (!TM.getSubtarget<X86Subtarget>().hasAVX())
+  if (!Subtarget.hasAVX())
     return false;
 
   enum {
@@ -5038,6 +5037,7 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First,
   case X86::TEST16rm:
   case X86::TEST32rm:
   case X86::TEST64rm:
+  case X86::TEST8ri_NOREX:
   case X86::AND16i16:
   case X86::AND16ri:
   case X86::AND16ri8:
@@ -5168,7 +5168,7 @@ isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
 /// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
 ///
 unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
-  assert(!TM.getSubtarget<X86Subtarget>().is64Bit() &&
+  assert(!Subtarget.is64Bit() &&
          "X86-64 PIC uses RIP relative addressing");
 
   X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
@@ -5271,7 +5271,7 @@ static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
 std::pair<uint16_t, uint16_t>
 X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const {
   uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
-  bool hasAVX2 = TM.getSubtarget<X86Subtarget>().hasAVX2();
+  bool hasAVX2 = Subtarget.hasAVX2();
   uint16_t validDomains = 0;
   if (domain && lookup(MI->getOpcode(), domain))
     validDomains = 0xe;
@@ -5286,7 +5286,7 @@ void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
   assert(dom && "Not an SSE instruction");
   const uint16_t *table = lookup(MI->getOpcode(), dom);
   if (!table) { // try the other table
-    assert((TM.getSubtarget<X86Subtarget>().hasAVX2() || Domain < 3) &&
+    assert((Subtarget.hasAVX2() || Domain < 3) &&
            "256-bit vector operations only available in AVX2");
     table = lookupAVX2(MI->getOpcode(), dom);
   }
@@ -5299,6 +5299,16 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
   NopInst.setOpcode(X86::NOOP);
 }
 
+void X86InstrInfo::getUnconditionalBranch(
+    MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const {
+  Branch.setOpcode(X86::JMP_4);
+  Branch.addOperand(MCOperand::CreateExpr(BranchTarget));
+}
+
+void X86InstrInfo::getTrap(MCInst &MI) const {
+  MI.setOpcode(X86::TRAP);
+}
+
 bool X86InstrInfo::isHighLatencyDef(int opc) const {
   switch (opc) {
   default: return false;
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 5f34915..c177e3a 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -24,7 +24,7 @@
 
 namespace llvm {
   class X86RegisterInfo;
-  class X86TargetMachine;
+  class X86Subtarget;
 
 namespace X86 {
   // X86 specific condition code. These correspond to X86_*_COND in
@@ -46,6 +46,7 @@ namespace X86 {
     COND_O  = 13,
     COND_P  = 14,
     COND_S  = 15,
+    LAST_VALID_COND = COND_S,
 
     // Artificial condition codes. These are used by AnalyzeBranch
     // to indicate a block terminated with two conditional branches to
@@ -61,12 +62,21 @@ namespace X86 {
   // Turn condition code into conditional branch opcode.
   unsigned GetCondBranchFromCond(CondCode CC);
 
+  /// \brief Return a set opcode for the given condition and whether it has
+  /// a memory operand.
+  unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
+
+  /// \brief Return a cmov opcode for the given condition, register size in
+  /// bytes, and operand type.
+  unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
+                           bool HasMemoryOperand = false);
+
   // Turn CMov opcode into condition code.
   CondCode getCondFromCMovOpc(unsigned Opc);
 
   /// GetOppositeBranchCondition - Return the inverse of the specified cond,
   /// e.g. turning COND_E to COND_NE.
-  CondCode GetOppositeBranchCondition(X86::CondCode CC);
+  CondCode GetOppositeBranchCondition(CondCode CC);
 }  // end namespace X86;
 
 
@@ -129,7 +139,7 @@ inline static bool isMem(const MachineInstr *MI, unsigned Op) {
 }
 
 class X86InstrInfo final : public X86GenInstrInfo {
-  X86TargetMachine &TM;
+  X86Subtarget &Subtarget;
   const X86RegisterInfo RI;
 
   /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
@@ -156,7 +166,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
   virtual void anchor();
 
 public:
-  explicit X86InstrInfo(X86TargetMachine &tm);
+  explicit X86InstrInfo(X86Subtarget &STI);
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
@@ -396,6 +406,12 @@ public:
                                       const SmallVectorImpl<MachineOperand> &MOs,
                                       unsigned Size, unsigned Alignment) const;
 
+  void
+  getUnconditionalBranch(MCInst &Branch,
+                         const MCSymbolRefExpr *BranchTarget) const override;
+
+  void getTrap(MCInst &MI) const override;
+
   bool isHighLatencyDef(int opc) const override;
 
   bool hasHighOperandLatency(const InstrItineraryData *ItinData,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 0d97669..e7b532c 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -155,27 +155,6 @@ def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
                         [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
                          SDNPMayLoad, SDNPMemOperand]>;
 
-def X86AtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTX86atomicBinary,
-                        [SDNPHasChain, SDNPMayStore,
-                         SDNPMayLoad, SDNPMemOperand]>;
-def X86AtomSub64 : SDNode<"X86ISD::ATOMSUB64_DAG", SDTX86atomicBinary,
-                        [SDNPHasChain, SDNPMayStore,
-                         SDNPMayLoad, SDNPMemOperand]>;
-def X86AtomOr64 : SDNode<"X86ISD::ATOMOR64_DAG", SDTX86atomicBinary,
-                        [SDNPHasChain, SDNPMayStore,
-                         SDNPMayLoad, SDNPMemOperand]>;
-def X86AtomXor64 : SDNode<"X86ISD::ATOMXOR64_DAG", SDTX86atomicBinary,
-                        [SDNPHasChain, SDNPMayStore,
-                         SDNPMayLoad, SDNPMemOperand]>;
-def X86AtomAnd64 : SDNode<"X86ISD::ATOMAND64_DAG", SDTX86atomicBinary,
-                        [SDNPHasChain, SDNPMayStore,
-                         SDNPMayLoad, SDNPMemOperand]>;
-def X86AtomNand64 : SDNode<"X86ISD::ATOMNAND64_DAG", SDTX86atomicBinary,
-                        [SDNPHasChain, SDNPMayStore,
-                         SDNPMayLoad, SDNPMemOperand]>;
-def X86AtomSwap64 : SDNode<"X86ISD::ATOMSWAP64_DAG", SDTX86atomicBinary,
-                        [SDNPHasChain, SDNPMayStore,
-                         SDNPMayLoad, SDNPMemOperand]>;
 def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
                         [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
@@ -208,6 +187,8 @@ def X86rdtsc   : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void,
                         [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
 def X86rdtscp  : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void,
                         [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def X86rdpmc   : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void,
+                        [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; 
 
 def X86Wrapper    : SDNode<"X86ISD::Wrapper",     SDTX86Wrapper>;
 def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP",  SDTX86Wrapper>;
@@ -795,6 +776,7 @@ def OptForSpeed  : Predicate<"!OptForSize">;
 def FastBTMem    : Predicate<"!Subtarget->isBTMemSlow()">;
 def CallImmAddr  : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
 def FavorMemIndirectCall  : Predicate<"!Subtarget->callRegIndirect()">;
+def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
 
 //===----------------------------------------------------------------------===//
 // X86 Instruction Format Definitions.
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 1eb0485..f9a5ae1 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -4337,20 +4337,6 @@ defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
                              SSE_INTALU_ITINS_P, 0>;
 
 //===---------------------------------------------------------------------===//
-// SSE2 - Packed Integer Pack Instructions
-//===---------------------------------------------------------------------===//
-
-defm PACKSSWB : PDI_binop_all_int<0x63, "packsswb", int_x86_sse2_packsswb_128,
-                                  int_x86_avx2_packsswb,
-                                  SSE_INTALU_ITINS_SHUFF_P, 0>;
-defm PACKSSDW : PDI_binop_all_int<0x6B, "packssdw", int_x86_sse2_packssdw_128,
-                                  int_x86_avx2_packssdw,
-                                  SSE_INTALU_ITINS_SHUFF_P, 0>;
-defm PACKUSWB : PDI_binop_all_int<0x67, "packuswb", int_x86_sse2_packuswb_128,
-                                  int_x86_avx2_packuswb,
-                                  SSE_INTALU_ITINS_SHUFF_P, 0>;
-
-//===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Shuffle Instructions
 //===---------------------------------------------------------------------===//
 
@@ -4432,6 +4418,136 @@ let Predicates = [UseSSE2] in {
 }
 
 //===---------------------------------------------------------------------===//
+// Packed Integer Pack Instructions (SSE & AVX)
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
+                     ValueType ArgVT, SDNode OpNode, PatFrag bc_frag,
+                     bit Is2Addr = 1> {
+  def rr : PDI<opc, MRMSrcReg,
+               (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+               !if(Is2Addr,
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   !strconcat(OpcodeStr,
+                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+               [(set VR128:$dst,
+                     (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
+               Sched<[WriteShuffle]>;
+  def rm : PDI<opc, MRMSrcMem,
+               (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+               !if(Is2Addr,
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   !strconcat(OpcodeStr,
+                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+               [(set VR128:$dst,
+                     (OutVT (OpNode VR128:$src1,
+                                    (bc_frag (memopv2i64 addr:$src2)))))]>,
+               Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
+                       ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> {
+  def Yrr : PDI<opc, MRMSrcReg,
+                (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+                !strconcat(OpcodeStr,
+                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set VR256:$dst,
+                      (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
+                Sched<[WriteShuffle]>;
+  def Yrm : PDI<opc, MRMSrcMem,
+                (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+                !strconcat(OpcodeStr,
+                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set VR256:$dst,
+                      (OutVT (OpNode VR256:$src1,
+                                     (bc_frag (memopv4i64 addr:$src2)))))]>,
+                Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
+                     ValueType ArgVT, SDNode OpNode, PatFrag bc_frag,
+                     bit Is2Addr = 1> {
+  def rr : SS48I<opc, MRMSrcReg,
+                 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                 !if(Is2Addr,
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     !strconcat(OpcodeStr,
+                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+                 [(set VR128:$dst,
+                       (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
+                 Sched<[WriteShuffle]>;
+  def rm : SS48I<opc, MRMSrcMem,
+                 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                 !if(Is2Addr,
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     !strconcat(OpcodeStr,
+                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+                 [(set VR128:$dst,
+                       (OutVT (OpNode VR128:$src1,
+                                      (bc_frag (memopv2i64 addr:$src2)))))]>,
+                 Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
+                     ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> {
+  def Yrr : SS48I<opc, MRMSrcReg,
+                  (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+                  !strconcat(OpcodeStr,
+                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set VR256:$dst,
+                        (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
+                  Sched<[WriteShuffle]>;
+  def Yrm : SS48I<opc, MRMSrcMem,
+                  (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+                  !strconcat(OpcodeStr,
+                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set VR256:$dst,
+                        (OutVT (OpNode VR256:$src1,
+                                       (bc_frag (memopv4i64 addr:$src2)))))]>,
+                  Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+  defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss,
+                             bc_v8i16, 0>, VEX_4V;
+  defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss,
+                             bc_v4i32, 0>, VEX_4V;
+
+  defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus,
+                             bc_v8i16, 0>, VEX_4V;
+  defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus,
+                             bc_v4i32, 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2] in {
+  defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss,
+                               bc_v16i16>, VEX_4V, VEX_L;
+  defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss,
+                               bc_v8i32>, VEX_4V, VEX_L;
+
+  defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus,
+                               bc_v16i16>, VEX_4V, VEX_L;
+  defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus,
+                               bc_v8i32>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss,
+                            bc_v8i16>;
+  defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss,
+                            bc_v4i32>;
+
+  defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus,
+                            bc_v8i16>;
+
+  let Predicates = [HasSSE41] in
+  defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus,
+                            bc_v4i32>;
+}
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Unpack Instructions
 //===---------------------------------------------------------------------===//
 
@@ -5239,6 +5355,60 @@ let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
                               f128mem, SSE_ALU_F64P>, PD;
 }
 
+// Patterns used to select 'addsub' instructions.
+let Predicates = [HasAVX] in {
+  // Constant 170 corresponds to the binary mask '10101010'.
+  // When used as a blend mask, it allows selecting eight elements from two
+  // input vectors as follow:
+  // - Even-numbered values in the destination are copied from
+  //   the corresponding elements in the first input vector;
+  // - Odd-numbered values in the destination are copied from
+  //   the corresponding elements in the second input vector.
+
+  def : Pat<(v8f32 (X86Blendi (v8f32 (fsub VR256:$lhs, VR256:$rhs)),
+                              (v8f32 (fadd VR256:$lhs, VR256:$rhs)), (i32 170))),
+            (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>;
+
+  // Constant 10 corresponds to the binary mask '1010'.
+  // In the two pattens below, constant 10 is used as a blend mask to select
+  // - the 1st and 3rd element from the first input vector (the 'fsub' node);
+  // - the 2nd and 4th element from the second input vector (the 'fadd' node).
+
+  def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)),
+                             (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))),
+            (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
+  def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)),
+                              (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))),
+            (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)),
+                              (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))),
+            (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)),
+                              (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), 
+            (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)),
+                             (v2f64 (fsub VR128:$lhs, VR128:$rhs)))),
+            (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+}
+
+let Predicates = [UseSSE3] in {
+  // Constant 10 corresponds to the binary mask '1010'.
+  // In the pattern below, it is used as a blend mask to select:
+  // - the 1st and 3rd element from the first input vector (the fsub node);
+  // - the 2nd and 4th element from the second input vector (the fadd node).
+
+  def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)),
+                              (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))),
+            (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
+
+  def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)),
+                              (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), 
+            (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+  def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)),
+                             (v2f64 (fsub VR128:$lhs, VR128:$rhs)))),
+            (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+}
+
 //===---------------------------------------------------------------------===//
 // SSE3 Instructions
 //===---------------------------------------------------------------------===//
@@ -7053,8 +7223,6 @@ multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 let Predicates = [HasAVX] in {
   let isCommutable = 0 in
-  defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
-                                      0, DEFAULT_ITINS_SHUFFLESCHED>, VEX_4V;
   defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
@@ -7086,9 +7254,6 @@ let Predicates = [HasAVX] in {
 
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in
-  defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw",
-                                        int_x86_avx2_packusdw, WriteShuffle>,
-                                        VEX_4V, VEX_L;
   defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
@@ -7120,8 +7285,6 @@ let Predicates = [HasAVX2] in {
 
 let Constraints = "$src1 = $dst" in {
   let isCommutable = 0 in
-  defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw,
-                                     1, DEFAULT_ITINS_SHUFFLESCHED>;
   defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128,
@@ -7969,6 +8132,16 @@ class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
         [(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX;
 
+class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                           X86MemOperand x86memop, ValueType VT,
+                           PatFrag ld_frag, SchedWrite Sched> :
+  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+        [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
+        Sched<[Sched]>, VEX {
+    let mayLoad = 1;
+}
+
 // AVX2 adds register forms
 class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
                          Intrinsic Int, SchedWrite Sched> :
@@ -7977,16 +8150,15 @@ class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
          [(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX;
 
 let ExeDomain = SSEPackedSingle in {
-  def VBROADCASTSSrm  : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem,
-                                      int_x86_avx_vbroadcast_ss, WriteLoad>;
-  def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem,
-                                      int_x86_avx_vbroadcast_ss_256,
-                                      WriteFShuffleLd>, VEX_L;
+  def VBROADCASTSSrm  : avx_broadcast_no_int<0x18, "vbroadcastss", VR128,
+                                             f32mem, v4f32, loadf32, WriteLoad>;
+  def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256,
+                                             f32mem, v8f32, loadf32,
+                                             WriteFShuffleLd>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in
-def VBROADCASTSDYrm  : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
-                                    int_x86_avx_vbroadcast_sd_256,
-                                    WriteFShuffleLd>, VEX_L;
+def VBROADCASTSDYrm  : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem,
+                                    v4f64, loadf64, WriteFShuffleLd>, VEX_L;
 def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
                                    int_x86_avx_vbroadcastf128_pd_256,
                                    WriteFShuffleLd>, VEX_L;
@@ -8366,6 +8538,21 @@ let Predicates = [HasF16C] in {
             (VCVTPH2PSrm addr:$src)>;
 }
 
+// Patterns for  matching conversions from float to half-float and vice versa.
+let Predicates = [HasF16C] in {
+  def : Pat<(f32_to_f16 FR32:$src),
+            (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr
+              (COPY_TO_REGCLASS FR32:$src, VR128), 0)), sub_16bit))>;
+
+  def : Pat<(f16_to_f32 GR16:$src),
+            (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
+              (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >;
+
+  def : Pat<(f16_to_f32 (i16 (f32_to_f16 FR32:$src))),
+            (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
+              (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), FR32)) >;
+}
+
 //===----------------------------------------------------------------------===//
 // AVX2 Instructions
 //===----------------------------------------------------------------------===//
@@ -8543,13 +8730,6 @@ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
 }
 
 let Predicates = [HasAVX] in {
-def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
-          (VBROADCASTSSYrm addr:$src)>;
-def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
-          (VBROADCASTSDYrm addr:$src)>;
-def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
-          (VBROADCASTSSrm addr:$src)>;
-
   // Provide fallback in case the load node that is used in the patterns above
   // is used by additional users, which prevents the pattern selection.
   let AddedComplexity = 20 in {
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index b5595cb..5402780 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -439,7 +439,10 @@ def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
 let SchedRW = [WriteSystem] in {
 def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB;
 def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB;
-def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [], IIC_RDPMC>, TB;
+
+let Defs = [RAX, RDX], Uses = [ECX] in
+  def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)], IIC_RDPMC>,
+              TB;
 
 def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), 
                 "smsw{w}\t$dst", [], IIC_SMSW>, OpSize16, TB;
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index e969ef2..a082c4f 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -432,7 +432,7 @@ X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
   // SSE Callback should be called for SSE-enabled LLVM.
   return X86CompilationCallback_SSE;
 #else
-  if (Subtarget->hasSSE1())
+  if (useSSE)
     return X86CompilationCallback_SSE;
 #endif
 #endif
@@ -440,8 +440,8 @@ X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
   return X86CompilationCallback;
 }
 
-X86JITInfo::X86JITInfo(X86TargetMachine &tm) : TM(tm) {
-  Subtarget = &TM.getSubtarget<X86Subtarget>();
+X86JITInfo::X86JITInfo(bool UseSSE) {
+  useSSE = UseSSE;
   useGOT = 0;
   TLSOffset = nullptr;
 }
diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h
index 4d279de..564343f 100644
--- a/lib/Target/X86/X86JITInfo.h
+++ b/lib/Target/X86/X86JITInfo.h
@@ -19,16 +19,14 @@
 #include "llvm/Target/TargetJITInfo.h"
 
 namespace llvm {
-  class X86TargetMachine;
   class X86Subtarget;
 
   class X86JITInfo : public TargetJITInfo {
-    X86TargetMachine &TM;
-    const X86Subtarget *Subtarget;
     uintptr_t PICBase;
-    char* TLSOffset;
+    char *TLSOffset;
+    bool useSSE;
   public:
-    explicit X86JITInfo(X86TargetMachine &tm);
+    explicit X86JITInfo(bool UseSSE);
 
     /// replaceMachineCodeForFunction - Make it so that calling the function
     /// whose machine code is at OLD turns into a call to NEW, perhaps by
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 0190080..2bd70a9 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86AsmPrinter.h"
+#include "X86RegisterInfo.h"
 #include "InstPrinter/X86ATTInstPrinter.h"
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "llvm/ADT/SmallString.h"
@@ -779,6 +780,9 @@ static void LowerPATCHPOINT(MCStreamer &OS, StackMaps &SM,
 
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   X86MCInstLower MCInstLowering(*MF, *this);
+  const X86RegisterInfo *RI =
+      static_cast<const X86RegisterInfo *>(TM.getRegisterInfo());
+
   switch (MI->getOpcode()) {
   case TargetOpcode::DBG_VALUE:
     llvm_unreachable("Should be handled target independently");
@@ -883,6 +887,37 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addReg(X86::R10)
       .addReg(X86::RAX));
     return;
+
+  case X86::SEH_PushReg:
+    OutStreamer.EmitWinCFIPushReg(RI->getSEHRegNum(MI->getOperand(0).getImm()));
+    return;
+
+  case X86::SEH_SaveReg:
+    OutStreamer.EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+                                  MI->getOperand(1).getImm());
+    return;
+
+  case X86::SEH_SaveXMM:
+    OutStreamer.EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+                                  MI->getOperand(1).getImm());
+    return;
+
+  case X86::SEH_StackAlloc:
+    OutStreamer.EmitWinCFIAllocStack(MI->getOperand(0).getImm());
+    return;
+
+  case X86::SEH_SetFrame:
+    OutStreamer.EmitWinCFISetFrame(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+                                   MI->getOperand(1).getImm());
+    return;
+
+  case X86::SEH_PushFrame:
+    OutStreamer.EmitWinCFIPushFrame(MI->getOperand(0).getImm());
+    return;
+
+  case X86::SEH_EndPrologue:
+    OutStreamer.EmitWinCFIEndProlog();
+    return;
   }
 
   MCInst TmpInst;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index a83e1e4..e8a7e84 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -53,20 +53,18 @@ static cl::opt<bool>
 EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
           cl::desc("Enable use of a base pointer for complex stack frames"));
 
-X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm)
-  : X86GenRegisterInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
-                         ? X86::RIP : X86::EIP),
-                       X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), false),
-                       X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true),
-                       (tm.getSubtarget<X86Subtarget>().is64Bit()
-                         ? X86::RIP : X86::EIP)),
-                       TM(tm) {
+X86RegisterInfo::X86RegisterInfo(const X86Subtarget &STI)
+    : X86GenRegisterInfo(
+          (STI.is64Bit() ? X86::RIP : X86::EIP),
+          X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), false),
+          X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), true),
+          (STI.is64Bit() ? X86::RIP : X86::EIP)),
+      Subtarget(STI) {
   X86_MC::InitLLVM2SEHRegisterMapping(this);
 
   // Cache some information.
-  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
-  Is64Bit = Subtarget->is64Bit();
-  IsWin64 = Subtarget->isTargetWin64();
+  Is64Bit = Subtarget.is64Bit();
+  IsWin64 = Subtarget.isTargetWin64();
 
   if (Is64Bit) {
     SlotSize = 8;
@@ -83,21 +81,6 @@ X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm)
   BasePtr = Is64Bit ? X86::RBX : X86::ESI;
 }
 
-/// getCompactUnwindRegNum - This function maps the register to the number for
-/// compact unwind encoding. Return -1 if the register isn't valid.
-int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const {
-  switch (getLLVMRegNum(RegNum, isEH)) {
-  case X86::EBX: case X86::RBX: return 1;
-  case X86::ECX: case X86::R12: return 2;
-  case X86::EDX: case X86::R13: return 3;
-  case X86::EDI: case X86::R14: return 4;
-  case X86::ESI: case X86::R15: return 5;
-  case X86::EBP: case X86::RBP: return 6;
-  }
-
-  return -1;
-}
-
 bool
 X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
   // ExeDepsFixer and PostRAScheduler require liveness.
@@ -173,9 +156,8 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
 }
 
 const TargetRegisterClass *
-X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
-                                                                         const {
-  const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                    unsigned Kind) const {
   switch (Kind) {
   default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
   case 0: // Normal GPRs.
@@ -225,7 +207,7 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   case X86::GR64RegClassID:
     return 12 - FPDiff;
   case X86::VR128RegClassID:
-    return TM.getSubtarget<X86Subtarget>().is64Bit() ? 10 : 4;
+    return Subtarget.is64Bit() ? 10 : 4;
   case X86::VR64RegClassID:
     return 4;
   }
@@ -233,8 +215,8 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
 
 const MCPhysReg *
 X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
-  bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
 
   assert(MF && "MachineFunction required");
   switch (MF->getFunction()->getCallingConv()) {
@@ -287,8 +269,8 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
 const uint32_t*
 X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
-  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
-  bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
 
   switch (CC) {
   case CallingConv::GHC:
@@ -406,7 +388,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
         Reserved.set(*AI);
     }
   }
-  if (!Is64Bit || !TM.getSubtarget<X86Subtarget>().hasAVX512()) {
+  if (!Is64Bit || !Subtarget.hasAVX512()) {
     for (unsigned n = 16; n != 32; ++n) {
       for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI)
         Reserved.set(*AI);
@@ -459,7 +441,7 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
 bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *F = MF.getFunction();
-  unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+  unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
   bool requiresRealignment =
     ((MFI->getMaxAlignment() > StackAlign) ||
      F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 2289d91..74efd1f 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -22,11 +22,11 @@
 namespace llvm {
   class Type;
   class TargetInstrInfo;
-  class X86TargetMachine;
+  class X86Subtarget;
 
 class X86RegisterInfo final : public X86GenRegisterInfo {
 public:
-  X86TargetMachine &TM;
+  const X86Subtarget &Subtarget;
 
 private:
   /// Is64Bit - Is the target 64-bits.
@@ -55,15 +55,11 @@ private:
   unsigned BasePtr;
 
 public:
-  X86RegisterInfo(X86TargetMachine &tm);
+  X86RegisterInfo(const X86Subtarget &STI);
 
   // FIXME: This should be tablegen'd like getDwarfRegNum is
   int getSEHRegNum(unsigned i) const;
 
-  /// getCompactUnwindRegNum - This function maps the register to the number for
-  /// compact unwind encoding. Return -1 if the register isn't valid.
-  int getCompactUnwindRegNum(unsigned RegNum, bool isEH) const override;
-
   /// Code Generation virtual methods...
   ///
   bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 744890d..a83dd9b 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -11,21 +11,23 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86TargetMachine.h"
+#include "X86InstrInfo.h"
+#include "X86ISelLowering.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86SelectionDAGInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/Target/TargetLowering.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "x86-selectiondag-info"
 
-X86SelectionDAGInfo::X86SelectionDAGInfo(const X86TargetMachine &TM) :
-  TargetSelectionDAGInfo(TM),
-  Subtarget(&TM.getSubtarget<X86Subtarget>()),
-  TLI(*TM.getTargetLowering()) {
-}
+X86SelectionDAGInfo::X86SelectionDAGInfo(const DataLayout &DL)
+    : TargetSelectionDAGInfo(&DL) {}
 
-X86SelectionDAGInfo::~X86SelectionDAGInfo() {
-}
+X86SelectionDAGInfo::~X86SelectionDAGInfo() {}
 
 SDValue
 X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
@@ -35,6 +37,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                                              bool isVolatile,
                                          MachinePointerInfo DstPtrInfo) const {
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>();
 
   // If to a segment-relative address space, use the default lowering.
   if (DstPtrInfo.getAddrSpace() >= 256)
@@ -43,16 +46,14 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
   // If not DWORD aligned or size is more than the threshold, call the library.
   // The libc version is likely to be faster for these cases. It can use the
   // address value and run time information about the CPU.
-  if ((Align & 3) != 0 ||
-      !ConstantSize ||
-      ConstantSize->getZExtValue() >
-        Subtarget->getMaxInlineSizeThreshold()) {
+  if ((Align & 3) != 0 || !ConstantSize ||
+      ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) {
     // Check to see if there is a specialized entry-point for memory zeroing.
     ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
 
     if (const char *bzeroEntry =  V &&
-        V->isNullValue() ? Subtarget->getBZeroEntry() : nullptr) {
-      EVT IntPtr = TLI.getPointerTy();
+        V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) {
+      EVT IntPtr = DAG.getTargetLoweringInfo().getPointerTy();
       Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
       TargetLowering::ArgListTy Args;
       TargetLowering::ArgListEntry Entry;
@@ -65,10 +66,11 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
       TargetLowering::CallLoweringInfo CLI(DAG);
       CLI.setDebugLoc(dl).setChain(Chain)
         .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                   DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0)
+                   DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args),
+                   0)
         .setDiscardResult();
 
-      std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+      std::pair<SDValue,SDValue> CallResult = DAG.getTargetLoweringInfo().LowerCallTo(CLI);
       return CallResult.second;
     }
 
@@ -99,7 +101,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
       ValReg = X86::EAX;
       Val = (Val << 8)  | Val;
       Val = (Val << 16) | Val;
-      if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) {  // QWORD aligned
+      if (Subtarget.is64Bit() && ((Align & 0x7) == 0)) {  // QWORD aligned
         AVT = MVT::i64;
         ValReg = X86::RAX;
         Val = (Val << 32) | Val;
@@ -128,13 +130,11 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
     InFlag = Chain.getValue(1);
   }
 
-  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
-                                                              X86::ECX,
-                            Count, InFlag);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX,
+                           Count, InFlag);
   InFlag = Chain.getValue(1);
-  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
-                                                              X86::EDI,
-                            Dst, InFlag);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI,
+                           Dst, InFlag);
   InFlag = Chain.getValue(1);
 
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
@@ -182,10 +182,11 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
   // This requires the copy size to be a constant, preferably
   // within a subtarget-specific limit.
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>();
   if (!ConstantSize)
     return SDValue();
   uint64_t SizeVal = ConstantSize->getZExtValue();
-  if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold())
+  if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
     return SDValue();
 
   /// If not DWORD aligned, it is more efficient to call the library.  However
@@ -218,7 +219,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
     AVT = MVT::i32;
   else
     // QWORD aligned
-    AVT = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
+    AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
 
   unsigned UBytes = AVT.getSizeInBits() / 8;
   unsigned CountVal = SizeVal / UBytes;
@@ -226,15 +227,15 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
   unsigned BytesLeft = SizeVal % UBytes;
 
   SDValue InFlag;
-  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX :
                                                               X86::ECX,
                             Count, InFlag);
   InFlag = Chain.getValue(1);
-  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI :
                                                               X86::EDI,
                             Dst, InFlag);
   InFlag = Chain.getValue(1);
-  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI :
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI :
                                                               X86::ESI,
                             Src, InFlag);
   InFlag = Chain.getValue(1);
diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h
index 0d5dc38..c12555a 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.h
+++ b/lib/Target/X86/X86SelectionDAGInfo.h
@@ -23,14 +23,8 @@ class X86TargetMachine;
 class X86Subtarget;
 
 class X86SelectionDAGInfo : public TargetSelectionDAGInfo {
-  /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const X86Subtarget *Subtarget;
-
-  const X86TargetLowering &TLI;
-
 public:
-  explicit X86SelectionDAGInfo(const X86TargetMachine &TM);
+  explicit X86SelectionDAGInfo(const DataLayout &DL);
   ~X86SelectionDAGInfo();
 
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 989e0d6..79b7e68 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -291,13 +291,60 @@ void X86Subtarget::initializeEnvironment() {
   CallRegIndirect = false;
   LEAUsesAG = false;
   SlowLEA = false;
+  SlowIncDec = false;
   stackAlignment = 4;
   // FIXME: this is a known good value for Yonah. How about others?
   MaxInlineSizeThreshold = 128;
 }
 
+static std::string computeDataLayout(const X86Subtarget &ST) {
+  // X86 is little endian
+  std::string Ret = "e";
+
+  Ret += DataLayout::getManglingComponent(ST.getTargetTriple());
+  // X86 and x32 have 32 bit pointers.
+  if (ST.isTarget64BitILP32() || !ST.is64Bit())
+    Ret += "-p:32:32";
+
+  // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
+  if (ST.is64Bit() || ST.isOSWindows() || ST.isTargetNaCl())
+    Ret += "-i64:64";
+  else
+    Ret += "-f64:32:64";
+
+  // Some ABIs align long double to 128 bits, others to 32.
+  if (ST.isTargetNaCl())
+    ; // No f80
+  else if (ST.is64Bit() || ST.isTargetDarwin())
+    Ret += "-f80:128";
+  else
+    Ret += "-f80:32";
+
+  // The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
+  if (ST.is64Bit())
+    Ret += "-n8:16:32:64";
+  else
+    Ret += "-n8:16:32";
+
+  // The stack is aligned to 32 bits on some ABIs and 128 bits on others.
+  if (!ST.is64Bit() && ST.isOSWindows())  
+    Ret += "-S32";
+  else
+    Ret += "-S128";
+
+  return Ret;
+}
+
+X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                            StringRef FS) {
+  initializeEnvironment();
+  resetSubtargetFeatures(CPU, FS);
+  return *this;
+}
+
 X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
-                           const std::string &FS, unsigned StackAlignOverride)
+                           const std::string &FS, X86TargetMachine &TM,
+                           unsigned StackAlignOverride)
     : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
       PICStyle(PICStyles::None), TargetTriple(TT),
       StackAlignOverride(StackAlignOverride),
@@ -305,10 +352,12 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
       In32BitMode(TargetTriple.getArch() == Triple::x86 &&
                   TargetTriple.getEnvironment() != Triple::CODE16),
       In16BitMode(TargetTriple.getArch() == Triple::x86 &&
-                  TargetTriple.getEnvironment() == Triple::CODE16) {
-  initializeEnvironment();
-  resetSubtargetFeatures(CPU, FS);
-}
+                  TargetTriple.getEnvironment() == Triple::CODE16),
+      DL(computeDataLayout(*this)), TSInfo(DL),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM),
+      FrameLowering(TargetFrameLowering::StackGrowsDown, getStackAlignment(),
+                    is64Bit() ? -8 : -4),
+      JITInfo(hasSSE1()) {}
 
 bool
 X86Subtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel,
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 703559a..09db0eb 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -14,6 +14,11 @@
 #ifndef X86SUBTARGET_H
 #define X86SUBTARGET_H
 
+#include "X86FrameLowering.h"
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86SelectionDAGInfo.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -40,6 +45,7 @@ enum Style {
 }
 
 class X86Subtarget final : public X86GenSubtargetInfo {
+
 protected:
   enum X86SSEEnum {
     NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
@@ -181,6 +187,9 @@ protected:
   /// SlowLEA - True if the LEA instruction with certain arguments is slow
   bool SlowLEA;
 
+  /// SlowIncDec - True if INC and DEC instructions are slow when writing to flags
+  bool SlowIncDec;
+
   /// Processor has AVX-512 PreFetch Instructions
   bool HasPFI;
   
@@ -217,14 +226,31 @@ private:
   /// In16BitMode - True if compiling for 16-bit, false for 32-bit or 64-bit.
   bool In16BitMode;
 
+  // Calculates type size & alignment
+  const DataLayout DL;
+  X86SelectionDAGInfo TSInfo;
+  // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
+  // X86TargetLowering needs.
+  X86InstrInfo InstrInfo;
+  X86TargetLowering TLInfo;
+  X86FrameLowering FrameLowering;
+  X86JITInfo JITInfo;
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   ///
   X86Subtarget(const std::string &TT, const std::string &CPU,
-               const std::string &FS,
+               const std::string &FS, X86TargetMachine &TM,
                unsigned StackAlignOverride);
 
+  const X86TargetLowering *getTargetLowering() const { return &TLInfo; }
+  const X86InstrInfo *getInstrInfo() const { return &InstrInfo; }
+  const DataLayout *getDataLayout() const { return &DL; }
+  const X86FrameLowering *getFrameLowering() const { return &FrameLowering; }
+  const X86SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  X86JITInfo *getJITInfo() { return &JITInfo; }
+
   /// getStackAlignment - Returns the minimum alignment known to hold of the
   /// stack frame on entry to the function and which must be maintained by every
   /// function for this subtarget.
@@ -241,6 +267,9 @@ public:
   /// \brief Reset the features for the X86 target.
   void resetSubtargetFeatures(const MachineFunction *MF) override;
 private:
+  /// \brief Initialize the full set of dependencies so we can use an initializer
+  /// list for X86Subtarget.
+  X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
   void initializeEnvironment();
   void resetSubtargetFeatures(StringRef CPU, StringRef FS);
 public:
@@ -319,6 +348,7 @@ public:
   bool callRegIndirect() const { return CallRegIndirect; }
   bool LEAusesAG() const { return LEAUsesAG; }
   bool slowLEA() const { return SlowLEA; }
+  bool slowIncDec() const { return SlowIncDec; }
   bool hasCDI() const { return HasCDI; }
   bool hasPFI() const { return HasPFI; }
   bool hasERI() const { return HasERI; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 93760ef..f12140f 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -29,61 +29,14 @@ extern "C" void LLVMInitializeX86Target() {
 
 void X86TargetMachine::anchor() { }
 
-static std::string computeDataLayout(const X86Subtarget &ST) {
-  // X86 is little endian
-  std::string Ret = "e";
-
-  Ret += DataLayout::getManglingComponent(ST.getTargetTriple());
-  // X86 and x32 have 32 bit pointers.
-  if (ST.isTarget64BitILP32() || !ST.is64Bit())
-    Ret += "-p:32:32";
-
-  // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
-  if (ST.is64Bit() || ST.isTargetCygMing() || ST.isTargetKnownWindowsMSVC() ||
-      ST.isTargetNaCl())
-    Ret += "-i64:64";
-  else
-    Ret += "-f64:32:64";
-
-  // Some ABIs align long double to 128 bits, others to 32.
-  if (ST.isTargetNaCl())
-    ; // No f80
-  else if (ST.is64Bit() || ST.isTargetDarwin())
-    Ret += "-f80:128";
-  else
-    Ret += "-f80:32";
-
-  // The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
-  if (ST.is64Bit())
-    Ret += "-n8:16:32:64";
-  else
-    Ret += "-n8:16:32";
-
-  // The stack is aligned to 32 bits on some ABIs and 128 bits on others.
-  if (!ST.is64Bit() && (ST.isTargetCygMing() || ST.isTargetKnownWindowsMSVC()))
-    Ret += "-S32";
-  else
-    Ret += "-S128";
-
-  return Ret;
-}
-
 /// X86TargetMachine ctor - Create an X86 target.
 ///
-X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
-                                   StringRef CPU, StringRef FS,
-                                   const TargetOptions &Options,
+X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                                   StringRef FS, const TargetOptions &Options,
                                    Reloc::Model RM, CodeModel::Model CM,
                                    CodeGenOpt::Level OL)
-  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS, Options.StackAlignmentOverride),
-    FrameLowering(*this, Subtarget),
-    InstrItins(Subtarget.getInstrItineraryData()),
-    DL(computeDataLayout(*getSubtargetImpl())),
-    InstrInfo(*this),
-    TLInfo(*this),
-    TSInfo(*this),
-    JITInfo(*this) {
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) {
   // Determine the PICStyle based on the target selected.
   if (getRelocationModel() == Reloc::Static) {
     // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
@@ -158,6 +111,7 @@ public:
     return *getX86TargetMachine().getSubtargetImpl();
   }
 
+  void addIRPasses() override;
   bool addInstSelector() override;
   bool addILPOpts() override;
   bool addPreRegAlloc() override;
@@ -170,6 +124,12 @@ TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
   return new X86PassConfig(this, PM);
 }
 
+void X86PassConfig::addIRPasses() {
+  addPass(createX86AtomicExpandPass(&getX86TargetMachine()));
+
+  TargetPassConfig::addIRPasses();
+}
+
 bool X86PassConfig::addInstSelector() {
   // Install an instruction selector.
   addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel()));
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 57e6eda..41d5157 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -13,12 +13,7 @@
 
 #ifndef X86TARGETMACHINE_H
 #define X86TARGETMACHINE_H
-
-#include "X86FrameLowering.h"
-#include "X86ISelLowering.h"
 #include "X86InstrInfo.h"
-#include "X86JITInfo.h"
-#include "X86SelectionDAGInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
@@ -30,13 +25,6 @@ class StringRef;
 class X86TargetMachine final : public LLVMTargetMachine {
   virtual void anchor();
   X86Subtarget       Subtarget;
-  X86FrameLowering   FrameLowering;
-  InstrItineraryData InstrItins;
-  const DataLayout   DL; // Calculates type size & alignment
-  X86InstrInfo       InstrInfo;
-  X86TargetLowering  TLInfo;
-  X86SelectionDAGInfo TSInfo;
-  X86JITInfo         JITInfo;
 
 public:
   X86TargetMachine(const Target &T, StringRef TT,
@@ -44,28 +32,28 @@ public:
                    Reloc::Model RM, CodeModel::Model CM,
                    CodeGenOpt::Level OL);
 
-  const DataLayout *getDataLayout() const override { return &DL; }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
+  }
   const X86InstrInfo *getInstrInfo() const override {
-    return &InstrInfo;
+    return getSubtargetImpl()->getInstrInfo();
   }
   const TargetFrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
-  }
-  X86JITInfo *getJITInfo() override {
-    return &JITInfo;
+    return getSubtargetImpl()->getFrameLowering();
   }
+  X86JITInfo *getJITInfo() override { return Subtarget.getJITInfo(); }
   const X86Subtarget *getSubtargetImpl() const override { return &Subtarget; }
   const X86TargetLowering *getTargetLowering() const override {
-    return &TLInfo;
+    return getSubtargetImpl()->getTargetLowering();
   }
   const X86SelectionDAGInfo *getSelectionDAGInfo() const override {
-    return &TSInfo;
+    return getSubtargetImpl()->getSelectionDAGInfo();
   }
   const X86RegisterInfo  *getRegisterInfo() const override {
     return &getInstrInfo()->getRegisterInfo();
   }
   const InstrItineraryData *getInstrItineraryData() const override {
-    return &InstrItins;
+    return &getSubtargetImpl()->getInstrItineraryData();
   }
 
   /// \brief Register X86 analysis passes with a pass manager.
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 91b9d40..c961e2f 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -102,6 +102,8 @@ public:
   unsigned getReductionCost(unsigned Opcode, Type *Ty,
                             bool IsPairwiseForm) const override;
 
+  unsigned getIntImmCost(int64_t) const;
+
   unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
 
   unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
@@ -142,13 +144,17 @@ unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
   if (Vector && !ST->hasSSE1())
     return 0;
 
-  if (ST->is64Bit())
+  if (ST->is64Bit()) {
+    if (Vector && ST->hasAVX512())
+      return 32;
     return 16;
+  }
   return 8;
 }
 
 unsigned X86TTI::getRegisterBitWidth(bool Vector) const {
   if (Vector) {
+    if (ST->hasAVX512()) return 512;
     if (ST->hasAVX()) return 256;
     if (ST->hasSSE1()) return 128;
     return 0;
@@ -400,17 +406,117 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
 
 unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
                                 Type *SubTp) const {
-  // We only estimate the cost of reverse shuffles.
-  if (Kind != SK_Reverse)
+  // We only estimate the cost of reverse and alternate shuffles.
+  if (Kind != SK_Reverse && Kind != SK_Alternate)
     return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
-  unsigned Cost = 1;
-  if (LT.second.getSizeInBits() > 128)
-    Cost = 3; // Extract + insert + copy.
+  if (Kind == SK_Reverse) {
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+    unsigned Cost = 1;
+    if (LT.second.getSizeInBits() > 128)
+      Cost = 3; // Extract + insert + copy.
+
+    // Multiple by the number of parts.
+    return Cost * LT.first;
+  }
+
+  if (Kind == SK_Alternate) {
+    // 64-bit packed float vectors (v2f32) are widened to type v4f32.
+    // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+
+    // The backend knows how to generate a single VEX.256 version of
+    // instruction VPBLENDW if the target supports AVX2.
+    if (ST->hasAVX2() && LT.second == MVT::v16i16)
+      return LT.first;
+
+    static const CostTblEntry<MVT::SimpleValueType> AVXAltShuffleTbl[] = {
+      {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1},  // vblendpd
+      {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1},  // vblendpd
+
+      {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1},  // vblendps
+      {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1},  // vblendps
+
+      // This shuffle is custom lowered into a sequence of:
+      //  2x  vextractf128 , 2x vpblendw , 1x vinsertf128
+      {ISD::VECTOR_SHUFFLE, MVT::v16i16, 5},
+
+      // This shuffle is custom lowered into a long sequence of:
+      //  2x vextractf128 , 4x vpshufb , 2x vpor ,  1x vinsertf128
+      {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9}
+    };
+
+    if (ST->hasAVX()) {
+      int Idx = CostTableLookup(AVXAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+      if (Idx != -1)
+        return LT.first * AVXAltShuffleTbl[Idx].Cost;
+    }
+
+    static const CostTblEntry<MVT::SimpleValueType> SSE41AltShuffleTbl[] = {
+      // These are lowered into movsd.
+      {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+      {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+
+      // packed float vectors with four elements are lowered into BLENDI dag
+      // nodes. A v4i32/v4f32 BLENDI generates a single 'blendps'/'blendpd'.
+      {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
+      {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
+
+      // This shuffle generates a single pshufw.
+      {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
+
+      // There is no instruction that matches a v16i8 alternate shuffle.
+      // The backend will expand it into the sequence 'pshufb + pshufb + or'.
+      {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}
+    };
+
+    if (ST->hasSSE41()) {
+      int Idx = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+      if (Idx != -1)
+        return LT.first * SSE41AltShuffleTbl[Idx].Cost;
+    }
+
+    static const CostTblEntry<MVT::SimpleValueType> SSSE3AltShuffleTbl[] = {
+      {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // movsd
+      {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // movsd
+
+      // SSE3 doesn't have 'blendps'. The following shuffles are expanded into
+      // the sequence 'shufps + pshufd'
+      {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+      {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
 
-  // Multiple by the number of parts.
-  return Cost * LT.first;
+      {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or
+      {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}  // pshufb + pshufb + or
+    };
+ 
+    if (ST->hasSSSE3()) {
+      int Idx = CostTableLookup(SSSE3AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+      if (Idx != -1)
+        return LT.first * SSSE3AltShuffleTbl[Idx].Cost;
+    }
+
+    static const CostTblEntry<MVT::SimpleValueType> SSEAltShuffleTbl[] = {
+      {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // movsd
+      {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // movsd
+
+      {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd
+      {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd
+ 
+      // This is expanded into a long sequence of four extract + four insert.
+      {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw.
+
+      // 8 x (pinsrw + pextrw + and + movb + movzb + or)
+      {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48}
+    };
+
+    // Fall-back (SSE3 and SSE2). 
+    int Idx = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+    if (Idx != -1)
+      return LT.first * SSEAltShuffleTbl[Idx].Cost;
+    return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+  }
+
+  return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
 unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
@@ -808,6 +914,19 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy,
   return TargetTransformInfo::getReductionCost(Opcode, ValTy, IsPairwise);
 }
 
+/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// method might only calculate a fraction of a larger immediate. Therefore it
+/// is valid to return a cost of ZERO.
+unsigned X86TTI::getIntImmCost(int64_t Val) const {
+  if (Val == 0)
+    return TCC_Free;
+
+  if (isInt<32>(Val))
+    return TCC_Basic;
+
+  return 2 * TCC_Basic;
+}
+
 unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
   assert(Ty->isIntegerTy());
 
@@ -825,11 +944,21 @@ unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
   if (Imm == 0)
     return TCC_Free;
 
-  if (Imm.getBitWidth() <= 64 &&
-      (isInt<32>(Imm.getSExtValue()) || isUInt<32>(Imm.getZExtValue())))
-    return TCC_Basic;
-  else
-    return 2 * TCC_Basic;
+  // Sign-extend all constants to a multiple of 64-bit.
+  APInt ImmVal = Imm;
+  if (BitSize & 0x3f)
+    ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
+
+  // Split the constant into 64-bit chunks and calculate the cost for each
+  // chunk.
+  unsigned Cost = 0;
+  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
+    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
+    int64_t Val = Tmp.getSExtValue();
+    Cost += getIntImmCost(Val);
+  }
+  // We need at least one instruction to materialze the constant.
+  return std::max(1U, Cost);
 }
 
 unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
@@ -889,9 +1018,13 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
     break;
   }
 
-  if ((Idx == ImmIdx) &&
-      Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
-    return TCC_Free;
+  if (Idx == ImmIdx) {
+    unsigned NumConstants = (BitSize + 63) / 64;
+    unsigned Cost = X86TTI::getIntImmCost(Imm, Ty);
+    return (Cost <= NumConstants * TCC_Basic)
+      ? static_cast<unsigned>(TCC_Free)
+      : Cost;
+  }
 
   return X86TTI::getIntImmCost(Imm, Ty);
 }
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index 5499aba..e694736 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -228,7 +228,9 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
   const XCoreInstrInfo &TII =
     *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
-  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc dl;
 
   if (MFI->getMaxAlignment() > getStackAlignment())
     report_fatal_error("emitPrologue unsupported alignment: "
@@ -416,7 +418,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(*MF);
 
   DebugLoc DL;
-  if (MI != MBB.end())
+  if (MI != MBB.end() && !MI->isDebugValue())
     DL = MI->getDebugLoc();
 
   for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 9d78586..be7ef64 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -68,10 +68,9 @@ getTargetNodeName(unsigned Opcode) const
   }
 }
 
-XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
-  : TargetLowering(XTM, new XCoreTargetObjectFile()),
-    TM(XTM),
-    Subtarget(*XTM.getSubtargetImpl()) {
+XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM)
+    : TargetLowering(TM, new XCoreTargetObjectFile()), TM(TM),
+      Subtarget(TM.getSubtarget<XCoreSubtarget>()) {
 
   // Set up the register classes.
   addRegisterClass(MVT::i32, &XCore::GRRegsRegClass);
@@ -92,15 +91,12 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
 
   // XCore does not have the NodeTypes below.
   setOperationAction(ISD::BR_CC,     MVT::i32,   Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::i32,   Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i32,   Expand);
   setOperationAction(ISD::ADDC, MVT::i32, Expand);
   setOperationAction(ISD::ADDE, MVT::i32, Expand);
   setOperationAction(ISD::SUBC, MVT::i32, Expand);
   setOperationAction(ISD::SUBE, MVT::i32, Expand);
 
-  // Stop the combiner recombining select and set_cc
-  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
-
   // 64bit
   setOperationAction(ISD::ADD, MVT::i64, Custom);
   setOperationAction(ISD::SUB, MVT::i64, Custom);
@@ -217,7 +213,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::BR_JT:              return LowerBR_JT(Op, DAG);
   case ISD::LOAD:               return LowerLOAD(Op, DAG);
   case ISD::STORE:              return LowerSTORE(Op, DAG);
-  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   case ISD::VASTART:            return LowerVASTART(Op, DAG);
   case ISD::SMUL_LOHI:          return LowerSMUL_LOHI(Op, DAG);
@@ -258,33 +253,21 @@ void XCoreTargetLowering::ReplaceNodeResults(SDNode *N,
 //  Misc Lower Operation implementation
 //===----------------------------------------------------------------------===//
 
-SDValue XCoreTargetLowering::
-LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
-{
-  SDLoc dl(Op);
-  SDValue Cond = DAG.getNode(ISD::SETCC, dl, MVT::i32, Op.getOperand(2),
-                             Op.getOperand(3), Op.getOperand(4));
-  return DAG.getNode(ISD::SELECT, dl, MVT::i32, Cond, Op.getOperand(0),
-                     Op.getOperand(1));
-}
-
 SDValue XCoreTargetLowering::getGlobalAddressWrapper(SDValue GA,
                                                      const GlobalValue *GV,
                                                      SelectionDAG &DAG) const {
   // FIXME there is no actual debug info here
   SDLoc dl(GA);
-  const GlobalValue *UnderlyingGV = GV;
-  // If GV is an alias then use the aliasee to determine the wrapper type
-  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    UnderlyingGV = GA->getAliasee();
-  if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(UnderlyingGV)) {
-    if ((GVar->isConstant() && GV->hasLocalLinkage()) ||
-        (GVar->hasSection() &&
-         StringRef(GVar->getSection()).startswith(".cp.")))
-      return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, GA);
-    return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, GA);
-  }
-  return DAG.getNode(XCoreISD::PCRelativeWrapper, dl, MVT::i32, GA);
+
+  if (GV->getType()->getElementType()->isFunctionTy())
+    return DAG.getNode(XCoreISD::PCRelativeWrapper, dl, MVT::i32, GA);
+
+  const auto *GVar = dyn_cast<GlobalVariable>(GV);
+  if ((GV->hasSection() && StringRef(GV->getSection()).startswith(".cp.")) ||
+      (GVar && GVar->isConstant() && GV->hasLocalLinkage()))
+    return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, GA);
+
+  return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, GA);
 }
 
 static bool IsSmallObject(const GlobalValue *GV, const XCoreTargetLowering &XTL) {
@@ -508,7 +491,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   CLI.setDebugLoc(DL).setChain(Chain)
     .setCallee(CallingConv::C, IntPtrTy,
                DAG.getExternalSymbol("__misaligned_load", getPointerTy()),
-               &Args, 0);
+               std::move(Args), 0);
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   SDValue Ops[] = { CallResult.first, CallResult.second };
@@ -568,7 +551,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
   CLI.setDebugLoc(dl).setChain(Chain)
     .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
                DAG.getExternalSymbol("__misaligned_store", getPointerTy()),
-               &Args, 0);
+               std::move(Args), 0);
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.second;
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index d28715b..62b89c3 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -94,7 +94,7 @@ namespace llvm {
   {
   public:
 
-    explicit XCoreTargetLowering(XCoreTargetMachine &TM);
+    explicit XCoreTargetLowering(const TargetMachine &TM);
 
     using TargetLowering::isZExtFree;
     bool isZExtFree(SDValue Val, EVT VT2) const override;
@@ -123,7 +123,7 @@ namespace llvm {
     bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
 
   private:
-    const XCoreTargetMachine &TM;
+    const TargetMachine &TM;
     const XCoreSubtarget &Subtarget;
 
     // Lower Operand helpers
@@ -157,7 +157,6 @@ namespace llvm {
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index 984f0cd..36ea9a0 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -373,7 +373,8 @@ void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                          const TargetRegisterInfo *TRI) const
 {
   DebugLoc DL;
-  if (I != MBB.end()) DL = I->getDebugLoc();
+  if (I != MBB.end() && !I->isDebugValue())
+    DL = I->getDebugLoc();
   MachineFunction *MF = MBB.getParent();
   const MachineFrameInfo &MFI = *MF->getFrameInfo();
   MachineMemOperand *MMO =
@@ -395,7 +396,8 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                           const TargetRegisterInfo *TRI) const
 {
   DebugLoc DL;
-  if (I != MBB.end()) DL = I->getDebugLoc();
+  if (I != MBB.end() && !I->isDebugValue())
+    DL = I->getDebugLoc();
   MachineFunction *MF = MBB.getParent();
   const MachineFrameInfo &MFI = *MF->getFrameInfo();
   MachineMemOperand *MMO =
@@ -440,7 +442,8 @@ MachineBasicBlock::iterator XCoreInstrInfo::loadImmediate(
                                               MachineBasicBlock::iterator MI,
                                               unsigned Reg, uint64_t Value) const {
   DebugLoc dl;
-  if (MI != MBB.end()) dl = MI->getDebugLoc();
+  if (MI != MBB.end() && !MI->isDebugValue())
+    dl = MI->getDebugLoc();
   if (isImmMskBitp(Value)) {
     int N = Log2_32(Value) + 1;
     return BuildMI(MBB, MI, dl, get(XCore::MKMSK_rus), Reg).addImm(N);
diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
index 5a6bbe7..91b33fd 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -16,9 +16,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "xcore-selectiondag-info"
 
-XCoreSelectionDAGInfo::XCoreSelectionDAGInfo(const XCoreTargetMachine &TM)
-  : TargetSelectionDAGInfo(TM) {
-}
+XCoreSelectionDAGInfo::XCoreSelectionDAGInfo(const DataLayout &DL)
+    : TargetSelectionDAGInfo(&DL) {}
 
 XCoreSelectionDAGInfo::~XCoreSelectionDAGInfo() {
 }
@@ -47,7 +46,7 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
       .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
                  Type::getVoidTy(*DAG.getContext()),
                  DAG.getExternalSymbol("__memcpy_4", TLI.getPointerTy()),
-                 &Args, 0)
+                 std::move(Args), 0)
       .setDiscardResult();
 
     std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.h b/lib/Target/XCore/XCoreSelectionDAGInfo.h
index ea6af98..0079de1 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.h
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.h
@@ -22,7 +22,7 @@ class XCoreTargetMachine;
 
 class XCoreSelectionDAGInfo : public TargetSelectionDAGInfo {
 public:
-  explicit XCoreSelectionDAGInfo(const XCoreTargetMachine &TM);
+  explicit XCoreSelectionDAGInfo(const DataLayout &DL);
   ~XCoreSelectionDAGInfo();
 
   SDValue
diff --git a/lib/Target/XCore/XCoreSubtarget.cpp b/lib/Target/XCore/XCoreSubtarget.cpp
index 89ea03a..7227411 100644
--- a/lib/Target/XCore/XCoreSubtarget.cpp
+++ b/lib/Target/XCore/XCoreSubtarget.cpp
@@ -25,8 +25,8 @@ using namespace llvm;
 
 void XCoreSubtarget::anchor() { }
 
-XCoreSubtarget::XCoreSubtarget(const std::string &TT,
-                               const std::string &CPU, const std::string &FS)
-  : XCoreGenSubtargetInfo(TT, CPU, FS)
-{
-}
+XCoreSubtarget::XCoreSubtarget(const std::string &TT, const std::string &CPU,
+                               const std::string &FS, const TargetMachine &TM)
+    : XCoreGenSubtargetInfo(TT, CPU, FS),
+      DL("e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32"),
+      InstrInfo(), FrameLowering(*this), TLInfo(TM), TSInfo(DL) {}
diff --git a/lib/Target/XCore/XCoreSubtarget.h b/lib/Target/XCore/XCoreSubtarget.h
index 5ac4dbc..1e9810b 100644
--- a/lib/Target/XCore/XCoreSubtarget.h
+++ b/lib/Target/XCore/XCoreSubtarget.h
@@ -14,6 +14,11 @@
 #ifndef XCORESUBTARGET_H
 #define XCORESUBTARGET_H
 
+#include "XCoreFrameLowering.h"
+#include "XCoreISelLowering.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreSelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
@@ -26,17 +31,31 @@ class StringRef;
 
 class XCoreSubtarget : public XCoreGenSubtargetInfo {
   virtual void anchor();
+  const DataLayout DL;       // Calculates type size & alignment
+  XCoreInstrInfo InstrInfo;
+  XCoreFrameLowering FrameLowering;
+  XCoreTargetLowering TLInfo;
+  XCoreSelectionDAGInfo TSInfo;
 
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   ///
   XCoreSubtarget(const std::string &TT, const std::string &CPU,
-                 const std::string &FS);
+                 const std::string &FS, const TargetMachine &TM);
   
   /// ParseSubtargetFeatures - Parses features string setting specified 
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  const XCoreInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  const XCoreFrameLowering *getFrameLowering() const { return &FrameLowering; }
+  const XCoreTargetLowering *getTargetLowering() const { return &TLInfo; }
+  const XCoreSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  const TargetRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const DataLayout *getDataLayout() const { return &DL; }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 0fb21c5..8d8bb38 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -25,13 +25,8 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, StringRef TT,
                                        const TargetOptions &Options,
                                        Reloc::Model RM, CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
-  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS),
-    DL("e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32"),
-    InstrInfo(),
-    FrameLowering(Subtarget),
-    TLInfo(*this),
-    TSInfo(*this) {
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
 
diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
index a57ca55..14c43bf 100644
--- a/lib/Target/XCore/XCoreTargetMachine.h
+++ b/lib/Target/XCore/XCoreTargetMachine.h
@@ -14,46 +14,38 @@
 #ifndef XCORETARGETMACHINE_H
 #define XCORETARGETMACHINE_H
 
-#include "XCoreFrameLowering.h"
-#include "XCoreISelLowering.h"
-#include "XCoreInstrInfo.h"
-#include "XCoreSelectionDAGInfo.h"
 #include "XCoreSubtarget.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
 class XCoreTargetMachine : public LLVMTargetMachine {
   XCoreSubtarget Subtarget;
-  const DataLayout DL;       // Calculates type size & alignment
-  XCoreInstrInfo InstrInfo;
-  XCoreFrameLowering FrameLowering;
-  XCoreTargetLowering TLInfo;
-  XCoreSelectionDAGInfo TSInfo;
 public:
   XCoreTargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS, const TargetOptions &Options,
                      Reloc::Model RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL);
 
-  const XCoreInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const XCoreInstrInfo *getInstrInfo() const override {
+    return getSubtargetImpl()->getInstrInfo();
+  }
   const XCoreFrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
+    return getSubtargetImpl()->getFrameLowering();
   }
   const XCoreSubtarget *getSubtargetImpl() const override { return &Subtarget; }
   const XCoreTargetLowering *getTargetLowering() const override {
-    return &TLInfo;
+    return getSubtargetImpl()->getTargetLowering();
   }
-
   const XCoreSelectionDAGInfo* getSelectionDAGInfo() const override {
-    return &TSInfo;
+    return getSubtargetImpl()->getSelectionDAGInfo();
   }
-
   const TargetRegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
+    return getSubtargetImpl()->getRegisterInfo();
+  }
+  const DataLayout *getDataLayout() const override {
+    return getSubtargetImpl()->getDataLayout();
   }
-  const DataLayout       *getDataLayout() const override { return &DL; }
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index 377fa15..f9de54a 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -39,6 +39,8 @@
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
@@ -67,21 +69,24 @@ namespace {
     bool runOnSCC(CallGraphSCC &SCC) override;
     static char ID; // Pass identification, replacement for typeid
     explicit ArgPromotion(unsigned maxElements = 3)
-        : CallGraphSCCPass(ID), maxElements(maxElements) {
+        : CallGraphSCCPass(ID), DL(nullptr), maxElements(maxElements) {
       initializeArgPromotionPass(*PassRegistry::getPassRegistry());
     }
 
     /// A vector used to hold the indices of a single GEP instruction
     typedef std::vector<uint64_t> IndicesVector;
 
+    const DataLayout *DL;
   private:
     CallGraphNode *PromoteArguments(CallGraphNode *CGN);
     bool isSafeToPromoteArgument(Argument *Arg, bool isByVal) const;
     CallGraphNode *DoPromotion(Function *F,
                                SmallPtrSet<Argument*, 8> &ArgsToPromote,
                                SmallPtrSet<Argument*, 8> &ByValArgsToTransform);
+    bool doInitialization(CallGraph &CG) override;
     /// The maximum number of elements to expand, or 0 for unlimited.
     unsigned maxElements;
+    DenseMap<const Function *, DISubprogram> FunctionDIs;
   };
 }
 
@@ -100,6 +105,9 @@ Pass *llvm::createArgumentPromotionPass(unsigned maxElements) {
 bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
   bool Changed = false, LocalChange;
 
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
+
   do {  // Iterate until we stop promoting from this SCC.
     LocalChange = false;
     // Attempt to promote arguments from all functions in this SCC.
@@ -215,7 +223,8 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
 
 /// AllCallersPassInValidPointerForArgument - Return true if we can prove that
 /// all callees pass in a valid pointer for the specified function argument.
-static bool AllCallersPassInValidPointerForArgument(Argument *Arg) {
+static bool AllCallersPassInValidPointerForArgument(Argument *Arg,
+                                                    const DataLayout *DL) {
   Function *Callee = Arg->getParent();
 
   unsigned ArgNo = Arg->getArgNo();
@@ -226,7 +235,7 @@ static bool AllCallersPassInValidPointerForArgument(Argument *Arg) {
     CallSite CS(U);
     assert(CS && "Should only have direct calls!");
 
-    if (!CS.getArgument(ArgNo)->isDereferenceablePointer())
+    if (!CS.getArgument(ArgNo)->isDereferenceablePointer(DL))
       return false;
   }
   return true;
@@ -334,7 +343,7 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,
   GEPIndicesSet ToPromote;
 
   // If the pointer is always valid, any load with first index 0 is valid.
-  if (isByValOrInAlloca || AllCallersPassInValidPointerForArgument(Arg))
+  if (isByValOrInAlloca || AllCallersPassInValidPointerForArgument(Arg, DL))
     SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
 
   // First, iterate the entry block and mark loads of (geps of) arguments as
@@ -604,6 +613,10 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName());
   NF->copyAttributesFrom(F);
 
+  // Patch the pointer to LLVM function in debug info descriptor.
+  auto DI = FunctionDIs.find(F);
+  if (DI != FunctionDIs.end())
+    DI->second.replaceFunction(NF);
   
   DEBUG(dbgs() << "ARG PROMOTION:  Promoting to:" << *NF << "\n"
         << "From: " << *F);
@@ -741,6 +754,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
       if (cast<CallInst>(Call)->isTailCall())
         cast<CallInst>(New)->setTailCall();
     }
+    New->setDebugLoc(Call->getDebugLoc());
     Args.clear();
     AttributesVec.clear();
 
@@ -902,3 +916,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   
   return NF_CGN;
 }
+
+bool ArgPromotion::doInitialization(CallGraph &CG) {
+  FunctionDIs = makeSubprogramMap(CG.getModule());
+  return CallGraphSCCPass::doInitialization(CG);
+}
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 284b896..ac3853d 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -127,8 +127,7 @@ namespace {
     // As the code generation for module is finished (and DIBuilder is
     // finalized) we assume that subprogram descriptors won't be changed, and
     // they are stored in map for short duration anyway.
-    typedef DenseMap<Function*, DISubprogram> FunctionDIMap;
-    FunctionDIMap FunctionDIs;
+    DenseMap<const Function *, DISubprogram> FunctionDIs;
 
   protected:
     // DAH uses this to specify a different ID.
@@ -150,7 +149,6 @@ namespace {
                        unsigned RetValNum = 0);
     Liveness SurveyUses(const Value *V, UseVector &MaybeLiveUses);
 
-    void CollectFunctionDIs(Module &M);
     void SurveyFunction(const Function &F);
     void MarkValue(const RetOrArg &RA, Liveness L,
                    const UseVector &MaybeLiveUses);
@@ -190,35 +188,6 @@ INITIALIZE_PASS(DAH, "deadarghaX0r",
 ModulePass *llvm::createDeadArgEliminationPass() { return new DAE(); }
 ModulePass *llvm::createDeadArgHackingPass() { return new DAH(); }
 
-/// CollectFunctionDIs - Map each function in the module to its debug info
-/// descriptor.
-void DAE::CollectFunctionDIs(Module &M) {
-  FunctionDIs.clear();
-
-  for (Module::named_metadata_iterator I = M.named_metadata_begin(),
-       E = M.named_metadata_end(); I != E; ++I) {
-    NamedMDNode &NMD = *I;
-    for (unsigned MDIndex = 0, MDNum = NMD.getNumOperands();
-         MDIndex < MDNum; ++MDIndex) {
-      MDNode *Node = NMD.getOperand(MDIndex);
-      if (!DIDescriptor(Node).isCompileUnit())
-        continue;
-      DICompileUnit CU(Node);
-      const DIArray &SPs = CU.getSubprograms();
-      for (unsigned SPIndex = 0, SPNum = SPs.getNumElements();
-           SPIndex < SPNum; ++SPIndex) {
-        DISubprogram SP(SPs.getElement(SPIndex));
-        assert((!SP || SP.isSubprogram()) &&
-          "A MDNode in subprograms of a CU should be null or a DISubprogram.");
-        if (!SP)
-          continue;
-        if (Function *F = SP.getFunction())
-          FunctionDIs[F] = SP;
-      }
-    }
-  }
-}
-
 /// DeleteDeadVarargs - If this is an function that takes a ... list, and if
 /// llvm.vastart is never called, the varargs list is dead for the function.
 bool DAE::DeleteDeadVarargs(Function &Fn) {
@@ -327,7 +296,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
   }
 
   // Patch the pointer to LLVM function in debug info descriptor.
-  FunctionDIMap::iterator DI = FunctionDIs.find(&Fn);
+  auto DI = FunctionDIs.find(&Fn);
   if (DI != FunctionDIs.end())
     DI->second.replaceFunction(NF);
 
@@ -1087,7 +1056,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
       }
 
   // Patch the pointer to LLVM function in debug info descriptor.
-  FunctionDIMap::iterator DI = FunctionDIs.find(F);
+  auto DI = FunctionDIs.find(F);
   if (DI != FunctionDIs.end())
     DI->second.replaceFunction(NF);
 
@@ -1101,7 +1070,7 @@ bool DAE::runOnModule(Module &M) {
   bool Changed = false;
 
   // Collect debug info descriptors for functions.
-  CollectFunctionDIs(M);
+  FunctionDIs = makeSubprogramMap(M);
 
   // First pass: Do a simple check to see if any functions can have their "..."
   // removed.  We can do this if they never call va_start.  This loop cannot be
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index fed8839..8174df9 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -449,14 +449,29 @@ determinePointerReadAttrs(Argument *A,
 
     case Instruction::Call:
     case Instruction::Invoke: {
+      bool Captures = true;
+
+      if (I->getType()->isVoidTy())
+        Captures = false;
+
+      auto AddUsersToWorklistIfCapturing = [&] {
+        if (Captures)
+          for (Use &UU : I->uses())
+            if (Visited.insert(&UU))
+              Worklist.push_back(&UU);
+      };
+
       CallSite CS(I);
-      if (CS.doesNotAccessMemory())
+      if (CS.doesNotAccessMemory()) {
+        AddUsersToWorklistIfCapturing();
         continue;
+      }
 
       Function *F = CS.getCalledFunction();
       if (!F) {
         if (CS.onlyReadsMemory()) {
           IsRead = true;
+          AddUsersToWorklistIfCapturing();
           continue;
         }
         return Attribute::None;
@@ -471,6 +486,7 @@ determinePointerReadAttrs(Argument *A,
                    "More params than args in non-varargs call.");
             return Attribute::None;
           }
+          Captures &= !CS.doesNotCapture(A - B);
           if (SCCNodes.count(AI))
             continue;
           if (!CS.onlyReadsMemory() && !CS.onlyReadsMemory(A - B))
@@ -479,6 +495,7 @@ determinePointerReadAttrs(Argument *A,
             IsRead = true;
         }
       }
+      AddUsersToWorklistIfCapturing();
       break;
     }
 
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index 9decddc..7e7a4c0 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -62,7 +62,7 @@ static bool isEmptyFunction(Function *F) {
   if (Entry.size() != 1 || !isa<ReturnInst>(Entry.front()))
     return false;
   ReturnInst &RI = cast<ReturnInst>(Entry.front());
-  return RI.getReturnValue() == NULL;
+  return RI.getReturnValue() == nullptr;
 }
 
 char GlobalDCE::ID = 0;
@@ -77,13 +77,19 @@ bool GlobalDCE::runOnModule(Module &M) {
   // Remove empty functions from the global ctors list.
   Changed |= optimizeGlobalCtorsList(M, isEmptyFunction);
 
+  typedef std::multimap<const Comdat *, GlobalValue *> ComdatGVPairsTy;
+  ComdatGVPairsTy ComdatGVPairs;
+
   // Loop over the module, adding globals which are obviously necessary.
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
     Changed |= RemoveUnusedGlobalValue(*I);
     // Functions with external linkage are needed if they have a body
-    if (!I->isDiscardableIfUnused() &&
-        !I->isDeclaration() && !I->hasAvailableExternallyLinkage())
-      GlobalIsNeeded(I);
+    if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) {
+      if (!I->isDiscardableIfUnused())
+        GlobalIsNeeded(I);
+      else if (const Comdat *C = I->getComdat())
+        ComdatGVPairs.insert(std::make_pair(C, I));
+    }
   }
 
   for (Module::global_iterator I = M.global_begin(), E = M.global_end();
@@ -91,17 +97,38 @@ bool GlobalDCE::runOnModule(Module &M) {
     Changed |= RemoveUnusedGlobalValue(*I);
     // Externally visible & appending globals are needed, if they have an
     // initializer.
-    if (!I->isDiscardableIfUnused() &&
-        !I->isDeclaration() && !I->hasAvailableExternallyLinkage())
-      GlobalIsNeeded(I);
+    if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) {
+      if (!I->isDiscardableIfUnused())
+        GlobalIsNeeded(I);
+      else if (const Comdat *C = I->getComdat())
+        ComdatGVPairs.insert(std::make_pair(C, I));
+    }
   }
 
   for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
        I != E; ++I) {
     Changed |= RemoveUnusedGlobalValue(*I);
     // Externally visible aliases are needed.
-    if (!I->isDiscardableIfUnused())
+    if (!I->isDiscardableIfUnused()) {
       GlobalIsNeeded(I);
+    } else if (const Comdat *C = I->getComdat()) {
+      ComdatGVPairs.insert(std::make_pair(C, I));
+    }
+  }
+
+  for (ComdatGVPairsTy::iterator I = ComdatGVPairs.begin(),
+                                 E = ComdatGVPairs.end();
+       I != E;) {
+    ComdatGVPairsTy::iterator UB = ComdatGVPairs.upper_bound(I->first);
+    bool CanDiscard = std::all_of(I, UB, [](ComdatGVPairsTy::value_type Pair) {
+      return Pair.second->isDiscardableIfUnused();
+    });
+    if (!CanDiscard) {
+      std::for_each(I, UB, [this](ComdatGVPairsTy::value_type Pair) {
+        GlobalIsNeeded(Pair.second);
+      });
+    }
+    I = UB;
   }
 
   // Now that all globals which are needed are in the AliveGlobals set, we loop
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index ae80c43..c1d0d3b 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
@@ -1699,9 +1700,6 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
 /// possible.  If we make a change, return true.
 bool GlobalOpt::ProcessGlobal(GlobalVariable *GV,
                               Module::global_iterator &GVI) {
-  if (!GV->isDiscardableIfUnused())
-    return false;
-
   // Do more involved optimizations if the global is internal.
   GV->removeDeadConstantUsers();
 
@@ -1910,7 +1908,7 @@ bool GlobalOpt::OptimizeFunctions(Module &M) {
   for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) {
     Function *F = FI++;
     // Functions without names cannot be referenced outside this module.
-    if (!F->hasName() && !F->isDeclaration())
+    if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage())
       F->setLinkage(GlobalValue::InternalLinkage);
     F->removeDeadConstantUsers();
     if (F->isDefTriviallyDead()) {
@@ -1944,11 +1942,18 @@ bool GlobalOpt::OptimizeFunctions(Module &M) {
 
 bool GlobalOpt::OptimizeGlobalVars(Module &M) {
   bool Changed = false;
+
+  SmallSet<const Comdat *, 8> NotDiscardableComdats;
+  for (const GlobalVariable &GV : M.globals())
+    if (const Comdat *C = GV.getComdat())
+      if (!GV.isDiscardableIfUnused())
+        NotDiscardableComdats.insert(C);
+
   for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
        GVI != E; ) {
     GlobalVariable *GV = GVI++;
     // Global variables without names cannot be referenced outside this module.
-    if (!GV->hasName() && !GV->isDeclaration())
+    if (!GV->hasName() && !GV->isDeclaration() && !GV->hasLocalLinkage())
       GV->setLinkage(GlobalValue::InternalLinkage);
     // Simplify the initializer.
     if (GV->hasInitializer())
@@ -1958,7 +1963,12 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) {
           GV->setInitializer(New);
       }
 
-    Changed |= ProcessGlobal(GV, GVI);
+    if (GV->isDiscardableIfUnused()) {
+      if (const Comdat *C = GV->getComdat())
+        if (NotDiscardableComdats.count(C))
+          continue;
+      Changed |= ProcessGlobal(GV, GVI);
+    }
   }
   return Changed;
 }
@@ -1980,10 +1990,13 @@ isSimpleEnoughValueToCommit(Constant *C,
 static bool isSimpleEnoughValueToCommitHelper(Constant *C,
                                    SmallPtrSet<Constant*, 8> &SimpleConstants,
                                    const DataLayout *DL) {
-  // Simple integer, undef, constant aggregate zero, global addresses, etc are
-  // all supported.
-  if (C->getNumOperands() == 0 || isa<BlockAddress>(C) ||
-      isa<GlobalValue>(C))
+  // Simple global addresses are supported, do not allow dllimport or
+  // thread-local globals.
+  if (auto *GV = dyn_cast<GlobalValue>(C))
+    return !GV->hasDLLImportStorageClass() && !GV->isThreadLocal();
+
+  // Simple integer, undef, constant aggregate zero, etc are all supported.
+  if (C->getNumOperands() == 0 || isa<BlockAddress>(C))
     return true;
 
   // Aggregate values are safe if all their elements are.
@@ -2054,8 +2067,7 @@ static bool isSimpleEnoughPointerToCommit(Constant *C) {
     return false;
 
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
-    // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
-    // external globals.
+    // Do not allow weak/*_odr/linkonce linkage or external globals.
     return GV->hasUniqueInitializer();
 
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
@@ -2846,14 +2858,19 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
        I != E;) {
     Module::alias_iterator J = I++;
     // Aliases without names cannot be referenced outside this module.
-    if (!J->hasName() && !J->isDeclaration())
+    if (!J->hasName() && !J->isDeclaration() && !J->hasLocalLinkage())
       J->setLinkage(GlobalValue::InternalLinkage);
     // If the aliasee may change at link time, nothing can be done - bail out.
     if (J->mayBeOverridden())
       continue;
 
     Constant *Aliasee = J->getAliasee();
-    GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts());
+    GlobalValue *Target = dyn_cast<GlobalValue>(Aliasee->stripPointerCasts());
+    // We can't trivially replace the alias with the aliasee if the aliasee is
+    // non-trivial in some way.
+    // TODO: Try to handle non-zero GEPs of local aliasees.
+    if (!Target)
+      continue;
     Target->removeDeadConstantUsers();
 
     // Make all users of the alias use the aliasee instead.
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index c3a2b12..559ef0b 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -9,13 +9,24 @@
 //
 // This pass looks for equivalent functions that are mergable and folds them.
 //
-// A hash is computed from the function, based on its type and number of
-// basic blocks.
+// Order relation is defined on set of functions. It was made through
+// special function comparison procedure that returns
+// 0 when functions are equal,
+// -1 when Left function is less than right function, and
+// 1 for opposite case. We need total-ordering, so we need to maintain
+// four properties on the functions set:
+// a <= a (reflexivity)
+// if a <= b and b <= a then a = b (antisymmetry)
+// if a <= b and b <= c then a <= c (transitivity).
+// for all a and b: a <= b or b <= a (totality).
 //
-// Once all hashes are computed, we perform an expensive equality comparison
-// on each function pair. This takes n^2/2 comparisons per bucket, so it's
-// important that the hash function be high quality. The equality comparison
-// iterates through each instruction in each basic block.
+// Comparison iterates through each instruction in each basic block.
+// Functions are kept on binary tree. For each new function F we perform
+// lookup in binary tree.
+// In practice it works the following way:
+// -- We define Function* container class with custom "operator<" (FunctionPtr).
+// -- "FunctionPtr" instances are stored in std::set collection, so every
+//    std::set::insert operation will give you result in log(N) time.
 //
 // When a match is found the functions are folded. If both functions are
 // overridable, we move the functionality into a new internal function and
@@ -31,9 +42,6 @@
 // the object they belong to. However, as long as it's only used for a lookup
 // and call, this is irrelevant, and we'd like to fold such functions.
 //
-// * switch from n^2 pair-wise comparisons to an n-way comparison for each
-// bucket.
-//
 // * be smarter about bitcasts.
 //
 // In order to fold functions, we will sometimes add either bitcast instructions
@@ -41,6 +49,36 @@
 // analysis since the two functions differ where one has a bitcast and the
 // other doesn't. We should learn to look through bitcasts.
 //
+// * Compare complex types with pointer types inside.
+// * Compare cross-reference cases.
+// * Compare complex expressions.
+//
+// All the three issues above could be described as ability to prove that
+// fA == fB == fC == fE == fF == fG in example below:
+//
+//  void fA() {
+//    fB();
+//  }
+//  void fB() {
+//    fA();
+//  }
+//
+//  void fE() {
+//    fF();
+//  }
+//  void fF() {
+//    fG();
+//  }
+//  void fG() {
+//    fE();
+//  }
+//
+// Simplest cross-reference case (fA <--> fB) was implemented in previous
+// versions of MergeFunctions, though it presented only in two function pairs
+// in test-suite (that counts >50k functions)
+// Though possibility to detect complex cross-referencing (e.g.: A->B->C->D->A)
+// could cover much more cases.
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO.h"
@@ -60,6 +98,7 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -73,89 +112,12 @@ STATISTIC(NumThunksWritten, "Number of thunks generated");
 STATISTIC(NumAliasesWritten, "Number of aliases generated");
 STATISTIC(NumDoubleWeak, "Number of new functions created");
 
-/// Returns the type id for a type to be hashed. We turn pointer types into
-/// integers here because the actual compare logic below considers pointers and
-/// integers of the same size as equal.
-static Type::TypeID getTypeIDForHash(Type *Ty) {
-  if (Ty->isPointerTy())
-    return Type::IntegerTyID;
-  return Ty->getTypeID();
-}
-
-/// Creates a hash-code for the function which is the same for any two
-/// functions that will compare equal, without looking at the instructions
-/// inside the function.
-static unsigned profileFunction(const Function *F) {
-  FunctionType *FTy = F->getFunctionType();
-
-  FoldingSetNodeID ID;
-  ID.AddInteger(F->size());
-  ID.AddInteger(F->getCallingConv());
-  ID.AddBoolean(F->hasGC());
-  ID.AddBoolean(FTy->isVarArg());
-  ID.AddInteger(getTypeIDForHash(FTy->getReturnType()));
-  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
-    ID.AddInteger(getTypeIDForHash(FTy->getParamType(i)));
-  return ID.ComputeHash();
-}
-
-namespace {
-
-/// ComparableFunction - A struct that pairs together functions with a
-/// DataLayout so that we can keep them together as elements in the DenseSet.
-class ComparableFunction {
-public:
-  static const ComparableFunction EmptyKey;
-  static const ComparableFunction TombstoneKey;
-  static DataLayout * const LookupOnly;
-
-  ComparableFunction(Function *Func, const DataLayout *DL)
-    : Func(Func), Hash(profileFunction(Func)), DL(DL) {}
-
-  Function *getFunc() const { return Func; }
-  unsigned getHash() const { return Hash; }
-  const DataLayout *getDataLayout() const { return DL; }
-
-  // Drops AssertingVH reference to the function. Outside of debug mode, this
-  // does nothing.
-  void release() {
-    assert(Func &&
-           "Attempted to release function twice, or release empty/tombstone!");
-    Func = nullptr;
-  }
-
-private:
-  explicit ComparableFunction(unsigned Hash)
-    : Func(nullptr), Hash(Hash), DL(nullptr) {}
-
-  AssertingVH<Function> Func;
-  unsigned Hash;
-  const DataLayout *DL;
-};
-
-const ComparableFunction ComparableFunction::EmptyKey = ComparableFunction(0);
-const ComparableFunction ComparableFunction::TombstoneKey =
-    ComparableFunction(1);
-DataLayout *const ComparableFunction::LookupOnly = (DataLayout*)(-1);
-
-}
-
-namespace llvm {
-  template <>
-  struct DenseMapInfo<ComparableFunction> {
-    static ComparableFunction getEmptyKey() {
-      return ComparableFunction::EmptyKey;
-    }
-    static ComparableFunction getTombstoneKey() {
-      return ComparableFunction::TombstoneKey;
-    }
-    static unsigned getHashValue(const ComparableFunction &CF) {
-      return CF.getHash();
-    }
-    static bool isEqual(const ComparableFunction &LHS,
-                        const ComparableFunction &RHS);
-  };
-}
+static cl::opt<unsigned> NumFunctionsForSanityCheck(
+    "mergefunc-sanity",
+    cl::desc("How many functions in module could be used for "
+             "MergeFunctions pass sanity check. "
+             "'0' disables this check. Works only with '-debug' key."),
+    cl::init(0), cl::Hidden);
 
 namespace {
 
@@ -167,14 +129,14 @@ class FunctionComparator {
 public:
   FunctionComparator(const DataLayout *DL, const Function *F1,
                      const Function *F2)
-    : F1(F1), F2(F2), DL(DL) {}
+      : FnL(F1), FnR(F2), DL(DL) {}
 
   /// Test whether the two functions have equivalent behaviour.
-  bool compare();
+  int compare();
 
 private:
   /// Test whether two basic blocks have equivalent behaviour.
-  bool compare(const BasicBlock *BB1, const BasicBlock *BB2);
+  int compare(const BasicBlock *BBL, const BasicBlock *BBR);
 
   /// Constants comparison.
   /// Its analog to lexicographical comparison between hypothetical numbers
@@ -300,10 +262,6 @@ private:
   ///          see comments for sn_mapL and sn_mapR.
   int cmpValues(const Value *L, const Value *R);
 
-  bool enumerate(const Value *V1, const Value *V2) {
-    return cmpValues(V1, V2) == 0;
-  }
-
   /// Compare two Instructions for equivalence, similar to
   /// Instruction::isSameOperationAs but with modifications to the type
   /// comparison.
@@ -325,15 +283,11 @@ private:
   /// 6.1.Load: volatile (as boolean flag)
   /// 6.2.Load: alignment (as integer numbers)
   /// 6.3.Load: synch-scope (as integer numbers)
+  /// 6.4.Load: range metadata (as integer numbers)
   /// On this stage its better to see the code, since its not more than 10-15
   /// strings for particular instruction, and could change sometimes.
   int cmpOperation(const Instruction *L, const Instruction *R) const;
 
-  bool isEquivalentOperation(const Instruction *I1,
-                             const Instruction *I2) const {
-    return cmpOperation(I1, I2) == 0;
-  }
-
   /// Compare two GEPs for equivalent pointer arithmetic.
   /// Parts to be compared for each comparison stage,
   /// most significant stage first:
@@ -348,14 +302,6 @@ private:
     return cmpGEP(cast<GEPOperator>(GEPL), cast<GEPOperator>(GEPR));
   }
 
-  bool isEquivalentGEP(const GEPOperator *GEP1, const GEPOperator *GEP2) {
-    return cmpGEP(GEP1, GEP2) == 0;
-  }
-  bool isEquivalentGEP(const GetElementPtrInst *GEP1,
-                       const GetElementPtrInst *GEP2) {
-    return isEquivalentGEP(cast<GEPOperator>(GEP1), cast<GEPOperator>(GEP2));
-  }
-
   /// cmpType - compares two types,
   /// defines total ordering among the types set.
   ///
@@ -398,10 +344,6 @@ private:
   /// 6. For all other cases put llvm_unreachable.
   int cmpType(Type *TyL, Type *TyR) const;
 
-  bool isEquivalentType(Type *Ty1, Type *Ty2) const {
-    return cmpType(Ty1, Ty2) == 0;
-  }
-
   int cmpNumbers(uint64_t L, uint64_t R) const;
 
   int cmpAPInt(const APInt &L, const APInt &R) const;
@@ -410,7 +352,7 @@ private:
   int cmpAttrs(const AttributeSet L, const AttributeSet R) const;
 
   // The two functions undergoing comparison.
-  const Function *F1, *F2;
+  const Function *FnL, *FnR;
 
   const DataLayout *DL;
 
@@ -450,6 +392,18 @@ private:
   DenseMap<const Value*, int> sn_mapL, sn_mapR;
 };
 
+class FunctionPtr {
+  AssertingVH<Function> F;
+  const DataLayout *DL;
+
+public:
+  FunctionPtr(Function *F, const DataLayout *DL) : F(F), DL(DL) {}
+  Function *getFunc() const { return F; }
+  void release() { F = 0; }
+  bool operator<(const FunctionPtr &RHS) const {
+    return (FunctionComparator(DL, F, RHS.getFunc()).compare()) == -1;
+  }
+};
 }
 
 int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const {
@@ -788,7 +742,11 @@ int FunctionComparator::cmpOperation(const Instruction *L,
     if (int Res =
             cmpNumbers(LI->getOrdering(), cast<LoadInst>(R)->getOrdering()))
       return Res;
-    return cmpNumbers(LI->getSynchScope(), cast<LoadInst>(R)->getSynchScope());
+    if (int Res =
+            cmpNumbers(LI->getSynchScope(), cast<LoadInst>(R)->getSynchScope()))
+      return Res;
+    return cmpNumbers((uint64_t)LI->getMetadata(LLVMContext::MD_range),
+                      (uint64_t)cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range));
   }
   if (const StoreInst *SI = dyn_cast<StoreInst>(L)) {
     if (int Res =
@@ -847,6 +805,9 @@ int FunctionComparator::cmpOperation(const Instruction *L,
     if (int Res = cmpNumbers(CXI->isVolatile(),
                              cast<AtomicCmpXchgInst>(R)->isVolatile()))
       return Res;
+    if (int Res = cmpNumbers(CXI->isWeak(),
+                             cast<AtomicCmpXchgInst>(R)->isWeak()))
+      return Res;
     if (int Res = cmpNumbers(CXI->getSuccessOrdering(),
                              cast<AtomicCmpXchgInst>(R)->getSuccessOrdering()))
       return Res;
@@ -914,13 +875,13 @@ int FunctionComparator::cmpGEP(const GEPOperator *GEPL,
 /// See comments in declaration for more details.
 int FunctionComparator::cmpValues(const Value *L, const Value *R) {
   // Catch self-reference case.
-  if (L == F1) {
-    if (R == F2)
+  if (L == FnL) {
+    if (R == FnR)
       return 0;
     return -1;
   }
-  if (R == F2) {
-    if (L == F1)
+  if (R == FnR) {
+    if (L == FnL)
       return 0;
     return 1;
   }
@@ -954,90 +915,102 @@ int FunctionComparator::cmpValues(const Value *L, const Value *R) {
   return cmpNumbers(LeftSN.first->second, RightSN.first->second);
 }
 // Test whether two basic blocks have equivalent behaviour.
-bool FunctionComparator::compare(const BasicBlock *BB1, const BasicBlock *BB2) {
-  BasicBlock::const_iterator F1I = BB1->begin(), F1E = BB1->end();
-  BasicBlock::const_iterator F2I = BB2->begin(), F2E = BB2->end();
+int FunctionComparator::compare(const BasicBlock *BBL, const BasicBlock *BBR) {
+  BasicBlock::const_iterator InstL = BBL->begin(), InstLE = BBL->end();
+  BasicBlock::const_iterator InstR = BBR->begin(), InstRE = BBR->end();
 
   do {
-    if (!enumerate(F1I, F2I))
-      return false;
+    if (int Res = cmpValues(InstL, InstR))
+      return Res;
 
-    if (const GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(F1I)) {
-      const GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(F2I);
-      if (!GEP2)
-        return false;
+    const GetElementPtrInst *GEPL = dyn_cast<GetElementPtrInst>(InstL);
+    const GetElementPtrInst *GEPR = dyn_cast<GetElementPtrInst>(InstR);
 
-      if (!enumerate(GEP1->getPointerOperand(), GEP2->getPointerOperand()))
-        return false;
+    if (GEPL && !GEPR)
+      return 1;
+    if (GEPR && !GEPL)
+      return -1;
 
-      if (!isEquivalentGEP(GEP1, GEP2))
-        return false;
+    if (GEPL && GEPR) {
+      if (int Res =
+              cmpValues(GEPL->getPointerOperand(), GEPR->getPointerOperand()))
+        return Res;
+      if (int Res = cmpGEP(GEPL, GEPR))
+        return Res;
     } else {
-      if (!isEquivalentOperation(F1I, F2I))
-        return false;
-
-      assert(F1I->getNumOperands() == F2I->getNumOperands());
-      for (unsigned i = 0, e = F1I->getNumOperands(); i != e; ++i) {
-        Value *OpF1 = F1I->getOperand(i);
-        Value *OpF2 = F2I->getOperand(i);
-
-        if (!enumerate(OpF1, OpF2))
-          return false;
+      if (int Res = cmpOperation(InstL, InstR))
+        return Res;
+      assert(InstL->getNumOperands() == InstR->getNumOperands());
 
-        if (OpF1->getValueID() != OpF2->getValueID() ||
-            !isEquivalentType(OpF1->getType(), OpF2->getType()))
-          return false;
+      for (unsigned i = 0, e = InstL->getNumOperands(); i != e; ++i) {
+        Value *OpL = InstL->getOperand(i);
+        Value *OpR = InstR->getOperand(i);
+        if (int Res = cmpValues(OpL, OpR))
+          return Res;
+        if (int Res = cmpNumbers(OpL->getValueID(), OpR->getValueID()))
+          return Res;
+        // TODO: Already checked in cmpOperation
+        if (int Res = cmpType(OpL->getType(), OpR->getType()))
+          return Res;
       }
     }
 
-    ++F1I, ++F2I;
-  } while (F1I != F1E && F2I != F2E);
+    ++InstL, ++InstR;
+  } while (InstL != InstLE && InstR != InstRE);
 
-  return F1I == F1E && F2I == F2E;
+  if (InstL != InstLE && InstR == InstRE)
+    return 1;
+  if (InstL == InstLE && InstR != InstRE)
+    return -1;
+  return 0;
 }
 
 // Test whether the two functions have equivalent behaviour.
-bool FunctionComparator::compare() {
-  // We need to recheck everything, but check the things that weren't included
-  // in the hash first.
+int FunctionComparator::compare() {
 
   sn_mapL.clear();
   sn_mapR.clear();
 
-  if (F1->getAttributes() != F2->getAttributes())
-    return false;
+  if (int Res = cmpAttrs(FnL->getAttributes(), FnR->getAttributes()))
+    return Res;
 
-  if (F1->hasGC() != F2->hasGC())
-    return false;
+  if (int Res = cmpNumbers(FnL->hasGC(), FnR->hasGC()))
+    return Res;
 
-  if (F1->hasGC() && F1->getGC() != F2->getGC())
-    return false;
+  if (FnL->hasGC()) {
+    if (int Res = cmpNumbers((uint64_t)FnL->getGC(), (uint64_t)FnR->getGC()))
+      return Res;
+  }
 
-  if (F1->hasSection() != F2->hasSection())
-    return false;
+  if (int Res = cmpNumbers(FnL->hasSection(), FnR->hasSection()))
+    return Res;
 
-  if (F1->hasSection() && F1->getSection() != F2->getSection())
-    return false;
+  if (FnL->hasSection()) {
+    if (int Res = cmpStrings(FnL->getSection(), FnR->getSection()))
+      return Res;
+  }
 
-  if (F1->isVarArg() != F2->isVarArg())
-    return false;
+  if (int Res = cmpNumbers(FnL->isVarArg(), FnR->isVarArg()))
+    return Res;
 
   // TODO: if it's internal and only used in direct calls, we could handle this
   // case too.
-  if (F1->getCallingConv() != F2->getCallingConv())
-    return false;
+  if (int Res = cmpNumbers(FnL->getCallingConv(), FnR->getCallingConv()))
+    return Res;
 
-  if (!isEquivalentType(F1->getFunctionType(), F2->getFunctionType()))
-    return false;
+  if (int Res = cmpType(FnL->getFunctionType(), FnR->getFunctionType()))
+    return Res;
 
-  assert(F1->arg_size() == F2->arg_size() &&
+  assert(FnL->arg_size() == FnR->arg_size() &&
          "Identically typed functions have different numbers of args!");
 
   // Visit the arguments so that they get enumerated in the order they're
   // passed in.
-  for (Function::const_arg_iterator f1i = F1->arg_begin(),
-         f2i = F2->arg_begin(), f1e = F1->arg_end(); f1i != f1e; ++f1i, ++f2i) {
-    if (!enumerate(f1i, f2i))
+  for (Function::const_arg_iterator ArgLI = FnL->arg_begin(),
+                                    ArgRI = FnR->arg_begin(),
+                                    ArgLE = FnL->arg_end();
+       ArgLI != ArgLE; ++ArgLI, ++ArgRI) {
+    if (cmpValues(ArgLI, ArgRI) != 0)
       llvm_unreachable("Arguments repeat!");
   }
 
@@ -1045,33 +1018,36 @@ bool FunctionComparator::compare() {
   // linked list is immaterial. Our walk starts at the entry block for both
   // functions, then takes each block from each terminator in order. As an
   // artifact, this also means that unreachable blocks are ignored.
-  SmallVector<const BasicBlock *, 8> F1BBs, F2BBs;
+  SmallVector<const BasicBlock *, 8> FnLBBs, FnRBBs;
   SmallSet<const BasicBlock *, 128> VisitedBBs; // in terms of F1.
 
-  F1BBs.push_back(&F1->getEntryBlock());
-  F2BBs.push_back(&F2->getEntryBlock());
+  FnLBBs.push_back(&FnL->getEntryBlock());
+  FnRBBs.push_back(&FnR->getEntryBlock());
 
-  VisitedBBs.insert(F1BBs[0]);
-  while (!F1BBs.empty()) {
-    const BasicBlock *F1BB = F1BBs.pop_back_val();
-    const BasicBlock *F2BB = F2BBs.pop_back_val();
+  VisitedBBs.insert(FnLBBs[0]);
+  while (!FnLBBs.empty()) {
+    const BasicBlock *BBL = FnLBBs.pop_back_val();
+    const BasicBlock *BBR = FnRBBs.pop_back_val();
 
-    if (!enumerate(F1BB, F2BB) || !compare(F1BB, F2BB))
-      return false;
+    if (int Res = cmpValues(BBL, BBR))
+      return Res;
+
+    if (int Res = compare(BBL, BBR))
+      return Res;
 
-    const TerminatorInst *F1TI = F1BB->getTerminator();
-    const TerminatorInst *F2TI = F2BB->getTerminator();
+    const TerminatorInst *TermL = BBL->getTerminator();
+    const TerminatorInst *TermR = BBR->getTerminator();
 
-    assert(F1TI->getNumSuccessors() == F2TI->getNumSuccessors());
-    for (unsigned i = 0, e = F1TI->getNumSuccessors(); i != e; ++i) {
-      if (!VisitedBBs.insert(F1TI->getSuccessor(i)))
+    assert(TermL->getNumSuccessors() == TermR->getNumSuccessors());
+    for (unsigned i = 0, e = TermL->getNumSuccessors(); i != e; ++i) {
+      if (!VisitedBBs.insert(TermL->getSuccessor(i)))
         continue;
 
-      F1BBs.push_back(F1TI->getSuccessor(i));
-      F2BBs.push_back(F2TI->getSuccessor(i));
+      FnLBBs.push_back(TermL->getSuccessor(i));
+      FnRBBs.push_back(TermR->getSuccessor(i));
     }
   }
-  return true;
+  return 0;
 }
 
 namespace {
@@ -1092,21 +1068,25 @@ public:
   bool runOnModule(Module &M) override;
 
 private:
-  typedef DenseSet<ComparableFunction> FnSetType;
+  typedef std::set<FunctionPtr> FnTreeType;
 
   /// A work queue of functions that may have been modified and should be
   /// analyzed again.
   std::vector<WeakVH> Deferred;
 
-  /// Insert a ComparableFunction into the FnSet, or merge it away if it's
+  /// Checks the rules of order relation introduced among functions set.
+  /// Returns true, if sanity check has been passed, and false if failed.
+  bool doSanityCheck(std::vector<WeakVH> &Worklist);
+
+  /// Insert a ComparableFunction into the FnTree, or merge it away if it's
   /// equal to one that's already present.
-  bool insert(ComparableFunction &NewF);
+  bool insert(Function *NewFunction);
 
-  /// Remove a Function from the FnSet and queue it up for a second sweep of
+  /// Remove a Function from the FnTree and queue it up for a second sweep of
   /// analysis.
   void remove(Function *F);
 
-  /// Find the functions that use this Value and remove them from FnSet and
+  /// Find the functions that use this Value and remove them from FnTree and
   /// queue the functions.
   void removeUsers(Value *V);
 
@@ -1131,7 +1111,7 @@ private:
 
   /// The set of all distinct functions. Use the insert() and remove() methods
   /// to modify it.
-  FnSetType FnSet;
+  FnTreeType FnTree;
 
   /// DataLayout for more accurate GEP comparisons. May be NULL.
   const DataLayout *DL;
@@ -1149,6 +1129,78 @@ ModulePass *llvm::createMergeFunctionsPass() {
   return new MergeFunctions();
 }
 
+bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) {
+  if (const unsigned Max = NumFunctionsForSanityCheck) {
+    unsigned TripleNumber = 0;
+    bool Valid = true;
+
+    dbgs() << "MERGEFUNC-SANITY: Started for first " << Max << " functions.\n";
+
+    unsigned i = 0;
+    for (std::vector<WeakVH>::iterator I = Worklist.begin(), E = Worklist.end();
+         I != E && i < Max; ++I, ++i) {
+      unsigned j = i;
+      for (std::vector<WeakVH>::iterator J = I; J != E && j < Max; ++J, ++j) {
+        Function *F1 = cast<Function>(*I);
+        Function *F2 = cast<Function>(*J);
+        int Res1 = FunctionComparator(DL, F1, F2).compare();
+        int Res2 = FunctionComparator(DL, F2, F1).compare();
+
+        // If F1 <= F2, then F2 >= F1, otherwise report failure.
+        if (Res1 != -Res2) {
+          dbgs() << "MERGEFUNC-SANITY: Non-symmetric; triple: " << TripleNumber
+                 << "\n";
+          F1->dump();
+          F2->dump();
+          Valid = false;
+        }
+
+        if (Res1 == 0)
+          continue;
+
+        unsigned k = j;
+        for (std::vector<WeakVH>::iterator K = J; K != E && k < Max;
+             ++k, ++K, ++TripleNumber) {
+          if (K == J)
+            continue;
+
+          Function *F3 = cast<Function>(*K);
+          int Res3 = FunctionComparator(DL, F1, F3).compare();
+          int Res4 = FunctionComparator(DL, F2, F3).compare();
+
+          bool Transitive = true;
+
+          if (Res1 != 0 && Res1 == Res4) {
+            // F1 > F2, F2 > F3 => F1 > F3
+            Transitive = Res3 == Res1;
+          } else if (Res3 != 0 && Res3 == -Res4) {
+            // F1 > F3, F3 > F2 => F1 > F2
+            Transitive = Res3 == Res1;
+          } else if (Res4 != 0 && -Res3 == Res4) {
+            // F2 > F3, F3 > F1 => F2 > F1
+            Transitive = Res4 == -Res1;
+          }
+
+          if (!Transitive) {
+            dbgs() << "MERGEFUNC-SANITY: Non-transitive; triple: "
+                   << TripleNumber << "\n";
+            dbgs() << "Res1, Res3, Res4: " << Res1 << ", " << Res3 << ", "
+                   << Res4 << "\n";
+            F1->dump();
+            F2->dump();
+            F3->dump();
+            Valid = false;
+          }
+        }
+      }
+    }
+
+    dbgs() << "MERGEFUNC-SANITY: " << (Valid ? "Passed." : "Failed.") << "\n";
+    return Valid;
+  }
+  return true;
+}
+
 bool MergeFunctions::runOnModule(Module &M) {
   bool Changed = false;
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
@@ -1158,12 +1210,13 @@ bool MergeFunctions::runOnModule(Module &M) {
     if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage())
       Deferred.push_back(WeakVH(I));
   }
-  FnSet.resize(Deferred.size());
 
   do {
     std::vector<WeakVH> Worklist;
     Deferred.swap(Worklist);
 
+    DEBUG(doSanityCheck(Worklist));
+
     DEBUG(dbgs() << "size of module: " << M.size() << '\n');
     DEBUG(dbgs() << "size of worklist: " << Worklist.size() << '\n');
 
@@ -1175,8 +1228,7 @@ bool MergeFunctions::runOnModule(Module &M) {
       Function *F = cast<Function>(*I);
       if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage() &&
           !F->mayBeOverridden()) {
-        ComparableFunction CF = ComparableFunction(F, DL);
-        Changed |= insert(CF);
+        Changed |= insert(F);
       }
     }
 
@@ -1190,38 +1242,17 @@ bool MergeFunctions::runOnModule(Module &M) {
       Function *F = cast<Function>(*I);
       if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage() &&
           F->mayBeOverridden()) {
-        ComparableFunction CF = ComparableFunction(F, DL);
-        Changed |= insert(CF);
+        Changed |= insert(F);
       }
     }
-    DEBUG(dbgs() << "size of FnSet: " << FnSet.size() << '\n');
+    DEBUG(dbgs() << "size of FnTree: " << FnTree.size() << '\n');
   } while (!Deferred.empty());
 
-  FnSet.clear();
+  FnTree.clear();
 
   return Changed;
 }
 
-bool DenseMapInfo<ComparableFunction>::isEqual(const ComparableFunction &LHS,
-                                               const ComparableFunction &RHS) {
-  if (LHS.getFunc() == RHS.getFunc() &&
-      LHS.getHash() == RHS.getHash())
-    return true;
-  if (!LHS.getFunc() || !RHS.getFunc())
-    return false;
-
-  // One of these is a special "underlying pointer comparison only" object.
-  if (LHS.getDataLayout() == ComparableFunction::LookupOnly ||
-      RHS.getDataLayout() == ComparableFunction::LookupOnly)
-    return false;
-
-  assert(LHS.getDataLayout() == RHS.getDataLayout() &&
-         "Comparing functions for different targets");
-
-  return FunctionComparator(LHS.getDataLayout(), LHS.getFunc(),
-                            RHS.getFunc()).compare();
-}
-
 // Replace direct callers of Old with New.
 void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) {
   Constant *BitcastNew = ConstantExpr::getBitCast(New, Old->getType());
@@ -1376,54 +1407,57 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
   ++NumFunctionsMerged;
 }
 
-// Insert a ComparableFunction into the FnSet, or merge it away if equal to one
+// Insert a ComparableFunction into the FnTree, or merge it away if equal to one
 // that was already inserted.
-bool MergeFunctions::insert(ComparableFunction &NewF) {
-  std::pair<FnSetType::iterator, bool> Result = FnSet.insert(NewF);
+bool MergeFunctions::insert(Function *NewFunction) {
+  std::pair<FnTreeType::iterator, bool> Result =
+      FnTree.insert(FunctionPtr(NewFunction, DL));
+
   if (Result.second) {
-    DEBUG(dbgs() << "Inserting as unique: " << NewF.getFunc()->getName() << '\n');
+    DEBUG(dbgs() << "Inserting as unique: " << NewFunction->getName() << '\n');
     return false;
   }
 
-  const ComparableFunction &OldF = *Result.first;
+  const FunctionPtr &OldF = *Result.first;
 
   // Don't merge tiny functions, since it can just end up making the function
   // larger.
   // FIXME: Should still merge them if they are unnamed_addr and produce an
   // alias.
-  if (NewF.getFunc()->size() == 1) {
-    if (NewF.getFunc()->front().size() <= 2) {
-      DEBUG(dbgs() << NewF.getFunc()->getName()
-            << " is to small to bother merging\n");
+  if (NewFunction->size() == 1) {
+    if (NewFunction->front().size() <= 2) {
+      DEBUG(dbgs() << NewFunction->getName()
+                   << " is to small to bother merging\n");
       return false;
     }
   }
 
   // Never thunk a strong function to a weak function.
-  assert(!OldF.getFunc()->mayBeOverridden() ||
-         NewF.getFunc()->mayBeOverridden());
+  assert(!OldF.getFunc()->mayBeOverridden() || NewFunction->mayBeOverridden());
 
-  DEBUG(dbgs() << "  " << OldF.getFunc()->getName() << " == "
-               << NewF.getFunc()->getName() << '\n');
+  DEBUG(dbgs() << "  " << OldF.getFunc()->getName()
+               << " == " << NewFunction->getName() << '\n');
 
-  Function *DeleteF = NewF.getFunc();
-  NewF.release();
+  Function *DeleteF = NewFunction;
   mergeTwoFunctions(OldF.getFunc(), DeleteF);
   return true;
 }
 
-// Remove a function from FnSet. If it was already in FnSet, add it to Deferred
-// so that we'll look at it in the next round.
+// Remove a function from FnTree. If it was already in FnTree, add
+// it to Deferred so that we'll look at it in the next round.
 void MergeFunctions::remove(Function *F) {
   // We need to make sure we remove F, not a function "equal" to F per the
   // function equality comparator.
-  //
-  // The special "lookup only" ComparableFunction bypasses the expensive
-  // function comparison in favour of a pointer comparison on the underlying
-  // Function*'s.
-  ComparableFunction CF = ComparableFunction(F, ComparableFunction::LookupOnly);
-  if (FnSet.erase(CF)) {
-    DEBUG(dbgs() << "Removed " << F->getName() << " from set and deferred it.\n");
+  FnTreeType::iterator found = FnTree.find(FunctionPtr(F, DL));
+  size_t Erased = 0;
+  if (found != FnTree.end() && found->getFunc() == F) {
+    Erased = 1;
+    FnTree.erase(found);
+  }
+
+  if (Erased) {
+    DEBUG(dbgs() << "Removed " << F->getName()
+                 << " from set and deferred it.\n");
     Deferred.push_back(F);
   }
 }
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 38e1b8e..46a3187 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -53,6 +53,10 @@ static cl::opt<bool>
 RunLoopRerolling("reroll-loops", cl::Hidden,
                  cl::desc("Run the loop rerolling pass"));
 
+static cl::opt<bool> RunLoadCombine("combine-loads", cl::init(false),
+                                    cl::Hidden,
+                                    cl::desc("Run the load combining pass"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -65,6 +69,7 @@ PassManagerBuilder::PassManagerBuilder() {
     SLPVectorize = RunSLPVectorization;
     LoopVectorize = RunLoopVectorization;
     RerollLoops = RunLoopRerolling;
+    LoadCombine = RunLoadCombine;
 }
 
 PassManagerBuilder::~PassManagerBuilder() {
@@ -151,9 +156,9 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   if (!DisableUnitAtATime) {
     addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
 
+    MPM.add(createIPSCCPPass());              // IP SCCP
     MPM.add(createGlobalOptimizerPass());     // Optimize out global vars
 
-    MPM.add(createIPSCCPPass());              // IP SCCP
     MPM.add(createDeadArgEliminationPass());  // Dead argument elimination
 
     MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
@@ -236,6 +241,9 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
       MPM.add(createLoopUnrollPass());
   }
 
+  if (LoadCombine)
+    MPM.add(createLoadCombinePass());
+
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
   MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
   MPM.add(createInstructionCombiningPass());  // Clean up after everything.
@@ -352,6 +360,9 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   // More scalar chains could be vectorized due to more alias information
   PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
 
+  if (LoadCombine)
+    PM.add(createLoadCombinePass());
+
   // Cleanup and simplify the code after the scalar optimizations.
   PM.add(createInstructionCombiningPass());
   addExtensionsToPM(EP_Peephole, PM);
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index e04b1be..ab4dc1c 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -37,8 +37,9 @@ enum SelectPatternFlavor {
   SPF_SMIN,
   SPF_UMIN,
   SPF_SMAX,
-  SPF_UMAX
-  // SPF_ABS - TODO.
+  SPF_UMAX,
+  SPF_ABS,
+  SPF_NABS
 };
 
 /// getComplexity:  Assign a complexity or rank value to LLVM Values...
@@ -246,6 +247,7 @@ private:
                                  bool DoXform = true);
   Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
   bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS);
+  bool WillNotOverflowUnsignedAdd(Value *LHS, Value *RHS);
   Value *EmitGEPOffset(User *GEP);
   Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
   Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask);
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index c37a9cf..99f0f1f 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -865,69 +865,170 @@ Value *FAddCombine::createAddendVal
   return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
 }
 
-// dyn_castFoldableMul - If this value is a multiply that can be folded into
-// other computations (because it has a constant operand), return the
-// non-constant operand of the multiply, and set CST to point to the multiplier.
-// Otherwise, return null.
-//
-static inline Value *dyn_castFoldableMul(Value *V, Constant *&CST) {
-  if (!V->hasOneUse() || !V->getType()->isIntOrIntVectorTy())
-    return nullptr;
-
-  Instruction *I = dyn_cast<Instruction>(V);
-  if (!I) return nullptr;
-
-  if (I->getOpcode() == Instruction::Mul)
-    if ((CST = dyn_cast<Constant>(I->getOperand(1))))
-      return I->getOperand(0);
-  if (I->getOpcode() == Instruction::Shl)
-    if ((CST = dyn_cast<Constant>(I->getOperand(1)))) {
-      // The multiplier is really 1 << CST.
-      CST = ConstantExpr::getShl(ConstantInt::get(V->getType(), 1), CST);
-      return I->getOperand(0);
-    }
-  return nullptr;
+// If one of the operands only has one non-zero bit, and if the other
+// operand has a known-zero bit in a more significant place than it (not
+// including the sign bit) the ripple may go up to and fill the zero, but
+// won't change the sign. For example, (X & ~4) + 1.
+static bool checkRippleForAdd(const APInt &Op0KnownZero,
+                              const APInt &Op1KnownZero) {
+  APInt Op1MaybeOne = ~Op1KnownZero;
+  // Make sure that one of the operand has at most one bit set to 1.
+  if (Op1MaybeOne.countPopulation() != 1)
+    return false;
+
+  // Find the most significant known 0 other than the sign bit.
+  int BitWidth = Op0KnownZero.getBitWidth();
+  APInt Op0KnownZeroTemp(Op0KnownZero);
+  Op0KnownZeroTemp.clearBit(BitWidth - 1);
+  int Op0ZeroPosition = BitWidth - Op0KnownZeroTemp.countLeadingZeros() - 1;
+
+  int Op1OnePosition = BitWidth - Op1MaybeOne.countLeadingZeros() - 1;
+  assert(Op1OnePosition >= 0);
+
+  // This also covers the case of no known zero, since in that case
+  // Op0ZeroPosition is -1.
+  return Op0ZeroPosition >= Op1OnePosition;
 }
 
-
 /// WillNotOverflowSignedAdd - Return true if we can prove that:
 ///    (sext (add LHS, RHS))  === (add (sext LHS), (sext RHS))
 /// This basically requires proving that the add in the original type would not
 /// overflow to change the sign bit or have a carry out.
+/// TODO: Handle this for Vectors.
 bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS) {
   // There are different heuristics we can use for this.  Here are some simple
   // ones.
 
-  // Add has the property that adding any two 2's complement numbers can only
-  // have one carry bit which can change a sign.  As such, if LHS and RHS each
-  // have at least two sign bits, we know that the addition of the two values
-  // will sign extend fine.
+  // If LHS and RHS each have at least two sign bits, the addition will look
+  // like
+  //
+  // XX..... +
+  // YY.....
+  //
+  // If the carry into the most significant position is 0, X and Y can't both
+  // be 1 and therefore the carry out of the addition is also 0.
+  //
+  // If the carry into the most significant position is 1, X and Y can't both
+  // be 0 and therefore the carry out of the addition is also 1.
+  //
+  // Since the carry into the most significant position is always equal to
+  // the carry out of the addition, there is no signed overflow.
   if (ComputeNumSignBits(LHS) > 1 && ComputeNumSignBits(RHS) > 1)
     return true;
 
+  if (IntegerType *IT = dyn_cast<IntegerType>(LHS->getType())) {
+    int BitWidth = IT->getBitWidth();
+    APInt LHSKnownZero(BitWidth, 0);
+    APInt LHSKnownOne(BitWidth, 0);
+    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne);
 
-  // If one of the operands only has one non-zero bit, and if the other operand
-  // has a known-zero bit in a more significant place than it (not including the
-  // sign bit) the ripple may go up to and fill the zero, but won't change the
-  // sign.  For example, (X & ~4) + 1.
+    APInt RHSKnownZero(BitWidth, 0);
+    APInt RHSKnownOne(BitWidth, 0);
+    computeKnownBits(RHS, RHSKnownZero, RHSKnownOne);
+
+    // Addition of two 2's compliment numbers having opposite signs will never
+    // overflow.
+    if ((LHSKnownOne[BitWidth - 1] && RHSKnownZero[BitWidth - 1]) ||
+        (LHSKnownZero[BitWidth - 1] && RHSKnownOne[BitWidth - 1]))
+      return true;
+
+    // Check if carry bit of addition will not cause overflow.
+    if (checkRippleForAdd(LHSKnownZero, RHSKnownZero))
+      return true;
+    if (checkRippleForAdd(RHSKnownZero, LHSKnownZero))
+      return true;
+  }
+  return false;
+}
 
-  // TODO: Implement.
+/// WillNotOverflowUnsignedAdd - Return true if we can prove that:
+///    (zext (add LHS, RHS))  === (add (zext LHS), (zext RHS))
+bool InstCombiner::WillNotOverflowUnsignedAdd(Value *LHS, Value *RHS) {
+  // There are different heuristics we can use for this. Here is a simple one.
+  // If the sign bit of LHS and that of RHS are both zero, no unsigned wrap.
+  bool LHSKnownNonNegative, LHSKnownNegative;
+  bool RHSKnownNonNegative, RHSKnownNegative;
+  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, 0);
+  ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, 0);
+  if (LHSKnownNonNegative && RHSKnownNonNegative)
+    return true;
 
   return false;
 }
 
-Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
-  bool Changed = SimplifyAssociativeOrCommutative(I);
+// Checks if any operand is negative and we can convert add to sub.
+// This function checks for following negative patterns
+//   ADD(XOR(OR(Z, NOT(C)), C)), 1) == NEG(AND(Z, C))
+//   ADD(XOR(AND(Z, C), C), 1) == NEG(OR(Z, ~C))
+//   XOR(AND(Z, C), (C + 1)) == NEG(OR(Z, ~C)) if C is even
+static Value *checkForNegativeOperand(BinaryOperator &I,
+                                      InstCombiner::BuilderTy *Builder) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
 
-  if (Value *V = SimplifyVectorOp(I))
-    return ReplaceInstUsesWith(I, V);
+  // This function creates 2 instructions to replace ADD, we need at least one
+  // of LHS or RHS to have one use to ensure benefit in transform.
+  if (!LHS->hasOneUse() && !RHS->hasOneUse())
+    return nullptr;
 
-  if (Value *V = SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(),
-                                 I.hasNoUnsignedWrap(), DL))
-    return ReplaceInstUsesWith(I, V);
+  Value *X = nullptr, *Y = nullptr, *Z = nullptr;
+  const APInt *C1 = nullptr, *C2 = nullptr;
+
+  // if ONE is on other side, swap
+  if (match(RHS, m_Add(m_Value(X), m_One())))
+    std::swap(LHS, RHS);
+
+  if (match(LHS, m_Add(m_Value(X), m_One()))) {
+    // if XOR on other side, swap
+    if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1))))
+      std::swap(X, RHS);
+
+    if (match(X, m_Xor(m_Value(Y), m_APInt(C1)))) {
+      // X = XOR(Y, C1), Y = OR(Z, C2), C2 = NOT(C1) ==> X == NOT(AND(Z, C1))
+      // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, AND(Z, C1))
+      if (match(Y, m_Or(m_Value(Z), m_APInt(C2))) && (*C2 == ~(*C1))) {
+        Value *NewAnd = Builder->CreateAnd(Z, *C1);
+        return Builder->CreateSub(RHS, NewAnd, "sub");
+      } else if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && (*C1 == *C2)) {
+        // X = XOR(Y, C1), Y = AND(Z, C2), C2 == C1 ==> X == NOT(OR(Z, ~C1))
+        // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, OR(Z, ~C1))
+        Value *NewOr = Builder->CreateOr(Z, ~(*C1));
+        return Builder->CreateSub(RHS, NewOr, "sub");
+      }
+    }
+  }
+
+  // Restore LHS and RHS
+  LHS = I.getOperand(0);
+  RHS = I.getOperand(1);
+
+  // if XOR is on other side, swap
+  if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1))))
+    std::swap(LHS, RHS);
+
+  // C2 is ODD
+  // LHS = XOR(Y, C1), Y = AND(Z, C2), C1 == (C2 + 1) => LHS == NEG(OR(Z, ~C2))
+  // ADD(LHS, RHS) == SUB(RHS, OR(Z, ~C2))
+  if (match(LHS, m_Xor(m_Value(Y), m_APInt(C1))))
+    if (C1->countTrailingZeros() == 0)
+      if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && *C1 == (*C2 + 1)) {
+        Value *NewOr = Builder->CreateOr(Z, ~(*C2));
+        return Builder->CreateSub(RHS, NewOr, "sub");
+      }
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
+   bool Changed = SimplifyAssociativeOrCommutative(I);
+   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+
+   if (Value *V = SimplifyVectorOp(I))
+     return ReplaceInstUsesWith(I, V);
 
-  // (A*B)+(A*C) -> A*(B+C) etc
+   if (Value *V = SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(),
+                                  I.hasNoUnsignedWrap(), DL))
+     return ReplaceInstUsesWith(I, V);
+
+   // (A*B)+(A*C) -> A*(B+C) etc
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return ReplaceInstUsesWith(I, V);
 
@@ -1025,23 +1126,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     if (Value *V = dyn_castNegVal(RHS))
       return BinaryOperator::CreateSub(LHS, V);
 
-
-  {
-    Constant *C2;
-    if (Value *X = dyn_castFoldableMul(LHS, C2)) {
-      if (X == RHS) // X*C + X --> X * (C+1)
-        return BinaryOperator::CreateMul(RHS, AddOne(C2));
-
-      // X*C1 + X*C2 --> X * (C1+C2)
-      Constant *C1;
-      if (X == dyn_castFoldableMul(RHS, C1))
-        return BinaryOperator::CreateMul(X, ConstantExpr::getAdd(C1, C2));
-    }
-
-    // X + X*C --> X * (C+1)
-    if (dyn_castFoldableMul(RHS, C2) == LHS)
-      return BinaryOperator::CreateMul(LHS, AddOne(C2));
-  }
+  if (Value *V = checkForNegativeOperand(I, Builder))
+    return ReplaceInstUsesWith(I, V);
 
   // A+B --> A|B iff A and B have no bits set in common.
   if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) {
@@ -1059,29 +1145,6 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     }
   }
 
-  // W*X + Y*Z --> W * (X+Z)  iff W == Y
-  {
-    Value *W, *X, *Y, *Z;
-    if (match(LHS, m_Mul(m_Value(W), m_Value(X))) &&
-        match(RHS, m_Mul(m_Value(Y), m_Value(Z)))) {
-      if (W != Y) {
-        if (W == Z) {
-          std::swap(Y, Z);
-        } else if (Y == X) {
-          std::swap(W, X);
-        } else if (X == Z) {
-          std::swap(Y, Z);
-          std::swap(W, X);
-        }
-      }
-
-      if (W == Y) {
-        Value *NewAdd = Builder->CreateAdd(X, Z, LHS->getName());
-        return BinaryOperator::CreateMul(W, NewAdd);
-      }
-    }
-  }
-
   if (Constant *CRHS = dyn_cast<Constant>(RHS)) {
     Value *X;
     if (match(LHS, m_Not(m_Value(X)))) // ~X + C --> (C-1) - X
@@ -1191,6 +1254,18 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       return BinaryOperator::CreateOr(A, B);
   }
 
+  // TODO(jingyue): Consider WillNotOverflowSignedAdd and
+  // WillNotOverflowUnsignedAdd to reduce the number of invocations of
+  // computeKnownBits.
+  if (!I.hasNoSignedWrap() && WillNotOverflowSignedAdd(LHS, RHS)) {
+    Changed = true;
+    I.setHasNoSignedWrap(true);
+  }
+  if (!I.hasNoUnsignedWrap() && WillNotOverflowUnsignedAdd(LHS, RHS)) {
+    Changed = true;
+    I.setHasNoUnsignedWrap(true);
+  }
+
   return Changed ? &I : nullptr;
 }
 
@@ -1478,9 +1553,9 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       return BinaryOperator::CreateAnd(Op0,
                                   Builder->CreateNot(Y, Y->getName() + ".not"));
 
-    // 0 - (X sdiv C)  -> (X sdiv -C)
-    if (match(Op1, m_SDiv(m_Value(X), m_Constant(C))) &&
-        match(Op0, m_Zero()))
+    // 0 - (X sdiv C)  -> (X sdiv -C)  provided the negation doesn't overflow.
+    if (match(Op1, m_SDiv(m_Value(X), m_Constant(C))) && match(Op0, m_Zero()) &&
+        !C->isMinSignedValue())
       return BinaryOperator::CreateSDiv(X, ConstantExpr::getNeg(C));
 
     // 0 - (X << Y)  -> (-X << Y)   when X is freely negatable.
@@ -1488,19 +1563,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       if (Value *XNeg = dyn_castNegVal(X))
         return BinaryOperator::CreateShl(XNeg, Y);
 
-    // X - X*C --> X * (1-C)
-    if (match(Op1, m_Mul(m_Specific(Op0), m_Constant(CI)))) {
-      Constant *CP1 = ConstantExpr::getSub(ConstantInt::get(I.getType(),1), CI);
-      return BinaryOperator::CreateMul(Op0, CP1);
-    }
-
-    // X - X<<C --> X * (1-(1<<C))
-    if (match(Op1, m_Shl(m_Specific(Op0), m_Constant(CI)))) {
-      Constant *One = ConstantInt::get(I.getType(), 1);
-      C = ConstantExpr::getSub(One, ConstantExpr::getShl(One, CI));
-      return BinaryOperator::CreateMul(Op0, C);
-    }
-
     // X - A*-B -> X + A*B
     // X - -A*B -> X + A*B
     Value *A, *B;
@@ -1517,16 +1579,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     }
   }
 
-  Constant *C1;
-  if (Value *X = dyn_castFoldableMul(Op0, C1)) {
-    if (X == Op1)  // X*C - X --> X * (C-1)
-      return BinaryOperator::CreateMul(Op1, SubOne(C1));
-
-    Constant *C2;   // X*C1 - X*C2 -> X * (C1-C2)
-    if (X == dyn_castFoldableMul(Op1, C2))
-      return BinaryOperator::CreateMul(X, ConstantExpr::getSub(C1, C2));
-  }
-
   // Optimize pointer differences into the same array into a size.  Consider:
   //  &A[10] - &A[0]: we should compile this to "10".
   if (DL) {
@@ -1541,7 +1593,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
         match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp)))))
       if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
         return ReplaceInstUsesWith(I, Res);
-  }
+      }
 
   return nullptr;
 }
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 4f5d65a..b23a606 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1996,29 +1996,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     C1 = dyn_cast<ConstantInt>(C);
     C2 = dyn_cast<ConstantInt>(D);
     if (C1 && C2) {  // (A & C1)|(B & C2)
-      // If we have: ((V + N) & C1) | (V & C2)
-      // .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
-      // replace with V+N.
-      if (C1->getValue() == ~C2->getValue()) {
-        if ((C2->getValue() & (C2->getValue()+1)) == 0 && // C2 == 0+1+
-            match(A, m_Add(m_Value(V1), m_Value(V2)))) {
-          // Add commutes, try both ways.
-          if (V1 == B && MaskedValueIsZero(V2, C2->getValue()))
-            return ReplaceInstUsesWith(I, A);
-          if (V2 == B && MaskedValueIsZero(V1, C2->getValue()))
-            return ReplaceInstUsesWith(I, A);
-        }
-        // Or commutes, try both ways.
-        if ((C1->getValue() & (C1->getValue()+1)) == 0 &&
-            match(B, m_Add(m_Value(V1), m_Value(V2)))) {
-          // Add commutes, try both ways.
-          if (V1 == A && MaskedValueIsZero(V2, C1->getValue()))
-            return ReplaceInstUsesWith(I, B);
-          if (V2 == A && MaskedValueIsZero(V1, C1->getValue()))
-            return ReplaceInstUsesWith(I, B);
-        }
-      }
-
       if ((C1->getValue() & C2->getValue()) == 0) {
         // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2)
         // iff (C1&C2) == 0 and (N&~C1) == 0
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index d4b583b..658178d 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -421,6 +421,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         return InsertValueInst::Create(Struct, II->getArgOperand(0), 0);
       }
     }
+
+    // We can strength reduce reduce this signed add into a regular add if we
+    // can prove that it will never overflow.
+    if (II->getIntrinsicID() == Intrinsic::sadd_with_overflow) {
+      Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
+      if (WillNotOverflowSignedAdd(LHS, RHS)) {
+        Value *Add = Builder->CreateNSWAdd(LHS, RHS);
+        Add->takeName(&CI);
+        Constant *V[] = {UndefValue::get(Add->getType()), Builder->getFalse()};
+        StructType *ST = cast<StructType>(II->getType());
+        Constant *Struct = ConstantStruct::get(ST, V);
+        return InsertValueInst::Create(Struct, Add, 0);
+      }
+    }
+
     break;
   case Intrinsic::usub_with_overflow:
   case Intrinsic::ssub_with_overflow:
@@ -800,6 +815,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
   case Intrinsic::ppc_altivec_vperm:
     // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
+    // Note that ppc_altivec_vperm has a big-endian bias, so when creating
+    // a vectorshuffle for little endian, we must undo the transformation
+    // performed on vec_perm in altivec.h.  That is, we must complement
+    // the permutation mask with respect to 31 and reverse the order of
+    // V1 and V2.
     if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) {
       assert(Mask->getType()->getVectorNumElements() == 16 &&
              "Bad type for intrinsic!");
@@ -832,10 +852,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
           unsigned Idx =
             cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
           Idx &= 31;  // Match the hardware behavior.
+          if (DL && DL->isLittleEndian())
+            Idx = 31 - Idx;
 
           if (!ExtractedElts[Idx]) {
+            Value *Op0ToUse = (DL && DL->isLittleEndian()) ? Op1 : Op0;
+            Value *Op1ToUse = (DL && DL->isLittleEndian()) ? Op0 : Op1;
             ExtractedElts[Idx] =
-              Builder->CreateExtractElement(Idx < 16 ? Op0 : Op1,
+              Builder->CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse,
                                             Builder->getInt32(Idx&15));
           }
 
@@ -913,6 +937,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
 
+  case Intrinsic::AMDGPU_rcp: {
+    if (const ConstantFP *C = dyn_cast<ConstantFP>(II->getArgOperand(0))) {
+      const APFloat &ArgVal = C->getValueAPF();
+      APFloat Val(ArgVal.getSemantics(), 1.0);
+      APFloat::opStatus Status = Val.divide(ArgVal,
+                                            APFloat::rmNearestTiesToEven);
+      // Only do this if it was exact and therefore not dependent on the
+      // rounding mode.
+      if (Status == APFloat::opOK)
+        return ReplaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));
+    }
+
+    break;
+  }
   case Intrinsic::stackrestore: {
     // If the save is right next to the restore, remove the restore.  This can
     // happen when variable allocas are DCE'd.
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 356803a..ff083d7 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1434,7 +1434,12 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
     // If casting the result of a getelementptr instruction with no offset, turn
     // this into a cast of the original pointer!
-    if (GEP->hasAllZeroIndices()) {
+    if (GEP->hasAllZeroIndices() &&
+        // If CI is an addrspacecast and GEP changes the poiner type, merging
+        // GEP into CI would undo canonicalizing addrspacecast with different
+        // pointer types, causing infinite loops.
+        (!isa<AddrSpaceCastInst>(CI) ||
+          GEP->getType() == GEP->getPointerOperand()->getType())) {
       // Changing the cast operand is usually not a good idea but it is safe
       // here because the pointer operand is being replaced with another
       // pointer operand so the opcode doesn't need to change.
@@ -1904,5 +1909,24 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
 }
 
 Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
+  // If the destination pointer element type is not the the same as the source's
+  // do the addrspacecast to the same type, and then the bitcast in the new
+  // address space. This allows the cast to be exposed to other transforms.
+  Value *Src = CI.getOperand(0);
+  PointerType *SrcTy = cast<PointerType>(Src->getType()->getScalarType());
+  PointerType *DestTy = cast<PointerType>(CI.getType()->getScalarType());
+
+  Type *DestElemTy = DestTy->getElementType();
+  if (SrcTy->getElementType() != DestElemTy) {
+    Type *MidTy = PointerType::get(DestElemTy, SrcTy->getAddressSpace());
+    if (VectorType *VT = dyn_cast<VectorType>(CI.getType())) {
+      // Handle vectors of pointers.
+      MidTy = VectorType::get(MidTy, VT->getNumElements());
+    }
+
+    Value *NewBitCast = Builder->CreateBitCast(Src, MidTy);
+    return new AddrSpaceCastInst(NewBitCast, CI.getType());
+  }
+
   return commonPointerCastTransforms(CI);
 }
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 02e8bf1..5e71c5c 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -612,9 +612,10 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
   if (ICmpInst::isSigned(Cond))
     return nullptr;
 
-  // Look through bitcasts.
-  if (BitCastInst *BCI = dyn_cast<BitCastInst>(RHS))
-    RHS = BCI->getOperand(0);
+  // Look through bitcasts and addrspacecasts. We do not however want to remove
+  // 0 GEPs.
+  if (!isa<GetElementPtrInst>(RHS))
+    RHS = RHS->stripPointerCasts();
 
   Value *PtrBase = GEPLHS->getOperand(0);
   if (DL && PtrBase == RHS && GEPLHS->isInBounds()) {
@@ -655,9 +656,24 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
           (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse()) &&
           PtrBase->stripPointerCasts() ==
             GEPRHS->getOperand(0)->stripPointerCasts()) {
+        Value *LOffset = EmitGEPOffset(GEPLHS);
+        Value *ROffset = EmitGEPOffset(GEPRHS);
+
+        // If we looked through an addrspacecast between different sized address
+        // spaces, the LHS and RHS pointers are different sized
+        // integers. Truncate to the smaller one.
+        Type *LHSIndexTy = LOffset->getType();
+        Type *RHSIndexTy = ROffset->getType();
+        if (LHSIndexTy != RHSIndexTy) {
+          if (LHSIndexTy->getPrimitiveSizeInBits() <
+              RHSIndexTy->getPrimitiveSizeInBits()) {
+            ROffset = Builder->CreateTrunc(ROffset, LHSIndexTy);
+          } else
+            LOffset = Builder->CreateTrunc(LOffset, RHSIndexTy);
+        }
+
         Value *Cmp = Builder->CreateICmp(ICmpInst::getSignedPredicate(Cond),
-                                         EmitGEPOffset(GEPLHS),
-                                         EmitGEPOffset(GEPRHS));
+                                         LOffset, ROffset);
         return ReplaceInstUsesWith(I, Cmp);
       }
 
@@ -667,26 +683,12 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     }
 
     // If one of the GEPs has all zero indices, recurse.
-    bool AllZeros = true;
-    for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i)
-      if (!isa<Constant>(GEPLHS->getOperand(i)) ||
-          !cast<Constant>(GEPLHS->getOperand(i))->isNullValue()) {
-        AllZeros = false;
-        break;
-      }
-    if (AllZeros)
+    if (GEPLHS->hasAllZeroIndices())
       return FoldGEPICmp(GEPRHS, GEPLHS->getOperand(0),
                          ICmpInst::getSwappedPredicate(Cond), I);
 
     // If the other GEP has all zero indices, recurse.
-    AllZeros = true;
-    for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i)
-      if (!isa<Constant>(GEPRHS->getOperand(i)) ||
-          !cast<Constant>(GEPRHS->getOperand(i))->isNullValue()) {
-        AllZeros = false;
-        break;
-      }
-    if (AllZeros)
+    if (GEPRHS->hasAllZeroIndices())
       return FoldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I);
 
     bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds();
@@ -2026,9 +2028,13 @@ static Instruction *ProcessUAddIdiom(Instruction &I, Value *OrigAddV,
 ///          replacement required.
 static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal,
                                          Value *OtherVal, InstCombiner &IC) {
+  // Don't bother doing this transformation for pointers, don't do it for
+  // vectors.
+  if (!isa<IntegerType>(MulVal->getType()))
+    return nullptr;
+
   assert(I.getOperand(0) == MulVal || I.getOperand(1) == MulVal);
   assert(I.getOperand(0) == OtherVal || I.getOperand(1) == OtherVal);
-  assert(isa<IntegerType>(MulVal->getType()));
   Instruction *MulInstr = cast<Instruction>(MulVal);
   assert(MulInstr->getOpcode() == Instruction::Mul);
 
@@ -2523,7 +2529,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       // bit is set.   If the comparison is against zero, then this is a check
       // to see if *that* bit is set.
       APInt Op0KnownZeroInverted = ~Op0KnownZero;
-      if (~Op1KnownZero == 0 && Op0KnownZeroInverted.isPowerOf2()) {
+      if (~Op1KnownZero == 0) {
         // If the LHS is an AND with the same constant, look through it.
         Value *LHS = nullptr;
         ConstantInt *LHSC = nullptr;
@@ -2533,11 +2539,19 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
 
         // If the LHS is 1 << x, and we know the result is a power of 2 like 8,
         // then turn "((1 << x)&8) == 0" into "x != 3".
+        // or turn "((1 << x)&7) == 0" into "x > 2".
         Value *X = nullptr;
         if (match(LHS, m_Shl(m_One(), m_Value(X)))) {
-          unsigned CmpVal = Op0KnownZeroInverted.countTrailingZeros();
-          return new ICmpInst(ICmpInst::ICMP_NE, X,
-                              ConstantInt::get(X->getType(), CmpVal));
+          APInt ValToCheck = Op0KnownZeroInverted;
+          if (ValToCheck.isPowerOf2()) {
+            unsigned CmpVal = ValToCheck.countTrailingZeros();
+            return new ICmpInst(ICmpInst::ICMP_NE, X,
+                                ConstantInt::get(X->getType(), CmpVal));
+          } else if ((++ValToCheck).isPowerOf2()) {
+            unsigned CmpVal = ValToCheck.countTrailingZeros() - 1;
+            return new ICmpInst(ICmpInst::ICMP_UGT, X,
+                                ConstantInt::get(X->getType(), CmpVal));
+          }
         }
 
         // If the LHS is 8 >>u x, and we know the result is a power of 2 like 1,
@@ -2560,7 +2574,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       // bit is set.   If the comparison is against zero, then this is a check
       // to see if *that* bit is set.
       APInt Op0KnownZeroInverted = ~Op0KnownZero;
-      if (~Op1KnownZero == 0 && Op0KnownZeroInverted.isPowerOf2()) {
+      if (~Op1KnownZero == 0) {
         // If the LHS is an AND with the same constant, look through it.
         Value *LHS = nullptr;
         ConstantInt *LHSC = nullptr;
@@ -2570,11 +2584,19 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
 
         // If the LHS is 1 << x, and we know the result is a power of 2 like 8,
         // then turn "((1 << x)&8) != 0" into "x == 3".
+        // or turn "((1 << x)&7) != 0" into "x < 3".
         Value *X = nullptr;
         if (match(LHS, m_Shl(m_One(), m_Value(X)))) {
-          unsigned CmpVal = Op0KnownZeroInverted.countTrailingZeros();
-          return new ICmpInst(ICmpInst::ICMP_EQ, X,
-                              ConstantInt::get(X->getType(), CmpVal));
+          APInt ValToCheck = Op0KnownZeroInverted;
+          if (ValToCheck.isPowerOf2()) {
+            unsigned CmpVal = ValToCheck.countTrailingZeros();
+            return new ICmpInst(ICmpInst::ICMP_EQ, X,
+                                ConstantInt::get(X->getType(), CmpVal));
+          } else if ((++ValToCheck).isPowerOf2()) {
+            unsigned CmpVal = ValToCheck.countTrailingZeros();
+            return new ICmpInst(ICmpInst::ICMP_ULT, X,
+                                ConstantInt::get(X->getType(), CmpVal));
+          }
         }
 
         // If the LHS is 8 >>u x, and we know the result is a power of 2 like 1,
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 66d0938..c10e92a 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -50,99 +50,102 @@ static bool pointsToConstantGlobal(Value *V) {
 /// can optimize this.
 static bool
 isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
-                               SmallVectorImpl<Instruction *> &ToDelete,
-                               bool IsOffset = false) {
+                               SmallVectorImpl<Instruction *> &ToDelete) {
   // We track lifetime intrinsics as we encounter them.  If we decide to go
   // ahead and replace the value with the global, this lets the caller quickly
   // eliminate the markers.
 
-  for (Use &U : V->uses()) {
-    Instruction *I = cast<Instruction>(U.getUser());
-
-    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-      // Ignore non-volatile loads, they are always ok.
-      if (!LI->isSimple()) return false;
-      continue;
-    }
-
-    if (isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I)) {
-      // If uses of the bitcast are ok, we are ok.
-      if (!isOnlyCopiedFromConstantGlobal(I, TheCopy, ToDelete, IsOffset))
-        return false;
-      continue;
-    }
-    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
-      // If the GEP has all zero indices, it doesn't offset the pointer.  If it
-      // doesn't, it does.
-      if (!isOnlyCopiedFromConstantGlobal(
-              GEP, TheCopy, ToDelete, IsOffset || !GEP->hasAllZeroIndices()))
-        return false;
-      continue;
-    }
+  SmallVector<std::pair<Value *, bool>, 35> ValuesToInspect;
+  ValuesToInspect.push_back(std::make_pair(V, false));
+  while (!ValuesToInspect.empty()) {
+    auto ValuePair = ValuesToInspect.pop_back_val();
+    const bool IsOffset = ValuePair.second;
+    for (auto &U : ValuePair.first->uses()) {
+      Instruction *I = cast<Instruction>(U.getUser());
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        // Ignore non-volatile loads, they are always ok.
+        if (!LI->isSimple()) return false;
+        continue;
+      }
 
-    if (CallSite CS = I) {
-      // If this is the function being called then we treat it like a load and
-      // ignore it.
-      if (CS.isCallee(&U))
+      if (isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I)) {
+        // If uses of the bitcast are ok, we are ok.
+        ValuesToInspect.push_back(std::make_pair(I, IsOffset));
         continue;
+      }
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+        // If the GEP has all zero indices, it doesn't offset the pointer. If it
+        // doesn't, it does.
+        ValuesToInspect.push_back(
+            std::make_pair(I, IsOffset || !GEP->hasAllZeroIndices()));
+        continue;
+      }
 
-      // Inalloca arguments are clobbered by the call.
-      unsigned ArgNo = CS.getArgumentNo(&U);
-      if (CS.isInAllocaArgument(ArgNo))
-        return false;
+      if (CallSite CS = I) {
+        // If this is the function being called then we treat it like a load and
+        // ignore it.
+        if (CS.isCallee(&U))
+          continue;
 
-      // If this is a readonly/readnone call site, then we know it is just a
-      // load (but one that potentially returns the value itself), so we can
-      // ignore it if we know that the value isn't captured.
-      if (CS.onlyReadsMemory() &&
-          (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo)))
-        continue;
+        // Inalloca arguments are clobbered by the call.
+        unsigned ArgNo = CS.getArgumentNo(&U);
+        if (CS.isInAllocaArgument(ArgNo))
+          return false;
 
-      // If this is being passed as a byval argument, the caller is making a
-      // copy, so it is only a read of the alloca.
-      if (CS.isByValArgument(ArgNo))
-        continue;
-    }
+        // If this is a readonly/readnone call site, then we know it is just a
+        // load (but one that potentially returns the value itself), so we can
+        // ignore it if we know that the value isn't captured.
+        if (CS.onlyReadsMemory() &&
+            (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo)))
+          continue;
+
+        // If this is being passed as a byval argument, the caller is making a
+        // copy, so it is only a read of the alloca.
+        if (CS.isByValArgument(ArgNo))
+          continue;
+      }
 
-    // Lifetime intrinsics can be handled by the caller.
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
-          II->getIntrinsicID() == Intrinsic::lifetime_end) {
-        assert(II->use_empty() && "Lifetime markers have no result to use!");
-        ToDelete.push_back(II);
-        continue;
+      // Lifetime intrinsics can be handled by the caller.
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+        if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+            II->getIntrinsicID() == Intrinsic::lifetime_end) {
+          assert(II->use_empty() && "Lifetime markers have no result to use!");
+          ToDelete.push_back(II);
+          continue;
+        }
       }
-    }
 
-    // If this is isn't our memcpy/memmove, reject it as something we can't
-    // handle.
-    MemTransferInst *MI = dyn_cast<MemTransferInst>(I);
-    if (!MI)
-      return false;
+      // If this is isn't our memcpy/memmove, reject it as something we can't
+      // handle.
+      MemTransferInst *MI = dyn_cast<MemTransferInst>(I);
+      if (!MI)
+        return false;
 
-    // If the transfer is using the alloca as a source of the transfer, then
-    // ignore it since it is a load (unless the transfer is volatile).
-    if (U.getOperandNo() == 1) {
-      if (MI->isVolatile()) return false;
-      continue;
-    }
+      // If the transfer is using the alloca as a source of the transfer, then
+      // ignore it since it is a load (unless the transfer is volatile).
+      if (U.getOperandNo() == 1) {
+        if (MI->isVolatile()) return false;
+        continue;
+      }
 
-    // If we already have seen a copy, reject the second one.
-    if (TheCopy) return false;
+      // If we already have seen a copy, reject the second one.
+      if (TheCopy) return false;
 
-    // If the pointer has been offset from the start of the alloca, we can't
-    // safely handle this.
-    if (IsOffset) return false;
+      // If the pointer has been offset from the start of the alloca, we can't
+      // safely handle this.
+      if (IsOffset) return false;
 
-    // If the memintrinsic isn't using the alloca as the dest, reject it.
-    if (U.getOperandNo() != 0) return false;
+      // If the memintrinsic isn't using the alloca as the dest, reject it.
+      if (U.getOperandNo() != 0) return false;
 
-    // If the source of the memcpy/move is not a constant global, reject it.
-    if (!pointsToConstantGlobal(MI->getSource()))
-      return false;
+      // If the source of the memcpy/move is not a constant global, reject it.
+      if (!pointsToConstantGlobal(MI->getSource()))
+        return false;
 
-    // Otherwise, the transform is safe.  Remember the copy instruction.
-    TheCopy = MI;
+      // Otherwise, the transform is safe.  Remember the copy instruction.
+      TheCopy = MI;
+    }
   }
   return true;
 }
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 9996ebc..6c6e7d8 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -203,8 +203,11 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
       Value *X;
       Constant *C1;
       if (match(Op0, m_OneUse(m_Add(m_Value(X), m_Constant(C1))))) {
-        Value *Add = Builder->CreateMul(X, Op1);
-        return BinaryOperator::CreateAdd(Add, Builder->CreateMul(C1, Op1));
+        Value *Mul = Builder->CreateMul(C1, Op1);
+        // Only go forward with the transform if C1*CI simplifies to a tidier
+        // constant.
+        if (!match(Mul, m_Mul(m_Value(), m_Value())))
+          return BinaryOperator::CreateAdd(Builder->CreateMul(X, Op1), Mul);
       }
     }
   }
@@ -990,6 +993,10 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
   }
 
   if (Constant *RHS = dyn_cast<Constant>(Op1)) {
+    // X/INT_MIN -> X == INT_MIN
+    if (RHS->isMinSignedValue())
+      return new ZExtInst(Builder->CreateICmpEQ(Op0, Op1), I.getType());
+
     // -X/C  -->  X/-C  provided the negation doesn't overflow.
     if (SubOperator *Sub = dyn_cast<SubOperator>(Op0))
       if (match(Sub->getOperand(0), m_Zero()) && Sub->hasNoSignedWrap())
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 9a41e4b..06c9e29 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -31,13 +31,18 @@ MatchSelectPattern(Value *V, Value *&LHS, Value *&RHS) {
   ICmpInst *ICI = dyn_cast<ICmpInst>(SI->getCondition());
   if (!ICI) return SPF_UNKNOWN;
 
-  LHS = ICI->getOperand(0);
-  RHS = ICI->getOperand(1);
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *CmpLHS = ICI->getOperand(0);
+  Value *CmpRHS = ICI->getOperand(1);
+  Value *TrueVal = SI->getTrueValue();
+  Value *FalseVal = SI->getFalseValue();
+
+  LHS = CmpLHS;
+  RHS = CmpRHS;
 
   // (icmp X, Y) ? X : Y
-  if (SI->getTrueValue() == ICI->getOperand(0) &&
-      SI->getFalseValue() == ICI->getOperand(1)) {
-    switch (ICI->getPredicate()) {
+  if (TrueVal == CmpLHS && FalseVal == CmpRHS) {
+    switch (Pred) {
     default: return SPF_UNKNOWN; // Equality.
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE: return SPF_UMAX;
@@ -51,18 +56,35 @@ MatchSelectPattern(Value *V, Value *&LHS, Value *&RHS) {
   }
 
   // (icmp X, Y) ? Y : X
-  if (SI->getTrueValue() == ICI->getOperand(1) &&
-      SI->getFalseValue() == ICI->getOperand(0)) {
-    switch (ICI->getPredicate()) {
-      default: return SPF_UNKNOWN; // Equality.
-      case ICmpInst::ICMP_UGT:
-      case ICmpInst::ICMP_UGE: return SPF_UMIN;
-      case ICmpInst::ICMP_SGT:
-      case ICmpInst::ICMP_SGE: return SPF_SMIN;
-      case ICmpInst::ICMP_ULT:
-      case ICmpInst::ICMP_ULE: return SPF_UMAX;
-      case ICmpInst::ICMP_SLT:
-      case ICmpInst::ICMP_SLE: return SPF_SMAX;
+  if (TrueVal == CmpRHS && FalseVal == CmpLHS) {
+    switch (Pred) {
+    default: return SPF_UNKNOWN; // Equality.
+    case ICmpInst::ICMP_UGT:
+    case ICmpInst::ICMP_UGE: return SPF_UMIN;
+    case ICmpInst::ICMP_SGT:
+    case ICmpInst::ICMP_SGE: return SPF_SMIN;
+    case ICmpInst::ICMP_ULT:
+    case ICmpInst::ICMP_ULE: return SPF_UMAX;
+    case ICmpInst::ICMP_SLT:
+    case ICmpInst::ICMP_SLE: return SPF_SMAX;
+    }
+  }
+
+  if (ConstantInt *C1 = dyn_cast<ConstantInt>(CmpRHS)) {
+    if ((CmpLHS == TrueVal && match(FalseVal, m_Neg(m_Specific(CmpLHS)))) ||
+        (CmpLHS == FalseVal && match(TrueVal, m_Neg(m_Specific(CmpLHS))))) {
+
+      // ABS(X) ==> (X >s 0) ? X : -X and (X >s -1) ? X : -X
+      // NABS(X) ==> (X >s 0) ? -X : X and (X >s -1) ? -X : X
+      if (Pred == ICmpInst::ICMP_SGT && (C1->isZero() || C1->isMinusOne())) {
+        return (CmpLHS == TrueVal) ? SPF_ABS : SPF_NABS;
+      }
+
+      // ABS(X) ==> (X <s 0) ? -X : X and (X <s 1) ? -X : X
+      // NABS(X) ==> (X <s 0) ? X : -X and (X <s 1) ? X : -X
+      if (Pred == ICmpInst::ICMP_SLT && (C1->isZero() || C1->isOne())) {
+        return (CmpLHS == FalseVal) ? SPF_ABS : SPF_NABS;
+      }
     }
   }
 
@@ -365,7 +387,15 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
 /// 1. The icmp predicate is inverted
 /// 2. The select operands are reversed
 /// 3. The magnitude of C2 and C1 are flipped
-static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
+///
+/// This also tries to turn
+/// --- Single bit tests:
+/// if ((x & C) == 0) x |= C	to  x |= C
+/// if ((x & C) != 0) x ^= C	to  x &= ~C
+/// if ((x & C) == 0) x ^= C	to  x |= C
+/// if ((x & C) != 0) x &= ~C	to  x &= ~C
+/// if ((x & C) == 0) x &= ~C	to  nothing
+static Value *foldSelectICmpAndOr(SelectInst &SI, Value *TrueVal,
                                   Value *FalseVal,
                                   InstCombiner::BuilderTy *Builder) {
   const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition());
@@ -384,6 +414,25 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
     return nullptr;
 
   const APInt *C2;
+  if (match(TrueVal, m_Specific(X))) {
+    // if ((X & C) != 0) X ^= C becomes X &= ~C
+    if (match(FalseVal, m_Xor(m_Specific(X), m_APInt(C2))) && C1 == C2)
+      return Builder->CreateAnd(X, ~(*C1));
+    // if ((X & C) != 0) X &= ~C becomes X &= ~C
+    if (match(FalseVal, m_And(m_Specific(X), m_APInt(C2))) && *C1 == ~(*C2))
+      return FalseVal;
+  } else if (match(FalseVal, m_Specific(X))) {
+    // if ((X & C) == 0) X ^= C becomes X |= C
+    if (match(TrueVal, m_Xor(m_Specific(X), m_APInt(C2))) && C1 == C2)
+      return Builder->CreateOr(X, *C1);
+    // if ((X & C) == 0) X &= ~C becomes nothing
+    if (match(TrueVal, m_And(m_Specific(X), m_APInt(C2))) && *C1 == ~(*C2))
+      return X;
+    // if ((X & C) == 0) X |= C becomes X |= C
+    if (match(TrueVal, m_Or(m_Specific(X), m_APInt(C2))) && C1 == C2)
+      return TrueVal;
+  }
+
   bool OrOnTrueVal = false;
   bool OrOnFalseVal = match(FalseVal, m_Or(m_Specific(TrueVal), m_Power2(C2)));
   if (!OrOnFalseVal)
@@ -677,6 +726,22 @@ Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner,
       }
     }
   }
+
+  // ABS(ABS(X)) -> ABS(X)
+  // NABS(NABS(X)) -> NABS(X)
+  if (SPF1 == SPF2 && (SPF1 == SPF_ABS || SPF1 == SPF_NABS)) {
+    return ReplaceInstUsesWith(Outer, Inner);
+  }
+
+  // ABS(NABS(X)) -> ABS(X)
+  // NABS(ABS(X)) -> NABS(X)
+  if ((SPF1 == SPF_ABS && SPF2 == SPF_NABS) ||
+      (SPF1 == SPF_NABS && SPF2 == SPF_ABS)) {
+    SelectInst *SI = cast<SelectInst>(Inner);
+    Value *NewSI = Builder->CreateSelect(
+        SI->getCondition(), SI->getFalseValue(), SI->getTrueValue());
+    return ReplaceInstUsesWith(Outer, NewSI);
+  }
   return nullptr;
 }
 
@@ -981,7 +1046,6 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
 
     // TODO.
     // ABS(-X) -> ABS(X)
-    // ABS(ABS(X)) -> ABS(X)
   }
 
   // See if we can fold the select into a phi node if the condition is a select.
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index cc6665c..2495747 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -789,11 +789,6 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
     // have a sign-extend idiom.
     Value *X;
     if (match(Op0, m_Shl(m_Value(X), m_Specific(Op1)))) {
-      // If the left shift is just shifting out partial signbits, delete the
-      // extension.
-      if (cast<OverflowingBinaryOperator>(Op0)->hasNoSignedWrap())
-        return ReplaceInstUsesWith(I, X);
-
       // If the input is an extension from the shifted amount value, e.g.
       //   %x = zext i8 %A to i32
       //   %y = shl i32 %x, 24
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 8c5e202..cb16584 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -144,7 +144,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
     // If the operand is the PHI induction variable:
     if (PHIInVal == PHIUser) {
       // Scalarize the binary operation. Its first operand is the
-      // scalar PHI and the second operand is extracted from the other
+      // scalar PHI, and the second operand is extracted from the other
       // vector operand.
       BinaryOperator *B0 = cast<BinaryOperator>(PHIUser);
       unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0;
@@ -361,7 +361,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
     unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
 
     if (isa<UndefValue>(ScalarOp)) {  // inserting undef into vector.
-      // Okay, we can handle this if the vector we are insertinting into is
+      // We can handle this if the vector we are inserting into is
       // transitively ok.
       if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
         // If so, update the mask to reflect the inserted undef.
@@ -376,7 +376,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
 
         // This must be extracting from either LHS or RHS.
         if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {
-          // Okay, we can handle this if the vector we are insertinting into is
+          // We can handle this if the vector we are inserting into is
           // transitively ok.
           if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
             // If so, update the mask to reflect the inserted value.
@@ -403,7 +403,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
 
 /// We are building a shuffle to create V, which is a sequence of insertelement,
 /// extractelement pairs. If PermittedRHS is set, then we must either use it or
-/// not rely on the second vector source. Return an std::pair containing the
+/// not rely on the second vector source. Return a std::pair containing the
 /// left and right vectors of the proposed shuffle (or 0), and set the Mask
 /// parameter as required.
 ///
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 4c36887..08e2446 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -42,6 +42,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
@@ -395,6 +396,127 @@ static bool RightDistributesOverLeft(Instruction::BinaryOps LOp,
   return false;
 }
 
+/// This function returns identity value for given opcode, which can be used to
+/// factor patterns like (X * 2) + X ==> (X * 2) + (X * 1) ==> X * (2 + 1).
+static Value *getIdentityValue(Instruction::BinaryOps OpCode, Value *V) {
+  if (isa<Constant>(V))
+    return nullptr;
+
+  if (OpCode == Instruction::Mul)
+    return ConstantInt::get(V->getType(), 1);
+
+  // TODO: We can handle other cases e.g. Instruction::And, Instruction::Or etc.
+
+  return nullptr;
+}
+
+/// This function factors binary ops which can be combined using distributive
+/// laws. This also factor SHL as MUL e.g. SHL(X, 2) ==> MUL(X, 4).
+static Instruction::BinaryOps
+getBinOpsForFactorization(BinaryOperator *Op, Value *&LHS, Value *&RHS) {
+  if (!Op)
+    return Instruction::BinaryOpsEnd;
+
+  if (Op->getOpcode() == Instruction::Shl) {
+    if (Constant *CST = dyn_cast<Constant>(Op->getOperand(1))) {
+      // The multiplier is really 1 << CST.
+      RHS = ConstantExpr::getShl(ConstantInt::get(Op->getType(), 1), CST);
+      LHS = Op->getOperand(0);
+      return Instruction::Mul;
+    }
+  }
+
+  // TODO: We can add other conversions e.g. shr => div etc.
+
+  LHS = Op->getOperand(0);
+  RHS = Op->getOperand(1);
+  return Op->getOpcode();
+}
+
+/// This tries to simplify binary operations by factorizing out common terms
+/// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").
+static Value *tryFactorization(InstCombiner::BuilderTy *Builder,
+                               const DataLayout *DL, BinaryOperator &I,
+                               Instruction::BinaryOps InnerOpcode, Value *A,
+                               Value *B, Value *C, Value *D) {
+
+  // If any of A, B, C, D are null, we can not factor I, return early.
+  // Checking A and C should be enough.
+  if (!A || !C || !B || !D)
+    return nullptr;
+
+  Value *SimplifiedInst = nullptr;
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  Instruction::BinaryOps TopLevelOpcode = I.getOpcode();
+
+  // Does "X op' Y" always equal "Y op' X"?
+  bool InnerCommutative = Instruction::isCommutative(InnerOpcode);
+
+  // Does "X op' (Y op Z)" always equal "(X op' Y) op (X op' Z)"?
+  if (LeftDistributesOverRight(InnerOpcode, TopLevelOpcode))
+    // Does the instruction have the form "(A op' B) op (A op' D)" or, in the
+    // commutative case, "(A op' B) op (C op' A)"?
+    if (A == C || (InnerCommutative && A == D)) {
+      if (A != C)
+        std::swap(C, D);
+      // Consider forming "A op' (B op D)".
+      // If "B op D" simplifies then it can be formed with no cost.
+      Value *V = SimplifyBinOp(TopLevelOpcode, B, D, DL);
+      // If "B op D" doesn't simplify then only go on if both of the existing
+      // operations "A op' B" and "C op' D" will be zapped as no longer used.
+      if (!V && LHS->hasOneUse() && RHS->hasOneUse())
+        V = Builder->CreateBinOp(TopLevelOpcode, B, D, RHS->getName());
+      if (V) {
+        SimplifiedInst = Builder->CreateBinOp(InnerOpcode, A, V);
+      }
+    }
+
+  // Does "(X op Y) op' Z" always equal "(X op' Z) op (Y op' Z)"?
+  if (!SimplifiedInst && RightDistributesOverLeft(TopLevelOpcode, InnerOpcode))
+    // Does the instruction have the form "(A op' B) op (C op' B)" or, in the
+    // commutative case, "(A op' B) op (B op' D)"?
+    if (B == D || (InnerCommutative && B == C)) {
+      if (B != D)
+        std::swap(C, D);
+      // Consider forming "(A op C) op' B".
+      // If "A op C" simplifies then it can be formed with no cost.
+      Value *V = SimplifyBinOp(TopLevelOpcode, A, C, DL);
+
+      // If "A op C" doesn't simplify then only go on if both of the existing
+      // operations "A op' B" and "C op' D" will be zapped as no longer used.
+      if (!V && LHS->hasOneUse() && RHS->hasOneUse())
+        V = Builder->CreateBinOp(TopLevelOpcode, A, C, LHS->getName());
+      if (V) {
+        SimplifiedInst = Builder->CreateBinOp(InnerOpcode, V, B);
+      }
+    }
+
+  if (SimplifiedInst) {
+    ++NumFactor;
+    SimplifiedInst->takeName(&I);
+
+    // Check if we can add NSW flag to SimplifiedInst. If so, set NSW flag.
+    // TODO: Check for NUW.
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(SimplifiedInst)) {
+      if (isa<OverflowingBinaryOperator>(SimplifiedInst)) {
+        bool HasNSW = false;
+        if (isa<OverflowingBinaryOperator>(&I))
+          HasNSW = I.hasNoSignedWrap();
+
+        if (BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS))
+          if (isa<OverflowingBinaryOperator>(Op0))
+            HasNSW &= Op0->hasNoSignedWrap();
+
+        if (BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS))
+          if (isa<OverflowingBinaryOperator>(Op1))
+            HasNSW &= Op1->hasNoSignedWrap();
+        BO->setHasNoSignedWrap(HasNSW);
+      }
+    }
+  }
+  return SimplifiedInst;
+}
+
 /// SimplifyUsingDistributiveLaws - This tries to simplify binary operations
 /// which some other binary operation distributes over either by factorizing
 /// out common terms (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this
@@ -404,65 +526,33 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
   BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
-  Instruction::BinaryOps TopLevelOpcode = I.getOpcode(); // op
 
   // Factorization.
-  if (Op0 && Op1 && Op0->getOpcode() == Op1->getOpcode()) {
-    // The instruction has the form "(A op' B) op (C op' D)".  Try to factorize
-    // a common term.
-    Value *A = Op0->getOperand(0), *B = Op0->getOperand(1);
-    Value *C = Op1->getOperand(0), *D = Op1->getOperand(1);
-    Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op'
+  Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
+  Instruction::BinaryOps LHSOpcode = getBinOpsForFactorization(Op0, A, B);
+  Instruction::BinaryOps RHSOpcode = getBinOpsForFactorization(Op1, C, D);
+
+  // The instruction has the form "(A op' B) op (C op' D)".  Try to factorize
+  // a common term.
+  if (LHSOpcode == RHSOpcode) {
+    if (Value *V = tryFactorization(Builder, DL, I, LHSOpcode, A, B, C, D))
+      return V;
+  }
 
-    // Does "X op' Y" always equal "Y op' X"?
-    bool InnerCommutative = Instruction::isCommutative(InnerOpcode);
-
-    // Does "X op' (Y op Z)" always equal "(X op' Y) op (X op' Z)"?
-    if (LeftDistributesOverRight(InnerOpcode, TopLevelOpcode))
-      // Does the instruction have the form "(A op' B) op (A op' D)" or, in the
-      // commutative case, "(A op' B) op (C op' A)"?
-      if (A == C || (InnerCommutative && A == D)) {
-        if (A != C)
-          std::swap(C, D);
-        // Consider forming "A op' (B op D)".
-        // If "B op D" simplifies then it can be formed with no cost.
-        Value *V = SimplifyBinOp(TopLevelOpcode, B, D, DL);
-        // If "B op D" doesn't simplify then only go on if both of the existing
-        // operations "A op' B" and "C op' D" will be zapped as no longer used.
-        if (!V && Op0->hasOneUse() && Op1->hasOneUse())
-          V = Builder->CreateBinOp(TopLevelOpcode, B, D, Op1->getName());
-        if (V) {
-          ++NumFactor;
-          V = Builder->CreateBinOp(InnerOpcode, A, V);
-          V->takeName(&I);
-          return V;
-        }
-      }
+  // The instruction has the form "(A op' B) op (C)".  Try to factorize common
+  // term.
+  if (Value *V = tryFactorization(Builder, DL, I, LHSOpcode, A, B, RHS,
+                                  getIdentityValue(LHSOpcode, RHS)))
+    return V;
 
-    // Does "(X op Y) op' Z" always equal "(X op' Z) op (Y op' Z)"?
-    if (RightDistributesOverLeft(TopLevelOpcode, InnerOpcode))
-      // Does the instruction have the form "(A op' B) op (C op' B)" or, in the
-      // commutative case, "(A op' B) op (B op' D)"?
-      if (B == D || (InnerCommutative && B == C)) {
-        if (B != D)
-          std::swap(C, D);
-        // Consider forming "(A op C) op' B".
-        // If "A op C" simplifies then it can be formed with no cost.
-        Value *V = SimplifyBinOp(TopLevelOpcode, A, C, DL);
-        // If "A op C" doesn't simplify then only go on if both of the existing
-        // operations "A op' B" and "C op' D" will be zapped as no longer used.
-        if (!V && Op0->hasOneUse() && Op1->hasOneUse())
-          V = Builder->CreateBinOp(TopLevelOpcode, A, C, Op0->getName());
-        if (V) {
-          ++NumFactor;
-          V = Builder->CreateBinOp(InnerOpcode, V, B);
-          V->takeName(&I);
-          return V;
-        }
-      }
-  }
+  // The instruction has the form "(B) op (C op' D)".  Try to factorize common
+  // term.
+  if (Value *V = tryFactorization(Builder, DL, I, RHSOpcode, LHS,
+                                  getIdentityValue(RHSOpcode, LHS), C, D))
+    return V;
 
   // Expansion.
+  Instruction::BinaryOps TopLevelOpcode = I.getOpcode();
   if (Op0 && RightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) {
     // The instruction has the form "(A op' B) op C".  See if expanding it out
     // to "(A op C) op' (B op C)" results in simplifications.
@@ -1030,6 +1120,12 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
     return nullptr;
   }
 
+  // If Op is zero then Val = Op * Scale.
+  if (match(Op, m_Zero())) {
+    NoSignedWrap = true;
+    return Op;
+  }
+
   // We know that we can successfully descale, so from here on we can safely
   // modify the IR.  Op holds the descaled version of the deepest term in the
   // expression.  NoSignedWrap is 'true' if multiplying Op by Scale is known
@@ -1106,6 +1202,11 @@ static Value *CreateBinOpAsGiven(BinaryOperator &Inst, Value *LHS, Value *RHS,
 Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
   if (!Inst.getType()->isVectorTy()) return nullptr;
 
+  // It may not be safe to reorder shuffles and things like div, urem, etc.
+  // because we may trap when executing those ops on unknown vector elements.
+  // See PR20059.
+  if (!isSafeToSpeculativelyExecute(&Inst, DL)) return nullptr;
+
   unsigned VWidth = cast<VectorType>(Inst.getType())->getNumElements();
   Value *LHS = Inst.getOperand(0), *RHS = Inst.getOperand(1);
   assert(cast<VectorType>(LHS->getType())->getNumElements() == VWidth);
@@ -1138,7 +1239,9 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
   if (isa<ShuffleVectorInst>(RHS)) Shuffle = cast<ShuffleVectorInst>(RHS);
   if (isa<Constant>(LHS)) C1 = cast<Constant>(LHS);
   if (isa<Constant>(RHS)) C1 = cast<Constant>(RHS);
-  if (Shuffle && C1 && isa<UndefValue>(Shuffle->getOperand(1)) &&
+  if (Shuffle && C1 &&
+      (isa<ConstantVector>(C1) || isa<ConstantDataVector>(C1)) &&
+      isa<UndefValue>(Shuffle->getOperand(1)) &&
       Shuffle->getType() == Shuffle->getOperand(0)->getType()) {
     SmallVector<int, 16> ShMask = Shuffle->getShuffleMask();
     // Find constant C2 that has property:
@@ -1220,6 +1323,91 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     if (MadeChange) return &GEP;
   }
 
+  // Check to see if the inputs to the PHI node are getelementptr instructions.
+  if (PHINode *PN = dyn_cast<PHINode>(PtrOp)) {
+    GetElementPtrInst *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0));
+    if (!Op1)
+      return nullptr;
+
+    signed DI = -1;
+
+    for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) {
+      GetElementPtrInst *Op2 = dyn_cast<GetElementPtrInst>(*I);
+      if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands())
+        return nullptr;
+
+      // Keep track of the type as we walk the GEP.
+      Type *CurTy = Op1->getOperand(0)->getType()->getScalarType();
+
+      for (unsigned J = 0, F = Op1->getNumOperands(); J != F; ++J) {
+        if (Op1->getOperand(J)->getType() != Op2->getOperand(J)->getType())
+          return nullptr;
+
+        if (Op1->getOperand(J) != Op2->getOperand(J)) {
+          if (DI == -1) {
+            // We have not seen any differences yet in the GEPs feeding the
+            // PHI yet, so we record this one if it is allowed to be a
+            // variable.
+
+            // The first two arguments can vary for any GEP, the rest have to be
+            // static for struct slots
+            if (J > 1 && CurTy->isStructTy())
+              return nullptr;
+
+            DI = J;
+          } else {
+            // The GEP is different by more than one input. While this could be
+            // extended to support GEPs that vary by more than one variable it
+            // doesn't make sense since it greatly increases the complexity and
+            // would result in an R+R+R addressing mode which no backend
+            // directly supports and would need to be broken into several
+            // simpler instructions anyway.
+            return nullptr;
+          }
+        }
+
+        // Sink down a layer of the type for the next iteration.
+        if (J > 0) {
+          if (CompositeType *CT = dyn_cast<CompositeType>(CurTy)) {
+            CurTy = CT->getTypeAtIndex(Op1->getOperand(J));
+          } else {
+            CurTy = nullptr;
+          }
+        }
+      }
+    }
+
+    GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(Op1->clone());
+
+    if (DI == -1) {
+      // All the GEPs feeding the PHI are identical. Clone one down into our
+      // BB so that it can be merged with the current GEP.
+      GEP.getParent()->getInstList().insert(GEP.getParent()->getFirstNonPHI(),
+                                            NewGEP);
+    } else {
+      // All the GEPs feeding the PHI differ at a single offset. Clone a GEP
+      // into the current block so it can be merged, and create a new PHI to
+      // set that index.
+      Instruction *InsertPt = Builder->GetInsertPoint();
+      Builder->SetInsertPoint(PN);
+      PHINode *NewPN = Builder->CreatePHI(Op1->getOperand(DI)->getType(),
+                                          PN->getNumOperands());
+      Builder->SetInsertPoint(InsertPt);
+
+      for (auto &I : PN->operands())
+        NewPN->addIncoming(cast<GEPOperator>(I)->getOperand(DI),
+                           PN->getIncomingBlock(I));
+
+      NewGEP->setOperand(DI, NewPN);
+      GEP.getParent()->getInstList().insert(GEP.getParent()->getFirstNonPHI(),
+                                            NewGEP);
+      NewGEP->setOperand(DI, NewPN);
+    }
+
+    GEP.setOperand(0, NewGEP);
+    PtrOp = NewGEP;
+  }
+
   // Combine Indices - If the source pointer to this getelementptr instruction
   // is a getelementptr instruction, combine the indices of the two
   // getelementptr instructions into a single instruction.
@@ -2014,7 +2202,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
   // Simplify the list of clauses, eg by removing repeated catch clauses
   // (these are often created by inlining).
   bool MakeNewInstruction = false; // If true, recreate using the following:
-  SmallVector<Value *, 16> NewClauses; // - Clauses for the new instruction;
+  SmallVector<Constant *, 16> NewClauses; // - Clauses for the new instruction;
   bool CleanupFlag = LI.isCleanup();   // - The new instruction is a cleanup.
 
   SmallPtrSet<Value *, 16> AlreadyCaught; // Typeinfos known caught already.
@@ -2022,8 +2210,8 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
     bool isLastClause = i + 1 == e;
     if (LI.isCatch(i)) {
       // A catch clause.
-      Value *CatchClause = LI.getClause(i);
-      Constant *TypeInfo = cast<Constant>(CatchClause->stripPointerCasts());
+      Constant *CatchClause = LI.getClause(i);
+      Constant *TypeInfo = CatchClause->stripPointerCasts();
 
       // If we already saw this clause, there is no point in having a second
       // copy of it.
@@ -2052,7 +2240,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
       // equal (for example if one represents a C++ class, and the other some
       // class derived from it).
       assert(LI.isFilter(i) && "Unsupported landingpad clause!");
-      Value *FilterClause = LI.getClause(i);
+      Constant *FilterClause = LI.getClause(i);
       ArrayType *FilterType = cast<ArrayType>(FilterClause->getType());
       unsigned NumTypeInfos = FilterType->getNumElements();
 
@@ -2096,8 +2284,8 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
         // catch-alls.  If so, the filter can be discarded.
         bool SawCatchAll = false;
         for (unsigned j = 0; j != NumTypeInfos; ++j) {
-          Value *Elt = Filter->getOperand(j);
-          Constant *TypeInfo = cast<Constant>(Elt->stripPointerCasts());
+          Constant *Elt = Filter->getOperand(j);
+          Constant *TypeInfo = Elt->stripPointerCasts();
           if (isCatchAll(Personality, TypeInfo)) {
             // This element is a catch-all.  Bail out, noting this fact.
             SawCatchAll = true;
@@ -2202,7 +2390,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
         continue;
       // If Filter is a subset of LFilter, i.e. every element of Filter is also
       // an element of LFilter, then discard LFilter.
-      SmallVectorImpl<Value *>::iterator J = NewClauses.begin() + j;
+      SmallVectorImpl<Constant *>::iterator J = NewClauses.begin() + j;
       // If Filter is empty then it is a subset of LFilter.
       if (!FElts) {
         // Discard LFilter.
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 95fca75..5e5ddc1 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
@@ -39,15 +40,14 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
-#include "llvm/Support/system_error.h"
 #include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
-#include "llvm/Transforms/Utils/SpecialCaseList.h"
 #include <algorithm>
 #include <string>
+#include <system_error>
 
 using namespace llvm;
 
@@ -70,7 +70,7 @@ static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E;
 
 static const char *const kAsanModuleCtorName = "asan.module_ctor";
 static const char *const kAsanModuleDtorName = "asan.module_dtor";
-static const int         kAsanCtorAndCtorPriority = 1;
+static const int         kAsanCtorAndDtorPriority = 1;
 static const char *const kAsanReportErrorTemplate = "__asan_report_";
 static const char *const kAsanReportLoadN = "__asan_report_load_n";
 static const char *const kAsanReportStoreN = "__asan_report_store_n";
@@ -79,7 +79,7 @@ static const char *const kAsanUnregisterGlobalsName =
     "__asan_unregister_globals";
 static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
 static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
-static const char *const kAsanInitName = "__asan_init_v3";
+static const char *const kAsanInitName = "__asan_init_v4";
 static const char *const kAsanCovModuleInitName = "__sanitizer_cov_module_init";
 static const char *const kAsanCovName = "__sanitizer_cov";
 static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp";
@@ -128,9 +128,8 @@ static cl::opt<int> ClMaxInsnsToInstrumentPerBB("asan-max-ins-per-bb",
 // This flag may need to be replaced with -f[no]asan-stack.
 static cl::opt<bool> ClStack("asan-stack",
        cl::desc("Handle stack memory"), cl::Hidden, cl::init(true));
-// This flag may need to be replaced with -f[no]asan-use-after-return.
 static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
-       cl::desc("Check return-after-free"), cl::Hidden, cl::init(false));
+       cl::desc("Check return-after-free"), cl::Hidden, cl::init(true));
 // This flag may need to be replaced with -f[no]asan-globals.
 static cl::opt<bool> ClGlobals("asan-globals",
        cl::desc("Handle global objects"), cl::Hidden, cl::init(true));
@@ -142,16 +141,13 @@ static cl::opt<int> ClCoverageBlockThreshold("asan-coverage-block-threshold",
                 "are more than this number of blocks."),
        cl::Hidden, cl::init(1500));
 static cl::opt<bool> ClInitializers("asan-initialization-order",
-       cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(false));
+       cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(true));
 static cl::opt<bool> ClInvalidPointerPairs("asan-detect-invalid-pointer-pair",
        cl::desc("Instrument <, <=, >, >=, - with pointer operands"),
        cl::Hidden, cl::init(false));
 static cl::opt<unsigned> ClRealignStack("asan-realign-stack",
        cl::desc("Realign stack to the value of this flag (power of two)"),
        cl::Hidden, cl::init(32));
-static cl::opt<std::string> ClBlacklistFile("asan-blacklist",
-       cl::desc("File containing the list of objects to ignore "
-                "during instrumentation"), cl::Hidden);
 static cl::opt<int> ClInstrumentationWithCallsThreshold(
     "asan-instrumentation-with-call-threshold",
        cl::desc("If the function being instrumented contains more than "
@@ -216,29 +212,87 @@ STATISTIC(NumOptimizedAccessesToGlobalVar,
           "Number of optimized accesses to global vars");
 
 namespace {
-/// A set of dynamically initialized globals extracted from metadata.
-class SetOfDynamicallyInitializedGlobals {
+/// Frontend-provided metadata for global variables.
+class GlobalsMetadata {
  public:
-  void Init(Module& M) {
-    // Clang generates metadata identifying all dynamically initialized globals.
-    NamedMDNode *DynamicGlobals =
-        M.getNamedMetadata("llvm.asan.dynamically_initialized_globals");
-    if (!DynamicGlobals)
+  GlobalsMetadata() : inited_(false) {}
+  void init(Module& M) {
+    assert(!inited_);
+    inited_ = true;
+    NamedMDNode *Globals = M.getNamedMetadata("llvm.asan.globals");
+    if (!Globals)
       return;
-    for (int i = 0, n = DynamicGlobals->getNumOperands(); i < n; ++i) {
-      MDNode *MDN = DynamicGlobals->getOperand(i);
-      assert(MDN->getNumOperands() == 1);
-      Value *VG = MDN->getOperand(0);
-      // The optimizer may optimize away a global entirely, in which case we
-      // cannot instrument access to it.
-      if (!VG)
+    for (auto MDN : Globals->operands()) {
+      // Format of the metadata node for the global:
+      // {
+      //   global,
+      //   source_location,
+      //   i1 is_dynamically_initialized,
+      //   i1 is_blacklisted
+      // }
+      assert(MDN->getNumOperands() == 4);
+      Value *V = MDN->getOperand(0);
+      // The optimizer may optimize away a global entirely.
+      if (!V)
         continue;
-      DynInitGlobals.insert(cast<GlobalVariable>(VG));
+      GlobalVariable *GV = cast<GlobalVariable>(V);
+      if (Value *Loc = MDN->getOperand(1)) {
+        GlobalVariable *GVLoc = cast<GlobalVariable>(Loc);
+        // We may already know the source location for GV, if it was merged
+        // with another global.
+        if (SourceLocation.insert(std::make_pair(GV, GVLoc)).second)
+          addSourceLocationGlobal(GVLoc);
+      }
+      ConstantInt *IsDynInit = cast<ConstantInt>(MDN->getOperand(2));
+      if (IsDynInit->isOne())
+        DynInitGlobals.insert(GV);
+      ConstantInt *IsBlacklisted = cast<ConstantInt>(MDN->getOperand(3));
+      if (IsBlacklisted->isOne())
+        BlacklistedGlobals.insert(GV);
     }
   }
-  bool Contains(GlobalVariable *G) { return DynInitGlobals.count(G) != 0; }
+
+  GlobalVariable *getSourceLocation(GlobalVariable *G) const {
+    auto Pos = SourceLocation.find(G);
+    return (Pos != SourceLocation.end()) ? Pos->second : nullptr;
+  }
+
+  /// Check if the global is dynamically initialized.
+  bool isDynInit(GlobalVariable *G) const {
+    return DynInitGlobals.count(G);
+  }
+
+  /// Check if the global was blacklisted.
+  bool isBlacklisted(GlobalVariable *G) const {
+    return BlacklistedGlobals.count(G);
+  }
+
+  /// Check if the global was generated to describe source location of another
+  /// global (we don't want to instrument them).
+  bool isSourceLocationGlobal(GlobalVariable *G) const {
+    return LocationGlobals.count(G);
+  }
+
  private:
-  SmallSet<GlobalValue*, 32> DynInitGlobals;
+  bool inited_;
+  DenseMap<GlobalVariable*, GlobalVariable*> SourceLocation;
+  DenseSet<GlobalVariable*> DynInitGlobals;
+  DenseSet<GlobalVariable*> BlacklistedGlobals;
+  DenseSet<GlobalVariable*> LocationGlobals;
+
+  void addSourceLocationGlobal(GlobalVariable *SourceLocGV) {
+    // Source location global is a struct with layout:
+    // {
+    //    filename,
+    //    i32 line_number,
+    //    i32 column_number,
+    // }
+    LocationGlobals.insert(SourceLocGV);
+    ConstantStruct *Contents =
+        cast<ConstantStruct>(SourceLocGV->getInitializer());
+    GlobalVariable *FilenameGV = cast<GlobalVariable>(Contents->getOperand(0));
+    LocationGlobals.insert(FilenameGV);
+  }
 };
 
 /// This struct defines the shadow mapping using the rule:
@@ -306,16 +360,7 @@ static size_t RedzoneSizeForScale(int MappingScale) {
 
 /// AddressSanitizer: instrument the code in module to find memory bugs.
 struct AddressSanitizer : public FunctionPass {
-  AddressSanitizer(bool CheckInitOrder = true,
-                   bool CheckUseAfterReturn = false,
-                   bool CheckLifetime = false,
-                   StringRef BlacklistFile = StringRef())
-      : FunctionPass(ID),
-        CheckInitOrder(CheckInitOrder || ClInitializers),
-        CheckUseAfterReturn(CheckUseAfterReturn || ClUseAfterReturn),
-        CheckLifetime(CheckLifetime || ClCheckLifetime),
-        BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
-                                            : BlacklistFile) {}
+  AddressSanitizer() : FunctionPass(ID) {}
   const char *getPassName() const override {
     return "AddressSanitizerFunctionPass";
   }
@@ -344,11 +389,6 @@ struct AddressSanitizer : public FunctionPass {
   bool InjectCoverage(Function &F, const ArrayRef<BasicBlock*> AllBlocks);
   void InjectCoverageAtBlock(Function &F, BasicBlock &BB);
 
-  bool CheckInitOrder;
-  bool CheckUseAfterReturn;
-  bool CheckLifetime;
-  SmallString<64> BlacklistFile;
-
   LLVMContext *C;
   const DataLayout *DL;
   int LongSize;
@@ -359,7 +399,6 @@ struct AddressSanitizer : public FunctionPass {
   Function *AsanHandleNoReturnFunc;
   Function *AsanCovFunction;
   Function *AsanPtrCmpFunction, *AsanPtrSubFunction;
-  std::unique_ptr<SpecialCaseList> BL;
   // This array is indexed by AccessIsWrite and log2(AccessSize).
   Function *AsanErrorCallback[2][kNumberOfAccessSizes];
   Function *AsanMemoryAccessCallback[2][kNumberOfAccessSizes];
@@ -368,19 +407,14 @@ struct AddressSanitizer : public FunctionPass {
            *AsanMemoryAccessCallbackSized[2];
   Function *AsanMemmove, *AsanMemcpy, *AsanMemset;
   InlineAsm *EmptyAsm;
-  SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals;
+  GlobalsMetadata GlobalsMD;
 
   friend struct FunctionStackPoisoner;
 };
 
 class AddressSanitizerModule : public ModulePass {
  public:
-  AddressSanitizerModule(bool CheckInitOrder = true,
-                         StringRef BlacklistFile = StringRef())
-      : ModulePass(ID),
-        CheckInitOrder(CheckInitOrder || ClInitializers),
-        BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
-                                            : BlacklistFile) {}
+  AddressSanitizerModule() : ModulePass(ID) {}
   bool runOnModule(Module &M) override;
   static char ID;  // Pass identification, replacement for typeid
   const char *getPassName() const override {
@@ -390,17 +424,15 @@ class AddressSanitizerModule : public ModulePass {
  private:
   void initializeCallbacks(Module &M);
 
+  bool InstrumentGlobals(IRBuilder<> &IRB, Module &M);
   bool ShouldInstrumentGlobal(GlobalVariable *G);
+  void poisonOneInitializer(Function &GlobalInit, GlobalValue *ModuleName);
   void createInitializerPoisonCalls(Module &M, GlobalValue *ModuleName);
   size_t MinRedzoneSizeForGlobal() const {
     return RedzoneSizeForScale(Mapping.Scale);
   }
 
-  bool CheckInitOrder;
-  SmallString<64> BlacklistFile;
-
-  std::unique_ptr<SpecialCaseList> BL;
-  SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals;
+  GlobalsMetadata GlobalsMD;
   Type *IntptrTy;
   LLVMContext *C;
   const DataLayout *DL;
@@ -497,7 +529,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   /// \brief Collect lifetime intrinsic calls to check for use-after-scope
   /// errors.
   void visitIntrinsicInst(IntrinsicInst &II) {
-    if (!ASan.CheckLifetime) return;
+    if (!ClCheckLifetime) return;
     Intrinsic::ID ID = II.getIntrinsicID();
     if (ID != Intrinsic::lifetime_start &&
         ID != Intrinsic::lifetime_end)
@@ -552,20 +584,16 @@ char AddressSanitizer::ID = 0;
 INITIALIZE_PASS(AddressSanitizer, "asan",
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs.",
     false, false)
-FunctionPass *llvm::createAddressSanitizerFunctionPass(
-    bool CheckInitOrder, bool CheckUseAfterReturn, bool CheckLifetime,
-    StringRef BlacklistFile) {
-  return new AddressSanitizer(CheckInitOrder, CheckUseAfterReturn,
-                              CheckLifetime, BlacklistFile);
+FunctionPass *llvm::createAddressSanitizerFunctionPass() {
+  return new AddressSanitizer();
 }
 
 char AddressSanitizerModule::ID = 0;
 INITIALIZE_PASS(AddressSanitizerModule, "asan-module",
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs."
     "ModulePass", false, false)
-ModulePass *llvm::createAddressSanitizerModulePass(
-    bool CheckInitOrder, StringRef BlacklistFile) {
-  return new AddressSanitizerModule(CheckInitOrder, BlacklistFile);
+ModulePass *llvm::createAddressSanitizerModulePass() {
+  return new AddressSanitizerModule();
 }
 
 static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
@@ -682,7 +710,7 @@ bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) {
   // If a global variable does not have dynamic initialization we don't
   // have to instrument it.  However, if a global does not have initializer
   // at all, we assume it has dynamic initializer (in other TU).
-  return G->hasInitializer() && !DynamicallyInitializedGlobals.Contains(G);
+  return G->hasInitializer() && !GlobalsMD.isDynInit(G);
 }
 
 void
@@ -706,7 +734,7 @@ void AddressSanitizer::instrumentMop(Instruction *I, bool UseCalls) {
     if (GlobalVariable *G = dyn_cast<GlobalVariable>(Addr)) {
       // If initialization order checking is disabled, a simple access to a
       // dynamically initialized global is always valid.
-      if (!CheckInitOrder || GlobalIsLinkerInitialized(G)) {
+      if (!ClInitializers || GlobalIsLinkerInitialized(G)) {
         NumOptimizedAccessesToGlobalVar++;
         return;
       }
@@ -851,48 +879,36 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
   Crash->setDebugLoc(OrigIns->getDebugLoc());
 }
 
-void AddressSanitizerModule::createInitializerPoisonCalls(
-    Module &M, GlobalValue *ModuleName) {
-  // We do all of our poisoning and unpoisoning within a global constructor.
-  // These are called _GLOBAL__(sub_)?I_.*.
-  // TODO: Consider looking through the functions in
-  // M.getGlobalVariable("llvm.global_ctors") instead of using this stringly
-  // typed approach.
-  Function *GlobalInit = nullptr;
-  for (auto &F : M.getFunctionList()) {
-    StringRef FName = F.getName();
-
-    const char kGlobalPrefix[] = "_GLOBAL__";
-    if (!FName.startswith(kGlobalPrefix))
-      continue;
-    FName = FName.substr(strlen(kGlobalPrefix));
-
-    const char kOptionalSub[] = "sub_";
-    if (FName.startswith(kOptionalSub))
-      FName = FName.substr(strlen(kOptionalSub));
-
-    if (FName.startswith("I_")) {
-      GlobalInit = &F;
-      break;
-    }
-  }
-  // If that function is not present, this TU contains no globals, or they have
-  // all been optimized away
-  if (!GlobalInit)
-    return;
-
+void AddressSanitizerModule::poisonOneInitializer(Function &GlobalInit,
+                                                  GlobalValue *ModuleName) {
   // Set up the arguments to our poison/unpoison functions.
-  IRBuilder<> IRB(GlobalInit->begin()->getFirstInsertionPt());
+  IRBuilder<> IRB(GlobalInit.begin()->getFirstInsertionPt());
 
   // Add a call to poison all external globals before the given function starts.
   Value *ModuleNameAddr = ConstantExpr::getPointerCast(ModuleName, IntptrTy);
   IRB.CreateCall(AsanPoisonGlobals, ModuleNameAddr);
 
   // Add calls to unpoison all globals before each return instruction.
-  for (Function::iterator I = GlobalInit->begin(), E = GlobalInit->end();
-       I != E; ++I) {
-    if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator())) {
+  for (auto &BB : GlobalInit.getBasicBlockList())
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
       CallInst::Create(AsanUnpoisonGlobals, "", RI);
+}
+
+void AddressSanitizerModule::createInitializerPoisonCalls(
+    Module &M, GlobalValue *ModuleName) {
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+
+  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
+  for (Use &OP : CA->operands()) {
+    if (isa<ConstantAggregateZero>(OP))
+      continue;
+    ConstantStruct *CS = cast<ConstantStruct>(OP);
+
+    // Must have a function or null ptr.
+    // (CS->getOperand(0) is the init priority.)
+    if (Function* F = dyn_cast<Function>(CS->getOperand(1))) {
+      if (F->getName() != kAsanModuleCtorName)
+        poisonOneInitializer(*F, ModuleName);
     }
   }
 }
@@ -901,16 +917,20 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
   Type *Ty = cast<PointerType>(G->getType())->getElementType();
   DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
 
-  if (BL->isIn(*G)) return false;
+  if (GlobalsMD.isBlacklisted(G)) return false;
+  if (GlobalsMD.isSourceLocationGlobal(G)) return false;
   if (!Ty->isSized()) return false;
   if (!G->hasInitializer()) return false;
   if (GlobalWasGeneratedByAsan(G)) return false;  // Our own global.
   // Touch only those globals that will not be defined in other modules.
-  // Don't handle ODR type linkages since other modules may be built w/o asan.
+  // Don't handle ODR linkage types and COMDATs since other modules may be built
+  // without ASan.
   if (G->getLinkage() != GlobalVariable::ExternalLinkage &&
       G->getLinkage() != GlobalVariable::PrivateLinkage &&
       G->getLinkage() != GlobalVariable::InternalLinkage)
     return false;
+  if (G->hasComdat())
+    return false;
   // Two problems with thread-locals:
   //   - The address of the main thread's copy can't be computed at link-time.
   //   - Need to poison all copies, not just the main thread's one.
@@ -1001,39 +1021,16 @@ void AddressSanitizerModule::initializeCallbacks(Module &M) {
 // This function replaces all global variables with new variables that have
 // trailing redzones. It also creates a function that poisons
 // redzones and inserts this function into llvm.global_ctors.
-bool AddressSanitizerModule::runOnModule(Module &M) {
-  if (!ClGlobals) return false;
-
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  if (!DLP)
-    return false;
-  DL = &DLP->getDataLayout();
-
-  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
-  if (BL->isIn(M)) return false;
-  C = &(M.getContext());
-  int LongSize = DL->getPointerSizeInBits();
-  IntptrTy = Type::getIntNTy(*C, LongSize);
-  Mapping = getShadowMapping(M, LongSize);
-  initializeCallbacks(M);
-  DynamicallyInitializedGlobals.Init(M);
+bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
+  GlobalsMD.init(M);
 
   SmallVector<GlobalVariable *, 16> GlobalsToChange;
 
-  for (Module::GlobalListType::iterator G = M.global_begin(),
-       E = M.global_end(); G != E; ++G) {
-    if (ShouldInstrumentGlobal(G))
-      GlobalsToChange.push_back(G);
+  for (auto &G : M.globals()) {
+    if (ShouldInstrumentGlobal(&G))
+      GlobalsToChange.push_back(&G);
   }
 
-  Function *CtorFunc = M.getFunction(kAsanModuleCtorName);
-  assert(CtorFunc);
-  IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator());
-
-  Function *CovFunc = M.getFunction(kAsanCovName);
-  int nCov = CovFunc ? CovFunc->getNumUses() : 0;
-  IRB.CreateCall(AsanCovModuleInit, ConstantInt::get(IntptrTy, nCov));
-
   size_t n = GlobalsToChange.size();
   if (n == 0) return false;
 
@@ -1044,10 +1041,11 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   //   const char *name;
   //   const char *module_name;
   //   size_t has_dynamic_init;
+  //   void *source_location;
   // We initialize an array of such structures and pass it to a run-time call.
-  StructType *GlobalStructTy = StructType::get(IntptrTy, IntptrTy,
-                                               IntptrTy, IntptrTy,
-                                               IntptrTy, IntptrTy, NULL);
+  StructType *GlobalStructTy =
+      StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy,
+                      IntptrTy, IntptrTy, NULL);
   SmallVector<Constant *, 16> Initializers(n);
 
   bool HasDynamicallyInitializedGlobals = false;
@@ -1075,11 +1073,6 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
       RightRedzoneSize += MinRZ - (SizeInBytes % MinRZ);
     assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0);
     Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
-    // Determine whether this global should be poisoned in initialization.
-    bool GlobalHasDynamicInitializer =
-        DynamicallyInitializedGlobals.Contains(G);
-    // Don't check initialization order if this global is blacklisted.
-    GlobalHasDynamicInitializer &= !BL->isIn(*G, "init");
 
     StructType *NewTy = StructType::get(Ty, RightRedZoneTy, NULL);
     Constant *NewInitializer = ConstantStruct::get(
@@ -1108,18 +1101,21 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
     NewGlobal->takeName(G);
     G->eraseFromParent();
 
+    bool GlobalHasDynamicInitializer = GlobalsMD.isDynInit(G);
+    GlobalVariable *SourceLoc = GlobalsMD.getSourceLocation(G);
+
     Initializers[i] = ConstantStruct::get(
-        GlobalStructTy,
-        ConstantExpr::getPointerCast(NewGlobal, IntptrTy),
+        GlobalStructTy, ConstantExpr::getPointerCast(NewGlobal, IntptrTy),
         ConstantInt::get(IntptrTy, SizeInBytes),
         ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize),
         ConstantExpr::getPointerCast(Name, IntptrTy),
         ConstantExpr::getPointerCast(ModuleName, IntptrTy),
         ConstantInt::get(IntptrTy, GlobalHasDynamicInitializer),
+        SourceLoc ? ConstantExpr::getPointerCast(SourceLoc, IntptrTy)
+                  : ConstantInt::get(IntptrTy, 0),
         NULL);
 
-    // Populate the first and last globals declared in this TU.
-    if (CheckInitOrder && GlobalHasDynamicInitializer)
+    if (ClInitializers && GlobalHasDynamicInitializer)
       HasDynamicallyInitializedGlobals = true;
 
     DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
@@ -1131,7 +1127,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
       ConstantArray::get(ArrayOfGlobalStructTy, Initializers), "");
 
   // Create calls for poisoning before initializers run and unpoisoning after.
-  if (CheckInitOrder && HasDynamicallyInitializedGlobals)
+  if (HasDynamicallyInitializedGlobals)
     createInitializerPoisonCalls(M, ModuleName);
   IRB.CreateCall2(AsanRegisterGlobals,
                   IRB.CreatePointerCast(AllGlobals, IntptrTy),
@@ -1147,12 +1143,42 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   IRB_Dtor.CreateCall2(AsanUnregisterGlobals,
                        IRB.CreatePointerCast(AllGlobals, IntptrTy),
                        ConstantInt::get(IntptrTy, n));
-  appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndCtorPriority);
+  appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority);
 
   DEBUG(dbgs() << M);
   return true;
 }
 
+bool AddressSanitizerModule::runOnModule(Module &M) {
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  if (!DLP)
+    return false;
+  DL = &DLP->getDataLayout();
+  C = &(M.getContext());
+  int LongSize = DL->getPointerSizeInBits();
+  IntptrTy = Type::getIntNTy(*C, LongSize);
+  Mapping = getShadowMapping(M, LongSize);
+  initializeCallbacks(M);
+
+  bool Changed = false;
+
+  Function *CtorFunc = M.getFunction(kAsanModuleCtorName);
+  assert(CtorFunc);
+  IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator());
+
+  if (ClCoverage > 0) {
+    Function *CovFunc = M.getFunction(kAsanCovName);
+    int nCov = CovFunc ? CovFunc->getNumUses() : 0;
+    IRB.CreateCall(AsanCovModuleInit, ConstantInt::get(IntptrTy, nCov));
+    Changed = true;
+  }
+
+  if (ClGlobals)
+    Changed |= InstrumentGlobals(IRB, M);
+
+  return Changed;
+}
+
 void AddressSanitizer::initializeCallbacks(Module &M) {
   IRBuilder<> IRB(*C);
   // Create __asan_report* callbacks.
@@ -1216,8 +1242,7 @@ bool AddressSanitizer::doInitialization(Module &M) {
     report_fatal_error("data layout missing");
   DL = &DLP->getDataLayout();
 
-  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
-  DynamicallyInitializedGlobals.Init(M);
+  GlobalsMD.init(M);
 
   C = &(M.getContext());
   LongSize = DL->getPointerSizeInBits();
@@ -1236,7 +1261,7 @@ bool AddressSanitizer::doInitialization(Module &M) {
 
   Mapping = getShadowMapping(M, LongSize);
 
-  appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndCtorPriority);
+  appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority);
   return true;
 }
 
@@ -1267,7 +1292,9 @@ void AddressSanitizer::InjectCoverageAtBlock(Function &F, BasicBlock &BB) {
       break;
   }
 
+  DebugLoc EntryLoc = IP->getDebugLoc().getFnDebugLoc(*C);
   IRBuilder<> IRB(IP);
+  IRB.SetCurrentDebugLocation(EntryLoc);
   Type *Int8Ty = IRB.getInt8Ty();
   GlobalVariable *Guard = new GlobalVariable(
       *F.getParent(), Int8Ty, false, GlobalValue::PrivateLinkage,
@@ -1279,10 +1306,10 @@ void AddressSanitizer::InjectCoverageAtBlock(Function &F, BasicBlock &BB) {
   Instruction *Ins = SplitBlockAndInsertIfThen(
       Cmp, IP, false, MDBuilder(*C).createBranchWeights(1, 100000));
   IRB.SetInsertPoint(Ins);
+  IRB.SetCurrentDebugLocation(EntryLoc);
   // We pass &F to __sanitizer_cov. We could avoid this and rely on
   // GET_CALLER_PC, but having the PC of the first instruction is just nice.
-  Instruction *Call = IRB.CreateCall(AsanCovFunction);
-  Call->setDebugLoc(IP->getDebugLoc());
+  IRB.CreateCall(AsanCovFunction);
   StoreInst *Store = IRB.CreateStore(ConstantInt::get(Int8Ty, 1), Guard);
   Store->setAtomic(Monotonic);
   Store->setAlignment(1);
@@ -1316,14 +1343,13 @@ bool AddressSanitizer::InjectCoverage(Function &F,
       (unsigned)ClCoverageBlockThreshold < AllBlocks.size()) {
     InjectCoverageAtBlock(F, F.getEntryBlock());
   } else {
-    for (size_t i = 0, n = AllBlocks.size(); i < n; i++)
-      InjectCoverageAtBlock(F, *AllBlocks[i]);
+    for (auto BB : AllBlocks)
+      InjectCoverageAtBlock(F, *BB);
   }
   return true;
 }
 
 bool AddressSanitizer::runOnFunction(Function &F) {
-  if (BL->isIn(F)) return false;
   if (&F == AsanCtorFunction) return false;
   if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
   DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
@@ -1350,29 +1376,28 @@ bool AddressSanitizer::runOnFunction(Function &F) {
   unsigned Alignment;
 
   // Fill the set of memory operations to instrument.
-  for (Function::iterator FI = F.begin(), FE = F.end();
-       FI != FE; ++FI) {
-    AllBlocks.push_back(FI);
+  for (auto &BB : F) {
+    AllBlocks.push_back(&BB);
     TempsToInstrument.clear();
     int NumInsnsPerBB = 0;
-    for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
-         BI != BE; ++BI) {
-      if (LooksLikeCodeInBug11395(BI)) return false;
-      if (Value *Addr = isInterestingMemoryAccess(BI, &IsWrite, &Alignment)) {
+    for (auto &Inst : BB) {
+      if (LooksLikeCodeInBug11395(&Inst)) return false;
+      if (Value *Addr =
+              isInterestingMemoryAccess(&Inst, &IsWrite, &Alignment)) {
         if (ClOpt && ClOptSameTemp) {
           if (!TempsToInstrument.insert(Addr))
             continue;  // We've seen this temp in the current BB.
         }
       } else if (ClInvalidPointerPairs &&
-                 isInterestingPointerComparisonOrSubtraction(BI)) {
-        PointerComparisonsOrSubtracts.push_back(BI);
+                 isInterestingPointerComparisonOrSubtraction(&Inst)) {
+        PointerComparisonsOrSubtracts.push_back(&Inst);
         continue;
-      } else if (isa<MemIntrinsic>(BI)) {
+      } else if (isa<MemIntrinsic>(Inst)) {
         // ok, take it.
       } else {
-        if (isa<AllocaInst>(BI))
+        if (isa<AllocaInst>(Inst))
           NumAllocas++;
-        CallSite CS(BI);
+        CallSite CS(&Inst);
         if (CS) {
           // A call inside BB.
           TempsToInstrument.clear();
@@ -1381,7 +1406,7 @@ bool AddressSanitizer::runOnFunction(Function &F) {
         }
         continue;
       }
-      ToInstrument.push_back(BI);
+      ToInstrument.push_back(&Inst);
       NumInsnsPerBB++;
       if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB)
         break;
@@ -1406,8 +1431,7 @@ bool AddressSanitizer::runOnFunction(Function &F) {
 
   // Instrument.
   int NumInstrumented = 0;
-  for (size_t i = 0, n = ToInstrument.size(); i != n; i++) {
-    Instruction *Inst = ToInstrument[i];
+  for (auto Inst : ToInstrument) {
     if (ClDebugMin < 0 || ClDebugMax < 0 ||
         (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
       if (isInterestingMemoryAccess(Inst, &IsWrite, &Alignment))
@@ -1423,14 +1447,13 @@ bool AddressSanitizer::runOnFunction(Function &F) {
 
   // We must unpoison the stack before every NoReturn call (throw, _exit, etc).
   // See e.g. http://code.google.com/p/address-sanitizer/issues/detail?id=37
-  for (size_t i = 0, n = NoReturnCalls.size(); i != n; i++) {
-    Instruction *CI = NoReturnCalls[i];
+  for (auto CI : NoReturnCalls) {
     IRBuilder<> IRB(CI);
     IRB.CreateCall(AsanHandleNoReturnFunc);
   }
 
-  for (size_t i = 0, n = PointerComparisonsOrSubtracts.size(); i != n; i++) {
-    instrumentPointerComparisonOrSubtraction(PointerComparisonsOrSubtracts[i]);
+  for (auto Inst : PointerComparisonsOrSubtracts) {
+    instrumentPointerComparisonOrSubtraction(Inst);
     NumInstrumented++;
   }
 
@@ -1543,12 +1566,10 @@ void FunctionStackPoisoner::SetShadowToStackAfterReturnInlined(
 }
 
 static DebugLoc getFunctionEntryDebugLocation(Function &F) {
-  BasicBlock::iterator I = F.getEntryBlock().begin(),
-                       E = F.getEntryBlock().end();
-  for (; I != E; ++I)
-    if (!isa<AllocaInst>(I))
-      break;
-  return I->getDebugLoc();
+  for (const auto &Inst : F.getEntryBlock())
+    if (!isa<AllocaInst>(Inst))
+      return Inst.getDebugLoc();
+  return DebugLoc();
 }
 
 void FunctionStackPoisoner::poisonStack() {
@@ -1562,8 +1583,7 @@ void FunctionStackPoisoner::poisonStack() {
 
   SmallVector<ASanStackVariableDescription, 16> SVD;
   SVD.reserve(AllocaVec.size());
-  for (size_t i = 0, n = AllocaVec.size(); i < n; i++) {
-    AllocaInst *AI = AllocaVec[i];
+  for (AllocaInst *AI : AllocaVec) {
     ASanStackVariableDescription D = { AI->getName().data(),
                                    getAllocaSizeInBytes(AI),
                                    AI->getAlignment(), AI, 0};
@@ -1577,7 +1597,7 @@ void FunctionStackPoisoner::poisonStack() {
   DEBUG(dbgs() << L.DescriptionString << " --- " << L.FrameSize << "\n");
   uint64_t LocalStackSize = L.FrameSize;
   bool DoStackMalloc =
-      ASan.CheckUseAfterReturn && LocalStackSize <= kMaxStackMallocSize;
+      ClUseAfterReturn && LocalStackSize <= kMaxStackMallocSize;
 
   Type *ByteArrayTy = ArrayType::get(IRB.getInt8Ty(), LocalStackSize);
   AllocaInst *MyAlloca =
@@ -1618,8 +1638,7 @@ void FunctionStackPoisoner::poisonStack() {
 
   // Insert poison calls for lifetime intrinsics for alloca.
   bool HavePoisonedAllocas = false;
-  for (size_t i = 0, n = AllocaPoisonCallVec.size(); i < n; i++) {
-    const AllocaPoisonCall &APC = AllocaPoisonCallVec[i];
+  for (const auto &APC : AllocaPoisonCallVec) {
     assert(APC.InsBefore);
     assert(APC.AI);
     IRBuilder<> IRB(APC.InsBefore);
@@ -1628,11 +1647,10 @@ void FunctionStackPoisoner::poisonStack() {
   }
 
   // Replace Alloca instructions with base+offset.
-  for (size_t i = 0, n = SVD.size(); i < n; i++) {
-    AllocaInst *AI = SVD[i].AI;
+  for (const auto &Desc : SVD) {
+    AllocaInst *AI = Desc.AI;
     Value *NewAllocaPtr = IRB.CreateIntToPtr(
-        IRB.CreateAdd(LocalStackBase,
-                      ConstantInt::get(IntptrTy, SVD[i].Offset)),
+        IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)),
         AI->getType());
     replaceDbgDeclareForAlloca(AI, NewAllocaPtr, DIB);
     AI->replaceAllUsesWith(NewAllocaPtr);
@@ -1665,8 +1683,7 @@ void FunctionStackPoisoner::poisonStack() {
   poisonRedZones(L.ShadowBytes, IRB, ShadowBase, true);
 
   // (Un)poison the stack before all ret instructions.
-  for (size_t i = 0, n = RetVec.size(); i < n; i++) {
-    Instruction *Ret = RetVec[i];
+  for (auto Ret : RetVec) {
     IRBuilder<> IRBRet(Ret);
     // Mark the current frame as retired.
     IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic),
@@ -1720,8 +1737,8 @@ void FunctionStackPoisoner::poisonStack() {
   }
 
   // We are done. Remove the old unused alloca instructions.
-  for (size_t i = 0, n = AllocaVec.size(); i < n; i++)
-    AllocaVec[i]->eraseFromParent();
+  for (auto AI : AllocaVec)
+    AI->eraseFromParent();
 }
 
 void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 7f468f7..799e14b 100644
--- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -59,9 +59,9 @@
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/SpecialCaseList.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/SpecialCaseList.h"
 #include <iterator>
 
 using namespace llvm;
@@ -120,6 +120,51 @@ static cl::opt<bool> ClDebugNonzeroLabels(
 
 namespace {
 
+StringRef GetGlobalTypeString(const GlobalValue &G) {
+  // Types of GlobalVariables are always pointer types.
+  Type *GType = G.getType()->getElementType();
+  // For now we support blacklisting struct types only.
+  if (StructType *SGType = dyn_cast<StructType>(GType)) {
+    if (!SGType->isLiteral())
+      return SGType->getName();
+  }
+  return "<unknown type>";
+}
+
+class DFSanABIList {
+  std::unique_ptr<SpecialCaseList> SCL;
+
+ public:
+  DFSanABIList(SpecialCaseList *SCL) : SCL(SCL) {}
+
+  /// Returns whether either this function or its source file are listed in the
+  /// given category.
+  bool isIn(const Function &F, const StringRef Category) const {
+    return isIn(*F.getParent(), Category) ||
+           SCL->inSection("fun", F.getName(), Category);
+  }
+
+  /// Returns whether this global alias is listed in the given category.
+  ///
+  /// If GA aliases a function, the alias's name is matched as a function name
+  /// would be.  Similarly, aliases of globals are matched like globals.
+  bool isIn(const GlobalAlias &GA, const StringRef Category) const {
+    if (isIn(*GA.getParent(), Category))
+      return true;
+
+    if (isa<FunctionType>(GA.getType()->getElementType()))
+      return SCL->inSection("fun", GA.getName(), Category);
+
+    return SCL->inSection("global", GA.getName(), Category) ||
+           SCL->inSection("type", GetGlobalTypeString(GA), Category);
+  }
+
+  /// Returns whether this module is listed in the given category.
+  bool isIn(const Module &M, const StringRef Category) const {
+    return SCL->inSection("src", M.getModuleIdentifier(), Category);
+  }
+};
+
 class DataFlowSanitizer : public ModulePass {
   friend struct DFSanFunction;
   friend class DFSanVisitor;
@@ -190,7 +235,7 @@ class DataFlowSanitizer : public ModulePass {
   Constant *DFSanSetLabelFn;
   Constant *DFSanNonzeroLabelFn;
   MDNode *ColdCallWeights;
-  std::unique_ptr<SpecialCaseList> ABIList;
+  DFSanABIList ABIList;
   DenseMap<Value *, Function *> UnwrappedFnMap;
   AttributeSet ReadOnlyNoneAttrs;
 
@@ -395,11 +440,11 @@ bool DataFlowSanitizer::doInitialization(Module &M) {
 }
 
 bool DataFlowSanitizer::isInstrumented(const Function *F) {
-  return !ABIList->isIn(*F, "uninstrumented");
+  return !ABIList.isIn(*F, "uninstrumented");
 }
 
 bool DataFlowSanitizer::isInstrumented(const GlobalAlias *GA) {
-  return !ABIList->isIn(*GA, "uninstrumented");
+  return !ABIList.isIn(*GA, "uninstrumented");
 }
 
 DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() {
@@ -407,11 +452,11 @@ DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() {
 }
 
 DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) {
-  if (ABIList->isIn(*F, "functional"))
+  if (ABIList.isIn(*F, "functional"))
     return WK_Functional;
-  if (ABIList->isIn(*F, "discard"))
+  if (ABIList.isIn(*F, "discard"))
     return WK_Discard;
-  if (ABIList->isIn(*F, "custom"))
+  if (ABIList.isIn(*F, "custom"))
     return WK_Custom;
 
   return WK_Warning;
@@ -500,7 +545,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
   if (!DL)
     return false;
 
-  if (ABIList->isIn(M, "skip"))
+  if (ABIList.isIn(M, "skip"))
     return false;
 
   if (!GetArgTLSPtr) {
@@ -557,7 +602,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
     ++i;
     // Don't stop on weak.  We assume people aren't playing games with the
     // instrumentedness of overridden weak aliases.
-    if (Function *F = dyn_cast<Function>(GA->getAliasee())) {
+    if (auto F = dyn_cast<Function>(GA->getBaseObject())) {
       bool GAInst = isInstrumented(GA), FInst = isInstrumented(F);
       if (GAInst && FInst) {
         addGlobalNamePrefix(GA);
@@ -567,7 +612,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
         // below will take care of instrumenting it.
         Function *NewF =
             buildWrapperFunction(F, "", GA->getLinkage(), F->getFunctionType());
-        GA->replaceAllUsesWith(NewF);
+        GA->replaceAllUsesWith(ConstantExpr::getBitCast(NewF, GA->getType()));
         NewF->takeName(GA);
         GA->eraseFromParent();
         FnsToInstrument.push_back(NewF);
diff --git a/lib/Transforms/Instrumentation/DebugIR.cpp b/lib/Transforms/Instrumentation/DebugIR.cpp
index 18bda1a..f2f1738 100644
--- a/lib/Transforms/Instrumentation/DebugIR.cpp
+++ b/lib/Transforms/Instrumentation/DebugIR.cpp
@@ -354,7 +354,10 @@ private:
   std::string getTypeName(Type *T) {
     std::string TypeName;
     raw_string_ostream TypeStream(TypeName);
-    T->print(TypeStream);
+    if (T)
+      T->print(TypeStream);
+    else
+      TypeStream << "Printing <null> Type";
     TypeStream.flush();
     return TypeName;
   }
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 8330a9b..cfeb62e 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -211,6 +211,7 @@ namespace {
   class GCOVLines : public GCOVRecord {
    public:
     void addLine(uint32_t Line) {
+      assert(Line != 0 && "Line zero is not a valid real line number.");
       Lines.push_back(Line);
     }
 
@@ -453,10 +454,17 @@ static bool functionHasLines(Function *F) {
   for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
     for (BasicBlock::iterator I = BB->begin(), IE = BB->end();
          I != IE; ++I) {
+      // Debug intrinsic locations correspond to the location of the
+      // declaration, not necessarily any statements or expressions.
+      if (isa<DbgInfoIntrinsic>(I)) continue;
+
       const DebugLoc &Loc = I->getDebugLoc();
       if (Loc.isUnknown()) continue;
-      if (Loc.getLine() != 0)
-        return true;
+
+      // Artificial lines such as calls to the global constructors.
+      if (Loc.getLine() == 0) continue; 
+
+      return true;
     }
   }
   return false;
@@ -515,8 +523,16 @@ void GCOVProfiler::emitProfileNotes() {
         uint32_t Line = 0;
         for (BasicBlock::iterator I = BB->begin(), IE = BB->end();
              I != IE; ++I) {
+          // Debug intrinsic locations correspond to the location of the
+          // declaration, not necessarily any statements or expressions.
+          if (isa<DbgInfoIntrinsic>(I)) continue;
+
           const DebugLoc &Loc = I->getDebugLoc();
           if (Loc.isUnknown()) continue;
+
+          // Artificial lines such as calls to the global constructors.
+          if (Loc.getLine() == 0) continue;
+
           if (Line == Loc.getLine()) continue;
           Line = Loc.getLine();
           if (SP != getDISubprogram(Loc.getScope(*Ctx))) continue;
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index b8e632e..496ab48 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -10,8 +10,6 @@
 /// This file is a part of MemorySanitizer, a detector of uninitialized
 /// reads.
 ///
-/// Status: early prototype.
-///
 /// The algorithm of the tool is similar to Memcheck
 /// (http://goo.gl/QKbem). We associate a few shadow bits with every
 /// byte of the application memory, poison the shadow of the malloc-ed
@@ -117,7 +115,6 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
-#include "llvm/Transforms/Utils/SpecialCaseList.h"
 
 using namespace llvm;
 
@@ -178,10 +175,6 @@ static cl::opt<bool> ClDumpStrictInstructions("msan-dump-strict-instructions",
        cl::desc("print out instructions with default strict semantics"),
        cl::Hidden, cl::init(false));
 
-static cl::opt<std::string>  ClBlacklistFile("msan-blacklist",
-       cl::desc("File containing the list of functions where MemorySanitizer "
-                "should not report bugs"), cl::Hidden);
-
 static cl::opt<int> ClInstrumentationWithCallThreshold(
     "msan-instrumentation-with-call-threshold",
     cl::desc(
@@ -211,13 +204,11 @@ namespace {
 /// uninitialized reads.
 class MemorySanitizer : public FunctionPass {
  public:
-  MemorySanitizer(int TrackOrigins = 0,
-                  StringRef BlacklistFile = StringRef())
+  MemorySanitizer(int TrackOrigins = 0)
       : FunctionPass(ID),
         TrackOrigins(std::max(TrackOrigins, (int)ClTrackOrigins)),
         DL(nullptr),
         WarningFn(nullptr),
-        BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile : BlacklistFile),
         WrapIndirectCalls(!ClWrapIndirectCalls.empty()) {}
   const char *getPassName() const override { return "MemorySanitizer"; }
   bool runOnFunction(Function &F) override;
@@ -282,10 +273,6 @@ class MemorySanitizer : public FunctionPass {
   MDNode *ColdCallWeights;
   /// \brief Branch weights for origin store.
   MDNode *OriginStoreWeights;
-  /// \brief Path to blacklist file.
-  SmallString<64> BlacklistFile;
-  /// \brief The blacklist.
-  std::unique_ptr<SpecialCaseList> BL;
   /// \brief An empty volatile inline asm that prevents callback merge.
   InlineAsm *EmptyAsm;
 
@@ -305,9 +292,8 @@ INITIALIZE_PASS(MemorySanitizer, "msan",
                 "MemorySanitizer: detects uninitialized reads.",
                 false, false)
 
-FunctionPass *llvm::createMemorySanitizerPass(int TrackOrigins,
-                                              StringRef BlacklistFile) {
-  return new MemorySanitizer(TrackOrigins, BlacklistFile);
+FunctionPass *llvm::createMemorySanitizerPass(int TrackOrigins) {
+  return new MemorySanitizer(TrackOrigins);
 }
 
 /// \brief Create a non-const global initialized with the given string.
@@ -431,7 +417,6 @@ bool MemorySanitizer::doInitialization(Module &M) {
     report_fatal_error("data layout missing");
   DL = &DLP->getDataLayout();
 
-  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
   C = &(M.getContext());
   unsigned PtrSize = DL->getPointerSizeInBits(/* AddressSpace */0);
   switch (PtrSize) {
@@ -526,7 +511,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   // The following flags disable parts of MSan instrumentation based on
   // blacklist contents and command-line options.
   bool InsertChecks;
-  bool LoadShadow;
+  bool PropagateShadow;
   bool PoisonStack;
   bool PoisonUndef;
   bool CheckReturnValue;
@@ -544,11 +529,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   MemorySanitizerVisitor(Function &F, MemorySanitizer &MS)
       : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)) {
-    bool SanitizeFunction = !MS.BL->isIn(F) && F.getAttributes().hasAttribute(
-                                                   AttributeSet::FunctionIndex,
-                                                   Attribute::SanitizeMemory);
+    bool SanitizeFunction = F.getAttributes().hasAttribute(
+        AttributeSet::FunctionIndex, Attribute::SanitizeMemory);
     InsertChecks = SanitizeFunction;
-    LoadShadow = SanitizeFunction;
+    PropagateShadow = SanitizeFunction;
     PoisonStack = SanitizeFunction && ClPoisonStack;
     PoisonUndef = SanitizeFunction && ClPoisonUndef;
     // FIXME: Consider using SpecialCaseList to specify a list of functions that
@@ -585,7 +569,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
             ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex)));
         IRB.CreateCall3(Fn, ConvertedShadow2,
                         IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
-                        updateOrigin(Origin, IRB));
+                        Origin);
       } else {
         Value *Cmp = IRB.CreateICmpNE(
             ConvertedShadow, getCleanShadow(ConvertedShadow), "_mscmp");
@@ -599,26 +583,26 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   void materializeStores(bool InstrumentWithCalls) {
-    for (size_t i = 0, n = StoreList.size(); i < n; i++) {
-      StoreInst &I = *dyn_cast<StoreInst>(StoreList[i]);
+    for (auto Inst : StoreList) {
+      StoreInst &SI = *dyn_cast<StoreInst>(Inst);
 
-      IRBuilder<> IRB(&I);
-      Value *Val = I.getValueOperand();
-      Value *Addr = I.getPointerOperand();
-      Value *Shadow = I.isAtomic() ? getCleanShadow(Val) : getShadow(Val);
+      IRBuilder<> IRB(&SI);
+      Value *Val = SI.getValueOperand();
+      Value *Addr = SI.getPointerOperand();
+      Value *Shadow = SI.isAtomic() ? getCleanShadow(Val) : getShadow(Val);
       Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB);
 
       StoreInst *NewSI =
-          IRB.CreateAlignedStore(Shadow, ShadowPtr, I.getAlignment());
+          IRB.CreateAlignedStore(Shadow, ShadowPtr, SI.getAlignment());
       DEBUG(dbgs() << "  STORE: " << *NewSI << "\n");
       (void)NewSI;
 
-      if (ClCheckAccessAddress) insertShadowCheck(Addr, &I);
+      if (ClCheckAccessAddress) insertShadowCheck(Addr, &SI);
 
-      if (I.isAtomic()) I.setOrdering(addReleaseOrdering(I.getOrdering()));
+      if (SI.isAtomic()) SI.setOrdering(addReleaseOrdering(SI.getOrdering()));
 
       if (MS.TrackOrigins) {
-        unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment());
+        unsigned Alignment = std::max(kMinOriginAlignment, SI.getAlignment());
         storeOrigin(IRB, Addr, Shadow, getOrigin(Val), Alignment,
                     InstrumentWithCalls);
       }
@@ -662,18 +646,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   void materializeChecks(bool InstrumentWithCalls) {
-    for (size_t i = 0, n = InstrumentationList.size(); i < n; i++) {
-      Instruction *OrigIns = InstrumentationList[i].OrigIns;
-      Value *Shadow = InstrumentationList[i].Shadow;
-      Value *Origin = InstrumentationList[i].Origin;
+    for (const auto &ShadowData : InstrumentationList) {
+      Instruction *OrigIns = ShadowData.OrigIns;
+      Value *Shadow = ShadowData.Shadow;
+      Value *Origin = ShadowData.Origin;
       materializeOneCheck(OrigIns, Shadow, Origin, InstrumentWithCalls);
     }
     DEBUG(dbgs() << "DONE:\n" << F);
   }
 
   void materializeIndirectCalls() {
-    for (size_t i = 0, n = IndirectCallList.size(); i < n; i++) {
-      CallSite CS = IndirectCallList[i];
+    for (auto &CS : IndirectCallList) {
       Instruction *I = CS.getInstruction();
       BasicBlock *B = I->getParent();
       IRBuilder<> IRB(I);
@@ -732,15 +715,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
 
     // Finalize PHI nodes.
-    for (size_t i = 0, n = ShadowPHINodes.size(); i < n; i++) {
-      PHINode *PN = ShadowPHINodes[i];
+    for (PHINode *PN : ShadowPHINodes) {
       PHINode *PNS = cast<PHINode>(getShadow(PN));
       PHINode *PNO = MS.TrackOrigins ? cast<PHINode>(getOrigin(PN)) : nullptr;
       size_t NumValues = PN->getNumIncomingValues();
       for (size_t v = 0; v < NumValues; v++) {
         PNS->addIncoming(getShadow(PN, v), PN->getIncomingBlock(v));
-        if (PNO)
-          PNO->addIncoming(getOrigin(PN, v), PN->getIncomingBlock(v));
+        if (PNO) PNO->addIncoming(getOrigin(PN, v), PN->getIncomingBlock(v));
       }
     }
 
@@ -874,7 +855,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// \brief Set SV to be the shadow value for V.
   void setShadow(Value *V, Value *SV) {
     assert(!ShadowMap.count(V) && "Values may only have one shadow");
-    ShadowMap[V] = SV;
+    ShadowMap[V] = PropagateShadow ? SV : getCleanShadow(V);
   }
 
   /// \brief Set Origin to be the origin value for V.
@@ -926,6 +907,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// This function either returns the value set earlier with setShadow,
   /// or extracts if from ParamTLS (for function arguments).
   Value *getShadow(Value *V) {
+    if (!PropagateShadow) return getCleanShadow(V);
     if (Instruction *I = dyn_cast<Instruction>(V)) {
       // For instructions the shadow is already stored in the map.
       Value *Shadow = ShadowMap[V];
@@ -950,22 +932,21 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       Function *F = A->getParent();
       IRBuilder<> EntryIRB(F->getEntryBlock().getFirstNonPHI());
       unsigned ArgOffset = 0;
-      for (Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end();
-           AI != AE; ++AI) {
-        if (!AI->getType()->isSized()) {
+      for (auto &FArg : F->args()) {
+        if (!FArg.getType()->isSized()) {
           DEBUG(dbgs() << "Arg is not sized\n");
           continue;
         }
-        unsigned Size = AI->hasByValAttr()
-          ? MS.DL->getTypeAllocSize(AI->getType()->getPointerElementType())
-          : MS.DL->getTypeAllocSize(AI->getType());
-        if (A == AI) {
-          Value *Base = getShadowPtrForArgument(AI, EntryIRB, ArgOffset);
-          if (AI->hasByValAttr()) {
+        unsigned Size = FArg.hasByValAttr()
+          ? MS.DL->getTypeAllocSize(FArg.getType()->getPointerElementType())
+          : MS.DL->getTypeAllocSize(FArg.getType());
+        if (A == &FArg) {
+          Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
+          if (FArg.hasByValAttr()) {
             // ByVal pointer itself has clean shadow. We copy the actual
             // argument shadow to the underlying memory.
             // Figure out maximal valid memcpy alignment.
-            unsigned ArgAlign = AI->getParamAlignment();
+            unsigned ArgAlign = FArg.getParamAlignment();
             if (ArgAlign == 0) {
               Type *EltType = A->getType()->getPointerElementType();
               ArgAlign = MS.DL->getABITypeAlignment(EltType);
@@ -980,10 +961,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           } else {
             *ShadowPtr = EntryIRB.CreateAlignedLoad(Base, kShadowTLSAlignment);
           }
-          DEBUG(dbgs() << "  ARG:    "  << *AI << " ==> " <<
+          DEBUG(dbgs() << "  ARG:    "  << FArg << " ==> " <<
                 **ShadowPtr << "\n");
           if (MS.TrackOrigins) {
-            Value* OriginPtr = getOriginPtrForArgument(AI, EntryIRB, ArgOffset);
+            Value *OriginPtr =
+                getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset);
             setOrigin(A, EntryIRB.CreateLoad(OriginPtr));
           }
         }
@@ -1093,7 +1075,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRBuilder<> IRB(I.getNextNode());
     Type *ShadowTy = getShadowTy(&I);
     Value *Addr = I.getPointerOperand();
-    if (LoadShadow) {
+    if (PropagateShadow) {
       Value *ShadowPtr = getShadowPtr(Addr, ShadowTy, IRB);
       setShadow(&I,
                 IRB.CreateAlignedLoad(ShadowPtr, I.getAlignment(), "_msld"));
@@ -1108,7 +1090,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       I.setOrdering(addAcquireOrdering(I.getOrdering()));
 
     if (MS.TrackOrigins) {
-      if (LoadShadow) {
+      if (PropagateShadow) {
         unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment());
         setOrigin(&I,
                   IRB.CreateAlignedLoad(getOriginPtr(Addr, IRB), Alignment));
@@ -1320,10 +1302,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         if (!Origin) {
           Origin = OpOrigin;
         } else {
-          Value *FlatShadow = MSV->convertToShadowTyNoVec(OpShadow, IRB);
-          Value *Cond = IRB.CreateICmpNE(FlatShadow,
-                                         MSV->getCleanShadow(FlatShadow));
-          Origin = IRB.CreateSelect(Cond, OpOrigin, Origin);
+          Constant *ConstOrigin = dyn_cast<Constant>(OpOrigin);
+          // No point in adding something that might result in 0 origin value.
+          if (!ConstOrigin || !ConstOrigin->isNullValue()) {
+            Value *FlatShadow = MSV->convertToShadowTyNoVec(OpShadow, IRB);
+            Value *Cond =
+                IRB.CreateICmpNE(FlatShadow, MSV->getCleanShadow(FlatShadow));
+            Origin = IRB.CreateSelect(Cond, OpOrigin, Origin);
+          }
         }
       }
       return *this;
@@ -1411,13 +1397,61 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     SC.Done(&I);
   }
 
+  // \brief Handle multiplication by constant.
+  //
+  // Handle a special case of multiplication by constant that may have one or
+  // more zeros in the lower bits. This makes corresponding number of lower bits
+  // of the result zero as well. We model it by shifting the other operand
+  // shadow left by the required number of bits. Effectively, we transform
+  // (X * (A * 2**B)) to ((X << B) * A) and instrument (X << B) as (Sx << B).
+  // We use multiplication by 2**N instead of shift to cover the case of
+  // multiplication by 0, which may occur in some elements of a vector operand.
+  void handleMulByConstant(BinaryOperator &I, Constant *ConstArg,
+                           Value *OtherArg) {
+    Constant *ShadowMul;
+    Type *Ty = ConstArg->getType();
+    if (Ty->isVectorTy()) {
+      unsigned NumElements = Ty->getVectorNumElements();
+      Type *EltTy = Ty->getSequentialElementType();
+      SmallVector<Constant *, 16> Elements;
+      for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
+        ConstantInt *Elt =
+            dyn_cast<ConstantInt>(ConstArg->getAggregateElement(Idx));
+        APInt V = Elt->getValue();
+        APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
+        Elements.push_back(ConstantInt::get(EltTy, V2));
+      }
+      ShadowMul = ConstantVector::get(Elements);
+    } else {
+      ConstantInt *Elt = dyn_cast<ConstantInt>(ConstArg);
+      APInt V = Elt->getValue();
+      APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
+      ShadowMul = ConstantInt::get(Elt->getType(), V2);
+    }
+
+    IRBuilder<> IRB(&I);
+    setShadow(&I,
+              IRB.CreateMul(getShadow(OtherArg), ShadowMul, "msprop_mul_cst"));
+    setOrigin(&I, getOrigin(OtherArg));
+  }
+
+  void visitMul(BinaryOperator &I) {
+    Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0));
+    Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1));
+    if (constOp0 && !constOp1)
+      handleMulByConstant(I, constOp0, I.getOperand(1));
+    else if (constOp1 && !constOp0)
+      handleMulByConstant(I, constOp1, I.getOperand(0));
+    else
+      handleShadowOr(I);
+  }
+
   void visitFAdd(BinaryOperator &I) { handleShadowOr(I); }
   void visitFSub(BinaryOperator &I) { handleShadowOr(I); }
   void visitFMul(BinaryOperator &I) { handleShadowOr(I); }
   void visitAdd(BinaryOperator &I) { handleShadowOr(I); }
   void visitSub(BinaryOperator &I) { handleShadowOr(I); }
   void visitXor(BinaryOperator &I) { handleShadowOr(I); }
-  void visitMul(BinaryOperator &I) { handleShadowOr(I); }
 
   void handleDiv(Instruction &I) {
     IRBuilder<> IRB(&I);
@@ -1723,7 +1757,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *Addr = I.getArgOperand(0);
 
     Type *ShadowTy = getShadowTy(&I);
-    if (LoadShadow) {
+    if (PropagateShadow) {
       Value *ShadowPtr = getShadowPtr(Addr, ShadowTy, IRB);
       // We don't know the pointer alignment (could be unaligned SSE load!).
       // Have to assume to worst case.
@@ -1736,7 +1770,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       insertShadowCheck(Addr, &I);
 
     if (MS.TrackOrigins) {
-      if (LoadShadow)
+      if (PropagateShadow)
         setOrigin(&I, IRB.CreateLoad(getOriginPtr(Addr, IRB)));
       else
         setOrigin(&I, getCleanOrigin());
@@ -1946,6 +1980,120 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
+  // \brief Get an X86_MMX-sized vector type.
+  Type *getMMXVectorTy(unsigned EltSizeInBits) {
+    const unsigned X86_MMXSizeInBits = 64;
+    return VectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
+                           X86_MMXSizeInBits / EltSizeInBits);
+  }
+
+  // \brief Returns a signed counterpart for an (un)signed-saturate-and-pack
+  // intrinsic.
+  Intrinsic::ID getSignedPackIntrinsic(Intrinsic::ID id) {
+    switch (id) {
+      case llvm::Intrinsic::x86_sse2_packsswb_128:
+      case llvm::Intrinsic::x86_sse2_packuswb_128:
+        return llvm::Intrinsic::x86_sse2_packsswb_128;
+
+      case llvm::Intrinsic::x86_sse2_packssdw_128:
+      case llvm::Intrinsic::x86_sse41_packusdw:
+        return llvm::Intrinsic::x86_sse2_packssdw_128;
+
+      case llvm::Intrinsic::x86_avx2_packsswb:
+      case llvm::Intrinsic::x86_avx2_packuswb:
+        return llvm::Intrinsic::x86_avx2_packsswb;
+
+      case llvm::Intrinsic::x86_avx2_packssdw:
+      case llvm::Intrinsic::x86_avx2_packusdw:
+        return llvm::Intrinsic::x86_avx2_packssdw;
+
+      case llvm::Intrinsic::x86_mmx_packsswb:
+      case llvm::Intrinsic::x86_mmx_packuswb:
+        return llvm::Intrinsic::x86_mmx_packsswb;
+
+      case llvm::Intrinsic::x86_mmx_packssdw:
+        return llvm::Intrinsic::x86_mmx_packssdw;
+      default:
+        llvm_unreachable("unexpected intrinsic id");
+    }
+  }
+
+  // \brief Instrument vector pack instrinsic.
+  //
+  // This function instruments intrinsics like x86_mmx_packsswb, that
+  // packs elements of 2 input vectors into half as many bits with saturation.
+  // Shadow is propagated with the signed variant of the same intrinsic applied
+  // to sext(Sa != zeroinitializer), sext(Sb != zeroinitializer).
+  // EltSizeInBits is used only for x86mmx arguments.
+  void handleVectorPackIntrinsic(IntrinsicInst &I, unsigned EltSizeInBits = 0) {
+    assert(I.getNumArgOperands() == 2);
+    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+    IRBuilder<> IRB(&I);
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    assert(isX86_MMX || S1->getType()->isVectorTy());
+
+    // SExt and ICmpNE below must apply to individual elements of input vectors.
+    // In case of x86mmx arguments, cast them to appropriate vector types and
+    // back.
+    Type *T = isX86_MMX ? getMMXVectorTy(EltSizeInBits) : S1->getType();
+    if (isX86_MMX) {
+      S1 = IRB.CreateBitCast(S1, T);
+      S2 = IRB.CreateBitCast(S2, T);
+    }
+    Value *S1_ext = IRB.CreateSExt(
+        IRB.CreateICmpNE(S1, llvm::Constant::getNullValue(T)), T);
+    Value *S2_ext = IRB.CreateSExt(
+        IRB.CreateICmpNE(S2, llvm::Constant::getNullValue(T)), T);
+    if (isX86_MMX) {
+      Type *X86_MMXTy = Type::getX86_MMXTy(*MS.C);
+      S1_ext = IRB.CreateBitCast(S1_ext, X86_MMXTy);
+      S2_ext = IRB.CreateBitCast(S2_ext, X86_MMXTy);
+    }
+
+    Function *ShadowFn = Intrinsic::getDeclaration(
+        F.getParent(), getSignedPackIntrinsic(I.getIntrinsicID()));
+
+    Value *S = IRB.CreateCall2(ShadowFn, S1_ext, S2_ext, "_msprop_vector_pack");
+    if (isX86_MMX) S = IRB.CreateBitCast(S, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // \brief Instrument sum-of-absolute-differencies intrinsic.
+  void handleVectorSadIntrinsic(IntrinsicInst &I) {
+    const unsigned SignificantBitsPerResultElement = 16;
+    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+    Type *ResTy = isX86_MMX ? IntegerType::get(*MS.C, 64) : I.getType();
+    unsigned ZeroBitsPerResultElement =
+        ResTy->getScalarSizeInBits() - SignificantBitsPerResultElement;
+
+    IRBuilder<> IRB(&I);
+    Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    S = IRB.CreateBitCast(S, ResTy);
+    S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
+                       ResTy);
+    S = IRB.CreateLShr(S, ZeroBitsPerResultElement);
+    S = IRB.CreateBitCast(S, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // \brief Instrument multiply-add intrinsic.
+  void handleVectorPmaddIntrinsic(IntrinsicInst &I,
+                                  unsigned EltSizeInBits = 0) {
+    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+    Type *ResTy = isX86_MMX ? getMMXVectorTy(EltSizeInBits * 2) : I.getType();
+    IRBuilder<> IRB(&I);
+    Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    S = IRB.CreateBitCast(S, ResTy);
+    S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
+                       ResTy);
+    S = IRB.CreateBitCast(S, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
   void visitIntrinsicInst(IntrinsicInst &I) {
     switch (I.getIntrinsicID()) {
     case llvm::Intrinsic::bswap:
@@ -2062,6 +2210,47 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // case llvm::Intrinsic::x86_sse2_psll_dq_bs:
     // case llvm::Intrinsic::x86_sse2_psrl_dq_bs:
 
+    case llvm::Intrinsic::x86_sse2_packsswb_128:
+    case llvm::Intrinsic::x86_sse2_packssdw_128:
+    case llvm::Intrinsic::x86_sse2_packuswb_128:
+    case llvm::Intrinsic::x86_sse41_packusdw:
+    case llvm::Intrinsic::x86_avx2_packsswb:
+    case llvm::Intrinsic::x86_avx2_packssdw:
+    case llvm::Intrinsic::x86_avx2_packuswb:
+    case llvm::Intrinsic::x86_avx2_packusdw:
+      handleVectorPackIntrinsic(I);
+      break;
+
+    case llvm::Intrinsic::x86_mmx_packsswb:
+    case llvm::Intrinsic::x86_mmx_packuswb:
+      handleVectorPackIntrinsic(I, 16);
+      break;
+
+    case llvm::Intrinsic::x86_mmx_packssdw:
+      handleVectorPackIntrinsic(I, 32);
+      break;
+
+    case llvm::Intrinsic::x86_mmx_psad_bw:
+    case llvm::Intrinsic::x86_sse2_psad_bw:
+    case llvm::Intrinsic::x86_avx2_psad_bw:
+      handleVectorSadIntrinsic(I);
+      break;
+
+    case llvm::Intrinsic::x86_sse2_pmadd_wd:
+    case llvm::Intrinsic::x86_avx2_pmadd_wd:
+    case llvm::Intrinsic::x86_ssse3_pmadd_ub_sw_128:
+    case llvm::Intrinsic::x86_avx2_pmadd_ub_sw:
+      handleVectorPmaddIntrinsic(I);
+      break;
+
+    case llvm::Intrinsic::x86_ssse3_pmadd_ub_sw:
+      handleVectorPmaddIntrinsic(I, 8);
+      break;
+
+    case llvm::Intrinsic::x86_mmx_pmadd_wd:
+      handleVectorPmaddIntrinsic(I, 16);
+      break;
+
     default:
       if (!handleUnknownIntrinsic(I))
         visitInstruction(I);
@@ -2083,12 +2272,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         return;
       }
 
-      // Allow only tail calls with the same types, otherwise
-      // we may have a false positive: shadow for a non-void RetVal
-      // will get propagated to a void RetVal.
-      if (Call->isTailCall() && Call->getType() != Call->getParent()->getType())
-        Call->setTailCall(false);
-
       assert(!isa<IntrinsicInst>(&I) && "intrinsics are handled elsewhere");
 
       // We are going to insert code that relies on the fact that the callee
@@ -2211,6 +2394,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   void visitPHINode(PHINode &I) {
     IRBuilder<> IRB(&I);
+    if (!PropagateShadow) {
+      setShadow(&I, getCleanShadow(&I));
+      return;
+    }
+
     ShadowPHINodes.push_back(&I);
     setShadow(&I, IRB.CreatePHI(getShadowTy(&I), I.getNumIncomingValues(),
                                 "_msphi_s"));
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 8fe9bca..89386a6 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -40,14 +40,11 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
-#include "llvm/Transforms/Utils/SpecialCaseList.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "tsan"
 
-static cl::opt<std::string>  ClBlacklistFile("tsan-blacklist",
-       cl::desc("Blacklist file"), cl::Hidden);
 static cl::opt<bool>  ClInstrumentMemoryAccesses(
     "tsan-instrument-memory-accesses", cl::init(true),
     cl::desc("Instrument memory accesses"), cl::Hidden);
@@ -76,11 +73,7 @@ namespace {
 
 /// ThreadSanitizer: instrument the code in module to find races.
 struct ThreadSanitizer : public FunctionPass {
-  ThreadSanitizer(StringRef BlacklistFile = StringRef())
-      : FunctionPass(ID),
-        DL(nullptr),
-        BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
-                                            : BlacklistFile) { }
+  ThreadSanitizer() : FunctionPass(ID), DL(nullptr) {}
   const char *getPassName() const override;
   bool runOnFunction(Function &F) override;
   bool doInitialization(Module &M) override;
@@ -98,8 +91,6 @@ struct ThreadSanitizer : public FunctionPass {
 
   const DataLayout *DL;
   Type *IntptrTy;
-  SmallString<64> BlacklistFile;
-  std::unique_ptr<SpecialCaseList> BL;
   IntegerType *OrdTy;
   // Callbacks to run-time library are computed in doInitialization.
   Function *TsanFuncEntry;
@@ -129,8 +120,8 @@ const char *ThreadSanitizer::getPassName() const {
   return "ThreadSanitizer";
 }
 
-FunctionPass *llvm::createThreadSanitizerPass(StringRef BlacklistFile) {
-  return new ThreadSanitizer(BlacklistFile);
+FunctionPass *llvm::createThreadSanitizerPass() {
+  return new ThreadSanitizer();
 }
 
 static Function *checkInterfaceFunction(Constant *FuncOrBitcast) {
@@ -228,7 +219,6 @@ bool ThreadSanitizer::doInitialization(Module &M) {
   if (!DLP)
     report_fatal_error("data layout missing");
   DL = &DLP->getDataLayout();
-  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
 
   // Always insert a call to __tsan_init into the module's CTORs.
   IRBuilder<> IRB(M.getContext());
@@ -322,7 +312,6 @@ static bool isAtomic(Instruction *I) {
 
 bool ThreadSanitizer::runOnFunction(Function &F) {
   if (!DL) return false;
-  if (BL->isIn(F)) return false;
   initializeCallbacks(*F.getParent());
   SmallVector<Instruction*, 8> RetVec;
   SmallVector<Instruction*, 8> AllLoadsAndStores;
@@ -331,22 +320,20 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
   SmallVector<Instruction*, 8> MemIntrinCalls;
   bool Res = false;
   bool HasCalls = false;
+  bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeThread);
 
   // Traverse all instructions, collect loads/stores/returns, check for calls.
-  for (Function::iterator FI = F.begin(), FE = F.end();
-       FI != FE; ++FI) {
-    BasicBlock &BB = *FI;
-    for (BasicBlock::iterator BI = BB.begin(), BE = BB.end();
-         BI != BE; ++BI) {
-      if (isAtomic(BI))
-        AtomicAccesses.push_back(BI);
-      else if (isa<LoadInst>(BI) || isa<StoreInst>(BI))
-        LocalLoadsAndStores.push_back(BI);
-      else if (isa<ReturnInst>(BI))
-        RetVec.push_back(BI);
-      else if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) {
-        if (isa<MemIntrinsic>(BI))
-          MemIntrinCalls.push_back(BI);
+  for (auto &BB : F) {
+    for (auto &Inst : BB) {
+      if (isAtomic(&Inst))
+        AtomicAccesses.push_back(&Inst);
+      else if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
+        LocalLoadsAndStores.push_back(&Inst);
+      else if (isa<ReturnInst>(Inst))
+        RetVec.push_back(&Inst);
+      else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
+        if (isa<MemIntrinsic>(Inst))
+          MemIntrinCalls.push_back(&Inst);
         HasCalls = true;
         chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
       }
@@ -358,21 +345,22 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
   // FIXME: many of these accesses do not need to be checked for races
   // (e.g. variables that do not escape, etc).
 
-  // Instrument memory accesses.
-  if (ClInstrumentMemoryAccesses && F.hasFnAttribute(Attribute::SanitizeThread))
-    for (size_t i = 0, n = AllLoadsAndStores.size(); i < n; ++i) {
-      Res |= instrumentLoadOrStore(AllLoadsAndStores[i]);
+  // Instrument memory accesses only if we want to report bugs in the function.
+  if (ClInstrumentMemoryAccesses && SanitizeFunction)
+    for (auto Inst : AllLoadsAndStores) {
+      Res |= instrumentLoadOrStore(Inst);
     }
 
-  // Instrument atomic memory accesses.
+  // Instrument atomic memory accesses in any case (they can be used to
+  // implement synchronization).
   if (ClInstrumentAtomics)
-    for (size_t i = 0, n = AtomicAccesses.size(); i < n; ++i) {
-      Res |= instrumentAtomic(AtomicAccesses[i]);
+    for (auto Inst : AtomicAccesses) {
+      Res |= instrumentAtomic(Inst);
     }
 
-  if (ClInstrumentMemIntrinsics)
-    for (size_t i = 0, n = MemIntrinCalls.size(); i < n; ++i) {
-      Res |= instrumentMemIntrinsic(MemIntrinCalls[i]);
+  if (ClInstrumentMemIntrinsics && SanitizeFunction)
+    for (auto Inst : MemIntrinCalls) {
+      Res |= instrumentMemIntrinsic(Inst);
     }
 
   // Instrument function entry/exit points if there were instrumented accesses.
@@ -382,8 +370,8 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
         Intrinsic::getDeclaration(F.getParent(), Intrinsic::returnaddress),
         IRB.getInt32(0));
     IRB.CreateCall(TsanFuncEntry, ReturnAddress);
-    for (size_t i = 0, n = RetVec.size(); i < n; ++i) {
-      IRBuilder<> IRBRet(RetVec[i]);
+    for (auto RetInst : RetVec) {
+      IRBuilder<> IRBRet(RetInst);
       IRBRet.CreateCall(TsanFuncExit);
     }
     Res = true;
@@ -543,8 +531,14 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I) {
                      IRB.CreateIntCast(CASI->getNewValOperand(), Ty, false),
                      createOrdering(&IRB, CASI->getSuccessOrdering()),
                      createOrdering(&IRB, CASI->getFailureOrdering())};
-    CallInst *C = CallInst::Create(TsanAtomicCAS[Idx], ArrayRef<Value*>(Args));
-    ReplaceInstWithInst(I, C);
+    CallInst *C = IRB.CreateCall(TsanAtomicCAS[Idx], Args);
+    Value *Success = IRB.CreateICmpEQ(C, CASI->getCompareOperand());
+
+    Value *Res = IRB.CreateInsertValue(UndefValue::get(CASI->getType()), C, 0);
+    Res = IRB.CreateInsertValue(Res, Success, 1);
+
+    I->replaceAllUsesWith(Res);
+    I->eraseFromParent();
   } else if (FenceInst *FI = dyn_cast<FenceInst>(I)) {
     Value *Args[] = {createOrdering(&IRB, FI->getOrdering())};
     Function *F = FI->getSynchScope() == SingleThread ?
diff --git a/lib/Transforms/Scalar/Android.mk b/lib/Transforms/Scalar/Android.mk
index 079cc86..5e22de6 100644
--- a/lib/Transforms/Scalar/Android.mk
+++ b/lib/Transforms/Scalar/Android.mk
@@ -8,11 +8,11 @@ transforms_scalar_SRC_FILES := \
   DCE.cpp \
   DeadStoreElimination.cpp \
   EarlyCSE.cpp \
-  GlobalMerge.cpp \
   GVN.cpp \
   IndVarSimplify.cpp \
   JumpThreading.cpp \
   LICM.cpp \
+  LoadCombine.cpp \
   LoopDeletion.cpp \
   LoopIdiomRecognize.cpp \
   LoopInstSimplify.cpp \
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index 3ad1488..2dcfa23 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -8,10 +8,10 @@ add_llvm_library(LLVMScalarOpts
   EarlyCSE.cpp
   FlattenCFGPass.cpp
   GVN.cpp
-  GlobalMerge.cpp
   IndVarSimplify.cpp
   JumpThreading.cpp
   LICM.cpp
+  LoadCombine.cpp
   LoopDeletion.cpp
   LoopIdiomRecognize.cpp
   LoopInstSimplify.cpp
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 6d07ddd..106eba0 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -1464,6 +1464,13 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
       continue;
     }
 
+    // Loading from calloc (which zero initializes memory) -> zero
+    if (isCallocLikeFn(DepInst, TLI)) {
+      ValuesPerBlock.push_back(AvailableValueInBlock::get(
+          DepBB, Constant::getNullValue(LI->getType())));
+      continue;
+    }
+
     if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
       // Reject loads and stores that are to the same address but are of
       // different types if we have to.
@@ -1791,6 +1798,10 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) {
       case LLVMContext::MD_fpmath:
         ReplInst->setMetadata(Kind, MDNode::getMostGenericFPMath(IMD, ReplMD));
         break;
+      case LLVMContext::MD_invariant_load:
+        // Only set the !invariant.load if it is present in both instructions.
+        ReplInst->setMetadata(Kind, IMD);
+        break;
       }
     }
   }
@@ -1988,6 +1999,15 @@ bool GVN::processLoad(LoadInst *L) {
     }
   }
 
+  // If this load follows a calloc (which zero initializes memory),
+  // then the loaded value is zero
+  if (isCallocLikeFn(DepInst, TLI)) {
+    L->replaceAllUsesWith(Constant::getNullValue(L->getType()));
+    markInstructionForDeletion(L);
+    ++NumGVNLoad;
+    return true;
+  }
+
   return false;
 }
 
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 230a381..6e50d33 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -158,6 +158,15 @@ bool JumpThreading::runOnFunction(Function &F) {
   TLI = &getAnalysis<TargetLibraryInfo>();
   LVI = &getAnalysis<LazyValueInfo>();
 
+  // Remove unreachable blocks from function as they may result in infinite
+  // loop. We do threading if we found something profitable. Jump threading a
+  // branch can create other opportunities. If these opportunities form a cycle
+  // i.e. if any jump treading is undoing previous threading in the path, then
+  // we will loop forever. We take care of this issue by not jump threading for
+  // back edges. This works for normal cases but not for unreachable blocks as
+  // they may have cycle with no back edge.
+  removeUnreachableBlocks(F);
+
   FindLoopHeaders(F);
 
   bool Changed, EverChanged = false;
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 0a8d16f..abcceb2 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -192,6 +192,14 @@ namespace {
                          SmallVectorImpl<BasicBlock*> &ExitBlocks,
                          SmallVectorImpl<Instruction*> &InsertPts,
                          PredIteratorCache &PIC);
+
+    /// \brief Create a copy of the instruction in the exit block and patch up
+    /// SSA.
+    /// PN is a user of I in ExitBlock that can be used to get the number and
+    /// list of predecessors fast.
+    Instruction *CloneInstructionInExitBlock(Instruction &I,
+                                             BasicBlock &ExitBlock,
+                                             PHINode &PN);
   };
 }
 
@@ -531,6 +539,35 @@ bool LICM::isNotUsedInLoop(Instruction &I) {
   return true;
 }
 
+Instruction *LICM::CloneInstructionInExitBlock(Instruction &I,
+                                               BasicBlock &ExitBlock,
+                                               PHINode &PN) {
+  Instruction *New = I.clone();
+  ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New);
+  if (!I.getName().empty()) New->setName(I.getName() + ".le");
+
+  // Build LCSSA PHI nodes for any in-loop operands. Note that this is
+  // particularly cheap because we can rip off the PHI node that we're
+  // replacing for the number and blocks of the predecessors.
+  // OPT: If this shows up in a profile, we can instead finish sinking all
+  // invariant instructions, and then walk their operands to re-establish
+  // LCSSA. That will eliminate creating PHI nodes just to nuke them when
+  // sinking bottom-up.
+  for (User::op_iterator OI = New->op_begin(), OE = New->op_end(); OI != OE;
+       ++OI)
+    if (Instruction *OInst = dyn_cast<Instruction>(*OI))
+      if (Loop *OLoop = LI->getLoopFor(OInst->getParent()))
+        if (!OLoop->contains(&PN)) {
+          PHINode *OpPN =
+              PHINode::Create(OInst->getType(), PN.getNumIncomingValues(),
+                              OInst->getName() + ".lcssa", ExitBlock.begin());
+          for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+            OpPN->addIncoming(OInst, PN.getIncomingBlock(i));
+          *OI = OpPN;
+        }
+  return New;
+}
+
 /// sink - When an instruction is found to only be used outside of the loop,
 /// this function moves it to the exit blocks and patches up SSA form as needed.
 /// This method is guaranteed to remove the original instruction from its
@@ -550,6 +587,9 @@ void LICM::sink(Instruction &I) {
   SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end());
 #endif
 
+  // Clones of this instruction. Don't create more than one per exit block!
+  SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies;
+
   // If this instruction is only used outside of the loop, then all users are
   // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
   // the instruction.
@@ -561,30 +601,13 @@ void LICM::sink(Instruction &I) {
     assert(ExitBlockSet.count(ExitBlock) &&
            "The LCSSA PHI is not in an exit block!");
 
-    Instruction *New = I.clone();
-    ExitBlock->getInstList().insert(ExitBlock->getFirstInsertionPt(), New);
-    if (!I.getName().empty())
-      New->setName(I.getName() + ".le");
-
-    // Build LCSSA PHI nodes for any in-loop operands. Note that this is
-    // particularly cheap because we can rip off the PHI node that we're
-    // replacing for the number and blocks of the predecessors.
-    // OPT: If this shows up in a profile, we can instead finish sinking all
-    // invariant instructions, and then walk their operands to re-establish
-    // LCSSA. That will eliminate creating PHI nodes just to nuke them when
-    // sinking bottom-up.
-    for (User::op_iterator OI = New->op_begin(), OE = New->op_end(); OI != OE;
-         ++OI)
-      if (Instruction *OInst = dyn_cast<Instruction>(*OI))
-        if (Loop *OLoop = LI->getLoopFor(OInst->getParent()))
-          if (!OLoop->contains(PN)) {
-            PHINode *OpPN = PHINode::Create(
-                OInst->getType(), PN->getNumIncomingValues(),
-                OInst->getName() + ".lcssa", ExitBlock->begin());
-            for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-              OpPN->addIncoming(OInst, PN->getIncomingBlock(i));
-            *OI = OpPN;
-          }
+    Instruction *New;
+    auto It = SunkCopies.find(ExitBlock);
+    if (It != SunkCopies.end())
+      New = It->second;
+    else
+      New = SunkCopies[ExitBlock] =
+          CloneInstructionInExitBlock(I, *ExitBlock, *PN);
 
     PN->replaceAllUsesWith(New);
     PN->eraseFromParent();
@@ -616,7 +639,7 @@ void LICM::hoist(Instruction &I) {
 ///
 bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) {
   // If it is not a trapping instruction, it is always safe to hoist.
-  if (isSafeToSpeculativelyExecute(&Inst))
+  if (isSafeToSpeculativelyExecute(&Inst, DL))
     return true;
 
   return isGuaranteedToExecute(Inst);
diff --git a/lib/Transforms/Scalar/LoadCombine.cpp b/lib/Transforms/Scalar/LoadCombine.cpp
new file mode 100644
index 0000000..846aa70
--- /dev/null
+++ b/lib/Transforms/Scalar/LoadCombine.cpp
@@ -0,0 +1,268 @@
+//===- LoadCombine.cpp - Combine Adjacent Loads ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This transformation combines adjacent loads.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "load-combine"
+
+STATISTIC(NumLoadsAnalyzed, "Number of loads analyzed for combining");
+STATISTIC(NumLoadsCombined, "Number of loads combined");
+
+namespace {
+struct PointerOffsetPair {
+  Value *Pointer;
+  uint64_t Offset;
+};
+
+struct LoadPOPPair {
+  LoadPOPPair(LoadInst *L, PointerOffsetPair P, unsigned O)
+      : Load(L), POP(P), InsertOrder(O) {}
+  LoadPOPPair() {}
+  LoadInst *Load;
+  PointerOffsetPair POP;
+  /// \brief The new load needs to be created before the first load in IR order.
+  unsigned InsertOrder;
+};
+
+class LoadCombine : public BasicBlockPass {
+  LLVMContext *C;
+  const DataLayout *DL;
+
+public:
+  LoadCombine()
+      : BasicBlockPass(ID),
+        C(nullptr), DL(nullptr) {
+    initializeSROAPass(*PassRegistry::getPassRegistry());
+  }
+  bool doInitialization(Function &) override;
+  bool runOnBasicBlock(BasicBlock &BB) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  const char *getPassName() const override { return "LoadCombine"; }
+  static char ID;
+
+  typedef IRBuilder<true, TargetFolder> BuilderTy;
+
+private:
+  BuilderTy *Builder;
+
+  PointerOffsetPair getPointerOffsetPair(LoadInst &);
+  bool combineLoads(DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &);
+  bool aggregateLoads(SmallVectorImpl<LoadPOPPair> &);
+  bool combineLoads(SmallVectorImpl<LoadPOPPair> &);
+};
+}
+
+bool LoadCombine::doInitialization(Function &F) {
+  DEBUG(dbgs() << "LoadCombine function: " << F.getName() << "\n");
+  C = &F.getContext();
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  if (!DLP) {
+    DEBUG(dbgs() << "  Skipping LoadCombine -- no target data!\n");
+    return false;
+  }
+  DL = &DLP->getDataLayout();
+  return true;
+}
+
+PointerOffsetPair LoadCombine::getPointerOffsetPair(LoadInst &LI) {
+  PointerOffsetPair POP;
+  POP.Pointer = LI.getPointerOperand();
+  POP.Offset = 0;
+  while (isa<BitCastInst>(POP.Pointer) || isa<GetElementPtrInst>(POP.Pointer)) {
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(POP.Pointer)) {
+      unsigned BitWidth = DL->getPointerTypeSizeInBits(GEP->getType());
+      APInt Offset(BitWidth, 0);
+      if (GEP->accumulateConstantOffset(*DL, Offset))
+        POP.Offset += Offset.getZExtValue();
+      else
+        // Can't handle GEPs with variable indices.
+        return POP;
+      POP.Pointer = GEP->getPointerOperand();
+    } else if (auto *BC = dyn_cast<BitCastInst>(POP.Pointer))
+      POP.Pointer = BC->getOperand(0);
+  }
+  return POP;
+}
+
+bool LoadCombine::combineLoads(
+    DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &LoadMap) {
+  bool Combined = false;
+  for (auto &Loads : LoadMap) {
+    if (Loads.second.size() < 2)
+      continue;
+    std::sort(Loads.second.begin(), Loads.second.end(),
+              [](const LoadPOPPair &A, const LoadPOPPair &B) {
+      return A.POP.Offset < B.POP.Offset;
+    });
+    if (aggregateLoads(Loads.second))
+      Combined = true;
+  }
+  return Combined;
+}
+
+/// \brief Try to aggregate loads from a sorted list of loads to be combined.
+///
+/// It is guaranteed that no writes occur between any of the loads. All loads
+/// have the same base pointer. There are at least two loads.
+bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
+  assert(Loads.size() >= 2 && "Insufficient loads!");
+  LoadInst *BaseLoad = nullptr;
+  SmallVector<LoadPOPPair, 8> AggregateLoads;
+  bool Combined = false;
+  uint64_t PrevOffset = -1ull;
+  uint64_t PrevSize = 0;
+  for (auto &L : Loads) {
+    if (PrevOffset == -1ull) {
+      BaseLoad = L.Load;
+      PrevOffset = L.POP.Offset;
+      PrevSize = DL->getTypeStoreSize(L.Load->getType());
+      AggregateLoads.push_back(L);
+      continue;
+    }
+    if (L.Load->getAlignment() > BaseLoad->getAlignment())
+      continue;
+    if (L.POP.Offset > PrevOffset + PrevSize) {
+      // No other load will be combinable
+      if (combineLoads(AggregateLoads))
+        Combined = true;
+      AggregateLoads.clear();
+      PrevOffset = -1;
+      continue;
+    }
+    if (L.POP.Offset != PrevOffset + PrevSize)
+      // This load is offset less than the size of the last load.
+      // FIXME: We may want to handle this case.
+      continue;
+    PrevOffset = L.POP.Offset;
+    PrevSize = DL->getTypeStoreSize(L.Load->getType());
+    AggregateLoads.push_back(L);
+  }
+  if (combineLoads(AggregateLoads))
+    Combined = true;
+  return Combined;
+}
+
+/// \brief Given a list of combinable load. Combine the maximum number of them.
+bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
+  // Remove loads from the end while the size is not a power of 2.
+  unsigned TotalSize = 0;
+  for (const auto &L : Loads)
+    TotalSize += L.Load->getType()->getPrimitiveSizeInBits();
+  while (TotalSize != 0 && !isPowerOf2_32(TotalSize))
+    TotalSize -= Loads.pop_back_val().Load->getType()->getPrimitiveSizeInBits();
+  if (Loads.size() < 2)
+    return false;
+
+  DEBUG({
+    dbgs() << "***** Combining Loads ******\n";
+    for (const auto &L : Loads) {
+      dbgs() << L.POP.Offset << ": " << *L.Load << "\n";
+    }
+  });
+
+  // Find first load. This is where we put the new load.
+  LoadPOPPair FirstLP;
+  FirstLP.InsertOrder = -1u;
+  for (const auto &L : Loads)
+    if (L.InsertOrder < FirstLP.InsertOrder)
+      FirstLP = L;
+
+  unsigned AddressSpace =
+      FirstLP.POP.Pointer->getType()->getPointerAddressSpace();
+
+  Builder->SetInsertPoint(FirstLP.Load);
+  Value *Ptr = Builder->CreateConstGEP1_64(
+      Builder->CreatePointerCast(Loads[0].POP.Pointer,
+                                 Builder->getInt8PtrTy(AddressSpace)),
+      Loads[0].POP.Offset);
+  LoadInst *NewLoad = new LoadInst(
+      Builder->CreatePointerCast(
+          Ptr, PointerType::get(IntegerType::get(Ptr->getContext(), TotalSize),
+                                Ptr->getType()->getPointerAddressSpace())),
+      Twine(Loads[0].Load->getName()) + ".combined", false,
+      Loads[0].Load->getAlignment(), FirstLP.Load);
+
+  for (const auto &L : Loads) {
+    Builder->SetInsertPoint(L.Load);
+    Value *V = Builder->CreateExtractInteger(
+        *DL, NewLoad, cast<IntegerType>(L.Load->getType()),
+        L.POP.Offset - Loads[0].POP.Offset, "combine.extract");
+    L.Load->replaceAllUsesWith(V);
+  }
+
+  NumLoadsCombined = NumLoadsCombined + Loads.size();
+  return true;
+}
+
+bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
+  if (skipOptnoneFunction(BB) || !DL)
+    return false;
+
+  IRBuilder<true, TargetFolder>
+  TheBuilder(BB.getContext(), TargetFolder(DL));
+  Builder = &TheBuilder;
+
+  DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> LoadMap;
+
+  bool Combined = false;
+  unsigned Index = 0;
+  for (auto &I : BB) {
+    if (I.mayWriteToMemory() || I.mayThrow()) {
+      if (combineLoads(LoadMap))
+        Combined = true;
+      LoadMap.clear();
+      continue;
+    }
+    LoadInst *LI = dyn_cast<LoadInst>(&I);
+    if (!LI)
+      continue;
+    ++NumLoadsAnalyzed;
+    if (!LI->isSimple() || !LI->getType()->isIntegerTy())
+      continue;
+    auto POP = getPointerOffsetPair(*LI);
+    if (!POP.Pointer)
+      continue;
+    LoadMap[POP.Pointer].push_back(LoadPOPPair(LI, POP, Index++));
+  }
+  if (combineLoads(LoadMap))
+    Combined = true;
+  return Combined;
+}
+
+void LoadCombine::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+}
+
+char LoadCombine::ID = 0;
+
+BasicBlockPass *llvm::createLoadCombinePass() {
+  return new LoadCombine();
+}
+
+INITIALIZE_PASS(LoadCombine, "load-combine", "Combine Adjacent Loads", false,
+                false)
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 26a83df..a12f5a7 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -112,7 +112,7 @@ namespace {
     /// the variable involved in the comparion is returned. This function will
     /// be called to see if the precondition and postcondition of the loop
     /// are in desirable form.
-    Value *matchCondition (BranchInst *Br, BasicBlock *NonZeroTarget) const;
+    Value *matchCondition(BranchInst *Br, BasicBlock *NonZeroTarget) const;
 
     /// Return true iff the idiom is detected in the loop. and 1) \p CntInst
     /// is set to the instruction counting the population bit. 2) \p CntPhi
@@ -122,7 +122,7 @@ namespace {
       (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const;
 
     /// Insert ctpop intrinsic function and some obviously dead instructions.
-    void transform (Instruction *CntInst, PHINode *CntPhi, Value *Var);
+    void transform(Instruction *CntInst, PHINode *CntPhi, Value *Var);
 
     /// Create llvm.ctpop.* intrinsic function.
     CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL);
diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp
index 8b5e036..b6fbb16 100644
--- a/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -924,8 +924,10 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
       // them, and this matching fails. As an exception, we allow the alias
       // set tracker to handle regular (simple) load/store dependencies.
       if (FutureSideEffects &&
-            ((!isSimpleLoadStore(J1) && !isSafeToSpeculativelyExecute(J1)) ||
-             (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2)))) {
+            ((!isSimpleLoadStore(J1) &&
+              !isSafeToSpeculativelyExecute(J1, DL)) ||
+             (!isSimpleLoadStore(J2) &&
+              !isSafeToSpeculativelyExecute(J2, DL)))) {
         DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
                         " vs. " << *J2 <<
                         " (side effects prevent reordering)\n");
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index fc28fd2..00c0f88 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -18,8 +18,10 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -36,7 +38,8 @@ UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden,
 
 static cl::opt<unsigned>
 UnrollCount("unroll-count", cl::init(0), cl::Hidden,
-  cl::desc("Use this unroll count for all loops, for testing purposes"));
+  cl::desc("Use this unroll count for all loops including those with "
+           "unroll_count pragma values, for testing purposes"));
 
 static cl::opt<bool>
 UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden,
@@ -47,6 +50,11 @@ static cl::opt<bool>
 UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::init(false), cl::Hidden,
   cl::desc("Unroll loops with run-time trip counts"));
 
+static cl::opt<unsigned>
+PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
+  cl::desc("Unrolled size limit for loops with an unroll(enable) or "
+           "unroll_count pragma."));
+
 namespace {
   class LoopUnroll : public LoopPass {
   public:
@@ -109,6 +117,66 @@ namespace {
       // For now, recreate dom info, if loop is unrolled.
       AU.addPreserved<DominatorTreeWrapperPass>();
     }
+
+    // Fill in the UnrollingPreferences parameter with values from the
+    // TargetTransformationInfo.
+    void getUnrollingPreferences(Loop *L, const TargetTransformInfo &TTI,
+                                 TargetTransformInfo::UnrollingPreferences &UP) {
+      UP.Threshold = CurrentThreshold;
+      UP.OptSizeThreshold = OptSizeUnrollThreshold;
+      UP.PartialThreshold = CurrentThreshold;
+      UP.PartialOptSizeThreshold = OptSizeUnrollThreshold;
+      UP.Count = CurrentCount;
+      UP.MaxCount = UINT_MAX;
+      UP.Partial = CurrentAllowPartial;
+      UP.Runtime = CurrentRuntime;
+      TTI.getUnrollingPreferences(L, UP);
+    }
+
+    // Select and return an unroll count based on parameters from
+    // user, unroll preferences, unroll pragmas, or a heuristic.
+    // SetExplicitly is set to true if the unroll count is is set by
+    // the user or a pragma rather than selected heuristically.
+    unsigned
+    selectUnrollCount(const Loop *L, unsigned TripCount, bool HasEnablePragma,
+                      unsigned PragmaCount,
+                      const TargetTransformInfo::UnrollingPreferences &UP,
+                      bool &SetExplicitly);
+
+
+    // Select threshold values used to limit unrolling based on a
+    // total unrolled size.  Parameters Threshold and PartialThreshold
+    // are set to the maximum unrolled size for fully and partially
+    // unrolled loops respectively.
+    void selectThresholds(const Loop *L, bool HasPragma,
+                          const TargetTransformInfo::UnrollingPreferences &UP,
+                          unsigned &Threshold, unsigned &PartialThreshold) {
+      // Determine the current unrolling threshold.  While this is
+      // normally set from UnrollThreshold, it is overridden to a
+      // smaller value if the current function is marked as
+      // optimize-for-size, and the unroll threshold was not user
+      // specified.
+      Threshold = UserThreshold ? CurrentThreshold : UP.Threshold;
+      PartialThreshold = UserThreshold ? CurrentThreshold : UP.PartialThreshold;
+      if (!UserThreshold &&
+          L->getHeader()->getParent()->getAttributes().
+              hasAttribute(AttributeSet::FunctionIndex,
+                           Attribute::OptimizeForSize)) {
+        Threshold = UP.OptSizeThreshold;
+        PartialThreshold = UP.PartialOptSizeThreshold;
+      }
+      if (HasPragma) {
+        // If the loop has an unrolling pragma, we want to be more
+        // aggressive with unrolling limits.  Set thresholds to at
+        // least the PragmaTheshold value which is larger than the
+        // default limits.
+        if (Threshold != NoThreshold)
+          Threshold = std::max<unsigned>(Threshold, PragmaUnrollThreshold);
+        if (PartialThreshold != NoThreshold)
+          PartialThreshold =
+              std::max<unsigned>(PartialThreshold, PragmaUnrollThreshold);
+      }
+    }
   };
 }
 
@@ -151,6 +219,103 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
   return LoopSize;
 }
 
+// Returns the value associated with the given metadata node name (for
+// example, "llvm.loop.unroll.count").  If no such named metadata node
+// exists, then nullptr is returned.
+static const ConstantInt *GetUnrollMetadataValue(const Loop *L,
+                                                 StringRef Name) {
+  MDNode *LoopID = L->getLoopID();
+  if (!LoopID) return nullptr;
+
+  // First operand should refer to the loop id itself.
+  assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+  assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+  for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
+    const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+    if (!MD) continue;
+
+    const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+    if (!S) continue;
+
+    if (Name.equals(S->getString())) {
+      assert(MD->getNumOperands() == 2 &&
+             "Unroll hint metadata should have two operands.");
+      return cast<ConstantInt>(MD->getOperand(1));
+    }
+  }
+  return nullptr;
+}
+
+// Returns true if the loop has an unroll(enable) pragma.
+static bool HasUnrollEnablePragma(const Loop *L) {
+  const ConstantInt *EnableValue =
+      GetUnrollMetadataValue(L, "llvm.loop.unroll.enable");
+  return (EnableValue && EnableValue->getZExtValue());
+}
+
+// Returns true if the loop has an unroll(disable) pragma.
+static bool HasUnrollDisablePragma(const Loop *L) {
+  const ConstantInt *EnableValue =
+      GetUnrollMetadataValue(L, "llvm.loop.unroll.enable");
+  return (EnableValue && !EnableValue->getZExtValue());
+}
+
+// If loop has an unroll_count pragma return the (necessarily
+// positive) value from the pragma.  Otherwise return 0.
+static unsigned UnrollCountPragmaValue(const Loop *L) {
+  const ConstantInt *CountValue =
+      GetUnrollMetadataValue(L, "llvm.loop.unroll.count");
+  if (CountValue) {
+    unsigned Count = CountValue->getZExtValue();
+    assert(Count >= 1 && "Unroll count must be positive.");
+    return Count;
+  }
+  return 0;
+}
+
+unsigned LoopUnroll::selectUnrollCount(
+    const Loop *L, unsigned TripCount, bool HasEnablePragma,
+    unsigned PragmaCount, const TargetTransformInfo::UnrollingPreferences &UP,
+    bool &SetExplicitly) {
+  SetExplicitly = true;
+
+  // User-specified count (either as a command-line option or
+  // constructor parameter) has highest precedence.
+  unsigned Count = UserCount ? CurrentCount : 0;
+
+  // If there is no user-specified count, unroll pragmas have the next
+  // highest precendence.
+  if (Count == 0) {
+    if (PragmaCount) {
+      Count = PragmaCount;
+    } else if (HasEnablePragma) {
+      // unroll(enable) pragma without an unroll_count pragma
+      // indicates to unroll loop fully.
+      Count = TripCount;
+    }
+  }
+
+  if (Count == 0)
+    Count = UP.Count;
+
+  if (Count == 0) {
+    SetExplicitly = false;
+    if (TripCount == 0)
+      // Runtime trip count.
+      Count = UnrollRuntimeCount;
+    else
+      // Conservative heuristic: if we know the trip count, see if we can
+      // completely unroll (subject to the threshold, checked below); otherwise
+      // try to find greatest modulo of the trip count which is still under
+      // threshold value.
+      Count = TripCount;
+  }
+  if (TripCount && Count > TripCount)
+    return TripCount;
+  return Count;
+}
+
 bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (skipOptnoneFunction(L))
     return false;
@@ -162,33 +327,16 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   BasicBlock *Header = L->getHeader();
   DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName()
         << "] Loop %" << Header->getName() << "\n");
-  (void)Header;
 
-  TargetTransformInfo::UnrollingPreferences UP;
-  UP.Threshold = CurrentThreshold;
-  UP.OptSizeThreshold = OptSizeUnrollThreshold;
-  UP.PartialThreshold = CurrentThreshold;
-  UP.PartialOptSizeThreshold = OptSizeUnrollThreshold;
-  UP.Count = CurrentCount;
-  UP.MaxCount = UINT_MAX;
-  UP.Partial = CurrentAllowPartial;
-  UP.Runtime = CurrentRuntime;
-  TTI.getUnrollingPreferences(L, UP);
-
-  // Determine the current unrolling threshold.  While this is normally set
-  // from UnrollThreshold, it is overridden to a smaller value if the current
-  // function is marked as optimize-for-size, and the unroll threshold was
-  // not user specified.
-  unsigned Threshold = UserThreshold ? CurrentThreshold : UP.Threshold;
-  unsigned PartialThreshold =
-    UserThreshold ? CurrentThreshold : UP.PartialThreshold;
-  if (!UserThreshold &&
-      Header->getParent()->getAttributes().
-        hasAttribute(AttributeSet::FunctionIndex,
-                     Attribute::OptimizeForSize)) {
-    Threshold = UP.OptSizeThreshold;
-    PartialThreshold = UP.PartialOptSizeThreshold;
+  if (HasUnrollDisablePragma(L)) {
+    return false;
   }
+  bool HasEnablePragma = HasUnrollEnablePragma(L);
+  unsigned PragmaCount = UnrollCountPragmaValue(L);
+  bool HasPragma = HasEnablePragma || PragmaCount > 0;
+
+  TargetTransformInfo::UnrollingPreferences UP;
+  getUnrollingPreferences(L, TTI, UP);
 
   // Find trip count and trip multiple if count is not available
   unsigned TripCount = 0;
@@ -202,79 +350,117 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock);
   }
 
-  bool Runtime = UserRuntime ? CurrentRuntime : UP.Runtime;
-
-  // Use a default unroll-count if the user doesn't specify a value
-  // and the trip count is a run-time value.  The default is different
-  // for run-time or compile-time trip count loops.
-  unsigned Count = UserCount ? CurrentCount : UP.Count;
-  if (Runtime && Count == 0 && TripCount == 0)
-    Count = UnrollRuntimeCount;
+  // Select an initial unroll count.  This may be reduced later based
+  // on size thresholds.
+  bool CountSetExplicitly;
+  unsigned Count = selectUnrollCount(L, TripCount, HasEnablePragma, PragmaCount,
+                                     UP, CountSetExplicitly);
+
+  unsigned NumInlineCandidates;
+  bool notDuplicatable;
+  unsigned LoopSize =
+      ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI);
+  DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
+  uint64_t UnrolledSize = (uint64_t)LoopSize * Count;
+  if (notDuplicatable) {
+    DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
+                 << " instructions.\n");
+    return false;
+  }
+  if (NumInlineCandidates != 0) {
+    DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
+    return false;
+  }
 
-  if (Count == 0) {
-    // Conservative heuristic: if we know the trip count, see if we can
-    // completely unroll (subject to the threshold, checked below); otherwise
-    // try to find greatest modulo of the trip count which is still under
-    // threshold value.
-    if (TripCount == 0)
-      return false;
-    Count = TripCount;
+  unsigned Threshold, PartialThreshold;
+  selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold);
+
+  // Given Count, TripCount and thresholds determine the type of
+  // unrolling which is to be performed.
+  enum { Full = 0, Partial = 1, Runtime = 2 };
+  int Unrolling;
+  if (TripCount && Count == TripCount) {
+    if (Threshold != NoThreshold && UnrolledSize > Threshold) {
+      DEBUG(dbgs() << "  Too large to fully unroll with count: " << Count
+                   << " because size: " << UnrolledSize << ">" << Threshold
+                   << "\n");
+      Unrolling = Partial;
+    } else {
+      Unrolling = Full;
+    }
+  } else if (TripCount && Count < TripCount) {
+    Unrolling = Partial;
+  } else {
+    Unrolling = Runtime;
   }
 
-  // Enforce the threshold.
-  if (Threshold != NoThreshold && PartialThreshold != NoThreshold) {
-    unsigned NumInlineCandidates;
-    bool notDuplicatable;
-    unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates,
-                                            notDuplicatable, TTI);
-    DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
-    if (notDuplicatable) {
-      DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
-            << " instructions.\n");
+  // Reduce count based on the type of unrolling and the threshold values.
+  unsigned OriginalCount = Count;
+  bool AllowRuntime = UserRuntime ? CurrentRuntime : UP.Runtime;
+  if (Unrolling == Partial) {
+    bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial;
+    if (!AllowPartial && !CountSetExplicitly) {
+      DEBUG(dbgs() << "  will not try to unroll partially because "
+                   << "-unroll-allow-partial not given\n");
       return false;
     }
-    if (NumInlineCandidates != 0) {
-      DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
+    if (PartialThreshold != NoThreshold && UnrolledSize > PartialThreshold) {
+      // Reduce unroll count to be modulo of TripCount for partial unrolling.
+      Count = PartialThreshold / LoopSize;
+      while (Count != 0 && TripCount % Count != 0)
+        Count--;
+    }
+  } else if (Unrolling == Runtime) {
+    if (!AllowRuntime && !CountSetExplicitly) {
+      DEBUG(dbgs() << "  will not try to unroll loop with runtime trip count "
+                   << "-unroll-runtime not given\n");
       return false;
     }
-    uint64_t Size = (uint64_t)LoopSize*Count;
-    if (TripCount != 1 &&
-        (Size > Threshold || (Count != TripCount && Size > PartialThreshold))) {
-      if (Size > Threshold)
-        DEBUG(dbgs() << "  Too large to fully unroll with count: " << Count
-                     << " because size: " << Size << ">" << Threshold << "\n");
-
-      bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial;
-      if (!AllowPartial && !(Runtime && TripCount == 0)) {
-        DEBUG(dbgs() << "  will not try to unroll partially because "
-              << "-unroll-allow-partial not given\n");
-        return false;
-      }
-      if (TripCount) {
-        // Reduce unroll count to be modulo of TripCount for partial unrolling
-        Count = PartialThreshold / LoopSize;
-        while (Count != 0 && TripCount%Count != 0)
-          Count--;
-      }
-      else if (Runtime) {
-        // Reduce unroll count to be a lower power-of-two value
-        while (Count != 0 && Size > PartialThreshold) {
-          Count >>= 1;
-          Size = LoopSize*Count;
-        }
-      }
-      if (Count > UP.MaxCount)
-        Count = UP.MaxCount;
-      if (Count < 2) {
-        DEBUG(dbgs() << "  could not unroll partially\n");
-        return false;
+    // Reduce unroll count to be the largest power-of-two factor of
+    // the original count which satisfies the threshold limit.
+    while (Count != 0 && UnrolledSize > PartialThreshold) {
+      Count >>= 1;
+      UnrolledSize = LoopSize * Count;
+    }
+    if (Count > UP.MaxCount)
+      Count = UP.MaxCount;
+    DEBUG(dbgs() << "  partially unrolling with count: " << Count << "\n");
+  }
+
+  if (HasPragma) {
+    // Emit optimization remarks if we are unable to unroll the loop
+    // as directed by a pragma.
+    DebugLoc LoopLoc = L->getStartLoc();
+    Function *F = Header->getParent();
+    LLVMContext &Ctx = F->getContext();
+    if (HasEnablePragma && PragmaCount == 0) {
+      if (TripCount && Count != TripCount) {
+        emitOptimizationRemarkMissed(
+            Ctx, DEBUG_TYPE, *F, LoopLoc,
+            "Unable to fully unroll loop as directed by unroll(enable) pragma "
+            "because unrolled size is too large.");
+      } else if (!TripCount) {
+        emitOptimizationRemarkMissed(
+            Ctx, DEBUG_TYPE, *F, LoopLoc,
+            "Unable to fully unroll loop as directed by unroll(enable) pragma "
+            "because loop has a runtime trip count.");
       }
-      DEBUG(dbgs() << "  partially unrolling with count: " << Count << "\n");
+    } else if (PragmaCount > 0 && Count != OriginalCount) {
+      emitOptimizationRemarkMissed(
+          Ctx, DEBUG_TYPE, *F, LoopLoc,
+          "Unable to unroll loop the number of times directed by "
+          "unroll_count pragma because unrolled size is too large.");
     }
   }
 
+  if (Unrolling != Full && Count < 2) {
+    // Partial unrolling by 1 is a nop.  For full unrolling, a factor
+    // of 1 makes sense because loop control can be eliminated.
+    return false;
+  }
+
   // Unroll the loop.
-  if (!UnrollLoop(L, Count, TripCount, Runtime, TripMultiple, LI, this, &LPM))
+  if (!UnrollLoop(L, Count, TripCount, AllowRuntime, TripMultiple, LI, this, &LPM))
     return false;
 
   return true;
diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp
index 4251ac4..3314e1e 100644
--- a/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -32,7 +32,10 @@ static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
   Value *Res = Builder.CreateSelect(Equal, Val, Orig);
   Builder.CreateStore(Res, Ptr);
 
-  CXI->replaceAllUsesWith(Orig);
+  Res = Builder.CreateInsertValue(UndefValue::get(CXI->getType()), Orig, 0);
+  Res = Builder.CreateInsertValue(Res, Equal, 1);
+
+  CXI->replaceAllUsesWith(Res);
   CXI->eraseFromParent();
   return true;
 }
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 986d6a4..ea2cf7c 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -1368,11 +1368,10 @@ Value *Reassociate::OptimizeXor(Instruction *I,
 Value *Reassociate::OptimizeAdd(Instruction *I,
                                 SmallVectorImpl<ValueEntry> &Ops) {
   // Scan the operand lists looking for X and -X pairs.  If we find any, we
-  // can simplify the expression. X+-X == 0.  While we're at it, scan for any
+  // can simplify expressions like X+-X == 0 and X+~X ==-1.  While we're at it,
+  // scan for any
   // duplicates.  We want to canonicalize Y+Y+Y+Z -> 3*Y+Z.
-  //
-  // TODO: We could handle "X + ~X" -> "-1" if we wanted, since "-X = ~X+1".
-  //
+
   for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
     Value *TheOp = Ops[i].Op;
     // Check to see if we've seen this operand before.  If so, we factor all
@@ -1412,19 +1411,28 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
       continue;
     }
 
-    // Check for X and -X in the operand list.
-    if (!BinaryOperator::isNeg(TheOp))
+    // Check for X and -X or X and ~X in the operand list.
+    if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isNot(TheOp))
       continue;
 
-    Value *X = BinaryOperator::getNegArgument(TheOp);
+    Value *X = nullptr;
+    if (BinaryOperator::isNeg(TheOp))
+      X = BinaryOperator::getNegArgument(TheOp);
+    else if (BinaryOperator::isNot(TheOp))
+      X = BinaryOperator::getNotArgument(TheOp);
+
     unsigned FoundX = FindInOperandList(Ops, i, X);
     if (FoundX == i)
       continue;
 
     // Remove X and -X from the operand list.
-    if (Ops.size() == 2)
+    if (Ops.size() == 2 && BinaryOperator::isNeg(TheOp))
       return Constant::getNullValue(X->getType());
 
+    // Remove X and ~X from the operand list.
+    if (Ops.size() == 2 && BinaryOperator::isNot(TheOp))
+      return Constant::getAllOnesValue(X->getType());
+
     Ops.erase(Ops.begin()+i);
     if (i < FoundX)
       --FoundX;
@@ -1434,6 +1442,13 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
     ++NumAnnihil;
     --i;     // Revisit element.
     e -= 2;  // Removed two elements.
+
+    // if X and ~X we append -1 to the operand list.
+    if (BinaryOperator::isNot(TheOp)) {
+      Value *V = Constant::getAllOnesValue(X->getType());
+      Ops.insert(Ops.end(), ValueEntry(getRank(V), V));
+      e += 1;
+    }
   }
 
   // Scan the operand list, checking to see if there are any common factors
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index feeb231..90c3520 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -494,7 +494,9 @@ private:
   void visitResumeInst    (TerminatorInst &I) { /*returns void*/ }
   void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
   void visitFenceInst     (FenceInst &I) { /*returns void*/ }
-  void visitAtomicCmpXchgInst (AtomicCmpXchgInst &I) { markOverdefined(&I); }
+  void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+    markAnythingOverdefined(&I);
+  }
   void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); }
   void visitAllocaInst    (Instruction &I) { markOverdefined(&I); }
   void visitVAArgInst     (Instruction &I) { markAnythingOverdefined(&I); }
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 04bf4f8..8c7f253 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -1032,11 +1032,6 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
       UserTy = SI->getValueOperand()->getType();
     }
 
-    if (!UserTy || (Ty && Ty != UserTy))
-      TyIsCommon = false; // Give up on anything but an iN type.
-    else
-      Ty = UserTy;
-
     if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
       // If the type is larger than the partition, skip it. We only encounter
       // this for split integer operations where we want to use the type of the
@@ -1051,6 +1046,13 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
       if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
         ITy = UserITy;
     }
+
+    // To avoid depending on the order of slices, Ty and TyIsCommon must not
+    // depend on types skipped above.
+    if (!UserTy || (Ty && Ty != UserTy))
+      TyIsCommon = false; // Give up on anything but an iN type.
+    else
+      Ty = UserTy;
   }
 
   return TyIsCommon ? Ty : ITy;
@@ -1128,7 +1130,7 @@ static bool isSafePHIToSpeculate(PHINode &PN,
     // If this pointer is always safe to load, or if we can prove that there
     // is already a load in the block, then we can move the load to the pred
     // block.
-    if (InVal->isDereferenceablePointer() ||
+    if (InVal->isDereferenceablePointer(DL) ||
         isSafeToLoadUnconditionally(InVal, TI, MaxAlign, DL))
       continue;
 
@@ -1196,8 +1198,8 @@ static bool isSafeSelectToSpeculate(SelectInst &SI,
                                     const DataLayout *DL = nullptr) {
   Value *TValue = SI.getTrueValue();
   Value *FValue = SI.getFalseValue();
-  bool TDerefable = TValue->isDereferenceablePointer();
-  bool FDerefable = FValue->isDereferenceablePointer();
+  bool TDerefable = TValue->isDereferenceablePointer(DL);
+  bool FDerefable = FValue->isDereferenceablePointer(DL);
 
   for (User *U : SI.users()) {
     LoadInst *LI = dyn_cast<LoadInst>(U);
diff --git a/lib/Transforms/Scalar/SampleProfile.cpp b/lib/Transforms/Scalar/SampleProfile.cpp
index 8e557aa..73c97ff 100644
--- a/lib/Transforms/Scalar/SampleProfile.cpp
+++ b/lib/Transforms/Scalar/SampleProfile.cpp
@@ -450,13 +450,14 @@ void SampleModuleProfile::dump() {
 ///
 /// \returns true if the file was loaded successfully, false otherwise.
 bool SampleModuleProfile::loadText() {
-  std::unique_ptr<MemoryBuffer> Buffer;
-  error_code EC = MemoryBuffer::getFile(Filename, Buffer);
-  if (EC) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
+      MemoryBuffer::getFile(Filename);
+  if (std::error_code EC = BufferOrErr.getError()) {
     std::string Msg(EC.message());
     M.getContext().diagnose(DiagnosticInfoSampleProfile(Filename.data(), Msg));
     return false;
   }
+  std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get());
   line_iterator LineIt(*Buffer, '#');
 
   // Read the profile of each function. Since each function may be
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index f8f828c..edf012d 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -65,6 +65,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeSinkingPass(Registry);
   initializeTailCallElimPass(Registry);
   initializeSeparateConstOffsetFromGEPPass(Registry);
+  initializeLoadCombinePass(Registry);
 }
 
 void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index 58192fc..e2a24a7 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -1142,8 +1142,8 @@ public:
 /// We can do this to a select if its only uses are loads and if the operand to
 /// the select can be loaded unconditionally.
 static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *DL) {
-  bool TDerefable = SI->getTrueValue()->isDereferenceablePointer();
-  bool FDerefable = SI->getFalseValue()->isDereferenceablePointer();
+  bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(DL);
+  bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(DL);
 
   for (User *U : SI->users()) {
     LoadInst *LI = dyn_cast<LoadInst>(U);
@@ -1226,7 +1226,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *DL) {
 
     // If this pointer is always safe to load, or if we can prove that there is
     // already a load in the block, then we can move the load to the pred block.
-    if (InVal->isDereferenceablePointer() ||
+    if (InVal->isDereferenceablePointer(DL) ||
         isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, DL))
       continue;
 
diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index b8529e1..62f2026 100644
--- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -121,41 +121,75 @@ class ConstantOffsetExtractor {
   /// numeric value of the extracted constant offset (0 if failed), and a
   /// new index representing the remainder (equal to the original index minus
   /// the constant offset).
-  /// \p Idx The given GEP index
-  /// \p NewIdx The new index to replace
-  /// \p DL The datalayout of the module
-  /// \p IP Calculating the new index requires new instructions. IP indicates
-  /// where to insert them (typically right before the GEP).
+  /// \p Idx    The given GEP index
+  /// \p NewIdx The new index to replace (output)
+  /// \p DL     The datalayout of the module
+  /// \p GEP    The given GEP
   static int64_t Extract(Value *Idx, Value *&NewIdx, const DataLayout *DL,
-                         Instruction *IP);
+                         GetElementPtrInst *GEP);
   /// Looks for a constant offset without extracting it. The meaning of the
   /// arguments and the return value are the same as Extract.
-  static int64_t Find(Value *Idx, const DataLayout *DL);
+  static int64_t Find(Value *Idx, const DataLayout *DL, GetElementPtrInst *GEP);
 
  private:
   ConstantOffsetExtractor(const DataLayout *Layout, Instruction *InsertionPt)
       : DL(Layout), IP(InsertionPt) {}
-  /// Searches the expression that computes V for a constant offset. If the
-  /// searching is successful, update UserChain as a path from V to the constant
-  /// offset.
-  int64_t find(Value *V);
-  /// A helper function to look into both operands of a binary operator U.
-  /// \p IsSub Whether U is a sub operator. If so, we need to negate the
-  /// constant offset at some point.
-  int64_t findInEitherOperand(User *U, bool IsSub);
-  /// After finding the constant offset and how it is reached from the GEP
-  /// index, we build a new index which is a clone of the old one except the
-  /// constant offset is removed. For example, given (a + (b + 5)) and knowning
-  /// the constant offset is 5, this function returns (a + b).
+  /// Searches the expression that computes V for a non-zero constant C s.t.
+  /// V can be reassociated into the form V' + C. If the searching is
+  /// successful, returns C and update UserChain as a def-use chain from C to V;
+  /// otherwise, UserChain is empty.
   ///
-  /// We cannot simply change the constant to zero because the expression that
-  /// computes the index or its intermediate result may be used by others.
-  Value *rebuildWithoutConstantOffset();
-  // A helper function for rebuildWithoutConstantOffset that rebuilds the direct
-  // user (U) of the constant offset (C).
-  Value *rebuildLeafWithoutConstantOffset(User *U, Value *C);
-  /// Returns a clone of U except the first occurrence of From with To.
-  Value *cloneAndReplace(User *U, Value *From, Value *To);
+  /// \p V            The given expression
+  /// \p SignExtended Whether V will be sign-extended in the computation of the
+  ///                 GEP index
+  /// \p ZeroExtended Whether V will be zero-extended in the computation of the
+  ///                 GEP index
+  /// \p NonNegative  Whether V is guaranteed to be non-negative. For example,
+  ///                 an index of an inbounds GEP is guaranteed to be
+  ///                 non-negative. Levaraging this, we can better split
+  ///                 inbounds GEPs.
+  APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative);
+  /// A helper function to look into both operands of a binary operator.
+  APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
+                            bool ZeroExtended);
+  /// After finding the constant offset C from the GEP index I, we build a new
+  /// index I' s.t. I' + C = I. This function builds and returns the new
+  /// index I' according to UserChain produced by function "find".
+  ///
+  /// The building conceptually takes two steps:
+  /// 1) iteratively distribute s/zext towards the leaves of the expression tree
+  /// that computes I
+  /// 2) reassociate the expression tree to the form I' + C.
+  ///
+  /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
+  /// sext to a, b and 5 so that we have
+  ///   sext(a) + (sext(b) + 5).
+  /// Then, we reassociate it to
+  ///   (sext(a) + sext(b)) + 5.
+  /// Given this form, we know I' is sext(a) + sext(b).
+  Value *rebuildWithoutConstOffset();
+  /// After the first step of rebuilding the GEP index without the constant
+  /// offset, distribute s/zext to the operands of all operators in UserChain.
+  /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
+  /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
+  ///
+  /// The function also updates UserChain to point to new subexpressions after
+  /// distributing s/zext. e.g., the old UserChain of the above example is
+  /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
+  /// and the new UserChain is
+  /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
+  ///   zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
+  ///
+  /// \p ChainIndex The index to UserChain. ChainIndex is initially
+  ///               UserChain.size() - 1, and is decremented during
+  ///               the recursion.
+  Value *distributeExtsAndCloneChain(unsigned ChainIndex);
+  /// Reassociates the GEP index to the form I' + C and returns I'.
+  Value *removeConstOffset(unsigned ChainIndex);
+  /// A helper function to apply ExtInsts, a list of s/zext, to value V.
+  /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
+  /// returns "sext i32 (zext i16 V to i32) to i64".
+  Value *applyExts(Value *V);
 
   /// Returns true if LHS and RHS have no bits in common, i.e., LHS | RHS == 0.
   bool NoCommonBits(Value *LHS, Value *RHS) const;
@@ -163,20 +197,26 @@ class ConstantOffsetExtractor {
   /// \p KnownOne Mask of all bits that are known to be one.
   /// \p KnownZero Mask of all bits that are known to be zero.
   void ComputeKnownBits(Value *V, APInt &KnownOne, APInt &KnownZero) const;
-  /// Finds the first use of Used in U. Returns -1 if not found.
-  static unsigned FindFirstUse(User *U, Value *Used);
-  /// Returns whether OPC (sext or zext) can be distributed to the operands of
-  /// BO. e.g., sext can be distributed to the operands of an "add nsw" because
-  /// sext (add nsw a, b) == add nsw (sext a), (sext b).
-  static bool Distributable(unsigned OPC, BinaryOperator *BO);
+  /// A helper function that returns whether we can trace into the operands
+  /// of binary operator BO for a constant offset.
+  ///
+  /// \p SignExtended Whether BO is surrounded by sext
+  /// \p ZeroExtended Whether BO is surrounded by zext
+  /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound
+  ///                array index.
+  bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
+                    bool NonNegative);
 
   /// The path from the constant offset to the old GEP index. e.g., if the GEP
   /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
   /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
   /// UserChain[2] will be the entire expression "a * b + (c + 5)".
   ///
-  /// This path helps rebuildWithoutConstantOffset rebuild the new GEP index.
+  /// This path helps to rebuild the new GEP index.
   SmallVector<User *, 8> UserChain;
+  /// A data structure used in rebuildWithoutConstOffset. Contains all
+  /// sext/zext instructions along UserChain.
+  SmallVector<CastInst *, 16> ExtInsts;
   /// The data layout of the module. Used in ComputeKnownBits.
   const DataLayout *DL;
   Instruction *IP;  /// Insertion position of cloned instructions.
@@ -196,6 +236,15 @@ class SeparateConstOffsetFromGEP : public FunctionPass {
     AU.addRequired<DataLayoutPass>();
     AU.addRequired<TargetTransformInfo>();
   }
+
+  bool doInitialization(Module &M) override {
+    DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+    if (DLP == nullptr)
+      report_fatal_error("data layout missing");
+    DL = &DLP->getDataLayout();
+    return false;
+  }
+
   bool runOnFunction(Function &F) override;
 
  private:
@@ -206,8 +255,42 @@ class SeparateConstOffsetFromGEP : public FunctionPass {
   /// function only inspects the GEP without changing it. The output
   /// NeedsExtraction indicates whether we can extract a non-zero constant
   /// offset from any index.
-  int64_t accumulateByteOffset(GetElementPtrInst *GEP, const DataLayout *DL,
-                               bool &NeedsExtraction);
+  int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
+  /// Canonicalize array indices to pointer-size integers. This helps to
+  /// simplify the logic of splitting a GEP. For example, if a + b is a
+  /// pointer-size integer, we have
+  ///   gep base, a + b = gep (gep base, a), b
+  /// However, this equality may not hold if the size of a + b is smaller than
+  /// the pointer size, because LLVM conceptually sign-extends GEP indices to
+  /// pointer size before computing the address
+  /// (http://llvm.org/docs/LangRef.html#id181).
+  ///
+  /// This canonicalization is very likely already done in clang and
+  /// instcombine. Therefore, the program will probably remain the same.
+  ///
+  /// Returns true if the module changes.
+  ///
+  /// Verified in @i32_add in split-gep.ll
+  bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
+  /// For each array index that is in the form of zext(a), convert it to sext(a)
+  /// if we can prove zext(a) <= max signed value of typeof(a). We prefer
+  /// sext(a) to zext(a), because in the special case where x + y >= 0 and
+  /// (x >= 0 or y >= 0), function CanTraceInto can split sext(x + y),
+  /// while no such case exists for zext(x + y).
+  ///
+  /// Note that
+  ///   zext(x + y) = zext(x) + zext(y)
+  /// is wrong, e.g.,
+  ///   zext i32(UINT_MAX + 1) to i64 !=
+  ///   (zext i32 UINT_MAX to i64) + (zext i32 1 to i64)
+  ///
+  /// Returns true if the module changes.
+  ///
+  /// Verified in @inbounds_zext_add in split-gep.ll and @sum_of_array3 in
+  /// split-gep-and-gvn.ll
+  bool convertInBoundsZExtToSExt(GetElementPtrInst *GEP);
+
+  const DataLayout *DL;
 };
 }  // anonymous namespace
 
@@ -227,181 +310,272 @@ FunctionPass *llvm::createSeparateConstOffsetFromGEPPass() {
   return new SeparateConstOffsetFromGEP();
 }
 
-bool ConstantOffsetExtractor::Distributable(unsigned OPC, BinaryOperator *BO) {
-  assert(OPC == Instruction::SExt || OPC == Instruction::ZExt);
+bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
+                                            bool ZeroExtended,
+                                            BinaryOperator *BO,
+                                            bool NonNegative) {
+  // We only consider ADD, SUB and OR, because a non-zero constant found in
+  // expressions composed of these operations can be easily hoisted as a
+  // constant offset by reassociation.
+  if (BO->getOpcode() != Instruction::Add &&
+      BO->getOpcode() != Instruction::Sub &&
+      BO->getOpcode() != Instruction::Or) {
+    return false;
+  }
+
+  Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
+  // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
+  // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
+  if (BO->getOpcode() == Instruction::Or && !NoCommonBits(LHS, RHS))
+    return false;
+
+  // In addition, tracing into BO requires that its surrounding s/zext (if
+  // any) is distributable to both operands.
+  //
+  // Suppose BO = A op B.
+  //  SignExtended | ZeroExtended | Distributable?
+  // --------------+--------------+----------------------------------
+  //       0       |      0       | true because no s/zext exists
+  //       0       |      1       | zext(BO) == zext(A) op zext(B)
+  //       1       |      0       | sext(BO) == sext(A) op sext(B)
+  //       1       |      1       | zext(sext(BO)) ==
+  //               |              |     zext(sext(A)) op zext(sext(B))
+  if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) {
+    // If a + b >= 0 and (a >= 0 or b >= 0), then
+    //   sext(a + b) = sext(a) + sext(b)
+    // even if the addition is not marked nsw.
+    //
+    // Leveraging this invarient, we can trace into an sext'ed inbound GEP
+    // index if the constant offset is non-negative.
+    //
+    // Verified in @sext_add in split-gep.ll.
+    if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) {
+      if (!ConstLHS->isNegative())
+        return true;
+    }
+    if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) {
+      if (!ConstRHS->isNegative())
+        return true;
+    }
+  }
 
   // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B)
   // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B)
   if (BO->getOpcode() == Instruction::Add ||
       BO->getOpcode() == Instruction::Sub) {
-    return (OPC == Instruction::SExt && BO->hasNoSignedWrap()) ||
-           (OPC == Instruction::ZExt && BO->hasNoUnsignedWrap());
+    if (SignExtended && !BO->hasNoSignedWrap())
+      return false;
+    if (ZeroExtended && !BO->hasNoUnsignedWrap())
+      return false;
   }
 
-  // sext/zext (and/or/xor A, B) == and/or/xor (sext/zext A), (sext/zext B)
-  // -instcombine also leverages this invariant to do the reverse
-  // transformation to reduce integer casts.
-  return BO->getOpcode() == Instruction::And ||
-         BO->getOpcode() == Instruction::Or ||
-         BO->getOpcode() == Instruction::Xor;
+  return true;
 }
 
-int64_t ConstantOffsetExtractor::findInEitherOperand(User *U, bool IsSub) {
-  assert(U->getNumOperands() == 2);
-  int64_t ConstantOffset = find(U->getOperand(0));
+APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
+                                                   bool SignExtended,
+                                                   bool ZeroExtended) {
+  // BO being non-negative does not shed light on whether its operands are
+  // non-negative. Clear the NonNegative flag here.
+  APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended,
+                              /* NonNegative */ false);
   // If we found a constant offset in the left operand, stop and return that.
   // This shortcut might cause us to miss opportunities of combining the
   // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9.
   // However, such cases are probably already handled by -instcombine,
   // given this pass runs after the standard optimizations.
   if (ConstantOffset != 0) return ConstantOffset;
-  ConstantOffset = find(U->getOperand(1));
+  ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended,
+                        /* NonNegative */ false);
   // If U is a sub operator, negate the constant offset found in the right
   // operand.
-  return IsSub ? -ConstantOffset : ConstantOffset;
+  if (BO->getOpcode() == Instruction::Sub)
+    ConstantOffset = -ConstantOffset;
+  return ConstantOffset;
 }
 
-int64_t ConstantOffsetExtractor::find(Value *V) {
-  // TODO(jingyue): We can even trace into integer/pointer casts, such as
+APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
+                                    bool ZeroExtended, bool NonNegative) {
+  // TODO(jingyue): We could trace into integer/pointer casts, such as
   // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only
   // integers because it gives good enough results for our benchmarks.
-  assert(V->getType()->isIntegerTy());
+  unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
 
+  // We cannot do much with Values that are not a User, such as an Argument.
   User *U = dyn_cast<User>(V);
-  // We cannot do much with Values that are not a User, such as BasicBlock and
-  // MDNode.
-  if (U == nullptr) return 0;
+  if (U == nullptr) return APInt(BitWidth, 0);
 
-  int64_t ConstantOffset = 0;
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(U)) {
+  APInt ConstantOffset(BitWidth, 0);
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
     // Hooray, we found it!
-    ConstantOffset = CI->getSExtValue();
-  } else if (Operator *O = dyn_cast<Operator>(U)) {
-    // The GEP index may be more complicated than a simple addition of a
-    // varaible and a constant. Therefore, we trace into subexpressions for more
-    // hoisting opportunities.
-    switch (O->getOpcode()) {
-      case Instruction::Add: {
-        ConstantOffset = findInEitherOperand(U, false);
-        break;
-      }
-      case Instruction::Sub: {
-        ConstantOffset = findInEitherOperand(U, true);
-        break;
-      }
-      case Instruction::Or: {
-        // If LHS and RHS don't have common bits, (LHS | RHS) is equivalent to
-        // (LHS + RHS).
-        if (NoCommonBits(U->getOperand(0), U->getOperand(1)))
-          ConstantOffset = findInEitherOperand(U, false);
-        break;
-      }
-      case Instruction::SExt:
-      case Instruction::ZExt: {
-        // We trace into sext/zext if the operator can be distributed to its
-        // operand. e.g., we can transform into "sext (add nsw a, 5)" and
-        // extract constant 5, because
-        //   sext (add nsw a, 5) == add nsw (sext a), 5
-        if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U->getOperand(0))) {
-          if (Distributable(O->getOpcode(), BO))
-            ConstantOffset = find(U->getOperand(0));
-        }
-        break;
-      }
+    ConstantOffset = CI->getValue();
+  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) {
+    // Trace into subexpressions for more hoisting opportunities.
+    if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) {
+      ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
     }
+  } else if (isa<SExtInst>(V)) {
+    ConstantOffset = find(U->getOperand(0), /* SignExtended */ true,
+                          ZeroExtended, NonNegative).sext(BitWidth);
+  } else if (isa<ZExtInst>(V)) {
+    // As an optimization, we can clear the SignExtended flag because
+    // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll.
+    //
+    // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0.
+    ConstantOffset =
+        find(U->getOperand(0), /* SignExtended */ false,
+             /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth);
   }
-  // If we found a non-zero constant offset, adds it to the path for future
-  // transformation (rebuildWithoutConstantOffset). Zero is a valid constant
-  // offset, but doesn't help this optimization.
+
+  // If we found a non-zero constant offset, add it to the path for
+  // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't
+  // help this optimization.
   if (ConstantOffset != 0)
     UserChain.push_back(U);
   return ConstantOffset;
 }
 
-unsigned ConstantOffsetExtractor::FindFirstUse(User *U, Value *Used) {
-  for (unsigned I = 0, E = U->getNumOperands(); I < E; ++I) {
-    if (U->getOperand(I) == Used)
-      return I;
+Value *ConstantOffsetExtractor::applyExts(Value *V) {
+  Value *Current = V;
+  // ExtInsts is built in the use-def order. Therefore, we apply them to V
+  // in the reversed order.
+  for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) {
+    if (Constant *C = dyn_cast<Constant>(Current)) {
+      // If Current is a constant, apply s/zext using ConstantExpr::getCast.
+      // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt.
+      Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType());
+    } else {
+      Instruction *Ext = (*I)->clone();
+      Ext->setOperand(0, Current);
+      Ext->insertBefore(IP);
+      Current = Ext;
+    }
   }
-  return -1;
+  return Current;
 }
 
-Value *ConstantOffsetExtractor::cloneAndReplace(User *U, Value *From,
-                                                Value *To) {
-  // Finds in U the first use of From. It is safe to ignore future occurrences
-  // of From, because findInEitherOperand similarly stops searching the right
-  // operand when the first operand has a non-zero constant offset.
-  unsigned OpNo = FindFirstUse(U, From);
-  assert(OpNo != (unsigned)-1 && "UserChain wasn't built correctly");
-
-  // ConstantOffsetExtractor::find only follows Operators (i.e., Instructions
-  // and ConstantExprs). Therefore, U is either an Instruction or a
-  // ConstantExpr.
-  if (Instruction *I = dyn_cast<Instruction>(U)) {
-    Instruction *Clone = I->clone();
-    Clone->setOperand(OpNo, To);
-    Clone->insertBefore(IP);
-    return Clone;
+Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
+  distributeExtsAndCloneChain(UserChain.size() - 1);
+  // Remove all nullptrs (used to be s/zext) from UserChain.
+  unsigned NewSize = 0;
+  for (auto I = UserChain.begin(), E = UserChain.end(); I != E; ++I) {
+    if (*I != nullptr) {
+      UserChain[NewSize] = *I;
+      NewSize++;
+    }
   }
-  // cast<Constant>(To) is safe because a ConstantExpr only uses Constants.
-  return cast<ConstantExpr>(U)
-      ->getWithOperandReplaced(OpNo, cast<Constant>(To));
+  UserChain.resize(NewSize);
+  return removeConstOffset(UserChain.size() - 1);
 }
 
-Value *ConstantOffsetExtractor::rebuildLeafWithoutConstantOffset(User *U,
-                                                                 Value *C) {
-  assert(U->getNumOperands() <= 2 &&
-         "We didn't trace into any operator with more than 2 operands");
-  // If U has only one operand which is the constant offset, removing the
-  // constant offset leaves U as a null value.
-  if (U->getNumOperands() == 1)
-    return Constant::getNullValue(U->getType());
-
-  // U->getNumOperands() == 2
-  unsigned OpNo = FindFirstUse(U, C); // U->getOperand(OpNo) == C
-  assert(OpNo < 2 && "UserChain wasn't built correctly");
-  Value *TheOther = U->getOperand(1 - OpNo); // The other operand of U
-  // If U = C - X, removing C makes U = -X; otherwise U will simply be X.
-  if (!isa<SubOperator>(U) || OpNo == 1)
-    return TheOther;
-  if (isa<ConstantExpr>(U))
-    return ConstantExpr::getNeg(cast<Constant>(TheOther));
-  return BinaryOperator::CreateNeg(TheOther, "", IP);
+Value *
+ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
+  User *U = UserChain[ChainIndex];
+  if (ChainIndex == 0) {
+    assert(isa<ConstantInt>(U));
+    // If U is a ConstantInt, applyExts will return a ConstantInt as well.
+    return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U));
+  }
+
+  if (CastInst *Cast = dyn_cast<CastInst>(U)) {
+    assert((isa<SExtInst>(Cast) || isa<ZExtInst>(Cast)) &&
+           "We only traced into two types of CastInst: sext and zext");
+    ExtInsts.push_back(Cast);
+    UserChain[ChainIndex] = nullptr;
+    return distributeExtsAndCloneChain(ChainIndex - 1);
+  }
+
+  // Function find only trace into BinaryOperator and CastInst.
+  BinaryOperator *BO = cast<BinaryOperator>(U);
+  // OpNo = which operand of BO is UserChain[ChainIndex - 1]
+  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
+  Value *TheOther = applyExts(BO->getOperand(1 - OpNo));
+  Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1);
+
+  BinaryOperator *NewBO = nullptr;
+  if (OpNo == 0) {
+    NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther,
+                                   BO->getName(), IP);
+  } else {
+    NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain,
+                                   BO->getName(), IP);
+  }
+  return UserChain[ChainIndex] = NewBO;
 }
 
-Value *ConstantOffsetExtractor::rebuildWithoutConstantOffset() {
-  assert(UserChain.size() > 0 && "you at least found a constant, right?");
-  // Start with the constant and go up through UserChain, each time building a
-  // clone of the subexpression but with the constant removed.
-  // e.g., to build a clone of (a + (b + (c + 5)) but with the 5 removed, we
-  // first c, then (b + c), and finally (a + (b + c)).
-  //
-  // Fast path: if the GEP index is a constant, simply returns 0.
-  if (UserChain.size() == 1)
-    return ConstantInt::get(UserChain[0]->getType(), 0);
-
-  Value *Remainder =
-      rebuildLeafWithoutConstantOffset(UserChain[1], UserChain[0]);
-  for (size_t I = 2; I < UserChain.size(); ++I)
-    Remainder = cloneAndReplace(UserChain[I], UserChain[I - 1], Remainder);
-  return Remainder;
+Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
+  if (ChainIndex == 0) {
+    assert(isa<ConstantInt>(UserChain[ChainIndex]));
+    return ConstantInt::getNullValue(UserChain[ChainIndex]->getType());
+  }
+
+  BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
+  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
+  assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]);
+  Value *NextInChain = removeConstOffset(ChainIndex - 1);
+  Value *TheOther = BO->getOperand(1 - OpNo);
+
+  // If NextInChain is 0 and not the LHS of a sub, we can simplify the
+  // sub-expression to be just TheOther.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
+    if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
+      return TheOther;
+  }
+
+  if (BO->getOpcode() == Instruction::Or) {
+    // Rebuild "or" as "add", because "or" may be invalid for the new
+    // epxression.
+    //
+    // For instance, given
+    //   a | (b + 5) where a and b + 5 have no common bits,
+    // we can extract 5 as the constant offset.
+    //
+    // However, reusing the "or" in the new index would give us
+    //   (a | b) + 5
+    // which does not equal a | (b + 5).
+    //
+    // Replacing the "or" with "add" is fine, because
+    //   a | (b + 5) = a + (b + 5) = (a + b) + 5
+    return BinaryOperator::CreateAdd(BO->getOperand(0), BO->getOperand(1),
+                                     BO->getName(), IP);
+  }
+
+  // We can reuse BO in this case, because the new expression shares the same
+  // instruction type and BO is used at most once.
+  assert(BO->getNumUses() <= 1 &&
+         "distributeExtsAndCloneChain clones each BinaryOperator in "
+         "UserChain, so no one should be used more than "
+         "once");
+  BO->setOperand(OpNo, NextInChain);
+  BO->setHasNoSignedWrap(false);
+  BO->setHasNoUnsignedWrap(false);
+  // Make sure it appears after all instructions we've inserted so far.
+  BO->moveBefore(IP);
+  return BO;
 }
 
 int64_t ConstantOffsetExtractor::Extract(Value *Idx, Value *&NewIdx,
                                          const DataLayout *DL,
-                                         Instruction *IP) {
-  ConstantOffsetExtractor Extractor(DL, IP);
+                                         GetElementPtrInst *GEP) {
+  ConstantOffsetExtractor Extractor(DL, GEP);
   // Find a non-zero constant offset first.
-  int64_t ConstantOffset = Extractor.find(Idx);
-  if (ConstantOffset == 0)
-    return 0;
-  // Then rebuild a new index with the constant removed.
-  NewIdx = Extractor.rebuildWithoutConstantOffset();
-  return ConstantOffset;
+  APInt ConstantOffset =
+      Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
+                     GEP->isInBounds());
+  if (ConstantOffset != 0) {
+    // Separates the constant offset from the GEP index.
+    NewIdx = Extractor.rebuildWithoutConstOffset();
+  }
+  return ConstantOffset.getSExtValue();
 }
 
-int64_t ConstantOffsetExtractor::Find(Value *Idx, const DataLayout *DL) {
-  return ConstantOffsetExtractor(DL, nullptr).find(Idx);
+int64_t ConstantOffsetExtractor::Find(Value *Idx, const DataLayout *DL,
+      GetElementPtrInst *GEP) {
+  // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
+  return ConstantOffsetExtractor(DL, GEP)
+      .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
+            GEP->isInBounds())
+      .getSExtValue();
 }
 
 void ConstantOffsetExtractor::ComputeKnownBits(Value *V, APInt &KnownOne,
@@ -421,8 +595,64 @@ bool ConstantOffsetExtractor::NoCommonBits(Value *LHS, Value *RHS) const {
   return (LHSKnownZero | RHSKnownZero).isAllOnesValue();
 }
 
-int64_t SeparateConstOffsetFromGEP::accumulateByteOffset(
-    GetElementPtrInst *GEP, const DataLayout *DL, bool &NeedsExtraction) {
+bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize(
+    GetElementPtrInst *GEP) {
+  bool Changed = false;
+  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end();
+       I != E; ++I, ++GTI) {
+    // Skip struct member indices which must be i32.
+    if (isa<SequentialType>(*GTI)) {
+      if ((*I)->getType() != IntPtrTy) {
+        *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP);
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+bool
+SeparateConstOffsetFromGEP::convertInBoundsZExtToSExt(GetElementPtrInst *GEP) {
+  if (!GEP->isInBounds())
+    return false;
+
+  // TODO: consider alloca
+  GlobalVariable *UnderlyingObject =
+      dyn_cast<GlobalVariable>(GEP->getPointerOperand());
+  if (UnderlyingObject == nullptr)
+    return false;
+
+  uint64_t ObjectSize =
+      DL->getTypeAllocSize(UnderlyingObject->getType()->getElementType());
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  bool Changed = false;
+  for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end(); I != E;
+       ++I, ++GTI) {
+    if (isa<SequentialType>(*GTI)) {
+      if (ZExtInst *Extended = dyn_cast<ZExtInst>(*I)) {
+        unsigned SrcBitWidth =
+            cast<IntegerType>(Extended->getSrcTy())->getBitWidth();
+        // For GEP operand zext(a), if a <= max signed value of typeof(a), then
+        // the sign bit of a is zero and sext(a) = zext(a). Because the GEP is
+        // in bounds, we know a <= ObjectSize, so the condition can be reduced
+        // to ObjectSize <= max signed value of typeof(a).
+        if (ObjectSize <=
+            APInt::getSignedMaxValue(SrcBitWidth).getZExtValue()) {
+          *I = new SExtInst(Extended->getOperand(0), Extended->getType(),
+                            Extended->getName(), GEP);
+          Changed = true;
+        }
+      }
+    }
+  }
+  return Changed;
+}
+
+int64_t
+SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
+                                                 bool &NeedsExtraction) {
   NeedsExtraction = false;
   int64_t AccumulativeByteOffset = 0;
   gep_type_iterator GTI = gep_type_begin(*GEP);
@@ -430,7 +660,7 @@ int64_t SeparateConstOffsetFromGEP::accumulateByteOffset(
     if (isa<SequentialType>(*GTI)) {
       // Tries to extract a constant offset from this GEP index.
       int64_t ConstantOffset =
-          ConstantOffsetExtractor::Find(GEP->getOperand(I), DL);
+          ConstantOffsetExtractor::Find(GEP->getOperand(I), DL, GEP);
       if (ConstantOffset != 0) {
         NeedsExtraction = true;
         // A GEP may have multiple indices.  We accumulate the extracted
@@ -455,31 +685,11 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
     return false;
 
   bool Changed = false;
+  Changed |= canonicalizeArrayIndicesToPointerSize(GEP);
+  Changed |= convertInBoundsZExtToSExt(GEP);
 
-  // Shortcuts integer casts. Eliminating these explicit casts can make
-  // subsequent optimizations more obvious: ConstantOffsetExtractor needn't
-  // trace into these casts.
-  if (GEP->isInBounds()) {
-    // Doing this to inbounds GEPs is safe because their indices are guaranteed
-    // to be non-negative and in bounds.
-    gep_type_iterator GTI = gep_type_begin(*GEP);
-    for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
-      if (isa<SequentialType>(*GTI)) {
-        if (Operator *O = dyn_cast<Operator>(GEP->getOperand(I))) {
-          if (O->getOpcode() == Instruction::SExt ||
-              O->getOpcode() == Instruction::ZExt) {
-            GEP->setOperand(I, O->getOperand(0));
-            Changed = true;
-          }
-        }
-      }
-    }
-  }
-
-  const DataLayout *DL = &getAnalysis<DataLayoutPass>().getDataLayout();
   bool NeedsExtraction;
-  int64_t AccumulativeByteOffset =
-      accumulateByteOffset(GEP, DL, NeedsExtraction);
+  int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
 
   if (!NeedsExtraction)
     return Changed;
@@ -506,30 +716,29 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
         assert(NewIdx != nullptr &&
                "ConstantOffset != 0 implies NewIdx is set");
         GEP->setOperand(I, NewIdx);
-        // Clear the inbounds attribute because the new index may be off-bound.
-        // e.g.,
-        //
-        // b = add i64 a, 5
-        // addr = gep inbounds float* p, i64 b
-        //
-        // is transformed to:
-        //
-        // addr2 = gep float* p, i64 a
-        // addr = gep float* addr2, i64 5
-        //
-        // If a is -4, although the old index b is in bounds, the new index a is
-        // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
-        // inbounds keyword is not present, the offsets are added to the base
-        // address with silently-wrapping two's complement arithmetic".
-        // Therefore, the final code will be a semantically equivalent.
-        //
-        // TODO(jingyue): do some range analysis to keep as many inbounds as
-        // possible. GEPs with inbounds are more friendly to alias analysis.
-        GEP->setIsInBounds(false);
-        Changed = true;
       }
     }
   }
+  // Clear the inbounds attribute because the new index may be off-bound.
+  // e.g.,
+  //
+  // b = add i64 a, 5
+  // addr = gep inbounds float* p, i64 b
+  //
+  // is transformed to:
+  //
+  // addr2 = gep float* p, i64 a
+  // addr = gep float* addr2, i64 5
+  //
+  // If a is -4, although the old index b is in bounds, the new index a is
+  // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
+  // inbounds keyword is not present, the offsets are added to the base
+  // address with silently-wrapping two's complement arithmetic".
+  // Therefore, the final code will be a semantically equivalent.
+  //
+  // TODO(jingyue): do some range analysis to keep as many inbounds as
+  // possible. GEPs with inbounds are more friendly to alias analysis.
+  GEP->setIsInBounds(false);
 
   // Offsets the base with the accumulative byte offset.
   //
@@ -562,9 +771,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   Instruction *NewGEP = GEP->clone();
   NewGEP->insertBefore(GEP);
 
-  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
   uint64_t ElementTypeSizeOfGEP =
       DL->getTypeAllocSize(GEP->getType()->getElementType());
+  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
   if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
     // Very likely. As long as %gep is natually aligned, the byte offset we
     // extracted should be a multiple of sizeof(*%gep).
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index 482c33a..7348c45 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Debug.h"
@@ -34,6 +35,7 @@ namespace {
     DominatorTree *DT;
     LoopInfo *LI;
     AliasAnalysis *AA;
+    const DataLayout *DL;
 
   public:
     static char ID; // Pass identification
@@ -98,6 +100,8 @@ bool Sinking::runOnFunction(Function &F) {
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LI = &getAnalysis<LoopInfo>();
   AA = &getAnalysis<AliasAnalysis>();
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
 
   bool MadeChange, EverMadeChange = false;
 
@@ -193,7 +197,7 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst,
   if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
     // We cannot sink a load across a critical edge - there may be stores in
     // other code paths.
-    if (!isSafeToSpeculativelyExecute(Inst))
+    if (!isSafeToSpeculativelyExecute(Inst, DL))
       return false;
 
     // We don't want to sink across a critical edge if we don't dominate the
diff --git a/lib/Transforms/Utils/Android.mk b/lib/Transforms/Utils/Android.mk
index cbd8dd0..2390027 100644
--- a/lib/Transforms/Utils/Android.mk
+++ b/lib/Transforms/Utils/Android.mk
@@ -33,7 +33,6 @@ transforms_utils_SRC_FILES := \
   SimplifyIndVar.cpp \
   SimplifyInstructions.cpp \
   SimplifyLibCalls.cpp \
-  SpecialCaseList.cpp \
   UnifyFunctionExitNodes.cpp \
   Utils.cpp \
   ValueMapper.cpp
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index e10ca90..fcf548f 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -33,7 +33,6 @@ add_llvm_library(LLVMTransformUtils
   SimplifyIndVar.cpp
   SimplifyInstructions.cpp
   SimplifyLibCalls.cpp
-  SpecialCaseList.cpp
   UnifyFunctionExitNodes.cpp
   Utils.cpp
   ValueMapper.cpp
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index eb67db1..3f75b3e 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -107,7 +107,7 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
   for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
        I != E; ++I) {
     GlobalAlias *GA = cast<GlobalAlias>(VMap[I]);
-    if (const GlobalObject *C = I->getAliasee())
+    if (const Constant *C = I->getAliasee())
       GA->setAliasee(cast<GlobalObject>(MapValue(C, VMap)));
   }
 
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index e01d0c3..f0a9f2b 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -189,6 +189,7 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
     InvokeInst *II = InvokeInst::Create(CI->getCalledValue(), Split,
                                         Invoke.getOuterResumeDest(),
                                         InvokeArgs, CI->getName(), BB);
+    II->setDebugLoc(CI->getDebugLoc());
     II->setCallingConv(CI->getCallingConv());
     II->setAttributes(CI->getAttributes());
     
@@ -466,7 +467,13 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
          BI != BE; ++BI) {
       DebugLoc DL = BI->getDebugLoc();
-      if (!DL.isUnknown()) {
+      if (DL.isUnknown()) {
+        // If the inlined instruction has no line number, make it look as if it
+        // originates from the call location. This is important for
+        // ((__always_inline__, __nodebug__)) functions which must use caller
+        // location for all instructions in their function body.
+        BI->setDebugLoc(TheCallDL);
+      } else {
         BI->setDebugLoc(updateInlinedAtInfo(DL, TheCallDL, BI->getContext()));
         if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(BI)) {
           LLVMContext &Ctx = BI->getContext();
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index f7787da..ef42291 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -50,6 +50,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
@@ -473,7 +474,8 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
 /// explicit if they accepted the analysis directly and then updated it.
 static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist,
                             AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI,
-                            ScalarEvolution *SE, Pass *PP) {
+                            ScalarEvolution *SE, Pass *PP,
+                            const DataLayout *DL) {
   bool Changed = false;
 ReprocessLoop:
 
@@ -672,7 +674,7 @@ ReprocessLoop:
       // The block has now been cleared of all instructions except for
       // a comparison and a conditional branch. SimplifyCFG may be able
       // to fold it now.
-      if (!FoldBranchToCommonDest(BI)) continue;
+      if (!FoldBranchToCommonDest(BI, DL)) continue;
 
       // Success. The block is now dead, so remove it from the loop,
       // update the dominator tree and delete it.
@@ -709,7 +711,8 @@ ReprocessLoop:
 }
 
 bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP,
-                        AliasAnalysis *AA, ScalarEvolution *SE) {
+                        AliasAnalysis *AA, ScalarEvolution *SE,
+                        const DataLayout *DL) {
   bool Changed = false;
 
   // Worklist maintains our depth-first queue of loops in this nest to process.
@@ -726,7 +729,8 @@ bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP,
   }
 
   while (!Worklist.empty())
-    Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, AA, DT, LI, SE, PP);
+    Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, AA, DT, LI,
+                               SE, PP, DL);
 
   return Changed;
 }
@@ -744,6 +748,7 @@ namespace {
     DominatorTree *DT;
     LoopInfo *LI;
     ScalarEvolution *SE;
+    const DataLayout *DL;
 
     bool runOnFunction(Function &F) override;
 
@@ -787,10 +792,12 @@ bool LoopSimplify::runOnFunction(Function &F) {
   LI = &getAnalysis<LoopInfo>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   SE = getAnalysisIfAvailable<ScalarEvolution>();
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
 
   // Simplify each loop nest in the function.
   for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
-    Changed |= simplifyLoop(*I, DT, LI, this, AA, SE);
+    Changed |= simplifyLoop(*I, DT, LI, this, AA, SE, DL);
 
   return Changed;
 }
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index d953e30..c86b82c 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/LLVMContext.h"
@@ -242,21 +243,25 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
                            Twine("completely unrolled loop with ") +
                                Twine(TripCount) + " iterations");
   } else {
+    auto EmitDiag = [&](const Twine &T) {
+      emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc,
+                             "unrolled loop by a factor of " + Twine(Count) +
+                                 T);
+    };
+
     DEBUG(dbgs() << "UNROLLING loop %" << Header->getName()
           << " by " << Count);
-    Twine DiagMsg("unrolled loop by a factor of " + Twine(Count));
     if (TripMultiple == 0 || BreakoutTrip != TripMultiple) {
       DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
-      DiagMsg.concat(" with a breakout at trip " + Twine(BreakoutTrip));
+      EmitDiag(" with a breakout at trip " + Twine(BreakoutTrip));
     } else if (TripMultiple != 1) {
       DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
-      DiagMsg.concat(" with " + Twine(TripMultiple) + " trips per branch");
+      EmitDiag(" with " + Twine(TripMultiple) + " trips per branch");
     } else if (RuntimeTripCount) {
       DEBUG(dbgs() << " with run-time trip count");
-      DiagMsg.concat(" with run-time trip count");
+      EmitDiag(" with run-time trip count");
     }
     DEBUG(dbgs() << "!\n");
-    emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc, DiagMsg);
   }
 
   bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
@@ -485,8 +490,19 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     if (!OuterL && !CompletelyUnroll)
       OuterL = L;
     if (OuterL) {
+      DataLayoutPass *DLP = PP->getAnalysisIfAvailable<DataLayoutPass>();
+      const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
       ScalarEvolution *SE = PP->getAnalysisIfAvailable<ScalarEvolution>();
-      simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE);
+      simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE, DL);
+
+      // LCSSA must be performed on the outermost affected loop. The unrolled
+      // loop's last loop latch is guaranteed to be in the outermost loop after
+      // deleteLoopFromQueue updates LoopInfo.
+      Loop *LatchLoop = LI->getLoopFor(Latches.back());
+      if (!OuterL->contains(LatchLoop))
+        while (OuterL->getParentLoop() != LatchLoop)
+          OuterL = OuterL->getParentLoop();
+
       formLCSSARecursively(*OuterL, *DT, SE);
     }
   }
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 5bef091..a96c46a 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -280,17 +280,17 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
   SCEVExpander Expander(*SE, "loop-unroll");
   Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
                                             PreHeaderBR);
-  Type *CountTy = TripCount->getType();
-  BinaryOperator *ModVal =
-    BinaryOperator::CreateURem(TripCount,
-                               ConstantInt::get(CountTy, Count),
-                               "xtraiter");
-  ModVal->insertBefore(PreHeaderBR);
-
-  // Check if for no extra iterations, then jump to unrolled loop
-  Value *BranchVal = new ICmpInst(PreHeaderBR,
-                                  ICmpInst::ICMP_NE, ModVal,
-                                  ConstantInt::get(CountTy, 0), "lcmp");
+
+  IRBuilder<> B(PreHeaderBR);
+  Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
+
+  // Check if for no extra iterations, then jump to unrolled loop.  We have to
+  // check that the trip count computation didn't overflow when adding one to
+  // the backedge taken count.
+  Value *LCmp = B.CreateIsNotNull(ModVal, "lcmp.mod");
+  Value *OverflowCheck = B.CreateIsNull(TripCount, "lcmp.overflow");
+  Value *BranchVal = B.CreateOr(OverflowCheck, LCmp, "lcmp.or");
+
   // Branch to either the extra iterations or the unrolled loop
   // We will fix up the true branch label when adding loop body copies
   BranchInst::Create(PEnd, PEnd, BranchVal, PreHeaderBR);
@@ -344,6 +344,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
       }
 
       // The comparison w/ the extra iteration value and branch
+      Type *CountTy = TripCount->getType();
       Value *BranchVal = new ICmpInst(*NewBB, ICmpInst::ICMP_EQ, ModVal,
                                       ConstantInt::get(CountTy, leftOverIters),
                                       "un.tmp");
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 9ef694c..eac693b 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -14,11 +14,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -58,16 +60,18 @@ namespace {
         Low(low), High(high), BB(bb) { }
     };
 
-    typedef std::vector<CaseRange>           CaseVector;
+    typedef std::vector<CaseRange> CaseVector;
     typedef std::vector<CaseRange>::iterator CaseItr;
   private:
     void processSwitchInst(SwitchInst *SI);
 
-    BasicBlock* switchConvert(CaseItr Begin, CaseItr End, Value* Val,
-                              BasicBlock* OrigBlock, BasicBlock* Default);
-    BasicBlock* newLeafBlock(CaseRange& Leaf, Value* Val,
-                             BasicBlock* OrigBlock, BasicBlock* Default);
-    unsigned Clusterify(CaseVector& Cases, SwitchInst *SI);
+    BasicBlock *switchConvert(CaseItr Begin, CaseItr End,
+                              ConstantInt *LowerBound, ConstantInt *UpperBound,
+                              Value *Val, BasicBlock *OrigBlock,
+                              BasicBlock *Default);
+    BasicBlock *newLeafBlock(CaseRange &Leaf, Value *Val, BasicBlock *OrigBlock,
+                             BasicBlock *Default);
+    unsigned Clusterify(CaseVector &Cases, SwitchInst *SI);
   };
 
   /// The comparison function for sorting the switch case values in the vector.
@@ -129,15 +133,26 @@ static raw_ostream& operator<<(raw_ostream &O,
 
 // switchConvert - Convert the switch statement into a binary lookup of
 // the case values. The function recursively builds this tree.
-//
-BasicBlock* LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
-                                       Value* Val, BasicBlock* OrigBlock,
-                                       BasicBlock* Default)
-{
+// LowerBound and UpperBound are used to keep track of the bounds for Val
+// that have already been checked by a block emitted by one of the previous
+// calls to switchConvert in the call stack.
+BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
+                                       ConstantInt *LowerBound,
+                                       ConstantInt *UpperBound, Value *Val,
+                                       BasicBlock *OrigBlock,
+                                       BasicBlock *Default) {
   unsigned Size = End - Begin;
 
-  if (Size == 1)
+  if (Size == 1) {
+    // Check if the Case Range is perfectly squeezed in between
+    // already checked Upper and Lower bounds. If it is then we can avoid
+    // emitting the code that checks if the value actually falls in the range
+    // because the bounds already tell us so.
+    if (Begin->Low == LowerBound && Begin->High == UpperBound) {
+      return Begin->BB;
+    }
     return newLeafBlock(*Begin, Val, OrigBlock, Default);
+  }
 
   unsigned Mid = Size / 2;
   std::vector<CaseRange> LHS(Begin, Begin + Mid);
@@ -145,15 +160,50 @@ BasicBlock* LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
   std::vector<CaseRange> RHS(Begin + Mid, End);
   DEBUG(dbgs() << "RHS: " << RHS << "\n");
 
-  CaseRange& Pivot = *(Begin + Mid);
-  DEBUG(dbgs() << "Pivot ==> " 
-               << cast<ConstantInt>(Pivot.Low)->getValue() << " -"
-               << cast<ConstantInt>(Pivot.High)->getValue() << "\n");
+  CaseRange &Pivot = *(Begin + Mid);
+  DEBUG(dbgs() << "Pivot ==> "
+               << cast<ConstantInt>(Pivot.Low)->getValue()
+               << " -" << cast<ConstantInt>(Pivot.High)->getValue() << "\n");
+
+  // NewLowerBound here should never be the integer minimal value.
+  // This is because it is computed from a case range that is never
+  // the smallest, so there is always a case range that has at least
+  // a smaller value.
+  ConstantInt *NewLowerBound = cast<ConstantInt>(Pivot.Low);
+  ConstantInt *NewUpperBound;
+
+  // If we don't have a Default block then it means that we can never
+  // have a value outside of a case range, so set the UpperBound to the highest
+  // value in the LHS part of the case ranges.
+  if (Default != nullptr) {
+    // Because NewLowerBound is never the smallest representable integer
+    // it is safe here to subtract one.
+    NewUpperBound = ConstantInt::get(NewLowerBound->getContext(),
+                                     NewLowerBound->getValue() - 1);
+  } else {
+    CaseItr LastLHS = LHS.begin() + LHS.size() - 1;
+    NewUpperBound = cast<ConstantInt>(LastLHS->High);
+  }
 
-  BasicBlock* LBranch = switchConvert(LHS.begin(), LHS.end(), Val,
-                                      OrigBlock, Default);
-  BasicBlock* RBranch = switchConvert(RHS.begin(), RHS.end(), Val,
-                                      OrigBlock, Default);
+  DEBUG(dbgs() << "LHS Bounds ==> ";
+        if (LowerBound) {
+          dbgs() << cast<ConstantInt>(LowerBound)->getSExtValue();
+        } else {
+          dbgs() << "NONE";
+        }
+        dbgs() << " - " << NewUpperBound->getSExtValue() << "\n";
+        dbgs() << "RHS Bounds ==> ";
+        dbgs() << NewLowerBound->getSExtValue() << " - ";
+        if (UpperBound) {
+          dbgs() << cast<ConstantInt>(UpperBound)->getSExtValue() << "\n";
+        } else {
+          dbgs() << "NONE\n";
+        });
+
+  BasicBlock *LBranch = switchConvert(LHS.begin(), LHS.end(), LowerBound,
+                                      NewUpperBound, Val, OrigBlock, Default);
+  BasicBlock *RBranch = switchConvert(RHS.begin(), RHS.end(), NewLowerBound,
+                                      UpperBound, Val, OrigBlock, Default);
 
   // Create a new node that checks if the value is < pivot. Go to the
   // left branch if it is and right branch if not.
@@ -291,13 +341,19 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) {
     return;
   }
 
+  const bool DefaultIsUnreachable =
+      Default->size() == 1 && isa<UnreachableInst>(Default->getTerminator());
   // Create a new, empty default block so that the new hierarchy of
   // if-then statements go to this and the PHI nodes are happy.
-  BasicBlock* NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault");
-  F->getBasicBlockList().insert(Default, NewDefault);
-
-  BranchInst::Create(Default, NewDefault);
-
+  // if the default block is set as an unreachable we avoid creating one
+  // because will never be a valid target.
+  BasicBlock *NewDefault = nullptr;
+  if (!DefaultIsUnreachable) {
+    NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault");
+    F->getBasicBlockList().insert(Default, NewDefault);
+
+    BranchInst::Create(Default, NewDefault);
+  }
   // If there is an entry in any PHI nodes for the default edge, make sure
   // to update them as well.
   for (BasicBlock::iterator I = Default->begin(); isa<PHINode>(I); ++I) {
@@ -316,12 +372,31 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) {
   DEBUG(dbgs() << "Cases: " << Cases << "\n");
   (void)numCmps;
   
-  BasicBlock* SwitchBlock = switchConvert(Cases.begin(), Cases.end(), Val,
-                                          OrigBlock, NewDefault);
+  ConstantInt *UpperBound = nullptr;
+  ConstantInt *LowerBound = nullptr;
+
+  // Optimize the condition where Default is an unreachable block. In this case
+  // we can make the bounds tightly fitted around the case value ranges,
+  // because we know that the value passed to the switch should always be
+  // exactly one of the case values.
+  if (DefaultIsUnreachable) {
+    CaseItr LastCase = Cases.begin() + Cases.size() - 1;
+    UpperBound = cast<ConstantInt>(LastCase->High);
+    LowerBound = cast<ConstantInt>(Cases.begin()->Low);
+  }
+  BasicBlock *SwitchBlock =
+      switchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val,
+                    OrigBlock, NewDefault);
 
   // Branch to our shiny new if-then stuff...
   BranchInst::Create(SwitchBlock, OrigBlock);
 
   // We are now done with the switch instruction, delete it.
   CurBlock->getInstList().erase(SI);
+
+  pred_iterator PI = pred_begin(Default), E = pred_end(Default);
+  // If the Default block has no more predecessors just remove it
+  if (PI == E) {
+    DeleteDeadBlock(Default);
+  }
 }
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 150dbdd..960b198 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -201,8 +201,8 @@ static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred,
 /// ComputeSpeculationCost - Compute an abstract "cost" of speculating the
 /// given instruction, which is assumed to be safe to speculate. 1 means
 /// cheap, 2 means less cheap, and UINT_MAX means prohibitively expensive.
-static unsigned ComputeSpeculationCost(const User *I) {
-  assert(isSafeToSpeculativelyExecute(I) &&
+static unsigned ComputeSpeculationCost(const User *I, const DataLayout *DL) {
+  assert(isSafeToSpeculativelyExecute(I, DL) &&
          "Instruction is not safe to speculatively execute!");
   switch (Operator::getOpcode(I)) {
   default:
@@ -227,6 +227,9 @@ static unsigned ComputeSpeculationCost(const User *I) {
   case Instruction::Trunc:
   case Instruction::ZExt:
   case Instruction::SExt:
+  case Instruction::BitCast:
+  case Instruction::ExtractElement:
+  case Instruction::InsertElement:
     return 1; // These are all cheap.
 
   case Instruction::Call:
@@ -254,7 +257,8 @@ static unsigned ComputeSpeculationCost(const User *I) {
 /// CostRemaining, false is returned and CostRemaining is undefined.
 static bool DominatesMergePoint(Value *V, BasicBlock *BB,
                                 SmallPtrSet<Instruction*, 4> *AggressiveInsts,
-                                unsigned &CostRemaining) {
+                                unsigned &CostRemaining,
+                                const DataLayout *DL) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) {
     // Non-instructions all dominate instructions, but not all constantexprs
@@ -287,10 +291,10 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
   // Okay, it looks like the instruction IS in the "condition".  Check to
   // see if it's a cheap instruction to unconditionally compute, and if it
   // only uses stuff defined outside of the condition.  If so, hoist it out.
-  if (!isSafeToSpeculativelyExecute(I))
+  if (!isSafeToSpeculativelyExecute(I, DL))
     return false;
 
-  unsigned Cost = ComputeSpeculationCost(I);
+  unsigned Cost = ComputeSpeculationCost(I, DL);
 
   if (Cost > CostRemaining)
     return false;
@@ -300,7 +304,7 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
   // Okay, we can only really hoist these out if their operands do
   // not take us over the cost threshold.
   for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
-    if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining))
+    if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, DL))
       return false;
   // Okay, it's safe to do this!  Remember this instruction.
   AggressiveInsts->insert(I);
@@ -994,7 +998,7 @@ static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2,
 /// HoistThenElseCodeToIf - Given a conditional branch that goes to BB1 and
 /// BB2, hoist any common code in the two blocks up into the branch block.  The
 /// caller of this function guarantees that BI's block dominates BB1 and BB2.
-static bool HoistThenElseCodeToIf(BranchInst *BI) {
+static bool HoistThenElseCodeToIf(BranchInst *BI, const DataLayout *DL) {
   // This does very trivial matching, with limited scanning, to find identical
   // instructions in the two blocks.  In particular, we don't want to get into
   // O(M*N) situations here where M and N are the sizes of BB1 and BB2.  As
@@ -1068,9 +1072,9 @@ HoistTerminator:
       if (BB1V == BB2V)
         continue;
 
-      if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V))
+      if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V, DL))
         return Changed;
-      if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V))
+      if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V, DL))
         return Changed;
     }
   }
@@ -1387,7 +1391,8 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
 /// \endcode
 ///
 /// \returns true if the conditional block is removed.
-static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) {
+static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
+                                   const DataLayout *DL) {
   // Be conservative for now. FP select instruction can often be expensive.
   Value *BrCond = BI->getCondition();
   if (isa<FCmpInst>(BrCond))
@@ -1430,13 +1435,13 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) {
       return false;
 
     // Don't hoist the instruction if it's unsafe or expensive.
-    if (!isSafeToSpeculativelyExecute(I) &&
+    if (!isSafeToSpeculativelyExecute(I, DL) &&
         !(HoistCondStores &&
           (SpeculatedStoreValue = isSafeToSpeculateStore(I, BB, ThenBB,
                                                          EndBB))))
       return false;
     if (!SpeculatedStoreValue &&
-        ComputeSpeculationCost(I) > PHINodeFoldingThreshold)
+        ComputeSpeculationCost(I, DL) > PHINodeFoldingThreshold)
       return false;
 
     // Store the store speculation candidate.
@@ -1487,11 +1492,11 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) {
     if (!OrigCE && !ThenCE)
       continue; // Known safe and cheap.
 
-    if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE)) ||
-        (OrigCE && !isSafeToSpeculativelyExecute(OrigCE)))
+    if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE, DL)) ||
+        (OrigCE && !isSafeToSpeculativelyExecute(OrigCE, DL)))
       return false;
-    unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE) : 0;
-    unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE) : 0;
+    unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE, DL) : 0;
+    unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE, DL) : 0;
     if (OrigCost + ThenCost > 2 * PHINodeFoldingThreshold)
       return false;
 
@@ -1738,9 +1743,9 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *DL) {
     }
 
     if (!DominatesMergePoint(PN->getIncomingValue(0), BB, &AggressiveInsts,
-                             MaxCostVal0) ||
+                             MaxCostVal0, DL) ||
         !DominatesMergePoint(PN->getIncomingValue(1), BB, &AggressiveInsts,
-                             MaxCostVal1))
+                             MaxCostVal1, DL))
       return false;
   }
 
@@ -1958,7 +1963,7 @@ static bool checkCSEInPredecessor(Instruction *Inst, BasicBlock *PB) {
 /// FoldBranchToCommonDest - If this basic block is simple enough, and if a
 /// predecessor branches to us and one of our successors, fold the block into
 /// the predecessor and use logical operations to pick the right destination.
-bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
+bool llvm::FoldBranchToCommonDest(BranchInst *BI, const DataLayout *DL) {
   BasicBlock *BB = BI->getParent();
 
   Instruction *Cond = nullptr;
@@ -2010,7 +2015,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
   Instruction *BonusInst = nullptr;
   if (&*FrontIt != Cond &&
       FrontIt->hasOneUse() && FrontIt->user_back() == Cond &&
-      isSafeToSpeculativelyExecute(FrontIt)) {
+      isSafeToSpeculativelyExecute(FrontIt, DL)) {
     BonusInst = &*FrontIt;
     ++FrontIt;
 
@@ -2025,7 +2030,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
   // Make sure the instruction after the condition is the cond branch.
   BasicBlock::iterator CondIt = Cond; ++CondIt;
 
-  // Ingore dbg intrinsics.
+  // Ignore dbg intrinsics.
   while (isa<DbgInfoIntrinsic>(CondIt)) ++CondIt;
 
   if (&*CondIt != BI)
@@ -2340,7 +2345,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
   }
 
   // If this is a conditional branch in an empty block, and if any
-  // predecessors is a conditional branch to one of our destinations,
+  // predecessors are a conditional branch to one of our destinations,
   // fold the conditions into logical ops and one cond br.
   BasicBlock::iterator BBI = BB->begin();
   // Ignore dbg intrinsics.
@@ -2375,16 +2380,33 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
   // Do not perform this transformation if it would require
   // insertion of a large number of select instructions. For targets
   // without predication/cmovs, this is a big pessimization.
-  BasicBlock *CommonDest = PBI->getSuccessor(PBIOp);
 
+  // Also do not perform this transformation if any phi node in the common
+  // destination block can trap when reached by BB or PBB (PR17073). In that
+  // case, it would be unsafe to hoist the operation into a select instruction.
+
+  BasicBlock *CommonDest = PBI->getSuccessor(PBIOp);
   unsigned NumPhis = 0;
   for (BasicBlock::iterator II = CommonDest->begin();
-       isa<PHINode>(II); ++II, ++NumPhis)
+       isa<PHINode>(II); ++II, ++NumPhis) {
     if (NumPhis > 2) // Disable this xform.
       return false;
 
+    PHINode *PN = cast<PHINode>(II);
+    Value *BIV = PN->getIncomingValueForBlock(BB);
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BIV))
+      if (CE->canTrap())
+        return false;
+
+    unsigned PBBIdx = PN->getBasicBlockIndex(PBI->getParent());
+    Value *PBIV = PN->getIncomingValue(PBBIdx);
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(PBIV))
+      if (CE->canTrap())
+        return false;
+  }
+
   // Finally, if everything is ok, fold the branches to logical ops.
-  BasicBlock *OtherDest  = BI->getSuccessor(BIOp ^ 1);
+  BasicBlock *OtherDest = BI->getSuccessor(BIOp ^ 1);
 
   DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent()
                << "AND: " << *BI->getParent());
@@ -3308,6 +3330,11 @@ static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
 /// ValidLookupTableConstant - Return true if the backend will be able to handle
 /// initializing an array of constants like C.
 static bool ValidLookupTableConstant(Constant *C) {
+  if (C->isThreadDependent())
+    return false;
+  if (C->isDLLImportDependent())
+    return false;
+
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
     return CE->isGEPWithNoNotionalOverIndexing();
 
@@ -3521,7 +3548,8 @@ SwitchLookupTable::SwitchLookupTable(Module &M,
 
   // Fill in any holes in the table with the default result.
   if (Values.size() < TableSize) {
-    assert(DefaultValue && "Need a default value to fill the lookup table holes.");
+    assert(DefaultValue &&
+           "Need a default value to fill the lookup table holes.");
     assert(DefaultValue->getType() == ValueType);
     for (uint64_t I = 0; I < TableSize; ++I) {
       if (!TableContents[I])
@@ -3990,7 +4018,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
   // branches to us and our successor, fold the comparison into the
   // predecessor and use logical operations to update the incoming value
   // for PHI nodes in common successor.
-  if (FoldBranchToCommonDest(BI))
+  if (FoldBranchToCommonDest(BI, DL))
     return SimplifyCFG(BB, TTI, DL) | true;
   return false;
 }
@@ -4034,7 +4062,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   // If this basic block is ONLY a compare and a branch, and if a predecessor
   // branches to us and one of our successors, fold the comparison into the
   // predecessor and use logical operations to pick the right destination.
-  if (FoldBranchToCommonDest(BI))
+  if (FoldBranchToCommonDest(BI, DL))
     return SimplifyCFG(BB, TTI, DL) | true;
 
   // We have a conditional branch to two blocks that are only reachable
@@ -4043,24 +4071,24 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   // can hoist it up to the branching block.
   if (BI->getSuccessor(0)->getSinglePredecessor()) {
     if (BI->getSuccessor(1)->getSinglePredecessor()) {
-      if (HoistThenElseCodeToIf(BI))
+      if (HoistThenElseCodeToIf(BI, DL))
         return SimplifyCFG(BB, TTI, DL) | true;
     } else {
       // If Successor #1 has multiple preds, we may be able to conditionally
-      // execute Successor #0 if it branches to successor #1.
+      // execute Successor #0 if it branches to Successor #1.
       TerminatorInst *Succ0TI = BI->getSuccessor(0)->getTerminator();
       if (Succ0TI->getNumSuccessors() == 1 &&
           Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
-        if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0)))
+        if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), DL))
           return SimplifyCFG(BB, TTI, DL) | true;
     }
   } else if (BI->getSuccessor(1)->getSinglePredecessor()) {
     // If Successor #0 has multiple preds, we may be able to conditionally
-    // execute Successor #1 if it branches to successor #0.
+    // execute Successor #1 if it branches to Successor #0.
     TerminatorInst *Succ1TI = BI->getSuccessor(1)->getTerminator();
     if (Succ1TI->getNumSuccessors() == 1 &&
         Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
-      if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1)))
+      if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), DL))
         return SimplifyCFG(BB, TTI, DL) | true;
   }
 
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 34d8a10..cb8a41d 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -209,6 +209,29 @@ namespace {
 class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
 
+/// Optimization analysis message produced during vectorization. Messages inform
+/// the user why vectorization did not occur.
+class Report {
+  std::string Message;
+  raw_string_ostream Out;
+  Instruction *Instr;
+
+public:
+  Report(Instruction *I = nullptr) : Out(Message), Instr(I) {
+    Out << "loop not vectorized: ";
+  }
+
+  template <typename A> Report &operator<<(const A &Value) {
+    Out << Value;
+    return *this;
+  }
+
+  Instruction *getInstr() { return Instr; }
+
+  std::string &str() { return Out.str(); }
+  operator Twine() { return Out.str(); }
+};
+
 /// InnerLoopVectorizer vectorizes loops which contain only one basic
 /// block to a specified vectorization factor (VF).
 /// This class performs the widening of scalars into vectors, or multiple
@@ -515,10 +538,12 @@ public:
   unsigned NumPredStores;
 
   LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL,
-                            DominatorTree *DT, TargetLibraryInfo *TLI)
+                            DominatorTree *DT, TargetLibraryInfo *TLI,
+                            Function *F)
       : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
-        DT(DT), TLI(TLI), Induction(nullptr), WidestIndTy(nullptr),
-        HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {}
+        DT(DT), TLI(TLI), TheFunction(F), Induction(nullptr),
+        WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {
+  }
 
   /// This enum represents the kinds of reductions that we support.
   enum ReductionKind {
@@ -747,6 +772,16 @@ private:
   /// invariant.
   void collectStridedAcccess(Value *LoadOrStoreInst);
 
+  /// Report an analysis message to assist the user in diagnosing loops that are
+  /// not vectorized.
+  void emitAnalysis(Report &Message) {
+    DebugLoc DL = TheLoop->getStartLoc();
+    if (Instruction *I = Message.getInstr())
+      DL = I->getDebugLoc();
+    emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE,
+                                   *TheFunction, DL, Message.str());
+  }
+
   /// The loop that we evaluate.
   Loop *TheLoop;
   /// Scev analysis.
@@ -757,6 +792,8 @@ private:
   DominatorTree *DT;
   /// Target Library Info.
   TargetLibraryInfo *TLI;
+  /// Parent function
+  Function *TheFunction;
 
   //  ---  vectorization state --- //
 
@@ -906,7 +943,7 @@ public:
   }
 
   /// Return the loop vectorizer metadata prefix.
-  static StringRef Prefix() { return "llvm.vectorizer."; }
+  static StringRef Prefix() { return "llvm.loop.vectorize."; }
 
   MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) const {
     SmallVector<Value*, 2> Vals;
@@ -942,6 +979,29 @@ public:
     LoopID = NewLoopID;
   }
 
+  std::string emitRemark() const {
+    Report R;
+    R << "vectorization ";
+    switch (Force) {
+    case LoopVectorizeHints::FK_Disabled:
+      R << "is explicitly disabled";
+      break;
+    case LoopVectorizeHints::FK_Enabled:
+      R << "is explicitly enabled";
+      if (Width != 0 && Unroll != 0)
+        R << " with width " << Width << " and interleave count " << Unroll;
+      else if (Width != 0)
+        R << " with width " << Width;
+      else if (Unroll != 0)
+        R << " with interleave count " << Unroll;
+      break;
+    case LoopVectorizeHints::FK_Undefined:
+      R << "was not specified";
+      break;
+    }
+    return R.str();
+  }
+
   unsigned getWidth() const { return Width; }
   unsigned getUnroll() const { return Unroll; }
   enum ForceKind getForce() const { return Force; }
@@ -1125,18 +1185,37 @@ struct LoopVectorize : public FunctionPass {
                                 : "?")) << " width=" << Hints.getWidth()
                  << " unroll=" << Hints.getUnroll() << "\n");
 
+    // Function containing loop
+    Function *F = L->getHeader()->getParent();
+
+    // Looking at the diagnostic output is the only way to determine if a loop
+    // was vectorized (other than looking at the IR or machine code), so it
+    // is important to generate an optimization remark for each loop. Most of
+    // these messages are generated by emitOptimizationRemarkAnalysis. Remarks
+    // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are
+    // less verbose reporting vectorized loops and unvectorized loops that may
+    // benefit from vectorization, respectively.
+
     if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) {
       DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
+      emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
+                                     L->getStartLoc(), Hints.emitRemark());
       return false;
     }
 
     if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) {
       DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
+      emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
+                                     L->getStartLoc(), Hints.emitRemark());
       return false;
     }
 
     if (Hints.getWidth() == 1 && Hints.getUnroll() == 1) {
       DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
+      emitOptimizationRemarkAnalysis(
+          F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
+          "loop not vectorized: vector width and interleave count are "
+          "explicitly set to 1");
       return false;
     }
 
@@ -1151,14 +1230,19 @@ struct LoopVectorize : public FunctionPass {
         DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
       else {
         DEBUG(dbgs() << "\n");
+        emitOptimizationRemarkAnalysis(
+            F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
+            "vectorization is not beneficial and is not explicitly forced");
         return false;
       }
     }
 
     // Check if it is legal to vectorize the loop.
-    LoopVectorizationLegality LVL(L, SE, DL, DT, TLI);
+    LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, F);
     if (!LVL.canVectorize()) {
       DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
+      emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F,
+                                   L->getStartLoc(), Hints.emitRemark());
       return false;
     }
 
@@ -1167,7 +1251,6 @@ struct LoopVectorize : public FunctionPass {
 
     // Check the function attributes to find out if this function should be
     // optimized for size.
-    Function *F = L->getHeader()->getParent();
     bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
                       F->hasFnAttribute(Attribute::OptimizeForSize);
 
@@ -1190,6 +1273,11 @@ struct LoopVectorize : public FunctionPass {
     if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
       DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
             "attribute is used.\n");
+      emitOptimizationRemarkAnalysis(
+          F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
+          "loop not vectorized due to NoImplicitFloat attribute");
+      emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F,
+                                   L->getStartLoc(), Hints.emitRemark());
       return false;
     }
 
@@ -1208,9 +1296,14 @@ struct LoopVectorize : public FunctionPass {
     DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n');
 
     if (VF.Width == 1) {
-      DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
-      if (UF == 1)
+      DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n");
+
+      if (UF == 1) {
+        emitOptimizationRemarkAnalysis(
+            F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
+            "not beneficial to vectorize and user disabled interleaving");
         return false;
+      }
       DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n");
 
       // Report the unrolling decision.
@@ -1220,6 +1313,7 @@ struct LoopVectorize : public FunctionPass {
                                    " (vectorization not beneficial)"));
 
       // We decided not to vectorize, but we may want to unroll.
+
       InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF);
       Unroller.vectorize(&LVL);
     } else {
@@ -1909,20 +2003,23 @@ void InnerLoopVectorizer::createEmptyLoop() {
    the vectorized instructions while the old loop will continue to run the
    scalar remainder.
 
-       [ ] <-- vector loop bypass (may consist of multiple blocks).
-     /  |
-    /   v
-   |   [ ]     <-- vector pre header.
-   |    |
-   |    v
-   |   [  ] \
-   |   [  ]_|   <-- vector loop.
-   |    |
-    \   v
-      >[ ]   <--- middle-block.
-     /  |
-    /   v
-   |   [ ]     <--- new preheader.
+       [ ] <-- Back-edge taken count overflow check.
+    /   |
+   /    v
+  |    [ ] <-- vector loop bypass (may consist of multiple blocks).
+  |  /  |
+  | /   v
+  ||   [ ]     <-- vector pre header.
+  ||    |
+  ||    v
+  ||   [  ] \
+  ||   [  ]_|   <-- vector loop.
+  ||    |
+  | \   v
+  |   >[ ]   <--- middle-block.
+  |  /  |
+  | /   v
+  -|- >[ ]     <--- new preheader.
    |    |
    |    v
    |   [ ] \
@@ -1936,6 +2033,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
   BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
+  assert(BypassBlock && "Invalid loop structure");
   assert(ExitBlock && "Must have an exit block");
 
   // Some loops have a single integer induction variable, while other loops
@@ -1958,18 +2056,30 @@ void InnerLoopVectorizer::createEmptyLoop() {
       IdxTy->getPrimitiveSizeInBits())
     ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
 
-  ExitCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
+  const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
   // Get the total trip count from the count by adding 1.
-  ExitCount = SE->getAddExpr(ExitCount,
-                             SE->getConstant(ExitCount->getType(), 1));
+  ExitCount = SE->getAddExpr(BackedgeTakeCount,
+                             SE->getConstant(BackedgeTakeCount->getType(), 1));
 
   // Expand the trip count and place the new instructions in the preheader.
   // Notice that the pre-header does not change, only the loop body.
   SCEVExpander Exp(*SE, "induction");
 
-  // Count holds the overall loop count (N).
-  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
-                                   BypassBlock->getTerminator());
+  // We need to test whether the backedge-taken count is uint##_max. Adding one
+  // to it will cause overflow and an incorrect loop trip count in the vector
+  // body. In case of overflow we want to directly jump to the scalar remainder
+  // loop.
+  Value *BackedgeCount =
+      Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),
+                        BypassBlock->getTerminator());
+  if (BackedgeCount->getType()->isPointerTy())
+    BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,
+                                                "backedge.ptrcnt.to.int",
+                                                BypassBlock->getTerminator());
+  Instruction *CheckBCOverflow =
+      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,
+                      Constant::getAllOnesValue(BackedgeCount->getType()),
+                      "backedge.overflow", BypassBlock->getTerminator());
 
   // The loop index does not have to start at Zero. Find the original start
   // value from the induction PHI node. If we don't have an induction variable
@@ -1980,7 +2090,18 @@ void InnerLoopVectorizer::createEmptyLoop() {
                        IdxTy):
     ConstantInt::get(IdxTy, 0);
 
-  assert(BypassBlock && "Invalid loop structure");
+  // We need an instruction to anchor the overflow check on. StartIdx needs to
+  // be defined before the overflow check branch. Because the scalar preheader
+  // is going to merge the start index and so the overflow branch block needs to
+  // contain a definition of the start index.
+  Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd(
+      StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor",
+      BypassBlock->getTerminator());
+
+  // Count holds the overall loop count (N).
+  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
+                                   BypassBlock->getTerminator());
+
   LoopBypassBlocks.push_back(BypassBlock);
 
   // Split the single block loop into the two loop structure described above.
@@ -2049,29 +2170,45 @@ void InnerLoopVectorizer::createEmptyLoop() {
 
   // Now, compare the new count to zero. If it is zero skip the vector loop and
   // jump to the scalar loop.
-  Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx,
-                                          "cmp.zero");
+  Value *Cmp =
+      BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero");
 
   BasicBlock *LastBypassBlock = BypassBlock;
 
+  // Generate code to check that the loops trip count that we computed by adding
+  // one to the backedge-taken count will not overflow.
+  {
+    auto PastOverflowCheck =
+        std::next(BasicBlock::iterator(OverflowCheckAnchor));
+    BasicBlock *CheckBlock =
+      LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked");
+    if (ParentLoop)
+      ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
+    LoopBypassBlocks.push_back(CheckBlock);
+    Instruction *OldTerm = LastBypassBlock->getTerminator();
+    BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm);
+    OldTerm->eraseFromParent();
+    LastBypassBlock = CheckBlock;
+  }
+
   // Generate the code to check that the strides we assumed to be one are really
   // one. We want the new basic block to start at the first instruction in a
   // sequence of instructions that form a check.
   Instruction *StrideCheck;
   Instruction *FirstCheckInst;
   std::tie(FirstCheckInst, StrideCheck) =
-      addStrideCheck(BypassBlock->getTerminator());
+      addStrideCheck(LastBypassBlock->getTerminator());
   if (StrideCheck) {
     // Create a new block containing the stride check.
     BasicBlock *CheckBlock =
-        BypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
+        LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
     if (ParentLoop)
       ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
     LoopBypassBlocks.push_back(CheckBlock);
 
     // Replace the branch into the memory check block with a conditional branch
     // for the "few elements case".
-    Instruction *OldTerm = BypassBlock->getTerminator();
+    Instruction *OldTerm = LastBypassBlock->getTerminator();
     BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
     OldTerm->eraseFromParent();
 
@@ -2134,6 +2271,19 @@ void InnerLoopVectorizer::createEmptyLoop() {
       PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
                       MiddleBlock->getTerminator()) : nullptr;
 
+    // Create phi nodes to merge from the  backedge-taken check block.
+    PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val",
+                                           ScalarPH->getTerminator());
+    BCResumeVal->addIncoming(ResumeVal, MiddleBlock);
+
+    PHINode *BCTruncResumeVal = nullptr;
+    if (OrigPhi == OldInduction) {
+      BCTruncResumeVal =
+          PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val",
+                          ScalarPH->getTerminator());
+      BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock);
+    }
+
     Value *EndValue = nullptr;
     switch (II.IK) {
     case LoopVectorizationLegality::IK_NoInduction:
@@ -2150,10 +2300,12 @@ void InnerLoopVectorizer::createEmptyLoop() {
           BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
         // The new PHI merges the original incoming value, in case of a bypass,
         // or the value at the end of the vectorized loop.
-        for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+        for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
           TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
         TruncResumeVal->addIncoming(EndValue, VecBody);
 
+        BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
+
         // We know what the end value is.
         EndValue = IdxEndRoundDown;
         // We also know which PHI node holds it.
@@ -2199,7 +2351,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
 
     // The new PHI merges the original incoming value, in case of a bypass,
     // or the value at the end of the vectorized loop.
-    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) {
+    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) {
       if (OrigPhi == OldInduction)
         ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
       else
@@ -2209,11 +2361,16 @@ void InnerLoopVectorizer::createEmptyLoop() {
 
     // Fix the scalar body counter (PHI node).
     unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
-    // The old inductions phi node in the scalar body needs the truncated value.
-    if (OrigPhi == OldInduction)
-      OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal);
-    else
-      OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
+
+    // The old induction's phi node in the scalar body needs the truncated
+    // value.
+    if (OrigPhi == OldInduction) {
+      BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]);
+      OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal);
+    } else {
+      BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
+      OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
+    }
   }
 
   // If we are generating a new induction variable then we also need to
@@ -2224,7 +2381,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
     assert(!ResumeIndex && "Unexpected resume value found");
     ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
                                   MiddleBlock->getTerminator());
-    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
       ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
     ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
   }
@@ -2494,7 +2651,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
     // To do so, we need to generate the 'identity' vector and override
     // one of the elements with the incoming scalar reduction. We need
     // to do it in the vector-loop preheader.
-    Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator());
+    Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
 
     // This is the vector-clone of the value that leaves the loop.
     VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
@@ -2568,7 +2725,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
       VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
       PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
       Value *StartVal = (part == 0) ? VectorStart : Identity;
-      for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
         NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
       NewPhi->addIncoming(RdxExitVal[part],
                           LoopVectorBody.back());
@@ -2626,6 +2783,13 @@ void InnerLoopVectorizer::vectorizeLoop() {
                                                     Builder.getInt32(0));
     }
 
+    // Create a phi node that merges control-flow from the backedge-taken check
+    // block and the middle block.
+    PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
+                                          LoopScalarPreHeader->getTerminator());
+    BCBlockPhi->addIncoming(RdxDesc.StartValue, LoopBypassBlocks[0]);
+    BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+
     // Now, we need to fix the users of the reduction variable
     // inside and outside of the scalar remainder loop.
     // We know that the loop is in LCSSA form. We need to update the
@@ -2655,7 +2819,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
     assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
     // Pick the other block.
     int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
-    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx);
+    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
     (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
   }// end of for each redux variable.
 
@@ -3062,9 +3226,14 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
         scalarizeInstruction(it);
         break;
       default:
+        bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1);
         for (unsigned Part = 0; Part < UF; ++Part) {
           SmallVector<Value *, 4> Args;
           for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+            if (HasScalarOpd && i == 1) {
+              Args.push_back(CI->getArgOperand(i));
+              continue;
+            }
             VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
             Args.push_back(Arg[Part]);
           }
@@ -3112,8 +3281,8 @@ void InnerLoopVectorizer::updateAnalysis() {
     }
   }
 
-  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
-  DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
+  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
+  DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
 
@@ -3138,8 +3307,10 @@ static bool canIfConvertPHINodes(BasicBlock *BB) {
 }
 
 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
-  if (!EnableIfConversion)
+  if (!EnableIfConversion) {
+    emitAnalysis(Report() << "if-conversion is disabled");
     return false;
+  }
 
   assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
 
@@ -3169,16 +3340,24 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
     BasicBlock *BB = *BI;
 
     // We don't support switch statements inside loops.
-    if (!isa<BranchInst>(BB->getTerminator()))
+    if (!isa<BranchInst>(BB->getTerminator())) {
+      emitAnalysis(Report(BB->getTerminator())
+                   << "loop contains a switch statement");
       return false;
+    }
 
     // We must be able to predicate all blocks that need to be predicated.
     if (blockNeedsPredication(BB)) {
-      if (!blockCanBePredicated(BB, SafePointes))
+      if (!blockCanBePredicated(BB, SafePointes)) {
+        emitAnalysis(Report(BB->getTerminator())
+                     << "control flow cannot be substituted for a select");
         return false;
-    } else if (BB != Header && !canIfConvertPHINodes(BB))
+      }
+    } else if (BB != Header && !canIfConvertPHINodes(BB)) {
+      emitAnalysis(Report(BB->getTerminator())
+                   << "control flow cannot be substituted for a select");
       return false;
-
+    }
   }
 
   // We can if-convert this loop.
@@ -3188,20 +3367,31 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
 bool LoopVectorizationLegality::canVectorize() {
   // We must have a loop in canonical form. Loops with indirectbr in them cannot
   // be canonicalized.
-  if (!TheLoop->getLoopPreheader())
+  if (!TheLoop->getLoopPreheader()) {
+    emitAnalysis(
+        Report() << "loop control flow is not understood by vectorizer");
     return false;
+  }
 
   // We can only vectorize innermost loops.
-  if (TheLoop->getSubLoopsVector().size())
+  if (TheLoop->getSubLoopsVector().size()) {
+    emitAnalysis(Report() << "loop is not the innermost loop");
     return false;
+  }
 
   // We must have a single backedge.
-  if (TheLoop->getNumBackEdges() != 1)
+  if (TheLoop->getNumBackEdges() != 1) {
+    emitAnalysis(
+        Report() << "loop control flow is not understood by vectorizer");
     return false;
+  }
 
   // We must have a single exiting block.
-  if (!TheLoop->getExitingBlock())
+  if (!TheLoop->getExitingBlock()) {
+    emitAnalysis(
+        Report() << "loop control flow is not understood by vectorizer");
     return false;
+  }
 
   // We need to have a loop header.
   DEBUG(dbgs() << "LV: Found a loop: " <<
@@ -3217,6 +3407,7 @@ bool LoopVectorizationLegality::canVectorize() {
   // ScalarEvolution needs to be able to find the exit count.
   const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
   if (ExitCount == SE->getCouldNotCompute()) {
+    emitAnalysis(Report() << "could not determine number of loop iterations");
     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
     return false;
   }
@@ -3310,6 +3501,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         if (!PhiTy->isIntegerTy() &&
             !PhiTy->isFloatingPointTy() &&
             !PhiTy->isPointerTy()) {
+          emitAnalysis(Report(it)
+                       << "loop control flow is not understood by vectorizer");
           DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
           return false;
         }
@@ -3320,13 +3513,17 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         if (*bb != Header) {
           // Check that this instruction has no outside users or is an
           // identified reduction value with an outside user.
-          if(!hasOutsideLoopUser(TheLoop, it, AllowedExit))
+          if (!hasOutsideLoopUser(TheLoop, it, AllowedExit))
             continue;
+          emitAnalysis(Report(it) << "value that could not be identified as "
+                                     "reduction is used outside the loop");
           return false;
         }
 
         // We only allow if-converted PHIs with more than two incoming values.
         if (Phi->getNumIncomingValues() != 2) {
+          emitAnalysis(Report(it)
+                       << "control flow not understood by vectorizer");
           DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
           return false;
         }
@@ -3357,8 +3554,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
           // Until we explicitly handle the case of an induction variable with
           // an outside loop user we have to give up vectorizing this loop.
-          if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
+          if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
+            emitAnalysis(Report(it) << "use of induction value outside of the "
+                                       "loop is not handled by vectorizer");
             return false;
+          }
 
           continue;
         }
@@ -3401,6 +3601,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           continue;
         }
 
+        emitAnalysis(Report(it) << "unvectorizable operation");
         DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
         return false;
       }// end of PHI handling
@@ -3409,14 +3610,29 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       // calls and we do handle certain intrinsic and libm functions.
       CallInst *CI = dyn_cast<CallInst>(it);
       if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
+        emitAnalysis(Report(it) << "call instruction cannot be vectorized");
         DEBUG(dbgs() << "LV: Found a call site.\n");
         return false;
       }
 
+      // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
+      // second argument is the same (i.e. loop invariant)
+      if (CI &&
+          hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
+        if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
+          emitAnalysis(Report(it)
+                       << "intrinsic instruction cannot be vectorized");
+          DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
+          return false;
+        }
+      }
+
       // Check that the instruction return type is vectorizable.
       // Also, we can't vectorize extractelement instructions.
       if ((!VectorType::isValidElementType(it->getType()) &&
            !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
+        emitAnalysis(Report(it)
+                     << "instruction return type cannot be vectorized");
         DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
         return false;
       }
@@ -3424,8 +3640,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       // Check that the stored type is vectorizable.
       if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
         Type *T = ST->getValueOperand()->getType();
-        if (!VectorType::isValidElementType(T))
+        if (!VectorType::isValidElementType(T)) {
+          emitAnalysis(Report(ST) << "store instruction cannot be vectorized");
           return false;
+        }
         if (EnableMemAccessVersioning)
           collectStridedAcccess(ST);
       }
@@ -3436,8 +3654,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
       // Reduction instructions are allowed to have exit users.
       // All other instructions must not have external users.
-      if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
+      if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
+        emitAnalysis(Report(it) << "value cannot be used outside the loop");
         return false;
+      }
 
     } // next instr.
 
@@ -3445,8 +3665,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
   if (!Induction) {
     DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
-    if (Inductions.empty())
+    if (Inductions.empty()) {
+      emitAnalysis(Report()
+                   << "loop induction variable could not be identified");
       return false;
+    }
   }
 
   return true;
@@ -4353,8 +4576,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
           continue;
 
         LoadInst *Ld = dyn_cast<LoadInst>(it);
-        if (!Ld) return false;
-        if (!Ld->isSimple() && !IsAnnotatedParallel) {
+        if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
+          emitAnalysis(Report(Ld)
+                       << "read with atomic ordering or volatile read");
           DEBUG(dbgs() << "LV: Found a non-simple load.\n");
           return false;
         }
@@ -4367,8 +4591,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       // Save 'store' instructions. Abort if other instructions write to memory.
       if (it->mayWriteToMemory()) {
         StoreInst *St = dyn_cast<StoreInst>(it);
-        if (!St) return false;
+        if (!St) {
+          emitAnalysis(Report(it) << "instruction cannot be vectorized");
+          return false;
+        }
         if (!St->isSimple() && !IsAnnotatedParallel) {
+          emitAnalysis(Report(St)
+                       << "write with atomic ordering or volatile write");
           DEBUG(dbgs() << "LV: Found a non-simple store.\n");
           return false;
         }
@@ -4405,6 +4634,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     Value* Ptr = ST->getPointerOperand();
 
     if (isUniform(Ptr)) {
+      emitAnalysis(
+          Report(ST)
+          << "write to a loop invariant address could not be vectorized");
       DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
       return false;
     }
@@ -4483,6 +4715,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   }
 
   if (NeedRTCheck && !CanDoRT) {
+    emitAnalysis(Report() << "cannot identify array bounds");
     DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
           "the array bounds.\n");
     PtrRtCheck.reset();
@@ -4513,6 +4746,14 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       // Check that we did not collect too many pointers or found an unsizeable
       // pointer.
       if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
+        if (!CanDoRT && NumComparisons > 0)
+          emitAnalysis(Report()
+                       << "cannot check memory dependencies at runtime");
+        else
+          emitAnalysis(Report()
+                       << NumComparisons << " exceeds limit of "
+                       << RuntimeMemoryCheckThreshold
+                       << " dependent memory operations checked at runtime");
         DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n");
         PtrRtCheck.reset();
         return false;
@@ -4522,6 +4763,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     }
   }
 
+  if (!CanVecMem)
+    emitAnalysis(Report() << "unsafe dependent memory operations in loop");
+
   DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") <<
         " need a runtime memory check.\n");
 
@@ -5774,4 +6018,3 @@ Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx,
   Constant *C = ConstantInt::get(ITy, StartIdx, Negate);
   return Builder.CreateAdd(Val, C, "induction");
 }
-
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e13ba95..53a43d9 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -149,6 +149,48 @@ static bool isSplat(ArrayRef<Value *> VL) {
   return true;
 }
 
+///\returns Opcode that can be clubbed with \p Op to create an alternate
+/// sequence which can later be merged as a ShuffleVector instruction.
+static unsigned getAltOpcode(unsigned Op) {
+  switch (Op) {
+  case Instruction::FAdd:
+    return Instruction::FSub;
+  case Instruction::FSub:
+    return Instruction::FAdd;
+  case Instruction::Add:
+    return Instruction::Sub;
+  case Instruction::Sub:
+    return Instruction::Add;
+  default:
+    return 0;
+  }
+}
+
+///\returns bool representing if Opcode \p Op can be part
+/// of an alternate sequence which can later be merged as
+/// a ShuffleVector instruction.
+static bool canCombineAsAltInst(unsigned Op) {
+  if (Op == Instruction::FAdd || Op == Instruction::FSub ||
+      Op == Instruction::Sub || Op == Instruction::Add)
+    return true;
+  return false;
+}
+
+/// \returns ShuffleVector instruction if intructions in \p VL have
+///  alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
+/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
+static unsigned isAltInst(ArrayRef<Value *> VL) {
+  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+  unsigned Opcode = I0->getOpcode();
+  unsigned AltOpcode = getAltOpcode(Opcode);
+  for (int i = 1, e = VL.size(); i < e; i++) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
+      return 0;
+  }
+  return Instruction::ShuffleVector;
+}
+
 /// \returns The opcode if all of the Instructions in \p VL have the same
 /// opcode, or zero.
 static unsigned getSameOpcode(ArrayRef<Value *> VL) {
@@ -158,8 +200,11 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
   unsigned Opcode = I0->getOpcode();
   for (int i = 1, e = VL.size(); i < e; i++) {
     Instruction *I = dyn_cast<Instruction>(VL[i]);
-    if (!I || Opcode != I->getOpcode())
+    if (!I || Opcode != I->getOpcode()) {
+      if (canCombineAsAltInst(Opcode) && i == 1)
+        return isAltInst(VL);
       return 0;
+    }
   }
   return Opcode;
 }
@@ -377,6 +422,7 @@ public:
 
   /// \brief Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
+
 private:
   struct TreeEntry;
 
@@ -594,6 +640,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
 
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
   bool SameTy = getSameType(VL); (void)SameTy;
+  bool isAltShuffle = false;
   assert(SameTy && "Invalid types!");
 
   if (Depth == RecursionMaxDepth) {
@@ -615,10 +662,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       newTreeEntry(VL, false);
       return;
     }
+  unsigned Opcode = getSameOpcode(VL);
+
+  // Check that this shuffle vector refers to the alternate
+  // sequence of opcodes.
+  if (Opcode == Instruction::ShuffleVector) {
+    Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+    unsigned Op = I0->getOpcode();
+    if (Op != Instruction::ShuffleVector)
+      isAltShuffle = true;
+  }
 
   // If all of the operands are identical or constant we have a simple solution.
-  if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) ||
-      !getSameOpcode(VL)) {
+  if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
     newTreeEntry(VL, false);
     return;
@@ -754,8 +810,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
 
   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
 
-  unsigned Opcode = getSameOpcode(VL);
-
   // Check if it is safe to sink the loads or the stores.
   if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
     Instruction *Last = getLastInstruction(VL);
@@ -914,8 +968,20 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
         ValueList Left, Right;
         reorderInputsAccordingToOpcode(VL, Left, Right);
-        buildTree_rec(Left, Depth + 1);
-        buildTree_rec(Right, Depth + 1);
+        BasicBlock *LeftBB = getSameBlock(Left);
+        BasicBlock *RightBB = getSameBlock(Right);
+        // If we have common uses on separate paths in the tree make sure we
+        // process the one with greater common depth first.
+        // We can use block numbering to determine the subtree traversal as
+        // earler user has to come in between the common use and the later user.
+        if (LeftBB && RightBB && LeftBB == RightBB &&
+            getLastIndex(Right) > getLastIndex(Left)) {
+          buildTree_rec(Right, Depth + 1);
+          buildTree_rec(Left, Depth + 1);
+        } else {
+          buildTree_rec(Left, Depth + 1);
+          buildTree_rec(Right, Depth + 1);
+        }
         return;
       }
 
@@ -929,6 +995,51 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       }
       return;
     }
+    case Instruction::GetElementPtr: {
+      // We don't combine GEPs with complicated (nested) indexing.
+      for (unsigned j = 0; j < VL.size(); ++j) {
+        if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
+          DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
+          newTreeEntry(VL, false);
+          return;
+        }
+      }
+
+      // We can't combine several GEPs into one vector if they operate on
+      // different types.
+      Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
+      for (unsigned j = 0; j < VL.size(); ++j) {
+        Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
+        if (Ty0 != CurTy) {
+          DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
+          newTreeEntry(VL, false);
+          return;
+        }
+      }
+
+      // We don't combine GEPs with non-constant indexes.
+      for (unsigned j = 0; j < VL.size(); ++j) {
+        auto Op = cast<Instruction>(VL[j])->getOperand(1);
+        if (!isa<ConstantInt>(Op)) {
+          DEBUG(
+              dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
+          newTreeEntry(VL, false);
+          return;
+        }
+      }
+
+      newTreeEntry(VL, true);
+      DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
+      for (unsigned i = 0, e = 2; i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < VL.size(); ++j)
+          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+        buildTree_rec(Operands, Depth + 1);
+      }
+      return;
+    }
     case Instruction::Store: {
       // Check if the stores are consecutive or of we need to swizzle them.
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
@@ -961,9 +1072,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
-
       Function *Int = CI->getCalledFunction();
-
+      Value *A1I = nullptr;
+      if (hasVectorInstrinsicScalarOpd(ID, 1))
+        A1I = CI->getArgOperand(1);
       for (unsigned i = 1, e = VL.size(); i != e; ++i) {
         CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
         if (!CI2 || CI2->getCalledFunction() != Int ||
@@ -973,6 +1085,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
                        << "\n");
           return;
         }
+        // ctlz,cttz and powi are special intrinsics whose second argument
+        // should be same in order for them to be vectorized.
+        if (hasVectorInstrinsicScalarOpd(ID, 1)) {
+          Value *A1J = CI2->getArgOperand(1);
+          if (A1I != A1J) {
+            newTreeEntry(VL, false);
+            DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
+                         << " argument "<< A1I<<"!=" << A1J
+                         << "\n");
+            return;
+          }
+        }
       }
 
       newTreeEntry(VL, true);
@@ -987,6 +1111,26 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       }
       return;
     }
+    case Instruction::ShuffleVector: {
+      // If this is not an alternate sequence of opcode like add-sub
+      // then do not vectorize this instruction.
+      if (!isAltShuffle) {
+        newTreeEntry(VL, false);
+        DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
+        return;
+      }
+      newTreeEntry(VL, true);
+      DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < VL.size(); ++j)
+          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+        buildTree_rec(Operands, Depth + 1);
+      }
+      return;
+    }
     default:
       newTreeEntry(VL, false);
       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
@@ -1010,11 +1154,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     }
     return getGatherCost(E->Scalars);
   }
-
-  assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) &&
-         "Invalid VL");
+  unsigned Opcode = getSameOpcode(VL);
+  assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
   Instruction *VL0 = cast<Instruction>(VL[0]);
-  unsigned Opcode = VL0->getOpcode();
   switch (Opcode) {
     case Instruction::PHI: {
       return 0;
@@ -1121,6 +1263,20 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       }
       return VecCost - ScalarCost;
     }
+    case Instruction::GetElementPtr: {
+      TargetTransformInfo::OperandValueKind Op1VK =
+          TargetTransformInfo::OK_AnyValue;
+      TargetTransformInfo::OperandValueKind Op2VK =
+          TargetTransformInfo::OK_UniformConstantValue;
+
+      int ScalarCost =
+          VecTy->getNumElements() *
+          TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
+      int VecCost =
+          TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
+
+      return VecCost - ScalarCost;
+    }
     case Instruction::Load: {
       // Cost of wide load - cost of scalar loads.
       int ScalarLdCost = VecTy->getNumElements() *
@@ -1158,6 +1314,32 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
 
       return VecCallCost - ScalarCallCost;
     }
+    case Instruction::ShuffleVector: {
+      TargetTransformInfo::OperandValueKind Op1VK =
+          TargetTransformInfo::OK_AnyValue;
+      TargetTransformInfo::OperandValueKind Op2VK =
+          TargetTransformInfo::OK_AnyValue;
+      int ScalarCost = 0;
+      int VecCost = 0;
+      for (unsigned i = 0; i < VL.size(); ++i) {
+        Instruction *I = cast<Instruction>(VL[i]);
+        if (!I)
+          break;
+        ScalarCost +=
+            TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
+      }
+      // VecCost is equal to sum of the cost of creating 2 vectors
+      // and the cost of creating shuffle.
+      Instruction *I0 = cast<Instruction>(VL[0]);
+      VecCost =
+          TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
+      Instruction *I1 = cast<Instruction>(VL[1]);
+      VecCost +=
+          TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
+      VecCost +=
+          TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
+      return VecCost - ScalarCost;
+    }
     default:
       llvm_unreachable("Unknown instruction");
   }
@@ -1438,9 +1620,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     setInsertPointAfterBundle(E->Scalars);
     return Gather(E->Scalars, VecTy);
   }
-
-  unsigned Opcode = VL0->getOpcode();
-  assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode");
+  unsigned Opcode = getSameOpcode(E->Scalars);
 
   switch (Opcode) {
     case Instruction::PHI: {
@@ -1649,12 +1829,52 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       E->VectorizedValue = S;
       return propagateMetadata(S, E->Scalars);
     }
+    case Instruction::GetElementPtr: {
+      setInsertPointAfterBundle(E->Scalars);
+
+      ValueList Op0VL;
+      for (int i = 0, e = E->Scalars.size(); i < e; ++i)
+        Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0));
+
+      Value *Op0 = vectorizeTree(Op0VL);
+
+      std::vector<Value *> OpVecs;
+      for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
+           ++j) {
+        ValueList OpVL;
+        for (int i = 0, e = E->Scalars.size(); i < e; ++i)
+          OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j));
+
+        Value *OpVec = vectorizeTree(OpVL);
+        OpVecs.push_back(OpVec);
+      }
+
+      Value *V = Builder.CreateGEP(Op0, OpVecs);
+      E->VectorizedValue = V;
+
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        return propagateMetadata(I, E->Scalars);
+
+      return V;
+    }
     case Instruction::Call: {
       CallInst *CI = cast<CallInst>(VL0);
       setInsertPointAfterBundle(E->Scalars);
+      Function *FI;
+      Intrinsic::ID IID  = Intrinsic::not_intrinsic;
+      if (CI && (FI = CI->getCalledFunction())) {
+        IID = (Intrinsic::ID) FI->getIntrinsicID();
+      }
       std::vector<Value *> OpVecs;
       for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
         ValueList OpVL;
+        // ctlz,cttz and powi are special intrinsics whose second argument is
+        // a scalar. This argument should not be vectorized.
+        if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
+          CallInst *CEI = cast<CallInst>(E->Scalars[0]);
+          OpVecs.push_back(CEI->getArgOperand(j));
+          continue;
+        }
         for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
           CallInst *CEI = cast<CallInst>(E->Scalars[i]);
           OpVL.push_back(CEI->getArgOperand(j));
@@ -1673,6 +1893,49 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       E->VectorizedValue = V;
       return V;
     }
+    case Instruction::ShuffleVector: {
+      ValueList LHSVL, RHSVL;
+      for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+        LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+        RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+      }
+      setInsertPointAfterBundle(E->Scalars);
+
+      Value *LHS = vectorizeTree(LHSVL);
+      Value *RHS = vectorizeTree(RHSVL);
+
+      if (Value *V = alreadyVectorized(E->Scalars))
+        return V;
+
+      // Create a vector of LHS op1 RHS
+      BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
+      Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
+
+      // Create a vector of LHS op2 RHS
+      Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
+      BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
+      Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
+
+      // Create appropriate shuffle to take alternative operations from
+      // the vector.
+      std::vector<Constant *> Mask(E->Scalars.size());
+      unsigned e = E->Scalars.size();
+      for (unsigned i = 0; i < e; ++i) {
+        if (i & 1)
+          Mask[i] = Builder.getInt32(e + i);
+        else
+          Mask[i] = Builder.getInt32(i);
+      }
+
+      Value *ShuffleMask = ConstantVector::get(Mask);
+
+      Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+      E->VectorizedValue = V;
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        return propagateMetadata(I, E->Scalars);
+
+      return V;
+    }
     default:
     llvm_unreachable("unknown inst");
   }
@@ -1741,7 +2004,6 @@ Value *BoUpSLP::vectorizeTree() {
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
-
       // No need to handle users of gathered values.
       if (Entry->NeedToGather)
         continue;
@@ -1925,7 +2187,6 @@ struct SLPVectorizer : public FunctionPass {
     for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
          e = po_end(&F.getEntryBlock()); it != e; ++it) {
       BasicBlock *BB = *it;
-
       // Vectorize trees that end at stores.
       if (unsigned count = collectStores(BB, R)) {
         (void)count;
diff --git a/llvm-device-build.mk b/llvm-device-build.mk
index 82ef055..5d863f5 100644
--- a/llvm-device-build.mk
+++ b/llvm-device-build.mk
@@ -64,6 +64,8 @@ LOCAL_C_INCLUDES :=	\
 	$(LLVM_ROOT_PATH)/device/include	\
 	$(LOCAL_C_INCLUDES)
 
+include external/libcxx/libcxx.mk
+
 ###########################################################
 ## Commands for running tblgen to compile a td file
 ###########################################################
diff --git a/test/Analysis/BasicAA/cs-cs.ll b/test/Analysis/BasicAA/cs-cs.ll
new file mode 100644
index 0000000..682e4b6
--- /dev/null
+++ b/test/Analysis/BasicAA/cs-cs.ll
@@ -0,0 +1,221 @@
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "arm-apple-ios"
+
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly
+declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
+entry:
+  %q = getelementptr i8* %p, i64 16
+  %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind
+  call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
+  %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind
+  %c = add <8 x i16> %a, %b
+  ret <8 x i16> %c
+
+; CHECK-LABEL: Function: test1:
+
+; CHECK: NoAlias:      i8* %p, i8* %q
+; CHECK: Just Ref:  Ptr: i8* %p        <->  %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1
+; CHECK: NoModRef:  Ptr: i8* %q        <->  %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1
+; CHECK: NoModRef:  Ptr: i8* %p        <->  call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK: Both ModRef:  Ptr: i8* %q     <->  call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK: Just Ref:  Ptr: i8* %p        <->  %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1
+; CHECK: NoModRef:  Ptr: i8* %q        <->  %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1
+; CHECK: NoModRef:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1 <->   call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK: NoModRef:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1 <->   %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1
+; CHECK: NoModRef:   call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) <->   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1
+; CHECK: NoModRef:   call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) <->   %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1
+; CHECK: NoModRef:   %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1 <->   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1
+; CHECK: NoModRef:   %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #1 <->   call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
+}
+
+define void @test2(i8* %P, i8* %Q) nounwind ssp {
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+  ret void
+
+; CHECK-LABEL: Function: test2:
+
+; CHECK:   MayAlias:     i8* %P, i8* %Q
+; CHECK:   Both ModRef:  Ptr: i8* %P     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK:   Both ModRef:  Ptr: i8* %Q     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK:   Both ModRef:  Ptr: i8* %P     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK:   Both ModRef:  Ptr: i8* %Q     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK:   Both ModRef:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK:   Both ModRef:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+}
+
+define void @test2a(i8* noalias %P, i8* noalias %Q) nounwind ssp {
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+  ret void
+
+; CHECK-LABEL: Function: test2a:
+
+; CHECK: NoAlias:      i8* %P, i8* %Q
+; CHECK: Just Mod:  Ptr: i8* %P        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Ref:  Ptr: i8* %Q        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:  Ptr: i8* %P        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Ref:  Ptr: i8* %Q        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+}
+
+define void @test2b(i8* noalias %P, i8* noalias %Q) nounwind ssp {
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+  %R = getelementptr i8* %P, i64 12
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+  ret void
+
+; CHECK-LABEL: Function: test2b:
+
+; CHECK: NoAlias:      i8* %P, i8* %Q
+; CHECK: NoAlias:      i8* %P, i8* %R
+; CHECK: NoAlias:      i8* %Q, i8* %R
+; CHECK: Just Mod:  Ptr: i8* %P        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Ref:  Ptr: i8* %Q        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: NoModRef:  Ptr: i8* %R        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: NoModRef:  Ptr: i8* %P        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Ref:  Ptr: i8* %Q        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:  Ptr: i8* %R        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: NoModRef:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: NoModRef:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+}
+
+define void @test2c(i8* noalias %P, i8* noalias %Q) nounwind ssp {
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+  %R = getelementptr i8* %P, i64 11
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+  ret void
+
+; CHECK-LABEL: Function: test2c:
+
+; CHECK: NoAlias:      i8* %P, i8* %Q
+; CHECK: NoAlias:      i8* %P, i8* %R
+; CHECK: NoAlias:      i8* %Q, i8* %R
+; CHECK: Just Mod:  Ptr: i8* %P        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Ref:  Ptr: i8* %Q        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:  Ptr: i8* %R        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: NoModRef:  Ptr: i8* %P        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Ref:  Ptr: i8* %Q        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:  Ptr: i8* %R        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+}
+
+define void @test2d(i8* noalias %P, i8* noalias %Q) nounwind ssp {
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+  %R = getelementptr i8* %P, i64 -12
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+  ret void
+
+; CHECK-LABEL: Function: test2d:
+
+; CHECK: NoAlias:      i8* %P, i8* %Q
+; CHECK: NoAlias:      i8* %P, i8* %R
+; CHECK: NoAlias:      i8* %Q, i8* %R
+; CHECK: Just Mod:  Ptr: i8* %P        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Ref:  Ptr: i8* %Q        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: NoModRef:  Ptr: i8* %R        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: NoModRef:  Ptr: i8* %P        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Ref:  Ptr: i8* %Q        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:  Ptr: i8* %R        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: NoModRef:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: NoModRef:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+}
+
+define void @test2e(i8* noalias %P, i8* noalias %Q) nounwind ssp {
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+  %R = getelementptr i8* %P, i64 -11
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+  ret void
+
+; CHECK-LABEL: Function: test2e:
+
+; CHECK: NoAlias:      i8* %P, i8* %Q
+; CHECK: NoAlias:      i8* %P, i8* %R
+; CHECK: NoAlias:      i8* %Q, i8* %R
+; CHECK: Just Mod:  Ptr: i8* %P        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Ref:  Ptr: i8* %Q        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: NoModRef:  Ptr: i8* %R        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:  Ptr: i8* %P        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Ref:  Ptr: i8* %Q        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:  Ptr: i8* %R        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %R, i8* %Q, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+}
+
+define void @test3(i8* %P, i8* %Q) nounwind ssp {
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+  ret void
+
+; CHECK-LABEL: Function: test3:
+
+; CHECK: MayAlias:     i8* %P, i8* %Q
+; CHECK: Both ModRef:  Ptr: i8* %P     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false)
+; CHECK: Both ModRef:  Ptr: i8* %Q     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false)
+; CHECK: Both ModRef:  Ptr: i8* %P     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Both ModRef:  Ptr: i8* %Q     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Both ModRef:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Both ModRef:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false)
+}
+
+define void @test3a(i8* noalias %P, i8* noalias %Q) nounwind ssp {
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+  ret void
+
+; CHECK-LABEL: Function: test3a:
+
+; CHECK: NoAlias:      i8* %P, i8* %Q
+; CHECK: Just Mod:  Ptr: i8* %P        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false)
+; CHECK: Just Ref:  Ptr: i8* %Q        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false)
+; CHECK: Just Mod:  Ptr: i8* %P        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Ref:  Ptr: i8* %Q        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 8, i32 1, i1 false)
+}
+
+define void @test4(i8* %P, i8* noalias %Q) nounwind ssp {
+  tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+  ret void
+
+; CHECK-LABEL: Function: test4:
+
+; CHECK: NoAlias:      i8* %P, i8* %Q
+; CHECK: Just Mod:  Ptr: i8* %P        <->  tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false)
+; CHECK: NoModRef:  Ptr: i8* %Q        <->  tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false)
+; CHECK: Just Mod:  Ptr: i8* %P        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Ref:  Ptr: i8* %Q        <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:   tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Just Mod:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <->   tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false)
+}
+
+define void @test5(i8* %P, i8* %Q, i8* %R) nounwind ssp {
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false)
+  ret void
+
+; CHECK-LABEL: Function: test5:
+
+; CHECK: MayAlias:     i8* %P, i8* %Q
+; CHECK: MayAlias:     i8* %P, i8* %R
+; CHECK: MayAlias:     i8* %Q, i8* %R
+; CHECK: Both ModRef:  Ptr: i8* %P     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Both ModRef:  Ptr: i8* %Q     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Both ModRef:  Ptr: i8* %R     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Both ModRef:  Ptr: i8* %P     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false)
+; CHECK: Both ModRef:  Ptr: i8* %Q     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false)
+; CHECK: Both ModRef:  Ptr: i8* %R     <->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false)
+; CHECK: Both ModRef:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false)
+; CHECK: Both ModRef:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+}
+
+attributes #0 = { nounwind }
diff --git a/test/Analysis/CostModel/AArch64/lit.local.cfg b/test/Analysis/CostModel/AArch64/lit.local.cfg
index c420349..7184443 100644
--- a/test/Analysis/CostModel/AArch64/lit.local.cfg
+++ b/test/Analysis/CostModel/AArch64/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'AArch64' in targets:
+if not 'AArch64' in config.root.targets:
     config.unsupported = True
diff --git a/test/Analysis/CostModel/ARM/lit.local.cfg b/test/Analysis/CostModel/ARM/lit.local.cfg
index 8a3ba96..98c6700 100644
--- a/test/Analysis/CostModel/ARM/lit.local.cfg
+++ b/test/Analysis/CostModel/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Analysis/CostModel/PowerPC/lit.local.cfg b/test/Analysis/CostModel/PowerPC/lit.local.cfg
index 2e46300..5d33887 100644
--- a/test/Analysis/CostModel/PowerPC/lit.local.cfg
+++ b/test/Analysis/CostModel/PowerPC/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'PowerPC' in targets:
+if not 'PowerPC' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll b/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll
new file mode 100644
index 0000000..2e162f0
--- /dev/null
+++ b/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll
@@ -0,0 +1,347 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-ssse3 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+sse3,+ssse3 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSSE3
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
+
+
+; Verify the cost model for alternate shuffles.
+
+; shufflevector instructions with illegal 64-bit vector types.
+; 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
+; 64-bit packed float vectors (v2f32) are widened to type v4f32.
+
+define <2 x i32> @test_v2i32(<2 x i32> %a, <2 x i32> %b) {
+  %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x i32> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2i32':
+; SSE2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+define <2 x float> @test_v2f32(<2 x float> %a, <2 x float> %b) {
+  %1 = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x float> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2f32':
+; SSE2: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+define <2 x i32> @test_v2i32_2(<2 x i32> %a, <2 x i32> %b) {
+  %1 = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 2, i32 1>
+  ret <2 x i32> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2i32_2':
+; SSE2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+define <2 x float> @test_v2f32_2(<2 x float> %a, <2 x float> %b) {
+  %1 = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 2, i32 1>
+  ret <2 x float> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2f32_2':
+; SSE2: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+
+; Test shuffles on packed vectors of two elements.
+
+define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) {
+  %1 = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x i64> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2i64':
+; SSE2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b) {
+  %1 = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2f64':
+; SSE2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+
+define <2 x i64> @test_v2i64_2(<2 x i64> %a, <2 x i64> %b) {
+  %1 = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
+  ret <2 x i64> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2i64_2':
+; SSE2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+
+define <2 x double> @test_v2f64_2(<2 x double> %a, <2 x double> %b) {
+  %1 = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 1>
+  ret <2 x double> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v2f64_2':
+; SSE2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+; Test shuffles on packed vectors of four elements.
+
+define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) {
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v4i32':
+; SSE2: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+
+define <4 x i32> @test_v4i32_2(<4 x i32> %a, <4 x i32> %b) {
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  ret <4 x i32> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v4i32_2':
+; SSE2: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+
+define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b) {
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x float> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v4f32':
+; SSE2: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+
+define <4 x float> @test_v4f32_2(<4 x float> %a, <4 x float> %b) {
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  ret <4 x float> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v4f32_2':
+; SSE2: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
+  %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i64> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v4i64':
+; SSE2: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+
+define <4 x i64> @test_v4i64_2(<4 x i64> %a, <4 x i64> %b) {
+  %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  ret <4 x i64> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v4i64_2':
+; SSE2: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+
+define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
+  %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x double> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v4f64':
+; SSE2: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+
+define <4 x double> @test_v4f64_2(<4 x double> %a, <4 x double> %b) {
+  %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  ret <4 x double> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v4f64_2':
+; SSE2: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+
+; Test shuffles on packed vectors of eight elements.
+define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
+  %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x i16> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v8i16':
+; SSE2: Cost Model: {{.*}} 8 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+
+define <8 x i16> @test_v8i16_2(<8 x i16> %a, <8 x i16> %b) {
+  %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+  ret <8 x i16> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v8i16_2':
+; SSE2: Cost Model: {{.*}} 8 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+
+define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
+  %1 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x i32> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v8i32':
+; SSE2: Cost Model: {{.*}} 4 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 4 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+
+define <8 x i32> @test_v8i32_2(<8 x i32> %a, <8 x i32> %b) {
+  %1 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+  ret <8 x i32> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v8i32_2':
+; SSE2: Cost Model: {{.*}} 4 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 4 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+
+define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
+  %1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x float> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v8f32':
+; SSE2: Cost Model: {{.*}} 4 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 4 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+
+define <8 x float> @test_v8f32_2(<8 x float> %a, <8 x float> %b) {
+  %1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+  ret <8 x float> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v8f32_2':
+; SSE2: Cost Model: {{.*}} 4 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 4 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+
+; Test shuffles on packed vectors of sixteen elements.
+define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) {
+  %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  ret <16 x i8> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i8':
+; SSE2: Cost Model: {{.*}} 48 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
+
+
+define <16 x i8> @test_v16i8_2(<16 x i8> %a, <16 x i8> %b) {
+  %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
+  ret <16 x i8> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i8_2':
+; SSE2: Cost Model: {{.*}} 48 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 3 for instruction:   %1 = shufflevector
+
+
+define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
+  %1 = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  ret <16 x i16> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i16':
+; SSE2: Cost Model: {{.*}} 16 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 6 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 5 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+
+define <16 x i16> @test_v16i16_2(<16 x i16> %a, <16 x i16> %b) {
+  %1 = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
+  ret <16 x i16> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i16_2':
+; SSE2: Cost Model: {{.*}} 16 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 6 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 5 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %1 = shufflevector
+
+define <32 x i8> @test_v32i8(<32 x i8> %a, <32 x i8> %b) {
+  %1 = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 33, i32 2, i32 35, i32 4, i32 37, i32 6, i32 39, i32 8, i32 41, i32 10, i32 43, i32 12, i32 45, i32 14, i32 47, i32 16, i32 49, i32 18, i32 51, i32 20, i32 53, i32 22, i32 55, i32 24, i32 57, i32 26, i32 59, i32 28, i32 61, i32 30, i32 63>
+  ret <32 x i8> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v32i8':
+; SSE2: Cost Model: {{.*}} 96 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 6 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 6 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 9 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 9 for instruction:   %1 = shufflevector
+
+
+define <32 x i8> @test_v32i8_2(<32 x i8> %a, <32 x i8> %b) {
+  %1 = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
+  ret <32 x i8> %1
+}
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v32i8_2':
+; SSE2: Cost Model: {{.*}} 96 for instruction:   %1 = shufflevector
+; SSSE3: Cost Model: {{.*}} 6 for instruction:   %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 6 for instruction:   %1 = shufflevector
+; AVX: Cost Model: {{.*}} 9 for instruction:   %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 9 for instruction:   %1 = shufflevector
+
diff --git a/test/Analysis/CostModel/X86/lit.local.cfg b/test/Analysis/CostModel/X86/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/Analysis/CostModel/X86/lit.local.cfg
+++ b/test/Analysis/CostModel/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Analysis/Delinearization/multidim_only_ivs_2d.ll b/test/Analysis/Delinearization/multidim_only_ivs_2d.ll
index 48bec08..5a88c4c 100644
--- a/test/Analysis/Delinearization/multidim_only_ivs_2d.ll
+++ b/test/Analysis/Delinearization/multidim_only_ivs_2d.ll
@@ -8,6 +8,15 @@
 ;       A[i][j] = 1.0;
 ; }
 
+; Inst:  %val = load double* %arrayidx
+; In Loop with Header: for.j
+; AddRec: {{0,+,(%m * sizeof(double))}<%for.i>,+,sizeof(double)}<%for.j>
+; Base offset: %A
+; ArrayDecl[UnknownSize][%m] with elements of sizeof(double) bytes.
+; ArrayRef[{0,+,1}<nuw><nsw><%for.i>][{0,+,1}<nuw><nsw><%for.j>]
+
+; Inst:  store double %val, double* %arrayidx
+; In Loop with Header: for.j
 ; AddRec: {{%A,+,(8 * %m)}<%for.i>,+,8}<%for.j>
 ; CHECK: Base offset: %A
 ; CHECK: ArrayDecl[UnknownSize][%m] with elements of sizeof(double) bytes.
@@ -26,7 +35,8 @@ for.j:
   %j = phi i64 [ 0, %for.i ], [ %j.inc, %for.j ]
   %vlaarrayidx.sum = add i64 %j, %tmp
   %arrayidx = getelementptr inbounds double* %A, i64 %vlaarrayidx.sum
-  store double 1.0, double* %arrayidx
+  %val = load double* %arrayidx
+  store double %val, double* %arrayidx
   %j.inc = add nsw i64 %j, 1
   %j.exitcond = icmp eq i64 %j.inc, %m
   br i1 %j.exitcond, label %for.i.inc, label %for.j
diff --git a/test/Assembler/addrspacecast-alias.ll b/test/Assembler/addrspacecast-alias.ll
index 052a141..d751659 100644
--- a/test/Assembler/addrspacecast-alias.ll
+++ b/test/Assembler/addrspacecast-alias.ll
@@ -3,5 +3,5 @@
 ; Test that global aliases are allowed to be constant addrspacecast
 
 @i = internal addrspace(1) global i8 42
-@ia = alias internal addrspace(2) i8 addrspace(3)*, i8 addrspace(1)* @i
-; CHECK: @ia = alias internal addrspace(2) i8 addrspace(3)*, i8 addrspace(1)* @i
+@ia = alias internal addrspacecast (i8 addrspace(1)* @i to i8 addrspace(2)* addrspace(3)*)
+; CHECK: @ia = alias internal addrspacecast (i8 addrspace(2)* addrspace(1)* bitcast (i8 addrspace(1)* @i to i8 addrspace(2)* addrspace(1)*) to i8 addrspace(2)* addrspace(3)*)
diff --git a/test/Assembler/alias-addrspace.ll b/test/Assembler/alias-addrspace.ll
deleted file mode 100644
index 6d378e4..0000000
--- a/test/Assembler/alias-addrspace.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: not llvm-as %s 2>&1 | FileCheck %s
-
-@foo = global i32 42
-@bar = alias internal addrspace(1) i32* @foo
-
-CHECK: error: A type is required if addrspace is given
diff --git a/test/Assembler/alias-to-alias.ll b/test/Assembler/alias-to-alias.ll
deleted file mode 100644
index 1ea99bb..0000000
--- a/test/Assembler/alias-to-alias.ll
+++ /dev/null
@@ -1,5 +0,0 @@
-; RUN:  not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
-; CHECK: Alias must point to function or variable
-
-@b1 = alias i32* @c1
-@c1 = alias i32* @b1
diff --git a/test/Assembler/alias-to-alias2.ll b/test/Assembler/alias-to-alias2.ll
deleted file mode 100644
index a8a0196..0000000
--- a/test/Assembler/alias-to-alias2.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN:  not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
-; CHECK: error: Alias is pointed by alias b1
-
-@g = global i32 42
-
-@b1 = alias i32* @c1
-@c1 = alias i32* @g
diff --git a/test/Assembler/alias-type.ll b/test/Assembler/alias-type.ll
deleted file mode 100644
index ead3e95..0000000
--- a/test/Assembler/alias-type.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: not llvm-as %s 2>&1 | FileCheck %s
-
-@foo = global i32 42
-@bar = alias i32 @foo
-
-CHECK: error: An alias must have pointer type
diff --git a/test/Assembler/atomic.ll b/test/Assembler/atomic.ll
index a2ae58e..d7ccd99 100644
--- a/test/Assembler/atomic.ll
+++ b/test/Assembler/atomic.ll
@@ -16,6 +16,8 @@ define void @f(i32* %x) {
   cmpxchg volatile i32* %x, i32 0, i32 1 acq_rel acquire
   ; CHECK: cmpxchg i32* %x, i32 42, i32 0 acq_rel monotonic
   cmpxchg i32* %x, i32 42, i32 0 acq_rel monotonic
+  ; CHECK: cmpxchg weak i32* %x, i32 13, i32 0 seq_cst monotonic
+  cmpxchg weak i32* %x, i32 13, i32 0 seq_cst monotonic
   ; CHECK: atomicrmw add i32* %x, i32 10 seq_cst
   atomicrmw add i32* %x, i32 10 seq_cst
   ; CHECK: atomicrmw volatile xchg  i32* %x, i32 10 monotonic
diff --git a/test/Assembler/invalid-comdat.ll b/test/Assembler/invalid-comdat.ll
new file mode 100644
index 0000000..987e1e1
--- /dev/null
+++ b/test/Assembler/invalid-comdat.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+@v = global i32 0, comdat $v
+; CHECK: use of undefined comdat '$v'
diff --git a/test/Assembler/invalid-comdat2.ll b/test/Assembler/invalid-comdat2.ll
new file mode 100644
index 0000000..ed656ef
--- /dev/null
+++ b/test/Assembler/invalid-comdat2.ll
@@ -0,0 +1,5 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+$v = comdat any
+$v = comdat any
+; CHECK: redefinition of comdat '$v'
diff --git a/test/Assembler/upgrade-loop-metadata.ll b/test/Assembler/upgrade-loop-metadata.ll
new file mode 100644
index 0000000..f664bdf
--- /dev/null
+++ b/test/Assembler/upgrade-loop-metadata.ll
@@ -0,0 +1,41 @@
+; Test to make sure loop vectorizer metadata is automatically upgraded.
+;
+; Run using opt as well to ensure that the metadata is upgraded when parsing
+; assembly.
+;
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: opt -S < %s | FileCheck %s
+
+define void @_Z28loop_with_vectorize_metadatav() {
+entry:
+  %i = alloca i32, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 16
+  br i1 %cmp, label %for.body, label %for.end, !llvm.loop !1
+
+for.body:                                         ; preds = %for.cond
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %1 = load i32* %i, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; CHECK: !{metadata !"llvm.loop.vectorize.unroll", i32 4}
+; CHECK: !{metadata !"llvm.loop.vectorize.width", i32 8}
+; CHECK: !{metadata !"llvm.loop.vectorize.enable", i1 true}
+
+!0 = metadata !{metadata !"clang version 3.5.0 (trunk 211528)"}
+!1 = metadata !{metadata !1, metadata !2, metadata !3, metadata !4, metadata !4}
+!2 = metadata !{metadata !"llvm.vectorizer.unroll", i32 4}
+!3 = metadata !{metadata !"llvm.vectorizer.width", i32 8}
+!4 = metadata !{metadata !"llvm.vectorizer.enable", i1 true}
diff --git a/test/Bindings/Ocaml/target.ml b/test/Bindings/Ocaml/target.ml
index 26cd129..0a2283a 100644
--- a/test/Bindings/Ocaml/target.ml
+++ b/test/Bindings/Ocaml/target.ml
@@ -46,7 +46,7 @@ let test_target_data () =
   let layout = "e-p:32:32-f64:32:64-v64:32:64-v128:32:128-n32-S32" in
   let dl     = DL.of_string layout in
   let sty    = struct_type context [| i32_type; i64_type |] in
-  
+
   assert_equal (DL.as_string dl) layout;
   assert_equal (DL.byte_order dl) Endian.Little;
   assert_equal (DL.pointer_size dl) 4;
@@ -86,7 +86,8 @@ let test_target_machine () =
   assert_equal (TM.triple machine) (Target.default_triple ());
   assert_equal (TM.cpu machine) "";
   assert_equal (TM.features machine) "";
-  ignore (TM.data_layout machine)
+  ignore (TM.data_layout machine);
+  TM.set_verbose_asm true machine
 
 
 (*===-- Code Emission -----------------------------------------------------===*)
diff --git a/test/Bindings/llvm-c/lit.local.cfg b/test/Bindings/llvm-c/lit.local.cfg
index d83ebee..75b22c0 100644
--- a/test/Bindings/llvm-c/lit.local.cfg
+++ b/test/Bindings/llvm-c/lit.local.cfg
@@ -1,5 +1,4 @@
-targets = set(config.root.targets_to_build.split())
-if not "X86" in targets:
+if not "X86" in config.root.targets:
     config.unsupported = True
-if not "ARM" in targets:
+if not "ARM" in config.root.targets:
     config.unsupported = True
diff --git a/test/Bitcode/atomic.ll b/test/Bitcode/atomic.ll
new file mode 100644
index 0000000..37815a7
--- /dev/null
+++ b/test/Bitcode/atomic.ll
@@ -0,0 +1,17 @@
+; RUN: llvm-as %s -o - | llvm-dis | FileCheck %s
+
+define void @test_cmpxchg(i32* %addr, i32 %desired, i32 %new) {
+  cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  ; CHECK: cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+
+  cmpxchg volatile i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+  ; CHECK: cmpxchg volatile i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+
+  cmpxchg weak i32* %addr, i32 %desired, i32 %new acq_rel acquire
+  ; CHECK: cmpxchg weak i32* %addr, i32 %desired, i32 %new acq_rel acquire
+
+  cmpxchg weak volatile i32* %addr, i32 %desired, i32 %new singlethread release monotonic
+  ; CHECK: cmpxchg weak volatile i32* %addr, i32 %desired, i32 %new singlethread release monotonic
+
+  ret void
+}
+\ No newline at end of file
diff --git a/test/Bitcode/attributes.ll b/test/Bitcode/attributes.ll
index 02e1bb1..49366de 100644
--- a/test/Bitcode/attributes.ll
+++ b/test/Bitcode/attributes.ll
@@ -203,7 +203,7 @@ define void @f34()
 ; CHECK: define void @f34()
 {
         call void @nobuiltin() nobuiltin
-; CHECK: call void @nobuiltin() #24
+; CHECK: call void @nobuiltin() #25
         ret void;
 }
 
@@ -223,6 +223,12 @@ define nonnull i8* @f37(i8* nonnull %a) {
         ret i8* %a
 }
 
+define void @f38() unnamed_addr jumptable {
+; CHECK: define void @f38() unnamed_addr #24
+    call void bitcast (void (i8*)* @f36 to void ()*)()
+    unreachable
+}
+
 ; CHECK: attributes #0 = { noreturn }
 ; CHECK: attributes #1 = { nounwind }
 ; CHECK: attributes #2 = { readnone }
@@ -247,5 +253,5 @@ define nonnull i8* @f37(i8* nonnull %a) {
 ; CHECK: attributes #21 = { sspstrong }
 ; CHECK: attributes #22 = { minsize }
 ; CHECK: attributes #23 = { noinline optnone }
-; CHECK: attributes #24 = { nobuiltin }
-
+; CHECK: attributes #24 = { jumptable }
+; CHECK: attributes #25 = { nobuiltin }
diff --git a/test/Bitcode/memInstructions.3.2.ll b/test/Bitcode/memInstructions.3.2.ll
index 21c3deb..e4cb6bd 100644
--- a/test/Bitcode/memInstructions.3.2.ll
+++ b/test/Bitcode/memInstructions.3.2.ll
@@ -223,68 +223,88 @@ define void @cmpxchg(i32* %ptr,i32 %cmp,i32 %new){
 entry:
   ;cmpxchg [volatile] <ty>* <pointer>, <ty> <cmp>, <ty> <new> [singlethread] <ordering>
 
-; CHECK: %res1 = cmpxchg i32* %ptr, i32 %cmp, i32 %new monotonic monotonic
+; CHECK: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new monotonic monotonic
+; CHECK-NEXT: %res1 = extractvalue { i32, i1 } [[TMP]], 0
   %res1 = cmpxchg i32* %ptr, i32 %cmp, i32 %new monotonic monotonic
   
-; CHECK-NEXT: %res2 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new monotonic monotonic
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new monotonic monotonic
+; CHECK-NEXT: %res2 = extractvalue { i32, i1 } [[TMP]], 0
   %res2 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new monotonic monotonic
   
-; CHECK-NEXT: %res3 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
+; CHECK-NEXT: %res3 = extractvalue { i32, i1 } [[TMP]], 0
   %res3 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
   
-; CHECK-NEXT: %res4 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
+; CHECK-NEXT: %res4 = extractvalue { i32, i1 } [[TMP]], 0
   %res4 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
   
   
-; CHECK-NEXT: %res5 = cmpxchg i32* %ptr, i32 %cmp, i32 %new acquire acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new acquire acquire
+; CHECK-NEXT: %res5 = extractvalue { i32, i1 } [[TMP]], 0
   %res5 = cmpxchg i32* %ptr, i32 %cmp, i32 %new acquire acquire
   
-; CHECK-NEXT: %res6 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acquire acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acquire acquire
+; CHECK-NEXT: %res6 = extractvalue { i32, i1 } [[TMP]], 0
   %res6 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acquire acquire
   
-; CHECK-NEXT: %res7 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
+; CHECK-NEXT: %res7 = extractvalue { i32, i1 } [[TMP]], 0
   %res7 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
   
-; CHECK-NEXT: %res8 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
+; CHECK-NEXT: %res8 = extractvalue { i32, i1 } [[TMP]], 0
   %res8 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
   
   
-; CHECK-NEXT: %res9 = cmpxchg i32* %ptr, i32 %cmp, i32 %new release monotonic
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new release monotonic
+; CHECK-NEXT: %res9 = extractvalue { i32, i1 } [[TMP]], 0
   %res9 = cmpxchg i32* %ptr, i32 %cmp, i32 %new release monotonic
   
-; CHECK-NEXT: %res10 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new release monotonic
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new release monotonic
+; CHECK-NEXT: %res10 = extractvalue { i32, i1 } [[TMP]], 0
   %res10 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new release monotonic
   
-; CHECK-NEXT: %res11 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
+; CHECK-NEXT: %res11 = extractvalue { i32, i1 } [[TMP]], 0
   %res11 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
   
-; CHECK-NEXT: %res12 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
+; CHECK-NEXT: %res12 = extractvalue { i32, i1 } [[TMP]], 0
   %res12 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
   
   
-; CHECK-NEXT: %res13 = cmpxchg i32* %ptr, i32 %cmp, i32 %new acq_rel acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new acq_rel acquire
+; CHECK-NEXT: %res13 = extractvalue { i32, i1 } [[TMP]], 0
   %res13 = cmpxchg i32* %ptr, i32 %cmp, i32 %new acq_rel acquire
   
-; CHECK-NEXT: %res14 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acq_rel acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acq_rel acquire
+; CHECK-NEXT: %res14 = extractvalue { i32, i1 } [[TMP]], 0
   %res14 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acq_rel acquire
   
-; CHECK-NEXT: %res15 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
+; CHECK-NEXT: %res15 = extractvalue { i32, i1 } [[TMP]], 0
   %res15 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
   
-; CHECK-NEXT: %res16 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
+; CHECK-NEXT: %res16 = extractvalue { i32, i1 } [[TMP]], 0
   %res16 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
   
   
-; CHECK-NEXT: %res17 = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
+; CHECK-NEXT: %res17 = extractvalue { i32, i1 } [[TMP]], 0
   %res17 = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
   
-; CHECK-NEXT: %res18 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
+; CHECK-NEXT: %res18 = extractvalue { i32, i1 } [[TMP]], 0
   %res18 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
   
-; CHECK-NEXT: %res19 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
+; CHECK-NEXT: %res19 = extractvalue { i32, i1 } [[TMP]], 0
   %res19 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
   
-; CHECK-NEXT: %res20 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
+; CHECK-NEXT: %res20 = extractvalue { i32, i1 } [[TMP]], 0
   %res20 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
 
   ret void
diff --git a/test/Bitcode/old-aliases.ll b/test/Bitcode/old-aliases.ll
index 4ef47c0..7a0eea2 100644
--- a/test/Bitcode/old-aliases.ll
+++ b/test/Bitcode/old-aliases.ll
@@ -10,13 +10,13 @@
 ; CHECK: @v2 = global [1 x i32] zeroinitializer
 
 @v3 = alias bitcast (i32* @v1 to i16*)
-; CHECK: @v3 = alias i16, i32* @v1
+; CHECK: @v3 = alias bitcast (i32* @v1 to i16*)
 
 @v4 = alias getelementptr ([1 x i32]* @v2, i32 0, i32 0)
-; CHECK: @v4 = alias i32, [1 x i32]* @v2
+; CHECK: @v4 = alias getelementptr inbounds ([1 x i32]* @v2, i32 0, i32 0)
 
 @v5 = alias i32 addrspace(2)* addrspacecast (i32 addrspace(0)* @v1 to i32 addrspace(2)*)
-; CHECK: @v5 = alias addrspace(2) i32, i32* @v1
+; CHECK: @v5 = alias addrspacecast (i32* @v1 to i32 addrspace(2)*)
 
 @v6 = alias i16* @v3
-; CHECK: @v6 = alias i16, i32* @v1
+; CHECK: @v6 = alias i16* @v3
diff --git a/test/Bitcode/upgrade-loop-metadata.ll b/test/Bitcode/upgrade-loop-metadata.ll
new file mode 100644
index 0000000..1a45056
--- /dev/null
+++ b/test/Bitcode/upgrade-loop-metadata.ll
@@ -0,0 +1,37 @@
+; Test to make sure loop vectorizer metadata is automatically upgraded.
+;
+; RUN: llvm-dis < %s.bc | FileCheck %s
+
+define void @_Z28loop_with_vectorize_metadatav() {
+entry:
+  %i = alloca i32, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 16
+  br i1 %cmp, label %for.body, label %for.end, !llvm.loop !1
+
+for.body:                                         ; preds = %for.cond
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %1 = load i32* %i, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; CHECK: !{metadata !"llvm.loop.vectorize.unroll", i32 4}
+; CHECK: !{metadata !"llvm.loop.vectorize.width", i32 8}
+; CHECK: !{metadata !"llvm.loop.vectorize.enable", i1 true}
+
+!0 = metadata !{metadata !"clang version 3.5.0 (trunk 211528)"}
+!1 = metadata !{metadata !1, metadata !2, metadata !3, metadata !4, metadata !4}
+!2 = metadata !{metadata !"llvm.vectorizer.unroll", i32 4}
+!3 = metadata !{metadata !"llvm.vectorizer.width", i32 8}
+!4 = metadata !{metadata !"llvm.vectorizer.enable", i1 true}
diff --git a/test/Bitcode/upgrade-loop-metadata.ll.bc b/test/Bitcode/upgrade-loop-metadata.ll.bc
new file mode 100644
index 0000000..3f218cb
--- /dev/null
+++ b/test/Bitcode/upgrade-loop-metadata.ll.bc
diff --git a/test/Bitcode/weak-cmpxchg-upgrade.ll b/test/Bitcode/weak-cmpxchg-upgrade.ll
new file mode 100644
index 0000000..dbcd150
--- /dev/null
+++ b/test/Bitcode/weak-cmpxchg-upgrade.ll
@@ -0,0 +1,15 @@
+; RUN: llvm-dis < %s.bc | FileCheck %s
+
+; cmpxchg-upgrade.ll.bc was produced by running a version of llvm-as from just
+; before the IR change on this file.
+
+define i32 @test(i32* %addr, i32 %old, i32 %new) {
+; CHECK:  [[TMP:%.*]] = cmpxchg i32* %addr, i32 %old, i32 %new seq_cst monotonic
+; CHECK:  %val = extractvalue { i32, i1 } [[TMP]], 0
+  %val = cmpxchg i32* %addr, i32 %old, i32 %new seq_cst monotonic
+  ret i32 %val
+}
+
+define i32 @test(i32* %addr, i32 %old, i32 %new) {
+  ret i1 %val
+}
diff --git a/test/Bitcode/weak-cmpxchg-upgrade.ll.bc b/test/Bitcode/weak-cmpxchg-upgrade.ll.bc
new file mode 100644
index 0000000..f713c31
--- /dev/null
+++ b/test/Bitcode/weak-cmpxchg-upgrade.ll.bc
diff --git a/test/CodeGen/AArch64/aarch64-address-type-promotion-assertion.ll b/test/CodeGen/AArch64/aarch64-address-type-promotion-assertion.ll
new file mode 100644
index 0000000..2df9c37
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-address-type-promotion-assertion.ll
@@ -0,0 +1,55 @@
+; RUN: llc -O3 -mcpu=cortex-a53 -mtriple=aarch64--linux-gnu %s -o - | FileCheck %s
+; PR20188: don't crash when merging sexts.
+
+; CHECK: foo:
+define void @foo() unnamed_addr align 2 {
+entry:
+  br label %invoke.cont145
+
+invoke.cont145:
+  %or.cond = and i1 undef, false
+  br i1 %or.cond, label %if.then274, label %invoke.cont145
+
+if.then274:
+  %0 = load i32* null, align 4
+  br i1 undef, label %invoke.cont291, label %if.else313
+
+invoke.cont291:
+  %idxprom.i.i.i605 = sext i32 %0 to i64
+  %arrayidx.i.i.i607 = getelementptr inbounds double* undef, i64 %idxprom.i.i.i605
+  %idxprom.i.i.i596 = sext i32 %0 to i64
+  %arrayidx.i.i.i598 = getelementptr inbounds double* undef, i64 %idxprom.i.i.i596
+  br label %if.end356
+
+if.else313:
+  %cmp314 = fcmp olt double undef, 0.000000e+00
+  br i1 %cmp314, label %invoke.cont317, label %invoke.cont353
+
+invoke.cont317:
+  br i1 undef, label %invoke.cont326, label %invoke.cont334
+
+invoke.cont326:
+  %idxprom.i.i.i587 = sext i32 %0 to i64
+  %arrayidx.i.i.i589 = getelementptr inbounds double* undef, i64 %idxprom.i.i.i587
+  %sub329 = fsub fast double undef, undef
+  br label %invoke.cont334
+
+invoke.cont334:
+  %lo.1 = phi double [ %sub329, %invoke.cont326 ], [ undef, %invoke.cont317 ]
+  br i1 undef, label %invoke.cont342, label %if.end356
+
+invoke.cont342:
+  %idxprom.i.i.i578 = sext i32 %0 to i64
+  %arrayidx.i.i.i580 = getelementptr inbounds double* undef, i64 %idxprom.i.i.i578
+  br label %if.end356
+
+invoke.cont353:
+  %idxprom.i.i.i572 = sext i32 %0 to i64
+  %arrayidx.i.i.i574 = getelementptr inbounds double* undef, i64 %idxprom.i.i.i572
+  br label %if.end356
+
+if.end356:
+  %lo.2 = phi double [ 0.000000e+00, %invoke.cont291 ], [ %lo.1, %invoke.cont342 ], [ undef, %invoke.cont353 ], [ %lo.1, %invoke.cont334 ]
+  call void null(i32 %0, double %lo.2)
+  unreachable
+}
diff --git a/test/CodeGen/AArch64/aarch64-address-type-promotion.ll b/test/CodeGen/AArch64/aarch64-address-type-promotion.ll
new file mode 100644
index 0000000..ee90d19
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-address-type-promotion.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -o - | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-macosx10.9"
+
+; Check that sexts get promoted above adds.
+define void @foo(i32* nocapture %a, i32 %i) {
+entry:
+; CHECK-LABEL: _foo:
+; CHECK: add
+; CHECK-NEXT: ldp
+; CHECK-NEXT: add
+; CHECK-NEXT: str
+; CHECK-NEXT: ret
+  %add = add nsw i32 %i, 1
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32* %a, i64 %idxprom
+  %0 = load i32* %arrayidx, align 4
+  %add1 = add nsw i32 %i, 2
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds i32* %a, i64 %idxprom2
+  %1 = load i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %1, %0
+  %idxprom5 = sext i32 %i to i64
+  %arrayidx6 = getelementptr inbounds i32* %a, i64 %idxprom5
+  store i32 %add4, i32* %arrayidx6, align 4
+  ret void
+}
diff --git a/test/CodeGen/AArch64/addsub_ext.ll b/test/CodeGen/AArch64/addsub_ext.ll
index a2266b1..ceea8a0 100644
--- a/test/CodeGen/AArch64/addsub_ext.ll
+++ b/test/CodeGen/AArch64/addsub_ext.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs %s -o - -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs %s -o - -mtriple=aarch64-linux-gnu -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 
 @var8 = global i8 0
 @var16 = global i16 0
diff --git a/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll b/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
index d1840d3..7da2d2c 100644
--- a/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
+++ b/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
@@ -2,14 +2,14 @@
 ; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic < %s | FileCheck %s --check-prefix=CHECK-LINUX
 ; <rdar://problem/11392109>
 
-define hidden void @t() optsize ssp {
+define hidden void @t(i64* %addr) optsize ssp {
 entry:
-  store i64 zext (i32 ptrtoint (i64 (i32)* @x to i32) to i64), i64* undef, align 8
+  store i64 zext (i32 ptrtoint (i64 (i32)* @x to i32) to i64), i64* %addr, align 8
 ; CHECK:             adrp    x{{[0-9]+}}, _x@GOTPAGE
 ; CHECK:        ldr     x{{[0-9]+}}, [x{{[0-9]+}}, _x@GOTPAGEOFF]
 ; CHECK-NEXT:        and     x{{[0-9]+}}, x{{[0-9]+}}, #0xffffffff
 ; CHECK-NEXT:        str     x{{[0-9]+}}, [x{{[0-9]+}}]
-  unreachable
+  ret void
 }
 
 declare i64 @x(i32) optsize
diff --git a/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll b/test/CodeGen/AArch64/arm64-AnInfiniteLoopInDAGCombine.ll
index a73b707..a73b707 100644
--- a/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll
+++ b/test/CodeGen/AArch64/arm64-AnInfiniteLoopInDAGCombine.ll
diff --git a/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll b/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll
index 1b2d543..1b2d543 100644
--- a/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll
+++ b/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll
diff --git a/test/CodeGen/AArch64/arm64-aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll
index b713f0d..ccf1371 100644
--- a/test/CodeGen/AArch64/arm64-aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-aapcs.ll
@@ -101,3 +101,11 @@ define fp128 @test_fp128([8 x float] %arg0, fp128 %arg1) {
 ; CHECK: ldr {{q[0-9]+}}, [sp]
   ret fp128 %arg1
 }
+
+; Check if VPR can be correctly pass by stack.
+define <2 x double> @test_vreg_stack([8 x <2 x double>], <2 x double> %varg_stack) {
+entry:
+; CHECK-LABEL: test_vreg_stack:
+; CHECK: ldr {{q[0-9]+}}, [sp]
+  ret <2 x double> %varg_stack;
+}
diff --git a/test/CodeGen/AArch64/arm64-abi.ll b/test/CodeGen/AArch64/arm64-abi.ll
index e2de434..a955029 100644
--- a/test/CodeGen/AArch64/arm64-abi.ll
+++ b/test/CodeGen/AArch64/arm64-abi.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+; RUN: llc < %s -debug -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
 ; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
+; REQUIRES: asserts
 target triple = "arm64-apple-darwin"
 
 ; rdar://9932559
@@ -8,15 +9,15 @@ entry:
 ; CHECK-LABEL: i8i16callee:
 ; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
 ; They are i8, i16, i8 and i8.
-; CHECK: ldrsb	{{w[0-9]+}}, [sp, #5]
-; CHECK: ldrsh	{{w[0-9]+}}, [sp, #2]
-; CHECK: ldrsb	{{w[0-9]+}}, [sp]
-; CHECK: ldrsb	{{w[0-9]+}}, [sp, #4]
+; CHECK-DAG: ldrsb {{w[0-9]+}}, [sp, #5]
+; CHECK-DAG: ldrsb {{w[0-9]+}}, [sp, #4]
+; CHECK-DAG: ldrsh {{w[0-9]+}}, [sp, #2]
+; CHECK-DAG: ldrsb {{w[0-9]+}}, [sp]
 ; FAST-LABEL: i8i16callee:
-; FAST: ldrb  {{w[0-9]+}}, [sp, #5]
-; FAST: ldrb  {{w[0-9]+}}, [sp, #4]
-; FAST: ldrh  {{w[0-9]+}}, [sp, #2]
-; FAST: ldrb  {{w[0-9]+}}, [sp]
+; FAST-DAG: ldrsb  {{w[0-9]+}}, [sp, #5]
+; FAST-DAG: ldrsb  {{w[0-9]+}}, [sp, #4]
+; FAST-DAG: ldrsh  {{w[0-9]+}}, [sp, #2]
+; FAST-DAG: ldrsb  {{w[0-9]+}}, [sp]
   %conv = sext i8 %a4 to i64
   %conv3 = sext i16 %a5 to i64
   %conv8 = sext i8 %b1 to i64
@@ -44,10 +45,10 @@ entry:
 ; CHECK: i8i16caller
 ; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
 ; They are i8, i16, i8 and i8.
-; CHECK: strb {{w[0-9]+}}, [sp, #5]
-; CHECK: strb {{w[0-9]+}}, [sp, #4]
-; CHECK: strh {{w[0-9]+}}, [sp, #2]
-; CHECK: strb {{w[0-9]+}}, [sp]
+; CHECK-DAG: strb {{w[0-9]+}}, [sp, #5]
+; CHECK-DAG: strb {{w[0-9]+}}, [sp, #4]
+; CHECK-DAG: strh {{w[0-9]+}}, [sp, #2]
+; CHECK-DAG: strb {{w[0-9]+}}, [sp]
 ; CHECK: bl
 ; FAST: i8i16caller
 ; FAST: strb {{w[0-9]+}}, [sp]
diff --git a/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll b/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
index 34d6287..38661a5 100644
--- a/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
+++ b/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -o - | FileCheck %s
+; RUN: llc %s -o - -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 ; Check that ANDS (tst) is not merged with ADD when the immediate
 ; is not 0.
 ; <rdar://problem/16693089>
@@ -8,18 +8,18 @@ target triple = "arm64-apple-ios"
 ; CHECK-LABEL: tst1:
 ; CHECK: add [[REG:w[0-9]+]], w{{[0-9]+}}, #1
 ; CHECK: tst [[REG]], #0x1
-define void @tst1() {
+define void @tst1(i1 %tst, i32 %true) {
 entry:
-  br i1 undef, label %for.end, label %for.body
+  br i1 %tst, label %for.end, label %for.body
 
 for.body:                                         ; preds = %for.body, %entry
   %result.09 = phi i32 [ %add2.result.0, %for.body ], [ 1, %entry ]
   %i.08 = phi i32 [ %inc, %for.body ], [ 2, %entry ]
   %and = and i32 %i.08, 1
   %cmp1 = icmp eq i32 %and, 0
-  %add2.result.0 = select i1 %cmp1, i32 undef, i32 %result.09
+  %add2.result.0 = select i1 %cmp1, i32 %true, i32 %result.09
   %inc = add nsw i32 %i.08, 1
-  %cmp = icmp slt i32 %i.08, undef
+  %cmp = icmp slt i32 %i.08, %true
   br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
 
 for.cond.for.end_crit_edge:                       ; preds = %for.body
diff --git a/test/CodeGen/AArch64/arm64-arith.ll b/test/CodeGen/AArch64/arm64-arith.ll
index ed9b569..f36e706 100644
--- a/test/CodeGen/AArch64/arm64-arith.ll
+++ b/test/CodeGen/AArch64/arm64-arith.ll
@@ -260,3 +260,11 @@ define i64 @f3(i64 %a) nounwind readnone ssp {
   %res = mul nsw i64 %a, 17
   ret i64 %res
 }
+
+define i32 @f4(i32 %a) nounwind readnone ssp {
+; CHECK-LABEL: f4:
+; CHECK-NEXT: add w0, w0, w0, lsl #1
+; CHECK-NEXT: ret
+  %res = mul i32 %a, 3
+  ret i32 %res
+}
diff --git a/test/CodeGen/AArch64/arm64-atomic-128.ll b/test/CodeGen/AArch64/arm64-atomic-128.ll
index 3b43aa1..3377849 100644
--- a/test/CodeGen/AArch64/arm64-atomic-128.ll
+++ b/test/CodeGen/AArch64/arm64-atomic-128.ll
@@ -13,7 +13,8 @@ define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
 ; CHECK: stxp   [[SCRATCH_RES:w[0-9]+]], x4, x5, [x[[ADDR]]]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 ; CHECK: [[DONE]]:
-  %val = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire
+  %pair = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire
+  %val = extractvalue { i128, i1 } %pair, 0
   ret i128 %val
 }
 
@@ -21,8 +22,10 @@ define void @fetch_and_nand(i128* %p, i128 %bits) {
 ; CHECK-LABEL: fetch_and_nand:
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK-DAG: bic    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
-; CHECK-DAG: bic    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK-DAG: and    [[TMP_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK-DAG: and    [[TMP_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK-DAG: mvn    [[SCRATCH_REGLO:x[0-9]+]], [[TMP_REGLO]]
+; CHECK-DAG: mvn    [[SCRATCH_REGHI:x[0-9]+]], [[TMP_REGHI]]
 ; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
 ; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
 
diff --git a/test/CodeGen/AArch64/arm64-atomic.ll b/test/CodeGen/AArch64/arm64-atomic.ll
index aa9b284..b56f91d 100644
--- a/test/CodeGen/AArch64/arm64-atomic.ll
+++ b/test/CodeGen/AArch64/arm64-atomic.ll
@@ -10,7 +10,8 @@ define i32 @val_compare_and_swap(i32* %p) {
 ; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEWVAL_REG]], [x0]
 ; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
 ; CHECK: [[LABEL2]]:
-  %val = cmpxchg i32* %p, i32 7, i32 4 acquire acquire
+  %pair = cmpxchg i32* %p, i32 7, i32 4 acquire acquire
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -25,7 +26,8 @@ define i64 @val_compare_and_swap_64(i64* %p) {
 ; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], x[[NEWVAL_REG]], [x0]
 ; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
 ; CHECK: [[LABEL2]]:
-  %val = cmpxchg i64* %p, i64 7, i64 4 monotonic monotonic
+  %pair = cmpxchg i64* %p, i64 7, i64 4 monotonic monotonic
+  %val = extractvalue { i64, i1 } %pair, 0
   ret i64 %val
 }
 
@@ -33,7 +35,8 @@ define i32 @fetch_and_nand(i32* %p) {
 ; CHECK-LABEL: fetch_and_nand:
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldxr   w[[DEST_REG:[0-9]+]], [x0]
-; CHECK: and    [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], #0xfffffff8
+; CHECK: mvn    [[TMP_REG:w[0-9]+]], w[[DEST_REG]]
+; CHECK: orr    [[SCRATCH2_REG:w[0-9]+]], [[TMP_REG]], #0xfffffff8
 ; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
 ; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
 ; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
@@ -46,8 +49,9 @@ define i64 @fetch_and_nand_64(i64* %p) {
 ; CHECK-LABEL: fetch_and_nand_64:
 ; CHECK: mov    x[[ADDR:[0-9]+]], x0
 ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxr   [[DEST_REG:x[0-9]+]], [x[[ADDR]]]
-; CHECK: and    [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0xfffffffffffffff8
+; CHECK: ldaxr   x[[DEST_REG:[0-9]+]], [x[[ADDR]]]
+; CHECK: mvn    w[[TMP_REG:[0-9]+]], w[[DEST_REG]]
+; CHECK: orr    [[SCRATCH2_REG:x[0-9]+]], x[[TMP_REG]], #0xfffffffffffffff8
 ; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]]
 ; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
 
diff --git a/test/CodeGen/AArch64/arm64-build-vector.ll b/test/CodeGen/AArch64/arm64-build-vector.ll
index c109263..d0f6db0 100644
--- a/test/CodeGen/AArch64/arm64-build-vector.ll
+++ b/test/CodeGen/AArch64/arm64-build-vector.ll
@@ -33,3 +33,27 @@ define <4 x float>  @foo(float %a, float %b, float %c, float %d) nounwind {
   %4 = insertelement <4 x float> %3, float %d, i32 3
   ret <4 x float> %4
 }
+
+define <8 x i16> @build_all_zero(<8 x i16> %a) #1 {
+; CHECK-LABEL: build_all_zero:
+; CHECK: movz	w[[GREG:[0-9]+]], #0xae80
+; CHECK-NEXT:	fmov	s[[FREG:[0-9]+]], w[[GREG]]
+; CHECK-NEXT:	mul.8h	v0, v0, v[[FREG]]
+  %b = add <8 x i16> %a, <i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>
+  %c = mul <8 x i16> %b, <i16 -20864, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>
+  ret <8 x i16> %c
+}
+
+; There is an optimization in DAG Combiner as following:
+;   fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...))
+;        -> (BUILD_VECTOR A, B, ..., C, D, ...)
+; This case checks when A,B and C,D are different types, there should be no
+; assertion failure.
+define <8 x i16> @concat_2_build_vector(<4 x i16> %in0) {
+; CHECK-LABEL: concat_2_build_vector:
+; CHECK: movi
+  %vshl_n = shl <4 x i16> %in0, <i16 8, i16 8, i16 8, i16 8>
+  %vshl_n2 = shl <4 x i16> %vshl_n, <i16 9, i16 9, i16 9, i16 9>
+  %shuffle.i = shufflevector <4 x i16> %vshl_n2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle.i
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll b/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll
deleted file mode 100644
index d862b1e..0000000
--- a/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
-
-; CHECK: fptosi_1
-; CHECK: fcvtzs.2d
-; CHECK: xtn.2s
-; CHECK: ret
-define void @fptosi_1() nounwind noinline ssp {
-entry:
-  %0 = fptosi <2 x double> undef to <2 x i32>
-  store <2 x i32> %0, <2 x i32>* undef, align 8
-  ret void
-}
-
-; CHECK: fptoui_1
-; CHECK: fcvtzu.2d
-; CHECK: xtn.2s
-; CHECK: ret
-define void @fptoui_1() nounwind noinline ssp {
-entry:
-  %0 = fptoui <2 x double> undef to <2 x i32>
-  store <2 x i32> %0, <2 x i32>* undef, align 8
-  ret void
-}
-
diff --git a/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll b/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll
deleted file mode 100644
index daaf1e0..0000000
--- a/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
-
-define <2 x double> @f1(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: f1:
-; CHECK: sshll.2d v0, v0, #0
-; CHECK-NEXT: scvtf.2d v0, v0
-; CHECK-NEXT: ret
-  %conv = sitofp <2 x i32> %v to <2 x double>
-  ret <2 x double> %conv
-}
-define <2 x double> @f2(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: f2:
-; CHECK: ushll.2d v0, v0, #0
-; CHECK-NEXT: ucvtf.2d v0, v0
-; CHECK-NEXT: ret
-  %conv = uitofp <2 x i32> %v to <2 x double>
-  ret <2 x double> %conv
-}
-
-; CHECK: autogen_SD19655
-; CHECK: scvtf
-; CHECK: ret
-define void @autogen_SD19655() {
-  %T = load <2 x i64>* undef
-  %F = sitofp <2 x i64> undef to <2 x float>
-  store <2 x float> %F, <2 x float>* undef
-  ret void
-}
-
diff --git a/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/test/CodeGen/AArch64/arm64-convert-v4f64.ll
new file mode 100644
index 0000000..7123e5e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -march=arm64 | FileCheck %s
+
+
+define <4 x i16> @fptosi_v4f64_to_v4i16(<4 x double>* %ptr) {
+; CHECK: fptosi_v4f64_to_v4i16
+; CHECK-DAG: fcvtzs  v[[LHS:[0-9]+]].2d, v1.2d
+; CHECK-DAG: fcvtzs  v[[RHS:[0-9]+]].2d, v0.2d
+; CHECK-DAG: xtn  v[[LHS_NA:[0-9]+]].2s, v[[LHS]].2d
+; CHECK-DAG: xtn  v[[RHS_NA:[0-9]+]].2s, v[[RHS]].2d
+; CHECK:     uzp1  v0.4h, v[[RHS_NA]].4h, v[[LHS_NA]].4h
+  %tmp1 = load <4 x double>* %ptr
+  %tmp2 = fptosi <4 x double> %tmp1 to <4 x i16>
+  ret <4 x i16> %tmp2
+}
+
+define <8 x i8> @fptosi_v4f64_to_v4i8(<8 x double>* %ptr) {
+; CHECK: fptosi_v4f64_to_v4i8
+; CHECK-DAG:  fcvtzs  v[[CONV3:[0-9]+]].2d, v3.2d
+; CHECK-DAG:  fcvtzs  v[[CONV2:[0-9]+]].2d, v2.2d
+; CHECK-DAG:  fcvtzs  v[[CONV1:[0-9]+]].2d, v1.2d
+; CHECK-DAG:  fcvtzs  v[[CONV0:[0-9]+]].2d, v0.2d
+; CHECK-DAG:  xtn  v[[NA3:[0-9]+]].2s, v[[CONV3]].2d
+; CHECK-DAG:  xtn  v[[NA2:[0-9]+]].2s, v[[CONV2]].2d
+; CHECK-DAG:  xtn  v[[NA1:[0-9]+]].2s, v[[CONV1]].2d
+; CHECK-DAG:  xtn  v[[NA0:[0-9]+]].2s, v[[CONV0]].2d
+; CHECK-DAG:  uzp1  v[[TMP1:[0-9]+]].4h, v[[CONV2]].4h, v[[CONV3]].4h
+; CHECK-DAG:  uzp1  v[[TMP2:[0-9]+]].4h, v[[CONV0]].4h, v[[CONV1]].4h
+; CHECK:      uzp1  v0.8b, v[[TMP2]].8b, v[[TMP1]].8b
+  %tmp1 = load <8 x double>* %ptr
+  %tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
+  ret <8 x i8> %tmp2
+}
+
diff --git a/test/CodeGen/AArch64/arm64-cse.ll b/test/CodeGen/AArch64/arm64-cse.ll
index bb14c89..5d62cfe 100644
--- a/test/CodeGen/AArch64/arm64-cse.ll
+++ b/test/CodeGen/AArch64/arm64-cse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 < %s | FileCheck %s
+; RUN: llc -O3 < %s -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 target triple = "arm64-apple-ios"
 
 ; rdar://12462006
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
index 2cf0135..6eed48b 100644
--- a/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
@@ -1,5 +1,8 @@
 ; RUN: llc -mcpu=cyclone < %s | FileCheck %s
 
+; r208640 broke ppc64/Linux self-hosting; xfailing while this is worked on.
+; XFAIL: *
+
 target datalayout = "e-i64:64-n32:64-S128"
 target triple = "arm64-apple-ios"
 
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
index 2e4b658..ce132c6 100644
--- a/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
@@ -13,12 +13,12 @@ target triple = "arm64-apple-ios"
 
 ; CHECK-LABEL: XX:
 ; CHECK: ldr
-define void @XX(%class.A* %K) {
+define i32 @XX(%class.A* %K, i1 %tst, i32* %addr, %class.C** %ppC, %class.C* %pC) {
 entry:
-  br i1 undef, label %if.then, label %lor.rhs.i
+  br i1 %tst, label %if.then, label %lor.rhs.i
 
 lor.rhs.i:                                        ; preds = %entry
-  %tmp = load i32* undef, align 4
+  %tmp = load i32* %addr, align 4
   %y.i.i.i = getelementptr inbounds %class.A* %K, i64 0, i32 1
   %tmp1 = load i64* %y.i.i.i, align 8
   %U.sroa.3.8.extract.trunc.i = trunc i64 %tmp1 to i32
@@ -30,17 +30,17 @@ lor.rhs.i:                                        ; preds = %entry
   %add16.i = add nsw i32 %add12.i, %div15.i
   %rem.i.i = srem i32 %add16.i, %tmp
   %idxprom = sext i32 %rem.i.i to i64
-  %arrayidx = getelementptr inbounds %class.C** undef, i64 %idxprom
-  %tobool533 = icmp eq %class.C* undef, null
+  %arrayidx = getelementptr inbounds %class.C** %ppC, i64 %idxprom
+  %tobool533 = icmp eq %class.C* %pC, null
   br i1 %tobool533, label %while.end, label %while.body
 
 if.then:                                          ; preds = %entry
-  unreachable
+  ret i32 42
 
 while.body:                                       ; preds = %lor.rhs.i
-  unreachable
+  ret i32 5
 
 while.end:                                        ; preds = %lor.rhs.i
   %tmp3 = load %class.C** %arrayidx, align 8
-  unreachable
+  ret i32 50
 }
diff --git a/test/CodeGen/AArch64/arm64-early-ifcvt.ll b/test/CodeGen/AArch64/arm64-early-ifcvt.ll
index 17d783a..44150c2 100644
--- a/test/CodeGen/AArch64/arm64-early-ifcvt.ll
+++ b/test/CodeGen/AArch64/arm64-early-ifcvt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -stress-early-ifcvt | FileCheck %s
+; RUN: llc < %s -stress-early-ifcvt -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 target triple = "arm64-apple-macosx"
 
 ; CHECK: mm2
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
index a3d5f6c..1152988 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
@@ -133,3 +133,16 @@ define void @t8() {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 4, i32 1, i1 false)
   ret void
 }
+
+define void @test_distant_memcpy(i8* %dst) {
+; ARM64-LABEL: test_distant_memcpy:
+; ARM64: mov [[ARRAY:x[0-9]+]], sp
+; ARM64: movz [[OFFSET:x[0-9]+]], #0x1f40
+; ARM64: add x[[ADDR:[0-9]+]], [[ARRAY]], [[OFFSET]]
+; ARM64: ldrb [[BYTE:w[0-9]+]], [x[[ADDR]]]
+; ARM64: strb [[BYTE]], [x0]
+  %array = alloca i8, i32 8192
+  %elem = getelementptr i8* %array, i32 8000
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %elem, i64 1, i32 1, i1 false)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fp128.ll b/test/CodeGen/AArch64/arm64-fp128.ll
index 57bbb93..b1d5010 100644
--- a/test/CodeGen/AArch64/arm64-fp128.ll
+++ b/test/CodeGen/AArch64/arm64-fp128.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone -aarch64-atomic-cfg-tidy=0 < %s | FileCheck %s
 
 @lhs = global fp128 zeroinitializer, align 16
 @rhs = global fp128 zeroinitializer, align 16
diff --git a/test/CodeGen/AArch64/arm64-frame-index.ll b/test/CodeGen/AArch64/arm64-frame-index.ll
index 4a91ff3..321f335 100644
--- a/test/CodeGen/AArch64/arm64-frame-index.ll
+++ b/test/CodeGen/AArch64/arm64-frame-index.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios -aarch64-atomic-cfg-tidy=0 < %s | FileCheck %s
 ; rdar://11935841
 
 define void @t1() nounwind ssp {
diff --git a/test/CodeGen/AArch64/arm64-misched-basic-A53.ll b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
index f88bd6a..bc7ed7f 100644
--- a/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
+++ b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
@@ -122,3 +122,82 @@ define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) {
 }
 
 declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*)
+
+; Regression Test for PR20057.
+;
+; Cortex-A53 machine model stalls on A53UnitFPMDS contention. Instructions that
+; are otherwise ready are jammed in the pending queue.
+; CHECK: ********** MI Scheduling **********
+; CHECK: testResourceConflict
+; CHECK: *** Final schedule for BB#0 ***
+; CHECK: BRK
+; CHECK: ********** INTERVALS **********
+define void @testResourceConflict(float* %ptr) {
+entry:
+  %add1 = fadd float undef, undef
+  %mul2 = fmul float undef, undef
+  %add3 = fadd float %mul2, undef
+  %mul4 = fmul float undef, %add3
+  %add5 = fadd float %mul4, undef
+  %sub6 = fsub float 0.000000e+00, undef
+  %sub7 = fsub float %add5, undef
+  %div8 = fdiv float 1.000000e+00, undef
+  %mul9 = fmul float %div8, %sub7
+  %mul14 = fmul float %sub6, %div8
+  %mul10 = fsub float -0.000000e+00, %mul14
+  %mul15 = fmul float undef, %div8
+  %mul11 = fsub float -0.000000e+00, %mul15
+  %mul12 = fmul float 0.000000e+00, %div8
+  %mul13 = fmul float %add1, %mul9
+  %mul21 = fmul float %add5, %mul11
+  %add22 = fadd float %mul13, %mul21
+  store float %add22, float* %ptr, align 4
+  %mul28 = fmul float %add1, %mul10
+  %mul33 = fmul float %add5, %mul12
+  %add34 = fadd float %mul33, %mul28
+  store float %add34, float* %ptr, align 4
+  %mul240 = fmul float undef, %mul9
+  %add246 = fadd float %mul240, undef
+  store float %add246, float* %ptr, align 4
+  %mul52 = fmul float undef, %mul10
+  %mul57 = fmul float undef, %mul12
+  %add58 = fadd float %mul57, %mul52
+  store float %add58, float* %ptr, align 4
+  %mul27 = fmul float 0.000000e+00, %mul9
+  %mul81 = fmul float undef, %mul10
+  %add82 = fadd float %mul27, %mul81
+  store float %add82, float* %ptr, align 4
+  call void @llvm.trap()
+  unreachable
+}
+
+declare void @llvm.trap()
+
+; Regression test for PR20057: "permanent hazard"'
+; Resource contention on LDST.
+; CHECK: ********** MI Scheduling **********
+; CHECK: testLdStConflict
+; CHECK: *** Final schedule for BB#1 ***
+; CHECK: LD4Fourv2d
+; CHECK: STRQui
+; CHECK: ********** INTERVALS **********
+define void @testLdStConflict() {
+entry:
+  br label %loop
+
+loop:
+  %0 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i8(i8* null)
+  %ptr = bitcast i8* undef to <2 x i64>*
+  store <2 x i64> zeroinitializer, <2 x i64>* %ptr, align 4
+  %ptr1 = bitcast i8* undef to <2 x i64>*
+  store <2 x i64> zeroinitializer, <2 x i64>* %ptr1, align 4
+  %ptr2 = bitcast i8* undef to <2 x i64>*
+  store <2 x i64> zeroinitializer, <2 x i64>* %ptr2, align 4
+  %ptr3 = bitcast i8* undef to <2 x i64>*
+  store <2 x i64> zeroinitializer, <2 x i64>* %ptr3, align 4
+  %ptr4 = bitcast i8* undef to <2 x i64>*
+  store <2 x i64> zeroinitializer, <2 x i64>* %ptr4, align 4
+  br label %loop
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i8(i8*)
diff --git a/test/CodeGen/AArch64/arm64-misched-basic-A57.ll b/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
new file mode 100644
index 0000000..238474a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
@@ -0,0 +1,112 @@
+; REQUIRES: asserts
+;
+; The Cortext-A57 machine model will avoid scheduling load instructions in
+; succession because loads on the A57 have a latency of 4 cycles and they all
+; issue to the same pipeline. Instead, it will move other instructions between
+; the loads to avoid unnecessary stalls. The generic machine model schedules 4
+; loads consecutively for this case and will cause stalls.
+;
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; CHECK: ********** MI Scheduling **********
+; CHECK: main:BB#2
+; CHECK LDR
+; CHECK Latency : 4
+; CHECK: *** Final schedule for BB#2 ***
+; CHECK: LDR
+; CHECK: LDR
+; CHECK-NOT: LDR
+; CHECK: {{.*}}
+; CHECK: ********** MI Scheduling **********
+
+@main.x = private unnamed_addr constant [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 4
+@main.y = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2], align 4
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %x = alloca [8 x i32], align 4
+  %y = alloca [8 x i32], align 4
+  %i = alloca i32, align 4
+  %xx = alloca i32, align 4
+  %yy = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = bitcast [8 x i32]* %x to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([8 x i32]* @main.x to i8*), i64 32, i32 4, i1 false)
+  %1 = bitcast [8 x i32]* %y to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ([8 x i32]* @main.y to i8*), i64 32, i32 4, i1 false)
+  store i32 0, i32* %xx, align 4
+  store i32 0, i32* %yy, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %2 = load i32* %i, align 4
+  %cmp = icmp slt i32 %2, 8
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %3 = load i32* %yy, align 4
+  %4 = load i32* %i, align 4
+  %idxprom = sext i32 %4 to i64
+  %arrayidx = getelementptr inbounds [8 x i32]* %x, i32 0, i64 %idxprom
+  %5 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %5, 1
+  store i32 %add, i32* %xx, align 4
+  %6 = load i32* %xx, align 4
+  %add1 = add nsw i32 %6, 12
+  store i32 %add1, i32* %xx, align 4
+  %7 = load i32* %xx, align 4
+  %add2 = add nsw i32 %7, 23
+  store i32 %add2, i32* %xx, align 4
+  %8 = load i32* %xx, align 4
+  %add3 = add nsw i32 %8, 34
+  store i32 %add3, i32* %xx, align 4
+  %9 = load i32* %i, align 4
+  %idxprom4 = sext i32 %9 to i64
+  %arrayidx5 = getelementptr inbounds [8 x i32]* %y, i32 0, i64 %idxprom4
+  %10 = load i32* %arrayidx5, align 4
+
+  %add4 = add nsw i32 %9, %add
+  %add5 = add nsw i32 %10, %add1
+  %add6 = add nsw i32 %add4, %add5
+
+  %add7 = add nsw i32 %9, %add3
+  %add8 = add nsw i32 %10, %add4
+  %add9 = add nsw i32 %add7, %add8
+
+  %add10 = add nsw i32 %9, %add6
+  %add11 = add nsw i32 %10, %add7
+  %add12 = add nsw i32 %add10, %add11
+
+  %add13 = add nsw i32 %9, %add9
+  %add14 = add nsw i32 %10, %add10
+  %add15 = add nsw i32 %add13, %add14
+
+  store i32 %add15, i32* %xx, align 4
+
+  %div = sdiv i32 %4, %5
+
+  store i32 %div, i32* %yy, align 4
+
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %11 = load i32* %i, align 4
+  %inc = add nsw i32 %11, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %12 = load i32* %xx, align 4
+  %13 = load i32* %yy, align 4
+  %add67 = add nsw i32 %12, %13
+  ret i32 %add67
+}
+
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
index 97bfb5c..07373cc 100644
--- a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
+++ b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
@@ -6,9 +6,10 @@
 ;
 ; CHECK: ********** MI Scheduling **********
 ; CHECK: shiftable
-; CHECK: *** Final schedule for BB#0 ***
-; CHECK: ADDXrr %vreg0, %vreg2
-; CHECK: ADDXrs %vreg0, %vreg2, 5
+; CHECK: SU(2):   %vreg2<def> = SUBXri %vreg1, 20, 0
+; CHECK:   Successors:
+; CHECK-NEXT:    val SU(4): Latency=1 Reg=%vreg2
+; CHECK-NEXT:    val SU(3): Latency=2 Reg=%vreg2
 ; CHECK: ********** INTERVALS **********
 define i64 @shiftable(i64 %A, i64 %B) {
         %tmp0 = sub i64 %B, 20
diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
index cfc2ebf..1cfba82 100644
--- a/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -842,7 +842,7 @@ define <2 x i64> @scalar_to_vector.v2i64(i64 %a) {
 
 define <8 x i8> @testDUP.v1i8(<1 x i8> %a) {
 ; CHECK-LABEL: testDUP.v1i8:
-; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
+; CHECK: dup v0.8b, v0.b[0]
   %b = extractelement <1 x i8> %a, i32 0
   %c = insertelement <8 x i8> undef, i8 %b, i32 0
   %d = insertelement <8 x i8> %c, i8 %b, i32 1
@@ -857,7 +857,7 @@ define <8 x i8> @testDUP.v1i8(<1 x i8> %a) {
 
 define <8 x i16> @testDUP.v1i16(<1 x i16> %a) {
 ; CHECK-LABEL: testDUP.v1i16:
-; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+; CHECK: dup v0.8h, v0.h[0]
   %b = extractelement <1 x i16> %a, i32 0
   %c = insertelement <8 x i16> undef, i16 %b, i32 0
   %d = insertelement <8 x i16> %c, i16 %b, i32 1
@@ -872,7 +872,7 @@ define <8 x i16> @testDUP.v1i16(<1 x i16> %a) {
 
 define <4 x i32> @testDUP.v1i32(<1 x i32> %a) {
 ; CHECK-LABEL: testDUP.v1i32:
-; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
+; CHECK: dup v0.4s, v0.s[0]
   %b = extractelement <1 x i32> %a, i32 0
   %c = insertelement <4 x i32> undef, i32 %b, i32 0
   %d = insertelement <4 x i32> %c, i32 %b, i32 1
@@ -1411,35 +1411,35 @@ define <16 x i8> @concat_vector_v16i8_const() {
 
 define <4 x i16> @concat_vector_v4i16(<1 x i16> %a) {
 ; CHECK-LABEL: concat_vector_v4i16:
-; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
+; CHECK: dup v0.4h, v0.h[0]
  %r = shufflevector <1 x i16> %a, <1 x i16> undef, <4 x i32> zeroinitializer
  ret <4 x i16> %r
 }
 
 define <4 x i32> @concat_vector_v4i32(<1 x i32> %a) {
 ; CHECK-LABEL: concat_vector_v4i32:
-; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
+; CHECK: dup v0.4s, v0.s[0]
  %r = shufflevector <1 x i32> %a, <1 x i32> undef, <4 x i32> zeroinitializer
  ret <4 x i32> %r
 }
 
 define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) {
 ; CHECK-LABEL: concat_vector_v8i8:
-; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
+; CHECK: dup v0.8b, v0.b[0]
  %r = shufflevector <1 x i8> %a, <1 x i8> undef, <8 x i32> zeroinitializer
  ret <8 x i8> %r
 }
 
 define <8 x i16> @concat_vector_v8i16(<1 x i16> %a) {
 ; CHECK-LABEL: concat_vector_v8i16:
-; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+; CHECK: dup v0.8h, v0.h[0]
  %r = shufflevector <1 x i16> %a, <1 x i16> undef, <8 x i32> zeroinitializer
  ret <8 x i16> %r
 }
 
 define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) {
 ; CHECK-LABEL: concat_vector_v16i8:
-; CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
+; CHECK: dup v0.16b, v0.b[0]
  %r = shufflevector <1 x i8> %a, <1 x i8> undef, <16 x i32> zeroinitializer
  ret <16 x i8> %r
 }
diff --git a/test/CodeGen/AArch64/arm64-neon-select_cc.ll b/test/CodeGen/AArch64/arm64-neon-select_cc.ll
index 255b90d..95c582a 100644
--- a/test/CodeGen/AArch64/arm64-neon-select_cc.ll
+++ b/test/CodeGen/AArch64/arm64-neon-select_cc.ll
@@ -136,8 +136,8 @@ define <2x i64> @test_select_cc_v2i64(i64 %a, i64 %b, <2x i64> %c, <2x i64> %d )
 
 define <1 x float> @test_select_cc_v1f32(float %a, float %b, <1 x float> %c, <1 x float> %d ) {
 ; CHECK-LABEL: test_select_cc_v1f32:
-; CHECK: fcmp s0, s1
-; CHECK-NEXT: fcsel s0, s2, s3, eq
+; CHECK: fcmeq [[MASK:v[0-9]+]].2s, v0.2s, v1.2s
+; CHECK-NEXT: bsl [[MASK]].8b, v2.8b, v3.8b
   %cmp31 = fcmp oeq float %a, %b
   %e = select i1 %cmp31, <1 x float> %c, <1 x float> %d
   ret <1 x float> %e
diff --git a/test/CodeGen/AArch64/arm64-shrink-v1i64.ll b/test/CodeGen/AArch64/arm64-shrink-v1i64.ll
new file mode 100644
index 0000000..f31a570
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-shrink-v1i64.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=arm64 < %s
+
+; The DAGCombiner tries to do following shrink:
+;     Convert x+y to (VT)((SmallVT)x+(SmallVT)y)
+; But currently it can't handle vector type and will trigger an assertion failure
+; when it tries to generate an add mixed using vector type and scaler type.
+; This test checks that such assertion failur should not happen.
+define <1 x i64> @dotest(<1 x i64> %in0) {
+entry:
+  %0 = add <1 x i64> %in0, %in0
+  %vshl_n = shl <1 x i64> %0, <i64 32>
+  %vsra_n = ashr <1 x i64> %vshl_n, <i64 32>
+  ret <1 x i64> %vsra_n
+}
diff --git a/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll b/test/CodeGen/AArch64/arm64-sqshl-uqshl-i64Contant.ll
index 3949b85..3949b85 100644
--- a/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll
+++ b/test/CodeGen/AArch64/arm64-sqshl-uqshl-i64Contant.ll
diff --git a/test/CodeGen/AArch64/arm64-vcvt.ll b/test/CodeGen/AArch64/arm64-vcvt.ll
index 8c9e4e9..6570f0e 100644
--- a/test/CodeGen/AArch64/arm64-vcvt.ll
+++ b/test/CodeGen/AArch64/arm64-vcvt.ll
@@ -665,19 +665,19 @@ define <2 x double> @ucvtf_2dc(<2 x i64> %A) nounwind {
 ;CHECK-LABEL: autogen_SD28458:
 ;CHECK: fcvt
 ;CHECK: ret
-define void @autogen_SD28458() {
-  %Tr53 = fptrunc <8 x double> undef to <8 x float>
-  store <8 x float> %Tr53, <8 x float>* undef
+define void @autogen_SD28458(<8 x double> %val.f64, <8 x float>* %addr.f32) {
+  %Tr53 = fptrunc <8 x double> %val.f64 to <8 x float>
+  store <8 x float> %Tr53, <8 x float>* %addr.f32
   ret void
 }
 
 ;CHECK-LABEL: autogen_SD19225:
 ;CHECK: fcvt
 ;CHECK: ret
-define void @autogen_SD19225() {
-  %A = load <8 x float>* undef
+define void @autogen_SD19225(<8 x double>* %addr.f64, <8 x float>* %addr.f32) {
+  %A = load <8 x float>* %addr.f32
   %Tr53 = fpext <8 x float> %A to <8 x double>
-  store <8 x double> %Tr53, <8 x double>* undef
+  store <8 x double> %Tr53, <8 x double>* %addr.f64
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/arm64-vshift.ll b/test/CodeGen/AArch64/arm64-vshift.ll
index 82ae486..65bd50c 100644
--- a/test/CodeGen/AArch64/arm64-vshift.ll
+++ b/test/CodeGen/AArch64/arm64-vshift.ll
@@ -1313,6 +1313,15 @@ define <8 x i8> @uqshli8b(<8 x i8>* %A) nounwind {
         ret <8 x i8> %tmp3
 }
 
+define <8 x i8> @uqshli8b_1(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uqshli8b_1:
+;CHECK: movi.8b [[REG:v[0-9]+]], #0x8
+;CHECK: uqshl.8b v0, v0, [[REG]]
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>)
+        ret <8 x i8> %tmp3
+}
+
 define <4 x i16> @uqshli4h(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: uqshli4h:
 ;CHECK: uqshl.4h v0, {{v[0-9]+}}, #1
diff --git a/test/CodeGen/AArch64/arm64-xaluo.ll b/test/CodeGen/AArch64/arm64-xaluo.ll
index 6cffbde..0c300de 100644
--- a/test/CodeGen/AArch64/arm64-xaluo.ll
+++ b/test/CodeGen/AArch64/arm64-xaluo.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 
 ;
 ; Get the actual value of the overflow bit.
diff --git a/test/CodeGen/AArch64/atomic-ops.ll b/test/CodeGen/AArch64/atomic-ops.ll
index 58b5d1d..26301b9 100644
--- a/test/CodeGen/AArch64/atomic-ops.ll
+++ b/test/CodeGen/AArch64/atomic-ops.ll
@@ -878,7 +878,9 @@ define i64 @test_atomic_load_umax_i64(i64 %offset) nounwind {
 
 define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i8:
-   %old = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
+   %pair = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
+   %old = extractvalue { i8, i1 } %pair, 0
+
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
@@ -889,8 +891,7 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
-  ; As above, w1 is a reasonable guess.
-; CHECK: stxrb [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
+; CHECK: stxrb [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
 
@@ -900,7 +901,9 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
 
 define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i16:
-   %old = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst seq_cst
+   %pair = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst seq_cst
+   %old = extractvalue { i16, i1 } %pair, 0
+
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
@@ -911,8 +914,7 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
-  ; As above, w1 is a reasonable guess.
-; CHECK: stlxrh [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
+; CHECK: stlxrh [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
 
@@ -922,7 +924,9 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
 
 define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i32:
-   %old = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic
+   %pair = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic
+   %old = extractvalue { i32, i1 } %pair, 0
+
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
@@ -933,8 +937,7 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
   ;  function there.
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
-  ; As above, w1 is a reasonable guess.
-; CHECK: stlxr [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
+; CHECK: stlxr [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
 
@@ -944,7 +947,9 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
 
 define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i64:
-   %old = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic monotonic
+   %pair = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic monotonic
+   %old = extractvalue { i64, i1 } %pair, 0
+
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
diff --git a/test/CodeGen/AArch64/blockaddress.ll b/test/CodeGen/AArch64/blockaddress.ll
index 1eec4cc..3a5dbdc 100644
--- a/test/CodeGen/AArch64/blockaddress.ll
+++ b/test/CodeGen/AArch64/blockaddress.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -code-model=large -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -code-model=large -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 @addr = global i8* null
 
diff --git a/test/CodeGen/AArch64/branch-relax-asm.ll b/test/CodeGen/AArch64/branch-relax-asm.ll
new file mode 100644
index 0000000..7409c84
--- /dev/null
+++ b/test/CodeGen/AArch64/branch-relax-asm.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=aarch64-apple-ios7.0 -disable-block-placement -aarch64-tbz-offset-bits=4 -o - %s | FileCheck %s
+define i32 @test_asm_length(i32 %in) {
+; CHECK-LABEL: test_asm_length:
+
+  ; It would be more natural to use just one "tbnz %false" here, but if the
+  ; number of instructions in the asm is counted reasonably, that block is out
+  ; of the limited range we gave tbz. So branch relaxation has to invert the
+  ; condition.
+; CHECK:     tbz w0, #0, [[TRUE:LBB[0-9]+_[0-9]+]]
+; CHECK:     b [[FALSE:LBB[0-9]+_[0-9]+]]
+
+; CHECK: [[TRUE]]:
+; CHECK:     orr w0, wzr, #0x4
+; CHECK:     nop
+; CHECK:     nop
+; CHECK:     nop
+; CHECK:     nop
+; CHECK:     nop
+; CHECK:     nop
+; CHECK:     ret
+
+; CHECK: [[FALSE]]:
+; CHECK:     ret
+
+  %val = and i32 %in, 1
+  %tst = icmp eq i32 %val, 0
+  br i1 %tst, label %true, label %false
+
+true:
+  call void asm sideeffect "nop\0A\09nop\0A\09nop\0A\09nop\0A\09nop\0A\09nop", ""()
+  ret i32 4
+
+false:
+  ret i32 0
+}
diff --git a/test/CodeGen/AArch64/breg.ll b/test/CodeGen/AArch64/breg.ll
index 591f483..9524044 100644
--- a/test/CodeGen/AArch64/breg.ll
+++ b/test/CodeGen/AArch64/breg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 
 @stored_label = global i8* null
 
diff --git a/test/CodeGen/AArch64/cmpxchg-idioms.ll b/test/CodeGen/AArch64/cmpxchg-idioms.ll
new file mode 100644
index 0000000..0c008c2
--- /dev/null
+++ b/test/CodeGen/AArch64/cmpxchg-idioms.ll
@@ -0,0 +1,93 @@
+; RUN: llc -mtriple=aarch64-apple-ios7.0 -o - %s | FileCheck %s
+
+define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
+; CHECK-LABEL: test_return:
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr [[LOADED:w[0-9]+]], [x0]
+; CHECK: cmp [[LOADED]], w1
+; CHECK: b.ne [[FAILED:LBB[0-9]+_[0-9]+]]
+
+; CHECK: stlxr [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x0]
+; CHECK: cbnz [[STATUS]], [[LOOP]]
+
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: orr w0, wzr, #0x1
+; CHECK: ret
+
+; CHECK: [[FAILED]]:
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: mov w0, wzr
+; CHECK: ret
+
+  %pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %pair, 1
+  %conv = zext i1 %success to i32
+  ret i32 %conv
+}
+
+define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) {
+; CHECK-LABEL: test_return_bool:
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxrb [[LOADED:w[0-9]+]], [x0]
+; CHECK: cmp [[LOADED]], w1, uxtb
+; CHECK: b.ne [[FAILED:LBB[0-9]+_[0-9]+]]
+
+; CHECK: stlxrb [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x0]
+; CHECK: cbnz [[STATUS]], [[LOOP]]
+
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+  ; FIXME: DAG combine should be able to deal with this.
+; CHECK: orr [[TMP:w[0-9]+]], wzr, #0x1
+; CHECK: eor w0, [[TMP]], #0x1
+; CHECK: ret
+
+; CHECK: [[FAILED]]:
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: mov [[TMP:w[0-9]+]], wzr
+; CHECK: eor w0, [[TMP]], #0x1
+; CHECK: ret
+
+  %pair = cmpxchg i8* %value, i8 %oldValue, i8 %newValue acq_rel monotonic
+  %success = extractvalue { i8, i1 } %pair, 1
+  %failure = xor i1 %success, 1
+  ret i1 %failure
+}
+
+define void @test_conditional(i32* %p, i32 %oldval, i32 %newval) {
+; CHECK-LABEL: test_conditional:
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr [[LOADED:w[0-9]+]], [x0]
+; CHECK: cmp [[LOADED]], w1
+; CHECK: b.ne [[FAILED:LBB[0-9]+_[0-9]+]]
+
+; CHECK: stlxr [[STATUS:w[0-9]+]], w2, [x0]
+; CHECK: cbnz [[STATUS]], [[LOOP]]
+
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: b _bar
+
+; CHECK: [[FAILED]]:
+; CHECK-NOT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: b _baz
+
+  %pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %pair, 1
+  br i1 %success, label %true, label %false
+
+true:
+  tail call void @bar() #2
+  br label %end
+
+false:
+  tail call void @baz() #2
+  br label %end
+
+end:
+  ret void
+}
+
+declare void @bar()
+declare void @baz()
diff --git a/test/CodeGen/AArch64/compiler-ident.ll b/test/CodeGen/AArch64/compiler-ident.ll
new file mode 100644
index 0000000..0350571
--- /dev/null
+++ b/test/CodeGen/AArch64/compiler-ident.ll
@@ -0,0 +1,12 @@
+; RUN: llc -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
+
+; ModuleID = 'compiler-ident.c'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK: .ident  "some LLVM version"
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"some LLVM version"}
+
diff --git a/test/CodeGen/AArch64/complex-fp-to-int.ll b/test/CodeGen/AArch64/complex-fp-to-int.ll
new file mode 100644
index 0000000..13cf762
--- /dev/null
+++ b/test/CodeGen/AArch64/complex-fp-to-int.ll
@@ -0,0 +1,141 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x i64> @test_v2f32_to_signed_v2i64(<2 x float> %in) {
+; CHECK-LABEL: test_v2f32_to_signed_v2i64:
+; CHECK: fcvtl [[VAL64:v[0-9]+]].2d, v0.2s
+; CHECK: fcvtzs.2d v0, [[VAL64]]
+
+  %val = fptosi <2 x float> %in to <2 x i64>
+  ret <2 x i64> %val
+}
+
+define <2 x i64> @test_v2f32_to_unsigned_v2i64(<2 x float> %in) {
+; CHECK-LABEL: test_v2f32_to_unsigned_v2i64:
+; CHECK: fcvtl [[VAL64:v[0-9]+]].2d, v0.2s
+; CHECK: fcvtzu.2d v0, [[VAL64]]
+
+  %val = fptoui <2 x float> %in to <2 x i64>
+  ret <2 x i64> %val
+}
+
+define <2 x i16> @test_v2f32_to_signed_v2i16(<2 x float> %in) {
+; CHECK-LABEL: test_v2f32_to_signed_v2i16:
+; CHECK: fcvtzs.2s v0, v0
+
+  %val = fptosi <2 x float> %in to <2 x i16>
+  ret <2 x i16> %val
+}
+
+define <2 x i16> @test_v2f32_to_unsigned_v2i16(<2 x float> %in) {
+; CHECK-LABEL: test_v2f32_to_unsigned_v2i16:
+; CHECK: fcvtzs.2s v0, v0
+
+  %val = fptoui <2 x float> %in to <2 x i16>
+  ret <2 x i16> %val
+}
+
+define <2 x i8> @test_v2f32_to_signed_v2i8(<2 x float> %in) {
+; CHECK-LABEL: test_v2f32_to_signed_v2i8:
+; CHECK: fcvtzs.2s v0, v0
+
+  %val = fptosi <2 x float> %in to <2 x i8>
+  ret <2 x i8> %val
+}
+
+define <2 x i8> @test_v2f32_to_unsigned_v2i8(<2 x float> %in) {
+; CHECK-LABEL: test_v2f32_to_unsigned_v2i8:
+; CHECK: fcvtzs.2s v0, v0
+
+  %val = fptoui <2 x float> %in to <2 x i8>
+  ret <2 x i8> %val
+}
+
+define <4 x i16> @test_v4f32_to_signed_v4i16(<4 x float> %in) {
+; CHECK-LABEL: test_v4f32_to_signed_v4i16:
+; CHECK: fcvtzs.4s [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.4h v0, [[VAL64]]
+
+  %val = fptosi <4 x float> %in to <4 x i16>
+  ret <4 x i16> %val
+}
+
+define <4 x i16> @test_v4f32_to_unsigned_v4i16(<4 x float> %in) {
+; CHECK-LABEL: test_v4f32_to_unsigned_v4i16:
+; CHECK: fcvtzu.4s [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.4h v0, [[VAL64]]
+
+  %val = fptoui <4 x float> %in to <4 x i16>
+  ret <4 x i16> %val
+}
+
+define <4 x i8> @test_v4f32_to_signed_v4i8(<4 x float> %in) {
+; CHECK-LABEL: test_v4f32_to_signed_v4i8:
+; CHECK: fcvtzs.4s [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.4h v0, [[VAL64]]
+
+  %val = fptosi <4 x float> %in to <4 x i8>
+  ret <4 x i8> %val
+}
+
+define <4 x i8> @test_v4f32_to_unsigned_v4i8(<4 x float> %in) {
+; CHECK-LABEL: test_v4f32_to_unsigned_v4i8:
+; CHECK: fcvtzs.4s [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.4h v0, [[VAL64]]
+
+  %val = fptoui <4 x float> %in to <4 x i8>
+  ret <4 x i8> %val
+}
+
+define <2 x i32> @test_v2f64_to_signed_v2i32(<2 x double> %in) {
+; CHECK-LABEL: test_v2f64_to_signed_v2i32:
+; CHECK: fcvtzs.2d [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.2s v0, [[VAL64]]
+
+  %val = fptosi <2 x double> %in to <2 x i32>
+  ret <2 x i32> %val
+}
+
+define <2 x i32> @test_v2f64_to_unsigned_v2i32(<2 x double> %in) {
+; CHECK-LABEL: test_v2f64_to_unsigned_v2i32:
+; CHECK: fcvtzu.2d [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.2s v0, [[VAL64]]
+
+  %val = fptoui <2 x double> %in to <2 x i32>
+  ret <2 x i32> %val
+}
+
+define <2 x i16> @test_v2f64_to_signed_v2i16(<2 x double> %in) {
+; CHECK-LABEL: test_v2f64_to_signed_v2i16:
+; CHECK: fcvtzs.2d [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.2s v0, [[VAL64]]
+
+  %val = fptosi <2 x double> %in to <2 x i16>
+  ret <2 x i16> %val
+}
+
+define <2 x i16> @test_v2f64_to_unsigned_v2i16(<2 x double> %in) {
+; CHECK-LABEL: test_v2f64_to_unsigned_v2i16:
+; CHECK: fcvtzs.2d [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.2s v0, [[VAL64]]
+
+  %val = fptoui <2 x double> %in to <2 x i16>
+  ret <2 x i16> %val
+}
+
+define <2 x i8> @test_v2f64_to_signed_v2i8(<2 x double> %in) {
+; CHECK-LABEL: test_v2f64_to_signed_v2i8:
+; CHECK: fcvtzs.2d [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.2s v0, [[VAL64]]
+
+  %val = fptosi <2 x double> %in to <2 x i8>
+  ret <2 x i8> %val
+}
+
+define <2 x i8> @test_v2f64_to_unsigned_v2i8(<2 x double> %in) {
+; CHECK-LABEL: test_v2f64_to_unsigned_v2i8:
+; CHECK: fcvtzs.2d [[VAL64:v[0-9]+]], v0
+; CHECK: xtn.2s v0, [[VAL64]]
+
+  %val = fptoui <2 x double> %in to <2 x i8>
+  ret <2 x i8> %val
+}
diff --git a/test/CodeGen/AArch64/complex-int-to-fp.ll b/test/CodeGen/AArch64/complex-int-to-fp.ll
new file mode 100644
index 0000000..5c943f9
--- /dev/null
+++ b/test/CodeGen/AArch64/complex-int-to-fp.ll
@@ -0,0 +1,164 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+; CHECK: autogen_SD19655
+; CHECK: scvtf
+; CHECK: ret
+define void @autogen_SD19655(<2 x i64>* %addr, <2 x float>* %addrfloat) {
+  %T = load <2 x i64>* %addr
+  %F = sitofp <2 x i64> %T to <2 x float>
+  store <2 x float> %F, <2 x float>* %addrfloat
+  ret void
+}
+
+define <2 x double> @test_signed_v2i32_to_v2f64(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v2i32_to_v2f64:
+; CHECK: sshll.2d [[VAL64:v[0-9]+]], v0, #0
+; CHECK-NEXT: scvtf.2d v0, [[VAL64]]
+; CHECK-NEXT: ret
+  %conv = sitofp <2 x i32> %v to <2 x double>
+  ret <2 x double> %conv
+}
+
+define <2 x double> @test_unsigned_v2i32_to_v2f64(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v2i32_to_v2f64
+; CHECK: ushll.2d [[VAL64:v[0-9]+]], v0, #0
+; CHECK-NEXT: ucvtf.2d v0, [[VAL64]]
+; CHECK-NEXT: ret
+  %conv = uitofp <2 x i32> %v to <2 x double>
+  ret <2 x double> %conv
+}
+
+define <2 x double> @test_signed_v2i16_to_v2f64(<2 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v2i16_to_v2f64:
+; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #16
+; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #16
+; CHECK: sshll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
+; CHECK: scvtf.2d v0, [[VAL64]]
+
+  %conv = sitofp <2 x i16> %v to <2 x double>
+  ret <2 x double> %conv
+}
+define <2 x double> @test_unsigned_v2i16_to_v2f64(<2 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v2i16_to_v2f64
+; CHECK: movi d[[MASK:[0-9]+]], #0x00ffff0000ffff
+; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
+; CHECK: ushll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
+; CHECK: ucvtf.2d v0, [[VAL64]]
+
+  %conv = uitofp <2 x i16> %v to <2 x double>
+  ret <2 x double> %conv
+}
+
+define <2 x double> @test_signed_v2i8_to_v2f64(<2 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v2i8_to_v2f64:
+; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #24
+; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #24
+; CHECK: sshll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
+; CHECK: scvtf.2d v0, [[VAL64]]
+
+  %conv = sitofp <2 x i8> %v to <2 x double>
+  ret <2 x double> %conv
+}
+define <2 x double> @test_unsigned_v2i8_to_v2f64(<2 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v2i8_to_v2f64
+; CHECK: movi d[[MASK:[0-9]+]], #0x0000ff000000ff
+; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
+; CHECK: ushll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
+; CHECK: ucvtf.2d v0, [[VAL64]]
+
+  %conv = uitofp <2 x i8> %v to <2 x double>
+  ret <2 x double> %conv
+}
+
+define <2 x float> @test_signed_v2i64_to_v2f32(<2 x i64> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v2i64_to_v2f32:
+; CHECK: scvtf.2d [[VAL64:v[0-9]+]], v0
+; CHECK: fcvtn v0.2s, [[VAL64]].2d
+
+  %conv = sitofp <2 x i64> %v to <2 x float>
+  ret <2 x float> %conv
+}
+define <2 x float> @test_unsigned_v2i64_to_v2f32(<2 x i64> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v2i64_to_v2f32
+; CHECK: ucvtf.2d [[VAL64:v[0-9]+]], v0
+; CHECK: fcvtn v0.2s, [[VAL64]].2d
+
+  %conv = uitofp <2 x i64> %v to <2 x float>
+  ret <2 x float> %conv
+}
+
+define <2 x float> @test_signed_v2i16_to_v2f32(<2 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v2i16_to_v2f32:
+; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #16
+; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #16
+; CHECK: scvtf.2s v0, [[VAL32]]
+
+  %conv = sitofp <2 x i16> %v to <2 x float>
+  ret <2 x float> %conv
+}
+define <2 x float> @test_unsigned_v2i16_to_v2f32(<2 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v2i16_to_v2f32
+; CHECK: movi d[[MASK:[0-9]+]], #0x00ffff0000ffff
+; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
+; CHECK: ucvtf.2s v0, [[VAL32]]
+
+  %conv = uitofp <2 x i16> %v to <2 x float>
+  ret <2 x float> %conv
+}
+
+define <2 x float> @test_signed_v2i8_to_v2f32(<2 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v2i8_to_v2f32:
+; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #24
+; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #24
+; CHECK: scvtf.2s v0, [[VAL32]]
+
+  %conv = sitofp <2 x i8> %v to <2 x float>
+  ret <2 x float> %conv
+}
+define <2 x float> @test_unsigned_v2i8_to_v2f32(<2 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v2i8_to_v2f32
+; CHECK: movi d[[MASK:[0-9]+]], #0x0000ff000000ff
+; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
+; CHECK: ucvtf.2s v0, [[VAL32]]
+
+  %conv = uitofp <2 x i8> %v to <2 x float>
+  ret <2 x float> %conv
+}
+
+define <4 x float> @test_signed_v4i16_to_v4f32(<4 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v4i16_to_v4f32:
+; CHECK: sshll.4s [[VAL32:v[0-9]+]], v0, #0
+; CHECK: scvtf.4s v0, [[VAL32]]
+
+  %conv = sitofp <4 x i16> %v to <4 x float>
+  ret <4 x float> %conv
+}
+
+define <4 x float> @test_unsigned_v4i16_to_v4f32(<4 x i16> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v4i16_to_v4f32
+; CHECK: ushll.4s [[VAL32:v[0-9]+]], v0, #0
+; CHECK: ucvtf.4s v0, [[VAL32]]
+
+  %conv = uitofp <4 x i16> %v to <4 x float>
+  ret <4 x float> %conv
+}
+
+define <4 x float> @test_signed_v4i8_to_v4f32(<4 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_signed_v4i8_to_v4f32:
+; CHECK: shl.4h [[TMP:v[0-9]+]], v0, #8
+; CHECK: sshr.4h [[VAL16:v[0-9]+]], [[TMP]], #8
+; CHECK: sshll.4s [[VAL32:v[0-9]+]], [[VAL16]], #0
+; CHECK: scvtf.4s v0, [[VAL32]]
+
+  %conv = sitofp <4 x i8> %v to <4 x float>
+  ret <4 x float> %conv
+}
+define <4 x float> @test_unsigned_v4i8_to_v4f32(<4 x i8> %v) nounwind readnone {
+; CHECK-LABEL: test_unsigned_v4i8_to_v4f32
+; CHECK: bic.4h v0, #0xff, lsl #8
+; CHECK: ushll.4s [[VAL32:v[0-9]+]], v0, #0
+; CHECK: ucvtf.4s v0, [[VAL32]]
+
+  %conv = uitofp <4 x i8> %v to <4 x float>
+  ret <4 x float> %conv
+}
diff --git a/test/CodeGen/AArch64/directcond.ll b/test/CodeGen/AArch64/directcond.ll
index 1b51928..fbea4a6 100644
--- a/test/CodeGen/AArch64/directcond.ll
+++ b/test/CodeGen/AArch64/directcond.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 -aarch64-atomic-cfg-tidy=0 | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -aarch64-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-NOFP %s
 
 define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) {
 ; CHECK-LABEL: test_select_i32:
diff --git a/test/CodeGen/AArch64/f16-convert.ll b/test/CodeGen/AArch64/f16-convert.ll
new file mode 100644
index 0000000..6fabdc5
--- /dev/null
+++ b/test/CodeGen/AArch64/f16-convert.ll
@@ -0,0 +1,254 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios -asm-verbose=false | FileCheck %s
+
+define float @load0(i16* nocapture readonly %a) nounwind {
+; CHECK-LABEL: load0:
+; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0]
+; CHECK-NEXT: fcvt s0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %tmp = load i16* %a, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  ret float %tmp1
+}
+
+define double @load1(i16* nocapture readonly %a) nounwind {
+; CHECK-LABEL: load1:
+; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0]
+; CHECK-NEXT: fcvt d0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %tmp = load i16* %a, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %conv = fpext float %tmp1 to double
+  ret double %conv
+}
+
+define float @load2(i16* nocapture readonly %a, i32 %i) nounwind {
+; CHECK-LABEL: load2:
+; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0, w1, sxtw #1]
+; CHECK-NEXT: fcvt s0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
+  %tmp = load i16* %arrayidx, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  ret float %tmp1
+}
+
+define double @load3(i16* nocapture readonly %a, i32 %i) nounwind {
+; CHECK-LABEL: load3:
+; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0, w1, sxtw #1]
+; CHECK-NEXT: fcvt d0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
+  %tmp = load i16* %arrayidx, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %conv = fpext float %tmp1 to double
+  ret double %conv
+}
+
+define float @load4(i16* nocapture readonly %a, i64 %i) nounwind {
+; CHECK-LABEL: load4:
+; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: fcvt s0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %arrayidx = getelementptr inbounds i16* %a, i64 %i
+  %tmp = load i16* %arrayidx, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  ret float %tmp1
+}
+
+define double @load5(i16* nocapture readonly %a, i64 %i) nounwind {
+; CHECK-LABEL: load5:
+; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: fcvt d0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %arrayidx = getelementptr inbounds i16* %a, i64 %i
+  %tmp = load i16* %arrayidx, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %conv = fpext float %tmp1 to double
+  ret double %conv
+}
+
+define float @load6(i16* nocapture readonly %a) nounwind {
+; CHECK-LABEL: load6:
+; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0, #20]
+; CHECK-NEXT: fcvt s0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %arrayidx = getelementptr inbounds i16* %a, i64 10
+  %tmp = load i16* %arrayidx, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  ret float %tmp1
+}
+
+define double @load7(i16* nocapture readonly %a) nounwind {
+; CHECK-LABEL: load7:
+; CHECK-NEXT: ldr [[HREG:h[0-9]+]], [x0, #20]
+; CHECK-NEXT: fcvt d0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %arrayidx = getelementptr inbounds i16* %a, i64 10
+  %tmp = load i16* %arrayidx, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %conv = fpext float %tmp1 to double
+  ret double %conv
+}
+
+define float @load8(i16* nocapture readonly %a) nounwind {
+; CHECK-LABEL: load8:
+; CHECK-NEXT: ldur [[HREG:h[0-9]+]], [x0, #-20]
+; CHECK-NEXT: fcvt s0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %arrayidx = getelementptr inbounds i16* %a, i64 -10
+  %tmp = load i16* %arrayidx, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  ret float %tmp1
+}
+
+define double @load9(i16* nocapture readonly %a) nounwind {
+; CHECK-LABEL: load9:
+; CHECK-NEXT: ldur [[HREG:h[0-9]+]], [x0, #-20]
+; CHECK-NEXT: fcvt d0, [[HREG]]
+; CHECK-NEXT: ret
+
+  %arrayidx = getelementptr inbounds i16* %a, i64 -10
+  %tmp = load i16* %arrayidx, align 2
+  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %conv = fpext float %tmp1 to double
+  ret double %conv
+}
+
+define void @store0(i16* nocapture %a, float %val) nounwind {
+; CHECK-LABEL: store0:
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: str  h0, [x0]
+; CHECK-NEXT: ret
+
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  store i16 %tmp, i16* %a, align 2
+  ret void
+}
+
+define void @store1(i16* nocapture %a, double %val) nounwind {
+; CHECK-LABEL: store1:
+; CHECK-NEXT: fcvt h0, d0
+; CHECK-NEXT: str  h0, [x0]
+; CHECK-NEXT: ret
+
+  %conv = fptrunc double %val to float
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  store i16 %tmp, i16* %a, align 2
+  ret void
+}
+
+define void @store2(i16* nocapture %a, i32 %i, float %val) nounwind {
+; CHECK-LABEL: store2:
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: str h0, [x0, w1, sxtw #1]
+; CHECK-NEXT: ret
+
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
+  store i16 %tmp, i16* %arrayidx, align 2
+  ret void
+}
+
+define void @store3(i16* nocapture %a, i32 %i, double %val) nounwind {
+; CHECK-LABEL: store3:
+; CHECK-NEXT: fcvt h0, d0
+; CHECK-NEXT: str h0, [x0, w1, sxtw #1]
+; CHECK-NEXT: ret
+
+  %conv = fptrunc double %val to float
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
+  store i16 %tmp, i16* %arrayidx, align 2
+  ret void
+}
+
+define void @store4(i16* nocapture %a, i64 %i, float %val) nounwind {
+; CHECK-LABEL: store4:
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: str h0, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %arrayidx = getelementptr inbounds i16* %a, i64 %i
+  store i16 %tmp, i16* %arrayidx, align 2
+  ret void
+}
+
+define void @store5(i16* nocapture %a, i64 %i, double %val) nounwind {
+; CHECK-LABEL: store5:
+; CHECK-NEXT: fcvt h0, d0
+; CHECK-NEXT: str h0, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+
+  %conv = fptrunc double %val to float
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %arrayidx = getelementptr inbounds i16* %a, i64 %i
+  store i16 %tmp, i16* %arrayidx, align 2
+  ret void
+}
+
+define void @store6(i16* nocapture %a, float %val) nounwind {
+; CHECK-LABEL: store6:
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: str h0, [x0, #20]
+; CHECK-NEXT: ret
+
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %arrayidx = getelementptr inbounds i16* %a, i64 10
+  store i16 %tmp, i16* %arrayidx, align 2
+  ret void
+}
+
+define void @store7(i16* nocapture %a, double %val) nounwind {
+; CHECK-LABEL: store7:
+; CHECK-NEXT: fcvt h0, d0
+; CHECK-NEXT: str h0, [x0, #20]
+; CHECK-NEXT: ret
+
+  %conv = fptrunc double %val to float
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %arrayidx = getelementptr inbounds i16* %a, i64 10
+  store i16 %tmp, i16* %arrayidx, align 2
+  ret void
+}
+
+define void @store8(i16* nocapture %a, float %val) nounwind {
+; CHECK-LABEL: store8:
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: stur h0, [x0, #-20]
+; CHECK-NEXT: ret
+
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %arrayidx = getelementptr inbounds i16* %a, i64 -10
+  store i16 %tmp, i16* %arrayidx, align 2
+  ret void
+}
+
+define void @store9(i16* nocapture %a, double %val) nounwind {
+; CHECK-LABEL: store9:
+; CHECK-NEXT: fcvt h0, d0
+; CHECK-NEXT: stur h0, [x0, #-20]
+; CHECK-NEXT: ret
+
+  %conv = fptrunc double %val to float
+  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %arrayidx = getelementptr inbounds i16* %a, i64 -10
+  store i16 %tmp, i16* %arrayidx, align 2
+  ret void
+}
+
+declare i16 @llvm.convert.to.fp16(float) nounwind readnone
+declare float @llvm.convert.from.fp16(i16) nounwind readnone
diff --git a/test/CodeGen/AArch64/fast-isel-mul.ll b/test/CodeGen/AArch64/fast-isel-mul.ll
new file mode 100644
index 0000000..d02c67f
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-mul.ll
@@ -0,0 +1,40 @@
+; RUN: llc -fast-isel -fast-isel-abort -mtriple=aarch64 -o - %s | FileCheck %s
+
+@var8 = global i8 0
+@var16 = global i16 0
+@var32 = global i32 0
+@var64 = global i64 0
+
+define void @test_mul8(i8 %lhs, i8 %rhs) {
+; CHECK-LABEL: test_mul8:
+; CHECK: mul w0, w0, w1
+;  %lhs = load i8* @var8
+;  %rhs = load i8* @var8
+  %prod = mul i8 %lhs, %rhs
+  store i8 %prod, i8* @var8
+  ret void
+}
+
+define void @test_mul16(i16 %lhs, i16 %rhs) {
+; CHECK-LABEL: test_mul16:
+; CHECK: mul w0, w0, w1
+  %prod = mul i16 %lhs, %rhs
+  store i16 %prod, i16* @var16
+  ret void
+}
+
+define void @test_mul32(i32 %lhs, i32 %rhs) {
+; CHECK-LABEL: test_mul32:
+; CHECK: mul w0, w0, w1
+  %prod = mul i32 %lhs, %rhs
+  store i32 %prod, i32* @var32
+  ret void
+}
+
+define void @test_mul64(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: test_mul64:
+; CHECK: mul x0, x0, x1
+  %prod = mul i64 %lhs, %rhs
+  store i64 %prod, i64* @var64
+  ret void
+}
diff --git a/test/CodeGen/AArch64/flags-multiuse.ll b/test/CodeGen/AArch64/flags-multiuse.ll
index c9b0b9f..77bbcdd 100644
--- a/test/CodeGen/AArch64/flags-multiuse.ll
+++ b/test/CodeGen/AArch64/flags-multiuse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -o - %s | FileCheck %s
 
 ; LLVM should be able to cope with multiple uses of the same flag-setting
 ; instruction at different points of a routine. Either by rematerializing the
diff --git a/test/CodeGen/AArch64/funcptr_cast.ll b/test/CodeGen/AArch64/funcptr_cast.ll
new file mode 100644
index 0000000..a00b7bc
--- /dev/null
+++ b/test/CodeGen/AArch64/funcptr_cast.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+define i8 @test() {
+; CHECK-LABEL: @test
+; CHECK: adrp {{x[0-9]+}}, foo
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, :lo12:foo
+; CHECK: ldrb w0, [{{x[0-9]+}}]
+entry:
+  %0 = load i8* bitcast (void (...)* @foo to i8*), align 1
+  ret i8 %0
+}
+
+declare void @foo(...)
diff --git a/test/CodeGen/AArch64/global-merge-1.ll b/test/CodeGen/AArch64/global-merge-1.ll
new file mode 100644
index 0000000..68aba5e
--- /dev/null
+++ b/test/CodeGen/AArch64/global-merge-1.ll
@@ -0,0 +1,26 @@
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -enable-global-merge -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -enable-global-merge -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+
+; RUN: llc %s -mtriple=aarch64-apple-ios -enable-global-merge -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+; RUN: llc %s -mtriple=aarch64-apple-ios -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+
+@m = internal global i32 0, align 4
+@n = internal global i32 0, align 4
+
+define void @f1(i32 %a1, i32 %a2) {
+;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals@PAGE
+;CHECK-APPLE-IOS-NOT: adrp
+;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals@PAGEOFF
+  store i32 %a1, i32* @m, align 4
+  store i32 %a2, i32* @n, align 4
+  ret void
+}
+
+;CHECK:	.type	_MergedGlobals,@object  // @_MergedGlobals
+;CHECK:	.local	_MergedGlobals
+;CHECK:	.comm	_MergedGlobals,8,8
+
+;CHECK-APPLE-IOS: .zerofill __DATA,__bss,__MergedGlobals,8,3 ; @_MergedGlobals
diff --git a/test/CodeGen/AArch64/global-merge-2.ll b/test/CodeGen/AArch64/global-merge-2.ll
new file mode 100644
index 0000000..a773566
--- /dev/null
+++ b/test/CodeGen/AArch64/global-merge-2.ll
@@ -0,0 +1,51 @@
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-apple-ios -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+
+@x = global i32 0, align 4
+@y = global i32 0, align 4
+@z = global i32 0, align 4
+
+define void @f1(i32 %a1, i32 %a2) {
+;CHECK-APPLE-IOS-LABEL: _f1:
+;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals_x@PAGE
+;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals_x@PAGEOFF
+;CHECK-APPLE-IOS-NOT: adrp
+  store i32 %a1, i32* @x, align 4
+  store i32 %a2, i32* @y, align 4
+  ret void
+}
+
+define void @g1(i32 %a1, i32 %a2) {
+;CHECK-APPLE-IOS-LABEL: _g1:
+;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals_x@PAGE
+;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals_x@PAGEOFF
+;CHECK-APPLE-IOS-NOT: adrp
+  store i32 %a1, i32* @y, align 4
+  store i32 %a2, i32* @z, align 4
+  ret void
+}
+
+;CHECK:	.type	_MergedGlobals_x,@object // @_MergedGlobals_x
+;CHECK:	.globl	_MergedGlobals_x
+;CHECK:	.align	3
+;CHECK: _MergedGlobals_x:
+;CHECK:	.size	_MergedGlobals_x, 12
+
+;CHECK:	.globl	x
+;CHECK: x = _MergedGlobals_x
+;CHECK:	.globl	y
+;CHECK: y = _MergedGlobals_x+4
+;CHECK:	.globl	z
+;CHECK: z = _MergedGlobals_x+8
+
+;CHECK-APPLE-IOS: .globl	__MergedGlobals_x       ; @_MergedGlobals_x
+;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_x,12,3
+
+;CHECK-APPLE-IOS: .globl	_x
+;CHECK-APPLE-IOS: _x = __MergedGlobals_x
+;CHECK-APPLE-IOS: .globl	_y
+;CHECK-APPLE-IOS: _y = __MergedGlobals_x+4
+;CHECK-APPLE-IOS: .globl	_z
+;CHECK-APPLE-IOS: _z = __MergedGlobals_x+8
+;CHECK-APPLE-IOS: .subsections_via_symbols
diff --git a/test/CodeGen/AArch64/global-merge-3.ll b/test/CodeGen/AArch64/global-merge-3.ll
new file mode 100644
index 0000000..d455d40
--- /dev/null
+++ b/test/CodeGen/AArch64/global-merge-3.ll
@@ -0,0 +1,51 @@
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-apple-ios -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+
+@x = global [1000 x i32] zeroinitializer, align 1
+@y = global [1000 x i32] zeroinitializer, align 1
+@z = internal global i32 1, align 4
+
+define void @f1(i32 %a1, i32 %a2, i32 %a3) {
+;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals_x@PAGE
+;CHECK-APPLE-IOS-NOT: adrp
+;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals_x@PAGEOFF
+;CHECK-APPLE-IOS: adrp	x9, __MergedGlobals_y@PAGE
+;CHECK-APPLE-IOS: add	x9, x9, __MergedGlobals_y@PAGEOFF
+  %x3 = getelementptr inbounds [1000 x i32]* @x, i32 0, i64 3
+  %y3 = getelementptr inbounds [1000 x i32]* @y, i32 0, i64 3
+  store i32 %a1, i32* %x3, align 4
+  store i32 %a2, i32* %y3, align 4
+  store i32 %a3, i32* @z, align 4
+  ret void
+}
+
+;CHECK:	.type	_MergedGlobals_x,@object // @_MergedGlobals_x
+;CHECK: .globl	_MergedGlobals_x
+;CHECK: .align	4
+;CHECK: _MergedGlobals_x:
+;CHECK: .size	_MergedGlobals_x, 4004
+
+;CHECK: .type	_MergedGlobals_y,@object // @_MergedGlobals_y
+;CHECK: .globl	_MergedGlobals_y
+;CHECK: _MergedGlobals_y:
+;CHECK: .size	_MergedGlobals_y, 4000
+
+;CHECK-APPLE-IOS: .globl	__MergedGlobals_x       ; @_MergedGlobals_x
+;CHECK-APPLE-IOS: .align	4
+;CHECK-APPLE-IOS:  __MergedGlobals_x:
+;CHECK-APPLE-IOS: .long 1
+;CHECK-APPLE-IOS: .space	4000
+
+;CHECK-APPLE-IOS: .globl	__MergedGlobals_y       ; @_MergedGlobals_y
+;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_y,4000,4
+
+;CHECK:	.globl	x
+;CHECK: x = _MergedGlobals_x+4
+;CHECK:	.globl	y
+;CHECK: y = _MergedGlobals_y
+
+;CHECK-APPLE-IOS:.globl	_x
+;CHECK-APPLE-IOS: _x = __MergedGlobals_x+4
+;CHECK-APPLE-IOS:.globl	_y
+;CHECK-APPLE-IOS: _y = __MergedGlobals_y
diff --git a/test/Transforms/GlobalMerge/AArch64/arm64.ll b/test/CodeGen/AArch64/global-merge-4.ll
index eea474a..a525ccd 100644
--- a/test/Transforms/GlobalMerge/AArch64/arm64.ll
+++ b/test/CodeGen/AArch64/global-merge-4.ll
@@ -1,23 +1,4 @@
-; RUN: llc %s -O0 -o - | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O0 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O0 -o - -global-merge=true | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O1 -o - | FileCheck -check-prefix=MERGE %s
-; RUN: llc %s -O1 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O1 -o - -global-merge=true | FileCheck -check-prefix=MERGE %s
-
-; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
-; MERGE: .zerofill __DATA,__bss,__MergedGlobals,60,4
-; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
-
-; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
-; NO-MERGE: .zerofill __DATA,__bss,_bar,20,2
-; NO-MERGE: .zerofill __DATA,__bss,_baz,20,2
-; NO-MERGE: .zerofill __DATA,__bss,_foo,20,2
-; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -enable-global-merge -o - | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
 target triple = "arm64-apple-ios7.0.0"
@@ -83,6 +64,10 @@ define internal i32* @returnFoo() #1 {
   ret i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 0)
 }
 
+;CHECK:	.type	_MergedGlobals,@object  // @_MergedGlobals
+;CHECK:	.local	_MergedGlobals
+;CHECK:	.comm	_MergedGlobals,60,16
+
 attributes #0 = { nounwind ssp }
 attributes #1 = { nounwind readnone ssp }
 attributes #2 = { nounwind }
diff --git a/test/CodeGen/AArch64/global-merge.ll b/test/CodeGen/AArch64/global-merge.ll
new file mode 100644
index 0000000..aed1dc4
--- /dev/null
+++ b/test/CodeGen/AArch64/global-merge.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck --check-prefix=NO-MERGE %s
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -O0 -global-merge-on-external=true | FileCheck --check-prefix=NO-MERGE %s
+
+; RUN: llc < %s -mtriple=aarch64-apple-ios -O0 | FileCheck %s --check-prefix=CHECK-APPLE-IOS-NO-MERGE
+; RUN: llc < %s -mtriple=aarch64-apple-ios -O0 -global-merge-on-external=true | FileCheck %s --check-prefix=CHECK-APPLE-IOS-NO-MERGE
+
+; FIXME: add O1/O2 test for aarch64-none-linux-gnu and aarch64-apple-ios
+
+@m = internal global i32 0, align 4
+@n = internal global i32 0, align 4
+
+define void @f1(i32 %a1, i32 %a2) {
+; CHECK-LABEL: f1:
+; CHECK: adrp x{{[0-9]+}}, _MergedGlobals
+; CHECK-NOT: adrp
+
+; CHECK-APPLE-IOS-LABEL: f1:
+; CHECK-APPLE-IOS: adrp x{{[0-9]+}}, __MergedGlobals
+; CHECK-APPLE-IOS-NOT: adrp
+  store i32 %a1, i32* @m, align 4
+  store i32 %a2, i32* @n, align 4
+  ret void
+}
+
+; CHECK:        .local _MergedGlobals
+; CHECK:        .comm  _MergedGlobals,8,8
+; NO-MERGE-NOT: .local _MergedGlobals
+
+; CHECK-APPLE-IOS: .zerofill __DATA,__bss,__MergedGlobals,8,3
+; CHECK-APPLE-IOS-NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,8,3
diff --git a/test/CodeGen/AArch64/i128-fast-isel-fallback.ll b/test/CodeGen/AArch64/i128-fast-isel-fallback.ll
new file mode 100644
index 0000000..1cffbf3
--- /dev/null
+++ b/test/CodeGen/AArch64/i128-fast-isel-fallback.ll
@@ -0,0 +1,18 @@
+; RUN: llc -O0 -mtriple=arm64-apple-ios7.0 -mcpu=generic < %s | FileCheck %s
+
+; Function Attrs: nounwind ssp
+define void @test1() {
+  %1 = sext i32 0 to i128
+  call void  @test2(i128 %1)
+  ret void
+
+; The i128 is 0 so the we can test to make sure it is propogated into the x
+; registers that make up the i128 pair
+
+; CHECK:  mov  x0, xzr
+; CHECK:  mov  x1, x0
+; CHECK:  bl  _test2
+
+}
+
+declare void @test2(i128)
diff --git a/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll b/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll
new file mode 100644
index 0000000..645214a
--- /dev/null
+++ b/test/CodeGen/AArch64/inlineasm-ldr-pseudo.ll
@@ -0,0 +1,26 @@
+; We actually need to use -filetype=obj in this test because if we output
+; assembly, the current code path will bypass the parser and just write the
+; raw text out to the Streamer. We need to actually parse the inlineasm to
+; demonstrate the bug. Going the asm->obj route does not show the issue.
+; RUN: llc -mtriple=aarch64   < %s -filetype=obj | llvm-objdump -arch=aarch64 -d - | FileCheck %s
+
+; CHECK-LABEL: foo:
+; CHECK:       a0 79 95 d2 	 movz	x0, #0xabcd
+; CHECK:       c0 03 5f d6   ret
+define i32 @foo() nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "ldr $0,=0xabcd", "=r"() nounwind
+  ret i32 %0
+}
+; CHECK-LABEL: bar:
+; CHECK:        40 00 00 58                                      ldr    x0, #8
+; CHECK:        c0 03 5f d6                                      ret
+; Make sure the constant pool entry comes after the return
+; CHECK-LABEL:        $d.1:
+define i32 @bar() nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "ldr $0,=0x10001", "=r"() nounwind
+  ret i32 %0
+}
+
+
diff --git a/test/CodeGen/AArch64/jump-table.ll b/test/CodeGen/AArch64/jump-table.ll
index 1dfb789..69fbd99 100644
--- a/test/CodeGen/AArch64/jump-table.ll
+++ b/test/CodeGen/AArch64/jump-table.ll
@@ -1,6 +1,6 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -code-model=large -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -o - %s | FileCheck --check-prefix=CHECK-PIC %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc -code-model=large -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -aarch64-atomic-cfg-tidy=0 -o - %s | FileCheck --check-prefix=CHECK-PIC %s
 
 define i32 @test_jumptable(i32 %in) {
 ; CHECK: test_jumptable
diff --git a/test/CodeGen/AArch64/ldst-opt.ll b/test/CodeGen/AArch64/ldst-opt.ll
index 1ce5c95..e4f4295 100644
--- a/test/CodeGen/AArch64/ldst-opt.ll
+++ b/test/CodeGen/AArch64/ldst-opt.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -o - %s | FileCheck %s
 
 ; This file contains tests for the AArch64 load/store optimizer.
 
@@ -166,6 +166,217 @@ bar:
 
 ; Check the following transform:
 ;
+; add x8, x8, #16
+;  ...
+; ldr X, [x8]
+;  ->
+; ldr X, [x8, #16]!
+;
+; with X being either w0, x0, s0, d0 or q0.
+
+%pre.struct.i32 = type { i32, i32, i32}
+%pre.struct.i64 = type { i32, i64, i64}
+%pre.struct.i128 = type { i32, <2 x i64>, <2 x i64>}
+%pre.struct.float = type { i32, float, float}
+%pre.struct.double = type { i32, double, double}
+
+define i32 @load-pre-indexed-word2(%pre.struct.i32** %this, i1 %cond,
+                                   %pre.struct.i32* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-word2
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, #4]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i32** %this
+  %gep1 = getelementptr inbounds %pre.struct.i32* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i32* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi i32* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load i32* %retptr
+  ret i32 %ret
+}
+
+define i64 @load-pre-indexed-doubleword2(%pre.struct.i64** %this, i1 %cond,
+                                         %pre.struct.i64* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-doubleword2
+; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}, #8]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i64** %this
+  %gep1 = getelementptr inbounds %pre.struct.i64* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i64* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi i64* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load i64* %retptr
+  ret i64 %ret
+}
+
+define <2 x i64> @load-pre-indexed-quadword2(%pre.struct.i128** %this, i1 %cond,
+                                             %pre.struct.i128* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-quadword2
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}, #16]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i128** %this
+  %gep1 = getelementptr inbounds %pre.struct.i128* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i128* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi <2 x i64>* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load <2 x i64>* %retptr
+  ret <2 x i64> %ret
+}
+
+define float @load-pre-indexed-float2(%pre.struct.float** %this, i1 %cond,
+                                      %pre.struct.float* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-float2
+; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+}}, #4]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.float** %this
+  %gep1 = getelementptr inbounds %pre.struct.float* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.float* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi float* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load float* %retptr
+  ret float %ret
+}
+
+define double @load-pre-indexed-double2(%pre.struct.double** %this, i1 %cond,
+                                        %pre.struct.double* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-double2
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}, #8]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.double** %this
+  %gep1 = getelementptr inbounds %pre.struct.double* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.double* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi double* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load double* %retptr
+  ret double %ret
+}
+
+; Check the following transform:
+;
+; add x8, x8, #16
+;  ...
+; str X, [x8]
+;  ->
+; str X, [x8, #16]!
+;
+; with X being either w0, x0, s0, d0 or q0.
+
+define void @store-pre-indexed-word2(%pre.struct.i32** %this, i1 %cond,
+                                     %pre.struct.i32* %load2,
+                                     i32 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-word2
+; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}, #4]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i32** %this
+  %gep1 = getelementptr inbounds %pre.struct.i32* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i32* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi i32* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store i32 %val, i32* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-doubleword2(%pre.struct.i64** %this, i1 %cond,
+                                           %pre.struct.i64* %load2,
+                                           i64 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-doubleword2
+; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}, #8]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i64** %this
+  %gep1 = getelementptr inbounds %pre.struct.i64* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i64* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi i64* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store i64 %val, i64* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-quadword2(%pre.struct.i128** %this, i1 %cond,
+                                         %pre.struct.i128* %load2,
+                                         <2 x i64> %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-quadword2
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}, #16]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i128** %this
+  %gep1 = getelementptr inbounds %pre.struct.i128* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i128* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi <2 x i64>* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store <2 x i64> %val, <2 x i64>* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-float2(%pre.struct.float** %this, i1 %cond,
+                                      %pre.struct.float* %load2,
+                                      float %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-float2
+; CHECK: str s{{[0-9]+}}, [x{{[0-9]+}}, #4]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.float** %this
+  %gep1 = getelementptr inbounds %pre.struct.float* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.float* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi float* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store float %val, float* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-double2(%pre.struct.double** %this, i1 %cond,
+                                      %pre.struct.double* %load2,
+                                      double %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-double2
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+}}, #8]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.double** %this
+  %gep1 = getelementptr inbounds %pre.struct.double* %load1, i64 0, i32 1
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.double* %load2, i64 0, i32 2
+  br label %return
+return:
+  %retptr = phi double* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store double %val, double* %retptr
+  ret void
+}
+
+; Check the following transform:
+;
 ; ldr X, [x20]
 ;  ...
 ; add x20, x20, #32
@@ -294,8 +505,263 @@ exit:
   ret void
 }
 
+; Check the following transform:
+;
+; str X, [x20]
+;  ...
+; add x20, x20, #32
+;  ->
+; str X, [x20], #32
+;
+; with X being either w0, x0, s0, d0 or q0.
+
+define void @store-post-indexed-word(i32* %array, i64 %count, i32 %val) nounwind {
+; CHECK-LABEL: store-post-indexed-word
+; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}], #16
+entry:
+  %gep1 = getelementptr i32* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i32* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i32* %iv2, i64 -1
+  %load = load i32* %gep2
+  call void @use-word(i32 %load)
+  store i32 %val, i32* %iv2
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i32* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @store-post-indexed-doubleword(i64* %array, i64 %count, i64 %val) nounwind {
+; CHECK-LABEL: store-post-indexed-doubleword
+; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}], #32
+entry:
+  %gep1 = getelementptr i64* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i64* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i64* %iv2, i64 -1
+  %load = load i64* %gep2
+  call void @use-doubleword(i64 %load)
+  store i64 %val, i64* %iv2
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i64* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @store-post-indexed-quadword(<2 x i64>* %array, i64 %count, <2 x i64> %val) nounwind {
+; CHECK-LABEL: store-post-indexed-quadword
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}], #64
+entry:
+  %gep1 = getelementptr <2 x i64>* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi <2 x i64>* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr <2 x i64>* %iv2, i64 -1
+  %load = load <2 x i64>* %gep2
+  call void @use-quadword(<2 x i64> %load)
+  store <2 x i64> %val, <2 x i64>* %iv2
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr <2 x i64>* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @store-post-indexed-float(float* %array, i64 %count, float %val) nounwind {
+; CHECK-LABEL: store-post-indexed-float
+; CHECK: str s{{[0-9]+}}, [x{{[0-9]+}}], #16
+entry:
+  %gep1 = getelementptr float* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi float* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr float* %iv2, i64 -1
+  %load = load float* %gep2
+  call void @use-float(float %load)
+  store float %val, float* %iv2
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr float* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @store-post-indexed-double(double* %array, i64 %count, double %val) nounwind {
+; CHECK-LABEL: store-post-indexed-double
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+}}], #32
+entry:
+  %gep1 = getelementptr double* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi double* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr double* %iv2, i64 -1
+  %load = load double* %gep2
+  call void @use-double(double %load)
+  store double %val, double* %iv2
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr double* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
 declare void @use-word(i32)
 declare void @use-doubleword(i64)
 declare void @use-quadword(<2 x i64>)
 declare void @use-float(float)
 declare void @use-double(double)
+
+; Check the following transform:
+;
+; (ldr|str) X, [x20]
+;  ...
+; sub x20, x20, #16
+;  ->
+; (ldr|str) X, [x20], #-16
+;
+; with X being either w0, x0, s0, d0 or q0.
+
+define void @post-indexed-sub-word(i32* %a, i32* %b, i64 %count) nounwind {
+; CHECK-LABEL: post-indexed-sub-word
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}], #-8
+; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}], #-8
+  br label %for.body
+for.body:
+  %phi1 = phi i32* [ %gep4, %for.body ], [ %b, %0 ]
+  %phi2 = phi i32* [ %gep3, %for.body ], [ %a, %0 ]
+  %i = phi i64 [ %dec.i, %for.body], [ %count, %0 ]
+  %gep1 = getelementptr i32* %phi1, i64 -1
+  %load1 = load i32* %gep1
+  %gep2 = getelementptr i32* %phi2, i64 -1
+  store i32 %load1, i32* %gep2
+  %load2 = load i32* %phi1
+  store i32 %load2, i32* %phi2
+  %dec.i = add nsw i64 %i, -1
+  %gep3 = getelementptr i32* %phi2, i64 -2
+  %gep4 = getelementptr i32* %phi1, i64 -2
+  %cond = icmp sgt i64 %dec.i, 0
+  br i1 %cond, label %for.body, label %end
+end:
+  ret void
+}
+
+define void @post-indexed-sub-doubleword(i64* %a, i64* %b, i64 %count) nounwind {
+; CHECK-LABEL: post-indexed-sub-doubleword
+; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}], #-16
+; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}], #-16
+  br label %for.body
+for.body:
+  %phi1 = phi i64* [ %gep4, %for.body ], [ %b, %0 ]
+  %phi2 = phi i64* [ %gep3, %for.body ], [ %a, %0 ]
+  %i = phi i64 [ %dec.i, %for.body], [ %count, %0 ]
+  %gep1 = getelementptr i64* %phi1, i64 -1
+  %load1 = load i64* %gep1
+  %gep2 = getelementptr i64* %phi2, i64 -1
+  store i64 %load1, i64* %gep2
+  %load2 = load i64* %phi1
+  store i64 %load2, i64* %phi2
+  %dec.i = add nsw i64 %i, -1
+  %gep3 = getelementptr i64* %phi2, i64 -2
+  %gep4 = getelementptr i64* %phi1, i64 -2
+  %cond = icmp sgt i64 %dec.i, 0
+  br i1 %cond, label %for.body, label %end
+end:
+  ret void
+}
+
+define void @post-indexed-sub-quadword(<2 x i64>* %a, <2 x i64>* %b, i64 %count) nounwind {
+; CHECK-LABEL: post-indexed-sub-quadword
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}], #-32
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}], #-32
+  br label %for.body
+for.body:
+  %phi1 = phi <2 x i64>* [ %gep4, %for.body ], [ %b, %0 ]
+  %phi2 = phi <2 x i64>* [ %gep3, %for.body ], [ %a, %0 ]
+  %i = phi i64 [ %dec.i, %for.body], [ %count, %0 ]
+  %gep1 = getelementptr <2 x i64>* %phi1, i64 -1
+  %load1 = load <2 x i64>* %gep1
+  %gep2 = getelementptr <2 x i64>* %phi2, i64 -1
+  store <2 x i64> %load1, <2 x i64>* %gep2
+  %load2 = load <2 x i64>* %phi1
+  store <2 x i64> %load2, <2 x i64>* %phi2
+  %dec.i = add nsw i64 %i, -1
+  %gep3 = getelementptr <2 x i64>* %phi2, i64 -2
+  %gep4 = getelementptr <2 x i64>* %phi1, i64 -2
+  %cond = icmp sgt i64 %dec.i, 0
+  br i1 %cond, label %for.body, label %end
+end:
+  ret void
+}
+
+define void @post-indexed-sub-float(float* %a, float* %b, i64 %count) nounwind {
+; CHECK-LABEL: post-indexed-sub-float
+; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+}}], #-8
+; CHECK: str s{{[0-9]+}}, [x{{[0-9]+}}], #-8
+  br label %for.body
+for.body:
+  %phi1 = phi float* [ %gep4, %for.body ], [ %b, %0 ]
+  %phi2 = phi float* [ %gep3, %for.body ], [ %a, %0 ]
+  %i = phi i64 [ %dec.i, %for.body], [ %count, %0 ]
+  %gep1 = getelementptr float* %phi1, i64 -1
+  %load1 = load float* %gep1
+  %gep2 = getelementptr float* %phi2, i64 -1
+  store float %load1, float* %gep2
+  %load2 = load float* %phi1
+  store float %load2, float* %phi2
+  %dec.i = add nsw i64 %i, -1
+  %gep3 = getelementptr float* %phi2, i64 -2
+  %gep4 = getelementptr float* %phi1, i64 -2
+  %cond = icmp sgt i64 %dec.i, 0
+  br i1 %cond, label %for.body, label %end
+end:
+  ret void
+}
+
+define void @post-indexed-sub-double(double* %a, double* %b, i64 %count) nounwind {
+; CHECK-LABEL: post-indexed-sub-double
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}], #-16
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+}}], #-16
+  br label %for.body
+for.body:
+  %phi1 = phi double* [ %gep4, %for.body ], [ %b, %0 ]
+  %phi2 = phi double* [ %gep3, %for.body ], [ %a, %0 ]
+  %i = phi i64 [ %dec.i, %for.body], [ %count, %0 ]
+  %gep1 = getelementptr double* %phi1, i64 -1
+  %load1 = load double* %gep1
+  %gep2 = getelementptr double* %phi2, i64 -1
+  store double %load1, double* %gep2
+  %load2 = load double* %phi1
+  store double %load2, double* %phi2
+  %dec.i = add nsw i64 %i, -1
+  %gep3 = getelementptr double* %phi2, i64 -2
+  %gep4 = getelementptr double* %phi1, i64 -2
+  %cond = icmp sgt i64 %dec.i, 0
+  br i1 %cond, label %for.body, label %end
+end:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/lit.local.cfg b/test/CodeGen/AArch64/lit.local.cfg
index 77493d8..125995c 100644
--- a/test/CodeGen/AArch64/lit.local.cfg
+++ b/test/CodeGen/AArch64/lit.local.cfg
@@ -2,8 +2,7 @@ import re
 
 config.suffixes = ['.ll']
 
-targets = set(config.root.targets_to_build.split())
-if not 'AArch64' in targets:
+if not 'AArch64' in config.root.targets:
     config.unsupported = True
 
 # For now we don't test arm64-win32.
diff --git a/test/CodeGen/AArch64/memcpy-f128.ll b/test/CodeGen/AArch64/memcpy-f128.ll
new file mode 100644
index 0000000..76db297
--- /dev/null
+++ b/test/CodeGen/AArch64/memcpy-f128.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=aarch64 -mtriple=aarch64-linux-gnu | FileCheck %s
+
+%structA = type { i128 }
+@stubA = internal unnamed_addr constant %structA zeroinitializer, align 8
+
+; Make sure we don't hit llvm_unreachable.
+
+define void @test1() {
+; CHECK-LABEL: @test1
+; CHECK: adrp
+; CHECK: ldr q0
+; CHECK: str q0
+; CHECK: ret
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* undef, i8* bitcast (%structA* @stubA to i8*), i64 48, i32 8, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
diff --git a/test/CodeGen/AArch64/mul_pow2.ll b/test/CodeGen/AArch64/mul_pow2.ll
new file mode 100644
index 0000000..efc0ec8
--- /dev/null
+++ b/test/CodeGen/AArch64/mul_pow2.ll
@@ -0,0 +1,123 @@
+; RUN: llc < %s -march=aarch64 | FileCheck %s
+
+; Convert mul x, pow2 to shift.
+; Convert mul x, pow2 +/- 1 to shift + add/sub.
+
+define i32 @test2(i32 %x) {
+; CHECK-LABEL: test2
+; CHECK: lsl w0, w0, #1
+
+  %mul = shl nsw i32 %x, 1
+  ret i32 %mul
+}
+
+define i32 @test3(i32 %x) {
+; CHECK-LABEL: test3
+; CHECK: add w0, w0, w0, lsl #1
+
+  %mul = mul nsw i32 %x, 3
+  ret i32 %mul
+}
+
+define i32 @test4(i32 %x) {
+; CHECK-LABEL: test4
+; CHECK: lsl w0, w0, #2
+
+  %mul = shl nsw i32 %x, 2
+  ret i32 %mul
+}
+
+define i32 @test5(i32 %x) {
+; CHECK-LABEL: test5
+; CHECK: add w0, w0, w0, lsl #2
+
+
+  %mul = mul nsw i32 %x, 5
+  ret i32 %mul
+}
+
+define i32 @test7(i32 %x) {
+; CHECK-LABEL: test7
+; CHECK: lsl {{w[0-9]+}}, w0, #3
+; CHECK: sub w0, {{w[0-9]+}}, w0
+
+  %mul = mul nsw i32 %x, 7
+  ret i32 %mul
+}
+
+define i32 @test8(i32 %x) {
+; CHECK-LABEL: test8
+; CHECK: lsl w0, w0, #3
+
+  %mul = shl nsw i32 %x, 3
+  ret i32 %mul
+}
+
+define i32 @test9(i32 %x) {
+; CHECK-LABEL: test9
+; CHECK: add w0, w0, w0, lsl #3
+
+  %mul = mul nsw i32 %x, 9
+  ret i32 %mul
+}
+
+; Convert mul x, -pow2 to shift.
+; Convert mul x, -(pow2 +/- 1) to shift + add/sub.
+
+define i32 @ntest2(i32 %x) {
+; CHECK-LABEL: ntest2
+; CHECK: neg w0, w0, lsl #1
+
+  %mul = mul nsw i32 %x, -2
+  ret i32 %mul
+}
+
+define i32 @ntest3(i32 %x) {
+; CHECK-LABEL: ntest3
+; CHECK: add {{w[0-9]+}}, w0, w0, lsl #1
+; CHECK: neg w0, {{w[0-9]+}}
+
+  %mul = mul nsw i32 %x, -3
+  ret i32 %mul
+}
+
+define i32 @ntest4(i32 %x) {
+; CHECK-LABEL: ntest4
+; CHECK:neg w0, w0, lsl #2
+
+  %mul = mul nsw i32 %x, -4
+  ret i32 %mul
+}
+
+define i32 @ntest5(i32 %x) {
+; CHECK-LABEL: ntest5
+; CHECK: add {{w[0-9]+}}, w0, w0, lsl #2
+; CHECK: neg w0, {{w[0-9]+}}
+  %mul = mul nsw i32 %x, -5
+  ret i32 %mul
+}
+
+define i32 @ntest7(i32 %x) {
+; CHECK-LABEL: ntest7
+; CHECK: sub w0, w0, w0, lsl #3
+
+  %mul = mul nsw i32 %x, -7
+  ret i32 %mul
+}
+
+define i32 @ntest8(i32 %x) {
+; CHECK-LABEL: ntest8
+; CHECK: neg w0, w0, lsl #3
+
+  %mul = mul nsw i32 %x, -8
+  ret i32 %mul
+}
+
+define i32 @ntest9(i32 %x) {
+; CHECK-LABEL: ntest9
+; CHECK: add {{w[0-9]+}}, w0, w0, lsl #3
+; CHECK: neg w0, {{w[0-9]+}}
+
+  %mul = mul nsw i32 %x, -9
+  ret i32 %mul
+}
diff --git a/test/CodeGen/AArch64/regress-tail-livereg.ll b/test/CodeGen/AArch64/regress-tail-livereg.ll
index e32ac84..03c3f33 100644
--- a/test/CodeGen/AArch64/regress-tail-livereg.ll
+++ b/test/CodeGen/AArch64/regress-tail-livereg.ll
@@ -17,3 +17,17 @@ define void @foo() {
 ; CHECK: br {{x([0-79]|1[0-8])}}
        ret void
 }
+
+; No matter how tempting it is, LLVM should not use x30 since that'll be
+; restored to its incoming value before the "br".
+define void @test_x30_tail() {
+; CHECK-LABEL: test_x30_tail:
+; CHECK: mov [[DEST:x[0-9]+]], x30
+; CHECK: br [[DEST]]
+  %addr = call i8* @llvm.returnaddress(i32 0)
+  %faddr = bitcast i8* %addr to void()*
+  tail call void %faddr()
+  ret void
+}
+
+declare i8* @llvm.returnaddress(i32)
diff --git a/test/CodeGen/AArch64/trunc-v1i64.ll b/test/CodeGen/AArch64/trunc-v1i64.ll
new file mode 100644
index 0000000..159b8e0
--- /dev/null
+++ b/test/CodeGen/AArch64/trunc-v1i64.ll
@@ -0,0 +1,63 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
+
+; An optimization in DAG Combiner to fold
+; (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...))
+; will generate nodes like:
+;     v1i32 trunc v1i64, v1i16 trunc v1i64, v1i8 trunc v1i64.
+; And such nodes will be defaultly scalarized in type legalization. But such
+; scalarization will cause an assertion failure, as v1i64 is a legal type in
+; AArch64. We change the default behaviour from be scalarized to be widen.
+
+; FIXME: Currently XTN is generated for v1i32, but it can be optimized.
+; Just like v1i16 and v1i8, there is no XTN generated.
+
+define <2 x i32> @test_v1i32_0(<1 x i64> %in0) {
+; CHECK-LABEL: test_v1i32_0:
+; CHECK: xtn v0.2s, v0.2d
+  %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <2 x i32> <i32 0, i32 undef>
+  %2 = trunc <2 x i64> %1 to <2 x i32>
+  ret <2 x i32> %2
+}
+
+define <2 x i32> @test_v1i32_1(<1 x i64> %in0) {
+; CHECK-LABEL: test_v1i32_1:
+; CHECK: xtn v0.2s, v0.2d
+; CHECK-NEXT: dup v0.2s, v0.s[0]
+  %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <2 x i32> <i32 undef, i32 0>
+  %2 = trunc <2 x i64> %1 to <2 x i32>
+  ret <2 x i32> %2
+}
+
+define <4 x i16> @test_v1i16_0(<1 x i64> %in0) {
+; CHECK-LABEL: test_v1i16_0:
+; CHECK-NOT: xtn
+  %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %2 = trunc <4 x i64> %1 to <4 x i16>
+  ret <4 x i16> %2
+}
+
+define <4 x i16> @test_v1i16_1(<1 x i64> %in0) {
+; CHECK-LABEL: test_v1i16_1:
+; CHECK-NOT: xtn
+; CHECK: dup v0.4h, v0.h[0]
+  %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
+  %2 = trunc <4 x i64> %1 to <4 x i16>
+  ret <4 x i16> %2
+}
+
+define <8 x i8> @test_v1i8_0(<1 x i64> %in0) {
+; CHECK-LABEL: test_v1i8_0:
+; CHECK-NOT: xtn
+  %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2 = trunc <8 x i64> %1 to <8 x i8>
+  ret <8 x i8> %2
+}
+
+define <8 x i8> @test_v1i8_1(<1 x i64> %in0) {
+; CHECK-LABEL: test_v1i8_1:
+; CHECK-NOT: xtn
+; CHECK: dup v0.8b, v0.b[0]
+  %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2 = trunc <8 x i64> %1 to <8 x i8>
+  ret <8 x i8> %2
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/tst-br.ll b/test/CodeGen/AArch64/tst-br.ll
index 8a2fe26..5dc7b5d 100644
--- a/test/CodeGen/AArch64/tst-br.ll
+++ b/test/CodeGen/AArch64/tst-br.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 -aarch64-atomic-cfg-tidy=0 | FileCheck %s
 
 ; We've got the usual issues with LLVM reordering blocks here. The
 ; tests are correct for the current order, but who knows when that
diff --git a/test/CodeGen/ARM/2009-11-02-NegativeLane.ll b/test/CodeGen/ARM/2009-11-02-NegativeLane.ll
index ca5ae8b..2597b41 100644
--- a/test/CodeGen/ARM/2009-11-02-NegativeLane.ll
+++ b/test/CodeGen/ARM/2009-11-02-NegativeLane.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=cortex-a8 < %s | FileCheck %s
+; RUN: llc -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 < %s | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "armv7-eabi"
 
diff --git a/test/CodeGen/ARM/2009-11-07-SubRegAsmPrinting.ll b/test/CodeGen/ARM/2009-11-07-SubRegAsmPrinting.ll
index 4fb2be0..38eb0ea 100644
--- a/test/CodeGen/ARM/2009-11-07-SubRegAsmPrinting.ll
+++ b/test/CodeGen/ARM/2009-11-07-SubRegAsmPrinting.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=cortex-a8 -mattr=-neonfp < %s | FileCheck %s
+; RUN: llc -mcpu=cortex-a8 -mattr=-neonfp -arm-atomic-cfg-tidy=0 < %s | FileCheck %s
 ; PR5423
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll b/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll
index 35995b7..b040b2d 100644
--- a/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll
+++ b/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll
@@ -4,22 +4,26 @@
 
 %struct.foo = type { i64, i64 }
 
-define zeroext i8 @t(%struct.foo* %this) noreturn optsize {
+define zeroext i8 @t(%struct.foo* %this, i1 %tst) noreturn optsize {
 entry:
 ; ARM-LABEL:       t:
-; ARM:       str r2, [r1], r0
+; ARM-DAG:       mov r[[ADDR:[0-9]+]], #8
+; ARM-DAG:       mov [[VAL:r[0-9]+]], #0
+; ARM:       str [[VAL]], [r[[ADDR]]], r0
 
 ; THUMB-LABEL:     t:
-; THUMB-NOT: str r0, [r1], r0
-; THUMB:     str r1, [r0]
+; THUMB-DAG:       movs r[[ADDR:[0-9]+]], #8
+; THUMB-DAG:       movs [[VAL:r[0-9]+]], #0
+; THUMB-NOT: str {{[a-z0-9]+}}, [{{[a-z0-9]+}}], {{[a-z0-9]+}}
+; THUMB:     str [[VAL]], [r[[ADDR]]]
   %0 = getelementptr inbounds %struct.foo* %this, i32 0, i32 1 ; <i64*> [#uses=1]
   store i32 0, i32* inttoptr (i32 8 to i32*), align 8
-  br i1 undef, label %bb.nph96, label %bb3
+  br i1 %tst, label %bb.nph96, label %bb3
 
 bb3:                                              ; preds = %entry
   %1 = load i64* %0, align 4                      ; <i64> [#uses=0]
-  unreachable
+  ret i8 42
 
 bb.nph96:                                         ; preds = %entry
-  unreachable
+  ret i8 3
 }
diff --git a/test/CodeGen/ARM/2010-10-25-ifcvt-ldm.ll b/test/CodeGen/ARM/2010-10-25-ifcvt-ldm.ll
index 32d350e9..e7e0580 100644
--- a/test/CodeGen/ARM/2010-10-25-ifcvt-ldm.ll
+++ b/test/CodeGen/ARM/2010-10-25-ifcvt-ldm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=arm1136jf-s | FileCheck %s
+; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=arm1136jf-s -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; Radar 8589805: Counting the number of microcoded operations, such as for an
 ; LDM instruction, was causing an assertion failure because the microop count
 ; was being treated as an instruction count.
@@ -11,7 +11,7 @@
 define i32 @test(i32 %x) {
 entry:
   %0 = tail call signext i16 undef(i32* undef)
-  switch i32 undef, label %bb3 [
+  switch i32 %x, label %bb3 [
     i32 0, label %bb4
     i32 1, label %bb1
     i32 2, label %bb2
diff --git a/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll b/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
index 85a1137..3950c9e 100644
--- a/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
+++ b/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -O3 -mtriple=armv6-apple-darwin -relocation-model=pic  -mcpu=arm1136jf-s | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -O3 -mtriple=armv6-apple-darwin -relocation-model=pic  -mcpu=arm1136jf-s -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; rdar://8959122 illegal register operands for UMULL instruction
 ;   in cfrac nightly test.
 ; Armv6 generates a umull that must write to two distinct destination regs.
@@ -7,7 +7,7 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:64-n32"
 target triple = "armv6-apple-darwin10"
 
-define void @ptoa() nounwind {
+define void @ptoa(i1 %tst, i8* %p8, i8 %val8) nounwind {
 entry:
   br i1 false, label %bb3, label %bb
 
@@ -16,7 +16,7 @@ bb:                                               ; preds = %entry
 
 bb3:                                              ; preds = %bb, %entry
   %0 = call noalias i8* @malloc() nounwind
-  br i1 undef, label %bb46, label %bb8
+  br i1 %tst, label %bb46, label %bb8
 
 bb8:                                              ; preds = %bb3
   %1 = getelementptr inbounds i8* %0, i32 0
@@ -35,7 +35,7 @@ bb8:                                              ; preds = %bb3
   %7 = or i8 %6, 48
   %8 = add i8 %6, 87
   %iftmp.5.0.1 = select i1 %5, i8 %7, i8 %8
-  store i8 %iftmp.5.0.1, i8* undef, align 1
+  store i8 %iftmp.5.0.1, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -49,7 +49,7 @@ bb8:                                              ; preds = %bb3
   %13 = or i8 %12, 48
   %14 = add i8 %12, 87
   %iftmp.5.0.2 = select i1 %11, i8 %13, i8 %14
-  store i8 %iftmp.5.0.2, i8* undef, align 1
+  store i8 %iftmp.5.0.2, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -73,8 +73,8 @@ bb8:                                              ; preds = %bb3
   %21 = udiv i32 %2, 100000
   %22 = urem i32 %21, 10
   %23 = icmp ult i32 %22, 10
-  %iftmp.5.0.5 = select i1 %23, i8 0, i8 undef
-  store i8 %iftmp.5.0.5, i8* undef, align 1
+  %iftmp.5.0.5 = select i1 %23, i8 0, i8 %val8
+  store i8 %iftmp.5.0.5, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -88,7 +88,7 @@ bb8:                                              ; preds = %bb3
   %28 = or i8 %27, 48
   %29 = add i8 %27, 87
   %iftmp.5.0.6 = select i1 %26, i8 %28, i8 %29
-  store i8 %iftmp.5.0.6, i8* undef, align 1
+  store i8 %iftmp.5.0.6, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -102,7 +102,7 @@ bb8:                                              ; preds = %bb3
   %34 = or i8 %33, 48
   %35 = add i8 %33, 87
   %iftmp.5.0.7 = select i1 %32, i8 %34, i8 %35
-  store i8 %iftmp.5.0.7, i8* undef, align 1
+  store i8 %iftmp.5.0.7, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -117,7 +117,7 @@ bb8:                                              ; preds = %bb3
   %41 = add i8 %39, 87
   %iftmp.5.0.8 = select i1 %38, i8 %40, i8 %41
   store i8 %iftmp.5.0.8, i8* null, align 1
-  unreachable
+  br label %bb46
 
 bb46:                                             ; preds = %bb3
   ret void
diff --git a/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll b/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll
index bc72e12..837feb6 100644
--- a/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll
+++ b/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll
@@ -8,7 +8,7 @@
 
 @oStruct = external global %struct.Outer, align 4
 
-define void @main() nounwind {
+define void @main(i8 %val8) nounwind {
 ; CHECK-LABEL: main:
 ; CHECK-NOT: ldrd
 ; CHECK: mul
@@ -28,7 +28,7 @@ for.body:                                         ; preds = %_Z14printIsNotZeroi
   br i1 %tobool.i14, label %_Z14printIsNotZeroi.exit17, label %if.then.i16
 
 if.then.i16:                                      ; preds = %_Z14printIsNotZeroi.exit
-  unreachable
+  ret void
 
 _Z14printIsNotZeroi.exit17:                       ; preds = %_Z14printIsNotZeroi.exit
   br label %_Z14printIsNotZeroi.exit17.for.body_crit_edge
@@ -36,7 +36,7 @@ _Z14printIsNotZeroi.exit17:                       ; preds = %_Z14printIsNotZeroi
 _Z14printIsNotZeroi.exit17.for.body_crit_edge:    ; preds = %_Z14printIsNotZeroi.exit17
   %b.phi.trans.insert = getelementptr %struct.Outer* @oStruct, i32 0, i32 1, i32 %inc, i32 3
   %tmp3.pre = load i8* %b.phi.trans.insert, align 1
-  %phitmp27 = icmp eq i8 undef, 0
+  %phitmp27 = icmp eq i8 %val8, 0
   br label %for.body
 
 for.end:                                          ; preds = %_Z14printIsNotZeroi.exit17
diff --git a/test/CodeGen/ARM/2012-11-14-subs_carry.ll b/test/CodeGen/ARM/2012-11-14-subs_carry.ll
index 8df295a..3308330 100644
--- a/test/CodeGen/ARM/2012-11-14-subs_carry.ll
+++ b/test/CodeGen/ARM/2012-11-14-subs_carry.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-atomic-cfg-tidy=0 | FileCheck %s
 
 ;CHECK-LABEL: foo:
 ;CHECK: adds
diff --git a/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll b/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
index 480d087..162f863 100644
--- a/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
+++ b/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
@@ -42,34 +42,34 @@ UnifiedReturnBlock:
   ret i32 %tmp13
 }
 
-define hidden fastcc void @t3(i8** %retaddr) {
+define hidden fastcc void @t3(i8** %retaddr, i1 %tst, i8* %p8) {
 ; CHECK-LABEL: t3:
 ; CHECK: Block address taken
 ; CHECK-NOT: Address of block that was removed by CodeGen
 bb:
   store i8* blockaddress(@t3, %KBBlockZero_return_1), i8** %retaddr
-  br i1 undef, label %bb77, label %bb7.i
+  br i1 %tst, label %bb77, label %bb7.i
 
 bb7.i:                                            ; preds = %bb35
   br label %bb2.i
 
 KBBlockZero_return_1:                             ; preds = %KBBlockZero.exit
-  unreachable
+  ret void
 
 KBBlockZero_return_0:                             ; preds = %KBBlockZero.exit
-  unreachable
+  ret void
 
 bb77:                                             ; preds = %bb26, %bb12, %bb
   ret void
 
 bb2.i:                                            ; preds = %bb6.i350, %bb7.i
-  br i1 undef, label %bb6.i350, label %KBBlockZero.exit
+  br i1 %tst, label %bb6.i350, label %KBBlockZero.exit
 
 bb6.i350:                                         ; preds = %bb2.i
   br label %bb2.i
 
 KBBlockZero.exit:                                 ; preds = %bb2.i
-  indirectbr i8* undef, [label %KBBlockZero_return_1, label %KBBlockZero_return_0]
+  indirectbr i8* %p8, [label %KBBlockZero_return_1, label %KBBlockZero_return_0]
 }
 
 @foo = global i32 ()* null
diff --git a/test/CodeGen/ARM/2013-07-29-vector-or-combine.ll b/test/CodeGen/ARM/2013-07-29-vector-or-combine.ll
index a438c1f..05a4ef0 100644
--- a/test/CodeGen/ARM/2013-07-29-vector-or-combine.ll
+++ b/test/CodeGen/ARM/2013-07-29-vector-or-combine.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; ModuleID = 'bugpoint-reduced-simplified.bc'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
 target triple = "armv7--linux-gnueabi"
diff --git a/test/CodeGen/ARM/Windows/dllimport.ll b/test/CodeGen/ARM/Windows/dllimport.ll
new file mode 100644
index 0000000..bc737bd
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/dllimport.ll
@@ -0,0 +1,61 @@
+; RUN: llc -mtriple thumbv7-windows -filetype asm -o - %s | FileCheck %s
+
+; ModuleID = 'dllimport.c'
+
+@var = external dllimport global i32
+@ext = external global i32
+declare dllimport arm_aapcs_vfpcc i32 @external()
+declare arm_aapcs_vfpcc i32 @internal()
+
+define arm_aapcs_vfpcc i32 @get_var() {
+  %1 = load i32* @var, align 4
+  ret i32 %1
+}
+
+; CHECK-LABEL: get_var
+; CHECK: movw r0, :lower16:__imp_var
+; CHECK: movt r0, :upper16:__imp_var
+; CHECK: ldr r0, [r0]
+; CHECK: ldr r0, [r0]
+; CHECK: bx lr
+
+define arm_aapcs_vfpcc i32 @get_ext() {
+  %1 = load i32* @ext, align 4
+  ret i32 %1
+}
+
+; CHECK-LABEL: get_ext
+; CHECK: movw r0, :lower16:ext
+; CHECK: movt r0, :upper16:ext
+; CHECK: ldr r0, [r0]
+; CHECK: bx lr
+
+define arm_aapcs_vfpcc i32* @get_var_pointer() {
+  ret i32* @var
+}
+
+; CHECK-LABEL: get_var_pointer
+; CHECK:  movw r0, :lower16:__imp_var
+; CHECK:  movt r0, :upper16:__imp_var
+; CHECK:  ldr r0, [r0]
+; CHECK:  bx lr
+
+define arm_aapcs_vfpcc i32 @call_external() {
+  %call = tail call arm_aapcs_vfpcc i32 @external()
+  ret i32 %call
+}
+
+; CHECK-LABEL: call_external
+; CHECK: movw r0, :lower16:__imp_external
+; CHECK: movt r0, :upper16:__imp_external
+; CHECK: ldr r0, [r0]
+; CHECK: bx r0
+
+define arm_aapcs_vfpcc i32 @call_internal() {
+  %call = tail call arm_aapcs_vfpcc i32 @internal()
+  ret i32 %call
+}
+
+; CHECK-LABEL: call_internal
+; CHECK: b internal
+
diff --git a/test/CodeGen/ARM/Windows/global-minsize.ll b/test/CodeGen/ARM/Windows/global-minsize.ll
new file mode 100644
index 0000000..c0be36c
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/global-minsize.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=thumbv7-windows -filetype asm -o - %s | FileCheck %s
+
+@i = internal global i32 0, align 4
+
+; Function Attrs: minsize
+define arm_aapcs_vfpcc i32* @function() #0 {
+entry:
+  ret i32* @i
+}
+
+attributes #0 = { minsize }
+
+; CHECK: function:
+; CHECK:   movw  r0, :lower16:i
+; CHECK:   movt  r0, :upper16:i
+; CHECK:   bx    lr
diff --git a/test/CodeGen/ARM/Windows/long-calls.ll b/test/CodeGen/ARM/Windows/long-calls.ll
new file mode 100644
index 0000000..e35f414
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/long-calls.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=thumbv7-windows -mcpu=cortex-a9 -arm-long-calls -o - %s \
+; RUN:    | FileCheck %s
+
+declare arm_aapcs_vfpcc void @callee()
+
+define arm_aapcs_vfpcc void @caller() nounwind {
+entry:
+  tail call void @callee()
+  ret void
+}
+
+; CHECK-LABEL: caller
+; CHECK: ldr [[REG:r[0-9]+]], [[CPI:.LCPI[_0-9]+]]
+; CHECK: bx [[REG]]
+; CHECK: .align 2
+; CHECK: [[CPI]]:
+; CHECK: .long callee
+
diff --git a/test/CodeGen/ARM/Windows/structors.ll b/test/CodeGen/ARM/Windows/structors.ll
new file mode 100644
index 0000000..a1a9026
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/structors.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple thumbv7-windows-itanium -o - %s | FileCheck %s
+
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @function, i8* null }]
+
+define arm_aapcs_vfpcc void @function() {
+entry:
+  ret void
+}
+
+; CHECK: .section .CRT$XCU,"rd"
+; CHECK: .long function
+
diff --git a/test/CodeGen/ARM/Windows/vla.ll b/test/CodeGen/ARM/Windows/vla.ll
new file mode 100644
index 0000000..56901de
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/vla.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -o - %s \
+; RUN:  | FileCheck %s -check-prefix CHECK-SMALL-CODE
+; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -code-model=large -o - %s \
+; RUN:  | FileCheck %s -check-prefix CHECK-LARGE-CODE
+; RUN: llc -mtriple=thumbv7-windows-msvc -mcpu=cortex-a9 -o - %s \
+; RUN:  | FileCheck %s -check-prefix CHECK-MSVC
+
+define arm_aapcs_vfpcc i8 @function(i32 %sz, i32 %idx) {
+entry:
+  %vla = alloca i8, i32 %sz, align 1
+  %arrayidx = getelementptr inbounds i8* %vla, i32 %idx
+  %0 = load volatile i8* %arrayidx, align 1
+  ret i8 %0
+}
+
+; CHECK-SMALL-CODE:   adds [[R4:r[0-9]+]], #7
+; CHECK-SMALL-CODE:   bic [[R4]], [[R4]], #7
+; CHECK-SMALL-CODE:   lsrs r4, [[R4]], #2
+; CHECK-SMALL-CODE:   bl __chkstk
+; CHECK-SMALL-CODE:   sub.w sp, sp, r4
+
+; CHECK-LARGE-CODE:   adds  [[R4:r[0-9]+]], #7
+; CHECK-LARGE-CODE:   bic   [[R4]], [[R4]], #7
+; CHECK-LARGE-CODE:   lsrs  r4, [[R4]], #2
+; CHECK-LARGE-CODE:   movw  [[IP:r[0-9]+]], :lower16:__chkstk
+; CHECK-LARGE-CODE:   movt  [[IP]], :upper16:__chkstk
+; CHECK-LARGE-CODE:   blx   [[IP]]
+; CHECK-LARGE-CODE:   sub.w sp, sp, r4
+
+; CHECK-MSVC-NOT: __chkstk
+
diff --git a/test/CodeGen/ARM/aliases.ll b/test/CodeGen/ARM/aliases.ll
index 4de305b..f55ae10 100644
--- a/test/CodeGen/ARM/aliases.ll
+++ b/test/CodeGen/ARM/aliases.ll
@@ -29,7 +29,7 @@ define i32 @foo_f() {
 
 @bar_i = alias internal i32* @bar
 
-@A = alias i64, i32* @bar
+@A = alias bitcast (i32* @bar to i64*)
 
 define i32 @test() {
 entry:
diff --git a/test/CodeGen/ARM/arm-and-tst-peephole.ll b/test/CodeGen/ARM/arm-and-tst-peephole.ll
index bf827d6..14eef83 100644
--- a/test/CodeGen/ARM/arm-and-tst-peephole.ll
+++ b/test/CodeGen/ARM/arm-and-tst-peephole.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck -check-prefix=ARM %s
-; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck -check-prefix=THUMB %s
-; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - \
+; RUN: llc -mtriple=arm-eabi -arm-atomic-cfg-tidy=0 %s -o - | FileCheck -check-prefix=ARM %s
+; RUN: llc -mtriple=thumb-eabi -arm-atomic-cfg-tidy=0 %s -o - | FileCheck -check-prefix=THUMB %s
+; RUN: llc -mtriple=thumb-eabi -arm-atomic-cfg-tidy=0 -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - \
 ; RUN:   | FileCheck -check-prefix=T2 %s
-; RUN: llc -mtriple=thumbv8-eabi %s -o - | FileCheck -check-prefix=V8 %s
+; RUN: llc -mtriple=thumbv8-eabi -arm-atomic-cfg-tidy=0 %s -o - | FileCheck -check-prefix=V8 %s
 
 ; FIXME: The -march=thumb test doesn't change if -disable-peephole is specified.
 
diff --git a/test/CodeGen/ARM/atomic-64bit.ll b/test/CodeGen/ARM/atomic-64bit.ll
index 9913f30..462c185 100644
--- a/test/CodeGen/ARM/atomic-64bit.ll
+++ b/test/CodeGen/ARM/atomic-64bit.ll
@@ -171,9 +171,10 @@ define i64 @test6(i64* %ptr, i64 %val) {
 
 define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
 ; CHECK-LABEL: test7:
-; CHECK: dmb {{ish$}}
+; CHECK-DAG: mov [[VAL1LO:r[0-9]+]], r1
+; CHECK-DAG: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK-LE-DAG: eor     [[MISMATCH_LO:r[0-9]+]], [[REG1]], r1
+; CHECK-LE-DAG: eor     [[MISMATCH_LO:r[0-9]+]], [[REG1]], [[VAL1LO]]
 ; CHECK-LE-DAG: eor     [[MISMATCH_HI:r[0-9]+]], [[REG2]], r2
 ; CHECK-BE-DAG: eor     [[MISMATCH_LO:r[0-9]+]], [[REG2]], r2
 ; CHECK-BE-DAG: eor     [[MISMATCH_HI:r[0-9]+]], [[REG1]], r1
@@ -189,16 +190,17 @@ define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB-LE-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG1]], r2
 ; CHECK-THUMB-LE-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG2]], r3
-; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG2]]
-; CHECK-THUMB: orrs    [[MISMATCH_HI]], [[MISMATCH_LO]]
+; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG1]], r2
+; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG2]], r3
+; CHECK-THUMB-LE: orrs    [[MISMATCH_HI]], [[MISMATCH_LO]]
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
-  %r = cmpxchg i64* %ptr, i64 %val1, i64 %val2 seq_cst seq_cst
+  %pair = cmpxchg i64* %ptr, i64 %val1, i64 %val2 seq_cst seq_cst
+  %r = extractvalue { i64, i1 } %pair, 0
   ret i64 %r
 }
 
diff --git a/test/CodeGen/ARM/atomic-cmp.ll b/test/CodeGen/ARM/atomic-cmp.ll
index a473807..629b16d 100644
--- a/test/CodeGen/ARM/atomic-cmp.ll
+++ b/test/CodeGen/ARM/atomic-cmp.ll
@@ -11,5 +11,6 @@ define i8 @t(i8* %a, i8 %b, i8 %c) nounwind {
 ; T2: ldrexb
 ; T2: strexb
   %tmp0 = cmpxchg i8* %a, i8 %b, i8 %c monotonic monotonic
-  ret i8 %tmp0
+  %tmp1 = extractvalue { i8, i1 } %tmp0, 0
+  ret i8 %tmp1
 }
diff --git a/test/CodeGen/ARM/atomic-load-store.ll b/test/CodeGen/ARM/atomic-load-store.ll
index 45a263d..49342d2 100644
--- a/test/CodeGen/ARM/atomic-load-store.ll
+++ b/test/CodeGen/ARM/atomic-load-store.ll
@@ -5,13 +5,13 @@
 ; RUN: llc < %s -mtriple=armv4-apple-ios | FileCheck %s -check-prefix=ARMV4
 
 define void @test1(i32* %ptr, i32 %val1) {
-; ARM: test1
+; ARM-LABEL: test1
 ; ARM: dmb {{ish$}}
 ; ARM-NEXT: str
 ; ARM-NEXT: dmb {{ish$}}
-; THUMBONE: test1
+; THUMBONE-LABEL: test1
 ; THUMBONE: __sync_lock_test_and_set_4
-; THUMBTWO: test1
+; THUMBTWO-LABEL: test1
 ; THUMBTWO: dmb {{ish$}}
 ; THUMBTWO-NEXT: str
 ; THUMBTWO-NEXT: dmb {{ish$}}
@@ -20,12 +20,12 @@ define void @test1(i32* %ptr, i32 %val1) {
 }
 
 define i32 @test2(i32* %ptr) {
-; ARM: test2
+; ARM-LABEL: test2
 ; ARM: ldr
 ; ARM-NEXT: dmb {{ish$}}
-; THUMBONE: test2
+; THUMBONE-LABEL: test2
 ; THUMBONE: __sync_val_compare_and_swap_4
-; THUMBTWO: test2
+; THUMBTWO-LABEL: test2
 ; THUMBTWO: ldr
 ; THUMBTWO-NEXT: dmb {{ish$}}
   %val = load atomic i32* %ptr seq_cst, align 4
@@ -33,22 +33,35 @@ define i32 @test2(i32* %ptr) {
 }
 
 define void @test3(i8* %ptr1, i8* %ptr2) {
-; ARM: test3
+; ARM-LABEL: test3
+; ARM-NOT: dmb
 ; ARM: ldrb
+; ARM-NOT: dmb
 ; ARM: strb
-; THUMBTWO: test3
+; ARM-NOT: dmb
+; ARM: bx lr
+
+; THUMBTWO-LABEL: test3
+; THUMBTWO-NOT: dmb
 ; THUMBTWO: ldrb
+; THUMBTWO-NOT: dmb
 ; THUMBTWO: strb
-; THUMBONE: test3
+; THUMBTWO-NOT: dmb
+; THUMBTWO: bx lr
+
+; THUMBONE-LABEL: test3
+; THUMBONE-NOT: dmb
 ; THUMBONE: ldrb
+; THUMBONE-NOT: dmb
 ; THUMBONE: strb
+; THUMBONE-NOT: dmb
   %val = load atomic i8* %ptr1 unordered, align 1
   store atomic i8 %val, i8* %ptr2 unordered, align 1
   ret void
 }
 
 define void @test4(i8* %ptr1, i8* %ptr2) {
-; THUMBONE: test4
+; THUMBONE-LABEL: test4
 ; THUMBONE: ___sync_val_compare_and_swap_1
 ; THUMBONE: ___sync_lock_test_and_set_1
   %val = load atomic i8* %ptr1 seq_cst, align 1
@@ -57,14 +70,14 @@ define void @test4(i8* %ptr1, i8* %ptr2) {
 }
 
 define i64 @test_old_load_64bit(i64* %p) {
-; ARMV4: test_old_load_64bit
+; ARMV4-LABEL: test_old_load_64bit
 ; ARMV4: ___sync_val_compare_and_swap_8
   %1 = load atomic i64* %p seq_cst, align 8
   ret i64 %1
 }
 
 define void @test_old_store_64bit(i64* %p, i64 %v) {
-; ARMV4: test_old_store_64bit
+; ARMV4-LABEL: test_old_store_64bit
 ; ARMV4: ___sync_lock_test_and_set_8
   store atomic i64 %v, i64* %p seq_cst, align 8
   ret void
diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll
index ac8e949..b988242 100644
--- a/test/CodeGen/ARM/atomic-op.ll
+++ b/test/CodeGen/ARM/atomic-op.ll
@@ -198,7 +198,8 @@ entry:
 define i32 @test_cmpxchg_fail_order(i32 *%addr, i32 %desired, i32 %new) {
 ; CHECK-LABEL: test_cmpxchg_fail_order:
 
-  %oldval = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+  %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+  %oldval = extractvalue { i32, i1 } %pair, 0
 ; CHECK:     dmb ish
 ; CHECK: [[LOOP_BB:\.?LBB[0-9]+_1]]:
 ; CHECK:     ldrex   [[OLDVAL:r[0-9]+]], [r[[ADDR:[0-9]+]]]
@@ -216,7 +217,8 @@ define i32 @test_cmpxchg_fail_order(i32 *%addr, i32 %desired, i32 %new) {
 define i32 @test_cmpxchg_fail_order1(i32 *%addr, i32 %desired, i32 %new) {
 ; CHECK-LABEL: test_cmpxchg_fail_order1:
 
-  %oldval = cmpxchg i32* %addr, i32 %desired, i32 %new acquire acquire
+  %pair = cmpxchg i32* %addr, i32 %desired, i32 %new acquire acquire
+  %oldval = extractvalue { i32, i1 } %pair, 0
 ; CHECK-NOT:     dmb ish
 ; CHECK: [[LOOP_BB:\.?LBB[0-9]+_1]]:
 ; CHECK:     ldrex   [[OLDVAL:r[0-9]+]], [r[[ADDR:[0-9]+]]]
diff --git a/test/CodeGen/ARM/atomic-ops-v8.ll b/test/CodeGen/ARM/atomic-ops-v8.ll
index a39565e..7072aaa 100644
--- a/test/CodeGen/ARM/atomic-ops-v8.ll
+++ b/test/CodeGen/ARM/atomic-ops-v8.ll
@@ -1051,7 +1051,8 @@ define void @test_atomic_load_umax_i64(i64 %offset) nounwind {
 
 define i8 @test_atomic_cmpxchg_i8(i8 zeroext %wanted, i8 zeroext %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i8:
-   %old = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
+   %pair = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
+   %old = extractvalue { i8, i1 } %pair, 0
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 ; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
@@ -1077,7 +1078,8 @@ define i8 @test_atomic_cmpxchg_i8(i8 zeroext %wanted, i8 zeroext %new) nounwind
 
 define i16 @test_atomic_cmpxchg_i16(i16 zeroext %wanted, i16 zeroext %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i16:
-   %old = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst seq_cst
+   %pair = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst seq_cst
+   %old = extractvalue { i16, i1 } %pair, 0
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 ; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
@@ -1103,7 +1105,8 @@ define i16 @test_atomic_cmpxchg_i16(i16 zeroext %wanted, i16 zeroext %new) nounw
 
 define void @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i32:
-   %old = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic
+   %pair = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic
+   %old = extractvalue { i32, i1 } %pair, 0
    store i32 %old, i32* @var32
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
@@ -1130,7 +1133,8 @@ define void @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
 
 define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i64:
-   %old = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic monotonic
+   %pair = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic monotonic
+   %old = extractvalue { i64, i1 } %pair, 0
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 ; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
diff --git a/test/CodeGen/ARM/big-endian-neon-extend.ll b/test/CodeGen/ARM/big-endian-neon-extend.ll
new file mode 100644
index 0000000..931c6c3
--- /dev/null
+++ b/test/CodeGen/ARM/big-endian-neon-extend.ll
@@ -0,0 +1,81 @@
+; RUN: llc < %s -mtriple armeb-eabi -mattr v7,neon -o - | FileCheck %s
+
+define void @vector_ext_2i8_to_2i64( <2 x i8>* %loadaddr, <2 x i64>* %storeaddr ) {
+; CHECK-LABEL: vector_ext_2i8_to_2i64:
+; CHECK:       vld1.16 {[[REG:d[0-9]+]]
+; CHECK:       vmov.i64 {{q[0-9]+}}, #0xff
+; CHECK:       vrev16.8  [[REG]], [[REG]]
+; CHECK:       vmovl.u8  {{q[0-9]+}}, [[REG]]
+  %1 = load <2 x i8>* %loadaddr
+  %2 = zext <2 x i8> %1 to <2 x i64>
+  store <2 x i64> %2, <2 x i64>* %storeaddr
+  ret void
+}
+
+define void @vector_ext_2i16_to_2i64( <2 x i16>* %loadaddr, <2 x i64>* %storeaddr ) {
+; CHECK-LABEL: vector_ext_2i16_to_2i64:
+; CHECK:       vld1.32 {[[REG:d[0-9]+]]
+; CHECK:       vmov.i64 {{q[0-9]+}}, #0xffff
+; CHECK:       vrev32.16  [[REG]], [[REG]]
+; CHECK:       vmovl.u16  {{q[0-9]+}}, [[REG]]
+  %1 = load <2 x i16>* %loadaddr
+  %2 = zext <2 x i16> %1 to <2 x i64>
+  store <2 x i64> %2, <2 x i64>* %storeaddr
+  ret void
+}
+
+
+define void @vector_ext_2i8_to_2i32( <2 x i8>* %loadaddr, <2 x i32>* %storeaddr ) {
+; CHECK-LABEL: vector_ext_2i8_to_2i32:
+; CHECK:       vld1.16 {[[REG:d[0-9]+]]
+; CHECK:       vrev16.8  [[REG]], [[REG]]
+  %1 = load <2 x i8>* %loadaddr
+  %2 = zext <2 x i8> %1 to <2 x i32>
+  store <2 x i32> %2, <2 x i32>* %storeaddr
+  ret void
+}
+
+define void @vector_ext_2i16_to_2i32( <2 x i16>* %loadaddr, <2 x i32>* %storeaddr ) {
+; CHECK-LABEL: vector_ext_2i16_to_2i32:
+; CHECK:       vld1.32 {[[REG:d[0-9]+]]
+; CHECK:       vrev32.16  [[REG]], [[REG]]
+; CHECK:       vmovl.u16  {{q[0-9]+}}, [[REG]]
+  %1 = load <2 x i16>* %loadaddr
+  %2 = zext <2 x i16> %1 to <2 x i32>
+  store <2 x i32> %2, <2 x i32>* %storeaddr
+  ret void
+}
+
+define void @vector_ext_2i8_to_2i16( <2 x i8>* %loadaddr, <2 x i16>* %storeaddr ) {
+; CHECK-LABEL: vector_ext_2i8_to_2i16:
+; CHECK:       vld1.16 {[[REG:d[0-9]+]]
+; CHECK:       vrev16.8  [[REG]], [[REG]]
+; CHECK:       vmovl.u8  {{q[0-9]+}}, [[REG]]
+  %1 = load <2 x i8>* %loadaddr
+  %2 = zext <2 x i8> %1 to <2 x i16>
+  store <2 x i16> %2, <2 x i16>* %storeaddr
+  ret void
+}
+
+define void @vector_ext_4i8_to_4i32( <4 x i8>* %loadaddr, <4 x i32>* %storeaddr ) {
+; CHECK-LABEL: vector_ext_4i8_to_4i32:
+; CHECK:       vld1.32 {[[REG:d[0-9]+]]
+; CHECK:       vrev32.8  [[REG]], [[REG]]
+; CHECK:       vmovl.u8  {{q[0-9]+}}, [[REG]]
+  %1 = load <4 x i8>* %loadaddr
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  store <4 x i32> %2, <4 x i32>* %storeaddr
+  ret void
+}
+
+define void @vector_ext_4i8_to_4i16( <4 x i8>* %loadaddr, <4 x i16>* %storeaddr ) {
+; CHECK-LABEL: vector_ext_4i8_to_4i16:
+; CHECK:       vld1.32 {[[REG:d[0-9]+]]
+; CHECK:       vrev32.8  [[REG]], [[REG]]
+; CHECK:       vmovl.u8  {{q[0-9]+}}, [[REG]]
+  %1 = load <4 x i8>* %loadaddr
+  %2 = zext <4 x i8> %1 to <4 x i16>
+  store <4 x i16> %2, <4 x i16>* %storeaddr
+  ret void
+}
+
diff --git a/test/CodeGen/ARM/big-endian-neon-trunc-store.ll b/test/CodeGen/ARM/big-endian-neon-trunc-store.ll
new file mode 100644
index 0000000..65147ad
--- /dev/null
+++ b/test/CodeGen/ARM/big-endian-neon-trunc-store.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple armeb-eabi -mattr v7,neon -o - | FileCheck %s
+
+define void @vector_trunc_store_2i64_to_2i16( <2 x i64>* %loadaddr, <2 x i16>* %storeaddr ) {
+; CHECK-LABEL: vector_trunc_store_2i64_to_2i16:
+; CHECK:       vmovn.i64  [[REG:d[0-9]+]]
+; CHECK:       vrev32.16  [[REG]], [[REG]]
+; CHECK:       vuzp.16    [[REG]], [[REG2:d[0-9]+]]
+; CHECK:       vrev32.16  [[REG]], [[REG2]]
+  %1 = load <2 x i64>* %loadaddr
+  %2 = trunc <2 x i64> %1 to <2 x i16>
+  store <2 x i16> %2, <2 x i16>* %storeaddr
+  ret void
+}
+
+define void @vector_trunc_store_4i32_to_4i8( <4 x i32>* %loadaddr, <4 x i8>* %storeaddr ) {
+; CHECK-LABEL: vector_trunc_store_4i32_to_4i8:
+; CHECK:       vmovn.i32 [[REG:d[0-9]+]]
+; CHECK:       vrev16.8  [[REG]], [[REG]]
+; CHECK:       vuzp.8    [[REG]], [[REG2:d[0-9]+]]
+; CHECK:       vrev32.8  [[REG]], [[REG2]]
+  %1 = load <4 x i32>* %loadaddr
+  %2 = trunc <4 x i32> %1 to <4 x i8>
+  store <4 x i8> %2, <4 x i8>* %storeaddr
+  ret void
+}
+
diff --git a/test/CodeGen/ARM/big-endian-ret-f64.ll b/test/CodeGen/ARM/big-endian-ret-f64.ll
new file mode 100644
index 0000000..614bfc0
--- /dev/null
+++ b/test/CodeGen/ARM/big-endian-ret-f64.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=armebv7a-eabi %s -O0 -o - | FileCheck %s
+; RUN: llc -mtriple=armebv8a-eabi %s -O0 -o - | FileCheck %s
+
+define double @fn() {
+; CHECK-LABEL: fn
+; CHECK: ldr r0, [sp]
+; CHECK: ldr r1, [sp, #4]
+  %r = alloca double, align 8
+  %1 = load double* %r, align 8
+  ret double %1
+}
+
diff --git a/test/CodeGen/ARM/call-tc.ll b/test/CodeGen/ARM/call-tc.ll
index 40694bf..a35fd74 100644
--- a/test/CodeGen/ARM/call-tc.ll
+++ b/test/CodeGen/ARM/call-tc.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=armv6-apple-ios5.0 -mattr=+vfp2 | FileCheck %s -check-prefix=CHECKV6
-; RUN: llc < %s -mtriple=thumbv7-apple-ios5.0 | FileCheck %s -check-prefix=CHECKT2D
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi -relocation-model=pic -mattr=+vfp2 \
+; RUN: llc < %s -mtriple=armv6-apple-ios5.0 -mattr=+vfp2 -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=CHECKV6
+; RUN: llc < %s -mtriple=thumbv7-apple-ios5.0 -arm-atomic-cfg-tidy=0 | FileCheck %s -check-prefix=CHECKT2D
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi -relocation-model=pic -mattr=+vfp2 -arm-atomic-cfg-tidy=0 \
 ; RUN:    | FileCheck %s -check-prefix=CHECKELF
 
 ; Enable tailcall optimization for iOS 5.0
diff --git a/test/CodeGen/ARM/cmpxchg-idioms.ll b/test/CodeGen/ARM/cmpxchg-idioms.ll
new file mode 100644
index 0000000..fb88575
--- /dev/null
+++ b/test/CodeGen/ARM/cmpxchg-idioms.ll
@@ -0,0 +1,107 @@
+; RUN: llc -mtriple=thumbv7s-apple-ios7.0 -o - %s | FileCheck %s
+
+define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
+; CHECK-LABEL: test_return:
+
+; CHECK: dmb ishst
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldrex [[LOADED:r[0-9]+]], [r0]
+; CHECK: cmp [[LOADED]], r1
+; CHECK: bne [[FAILED:LBB[0-9]+_[0-9]+]]
+
+; CHECK: strex [[STATUS:r[0-9]+]], {{r[0-9]+}}, [r0]
+; CHECK: cmp [[STATUS]], #0
+; CHECK: bne [[LOOP]]
+
+; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: movs r0, #1
+; CHECK: dmb ish
+; CHECK: bx lr
+
+; CHECK: [[FAILED]]:
+; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: movs r0, #0
+; CHECK: dmb ish
+; CHECK: bx lr
+
+  %pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %pair, 1
+  %conv = zext i1 %success to i32
+  ret i32 %conv
+}
+
+define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) {
+; CHECK-LABEL: test_return_bool:
+
+; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1
+; CHECK: dmb ishst
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldrexb [[LOADED:r[0-9]+]], [r0]
+; CHECK: cmp [[LOADED]], [[OLDBYTE]]
+; CHECK: bne [[FAIL:LBB[0-9]+_[0-9]+]]
+
+; CHECK: strexb [[STATUS:r[0-9]+]], {{r[0-9]+}}, [r0]
+; CHECK: cmp [[STATUS]], #0
+; CHECK: bne [[LOOP]]
+
+  ; FIXME: this eor is redundant. Need to teach DAG combine that.
+; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: movs [[TMP:r[0-9]+]], #1
+; CHECK: eor r0, [[TMP]], #1
+; CHECK: bx lr
+
+; CHECK: [[FAIL]]:
+; CHECK: movs [[TMP:r[0-9]+]], #0
+; CHECK: eor r0, [[TMP]], #1
+; CHECK: bx lr
+
+
+  %pair = cmpxchg i8* %value, i8 %oldValue, i8 %newValue acq_rel monotonic
+  %success = extractvalue { i8, i1 } %pair, 1
+  %failure = xor i1 %success, 1
+  ret i1 %failure
+}
+
+define void @test_conditional(i32* %p, i32 %oldval, i32 %newval) {
+; CHECK-LABEL: test_conditional:
+
+; CHECK: dmb ishst
+
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK: ldrex [[LOADED:r[0-9]+]], [r0]
+; CHECK: cmp [[LOADED]], r1
+; CHECK: bne [[FAILED:LBB[0-9]+_[0-9]+]]
+
+; CHECK: strex [[STATUS:r[0-9]+]], r2, [r0]
+; CHECK: cmp [[STATUS]], #0
+; CHECK: bne [[LOOP]]
+
+; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: dmb ish
+; CHECK: b.w _bar
+
+; CHECK: [[FAILED]]:
+; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
+; CHECK: dmb ish
+; CHECK: b.w _baz
+
+  %pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %pair, 1
+  br i1 %success, label %true, label %false
+
+true:
+  tail call void @bar() #2
+  br label %end
+
+false:
+  tail call void @baz() #2
+  br label %end
+
+end:
+  ret void
+}
+
+declare void @bar()
+declare void @baz()
diff --git a/test/CodeGen/ARM/cmpxchg-weak.ll b/test/CodeGen/ARM/cmpxchg-weak.ll
new file mode 100644
index 0000000..126e330
--- /dev/null
+++ b/test/CodeGen/ARM/cmpxchg-weak.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s
+
+define void @test_cmpxchg_weak(i32 *%addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: test_cmpxchg_weak:
+
+  %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+  %oldval = extractvalue { i32, i1 } %pair, 0
+; CHECK:     dmb ish
+; CHECK:     ldrex   [[LOADED:r[0-9]+]], [r0]
+; CHECK:     cmp     [[LOADED]], r1
+; CHECK:     strexeq [[SUCCESS:r[0-9]+]], r2, [r0]
+; CHECK:     cmpeq   [[SUCCESS]], #0
+; CHECK:     bne     [[DONE:LBB[0-9]+_[0-9]+]]
+; CHECK:     dmb     ish
+; CHECK: [[DONE]]:
+; CHECK:     str     r3, [r0]
+; CHECK:     bx      lr
+
+  store i32 %oldval, i32* %addr
+  ret void
+}
+
+
+define i1 @test_cmpxchg_weak_to_bool(i32, i32 *%addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: test_cmpxchg_weak_to_bool:
+
+  %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+  %success = extractvalue { i32, i1 } %pair, 1
+
+; CHECK:      dmb     ish
+; CHECK:      mov     r0, #0
+; CHECK:      ldrex   [[LOADED:r[0-9]+]], [r1]
+; CHECK:      cmp     [[LOADED]], r2
+; CHECK:      strexeq [[STATUS:r[0-9]+]], r3, [r1]
+; CHECK:      cmpeq   [[STATUS]], #0
+; CHECK:      bne     [[DONE:LBB[0-9]+_[0-9]+]]
+; CHECK:      dmb     ish
+; CHECK:      mov     r0, #1
+; CHECK: [[DONE]]:
+; CHECK:      bx      lr
+
+  ret i1 %success
+}
diff --git a/test/CodeGen/ARM/data-in-code-annotations.ll b/test/CodeGen/ARM/data-in-code-annotations.ll
index da70178..5eb81b2 100644
--- a/test/CodeGen/ARM/data-in-code-annotations.ll
+++ b/test/CodeGen/ARM/data-in-code-annotations.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv7-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-apple-darwin -arm-atomic-cfg-tidy=0 | FileCheck %s
 
 define double @f1() nounwind {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/ARM/debug-info-arg.ll b/test/CodeGen/ARM/debug-info-arg.ll
index e8bf3ba..31d0324 100644
--- a/test/CodeGen/ARM/debug-info-arg.ll
+++ b/test/CodeGen/ARM/debug-info-arg.ll
@@ -59,7 +59,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !24 = metadata !{i32 11, i32 81, metadata !1, null}
 !25 = metadata !{i32 11, i32 101, metadata !1, null}
 !26 = metadata !{i32 12, i32 3, metadata !27, null}
-!27 = metadata !{i32 786443, metadata !1, i32 11, i32 107, metadata !2, i32 0} ; [ DW_TAG_lexical_block ]
+!27 = metadata !{i32 786443, metadata !2, metadata !1, i32 11, i32 107, i32 0} ; [ DW_TAG_lexical_block ]
 !28 = metadata !{i32 13, i32 5, metadata !27, null}
 !29 = metadata !{i32 14, i32 1, metadata !27, null}
 !30 = metadata !{metadata !1}
diff --git a/test/CodeGen/ARM/debug-info-blocks.ll b/test/CodeGen/ARM/debug-info-blocks.ll
index 6cbe4b4..5ad5e59 100644
--- a/test/CodeGen/ARM/debug-info-blocks.ll
+++ b/test/CodeGen/ARM/debug-info-blocks.ll
@@ -231,10 +231,10 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !133 = metadata !{i32 609, i32 175, metadata !23, null}
 !134 = metadata !{i32 786689, metadata !23, metadata !"data", metadata !24, i32 67109473, metadata !108, i32 0, null} ; [ DW_TAG_arg_variable ]
 !135 = metadata !{i32 609, i32 190, metadata !23, null}
-!136 = metadata !{i32 786688, metadata !23, metadata !"mydata", metadata !24, i32 604, metadata !50, i32 0, null, i64 1, i64 20, i64 2, i64 1, i64 4, i64 2, i64 1, i64 24} ; [ DW_TAG_auto_variable ]
+!136 = metadata !{i32 786688, metadata !23, metadata !"mydata", metadata !24, i32 604, metadata !50, i32 0, null, metadata !163} ; [ DW_TAG_auto_variable ]
 !137 = metadata !{i32 604, i32 49, metadata !23, null}
-!138 = metadata !{i32 786688, metadata !23, metadata !"self", metadata !40, i32 604, metadata !90, i32 0, null, i64 1, i64 24} ; [ DW_TAG_auto_variable ]
-!139 = metadata !{i32 786688, metadata !23, metadata !"semi", metadata !24, i32 607, metadata !125, i32 0, null, i64 1, i64 28} ; [ DW_TAG_auto_variable ]
+!138 = metadata !{i32 786688, metadata !23, metadata !"self", metadata !40, i32 604, metadata !90, i32 0, null, metadata !164} ; [ DW_TAG_auto_variable ]
+!139 = metadata !{i32 786688, metadata !23, metadata !"semi", metadata !24, i32 607, metadata !125, i32 0, null, metadata !165} ; [ DW_TAG_auto_variable ]
 !140 = metadata !{i32 607, i32 30, metadata !23, null}
 !141 = metadata !{i32 610, i32 17, metadata !142, null}
 !142 = metadata !{i32 786443, metadata !152, metadata !23, i32 609, i32 200, i32 94} ; [ DW_TAG_lexical_block ]
@@ -258,3 +258,6 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !160 = metadata !{metadata !"header.h", metadata !"/Volumes/Sandbox/llvm"}
 !161 = metadata !{metadata !"header2.h", metadata !"/Volumes/Sandbox/llvm"}
 !162 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!163 = metadata !{i64 1, i64 20, i64 2, i64 1, i64 4, i64 2, i64 1, i64 24}
+!164 = metadata !{i64 1, i64 24}
+!165 = metadata !{i64 1, i64 28}
diff --git a/test/CodeGen/ARM/fold-stack-adjust.ll b/test/CodeGen/ARM/fold-stack-adjust.ll
index 695a20b..eb0120f 100644
--- a/test/CodeGen/ARM/fold-stack-adjust.ll
+++ b/test/CodeGen/ARM/fold-stack-adjust.ll
@@ -12,11 +12,11 @@ declare void @bar(i8*)
 
 define void @check_simple() minsize {
 ; CHECK-LABEL: check_simple:
-; CHECK: push {r3, r4, r5, r6, r7, lr}
+; CHECK: push.w {r7, r8, r9, r10, r11, lr}
 ; CHECK-NOT: sub sp, sp,
 ; ...
 ; CHECK-NOT: add sp, sp,
-; CHECK: pop {r0, r1, r2, r3, r7, pc}
+; CHECK: pop.w {r0, r1, r2, r3, r11, pc}
 
 ; CHECK-T1-LABEL: check_simple:
 ; CHECK-T1: push {r3, r4, r5, r6, r7, lr}
@@ -44,11 +44,11 @@ define void @check_simple() minsize {
 
 define void @check_simple_too_big() minsize {
 ; CHECK-LABEL: check_simple_too_big:
-; CHECK: push {r7, lr}
+; CHECK: push.w {r11, lr}
 ; CHECK: sub sp,
 ; ...
 ; CHECK: add sp,
-; CHECK: pop {r7, pc}
+; CHECK: pop.w {r11, pc}
   %var = alloca i8, i32 64
   call void @bar(i8* %var)
   ret void
@@ -93,11 +93,11 @@ define void @check_vfp_fold() minsize {
 ; folded in except that doing so would clobber the value being returned.
 define i64 @check_no_return_clobber() minsize {
 ; CHECK-LABEL: check_no_return_clobber:
-; CHECK: push {r1, r2, r3, r4, r5, r6, r7, lr}
+; CHECK: push.w {r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NOT: sub sp,
 ; ...
 ; CHECK: add sp, #24
-; CHECK: pop {r7, pc}
+; CHECK: pop.w {r11, pc}
 
   ; Just to keep iOS FileCheck within previous function:
 ; CHECK-IOS-LABEL: check_no_return_clobber:
@@ -176,9 +176,9 @@ define void @test_varsize(...) minsize {
 
 ; CHECK-LABEL: test_varsize:
 ; CHECK: sub	sp, #16
-; CHECK: push	{r5, r6, r7, lr}
+; CHECK: push.w {r9, r10, r11, lr}
 ; ...
-; CHECK: pop.w	{r2, r3, r7, lr}
+; CHECK: pop.w	{r2, r3, r11, lr}
 ; CHECK: add	sp, #16
 ; CHECK: bx	lr
 
diff --git a/test/CodeGen/ARM/fptoint.ll b/test/CodeGen/ARM/fptoint.ll
index c721756..f50d0b9 100644
--- a/test/CodeGen/ARM/fptoint.ll
+++ b/test/CodeGen/ARM/fptoint.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm-eabi -mattr=+v6,+vfp2 %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -arm-atomic-cfg-tidy=0 -mattr=+v6,+vfp2 %s -o - | FileCheck %s
 
 @i = weak global i32 0		; <i32*> [#uses=2]
 @u = weak global i32 0		; <i32*> [#uses=2]
diff --git a/test/Transforms/GlobalMerge/ARM/arm.ll b/test/CodeGen/ARM/global-merge-1.ll
index 8c77de6..341597e 100644
--- a/test/Transforms/GlobalMerge/ARM/arm.ll
+++ b/test/CodeGen/ARM/global-merge-1.ll
@@ -1,9 +1,9 @@
 ; RUN: llc %s -O0 -o - | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O0 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O0 -o - -global-merge=true | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O0 -o - -enable-global-merge=false | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O0 -o - -enable-global-merge=true | FileCheck -check-prefix=NO-MERGE %s
 ; RUN: llc %s -O1 -o - | FileCheck -check-prefix=MERGE %s
-; RUN: llc %s -O1 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O1 -o - -global-merge=true | FileCheck -check-prefix=MERGE %s
+; RUN: llc %s -O1 -o - -enable-global-merge=false | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O1 -o - -enable-global-merge=true | FileCheck -check-prefix=MERGE %s
 
 ; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
 ; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
diff --git a/test/CodeGen/ARM/ifcvt-branch-weight.ll b/test/CodeGen/ARM/ifcvt-branch-weight.ll
index cd8a561..a994d3d 100644
--- a/test/CodeGen/ARM/ifcvt-branch-weight.ll
+++ b/test/CodeGen/ARM/ifcvt-branch-weight.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv8 -print-machineinstrs=if-converter -o /dev/null 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8 -print-machineinstrs=if-converter -arm-atomic-cfg-tidy=0 -o /dev/null 2>&1 | FileCheck %s
 
 %struct.S = type { i8* (i8*)*, [1 x i8] }
 define internal zeroext i8 @bar(%struct.S* %x, %struct.S* nocapture %y) nounwind readonly {
diff --git a/test/CodeGen/ARM/ifcvt10.ll b/test/CodeGen/ARM/ifcvt10.ll
index 26c7272..509c182 100644
--- a/test/CodeGen/ARM/ifcvt10.ll
+++ b/test/CodeGen/ARM/ifcvt10.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a9 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -arm-atomic-cfg-tidy=0 -mcpu=cortex-a9 | FileCheck %s
 ; rdar://8402126
 ; Make sure if-converter is not predicating vldmia and ldmia. These are
 ; micro-coded and would have long issue latency even if predicated on
diff --git a/test/CodeGen/ARM/indirectbr-3.ll b/test/CodeGen/ARM/indirectbr-3.ll
index 5a9c459..291fedb 100644
--- a/test/CodeGen/ARM/indirectbr-3.ll
+++ b/test/CodeGen/ARM/indirectbr-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-atomic-cfg-tidy=0 | FileCheck %s
 
 ; If ARMBaseInstrInfo::AnalyzeBlocks returns the wrong value, which was possible
 ; for blocks with indirect branches, the IfConverter could end up deleting
diff --git a/test/CodeGen/ARM/interrupt-attr.ll b/test/CodeGen/ARM/interrupt-attr.ll
index c5be667..cb67dd9 100644
--- a/test/CodeGen/ARM/interrupt-attr.ll
+++ b/test/CodeGen/ARM/interrupt-attr.ll
@@ -35,15 +35,15 @@ define arm_aapcscc void @irq_fn() alignstack(8) "interrupt"="IRQ" {
   ; Normal AAPCS function (r0-r3 pushed onto stack by hardware, lr set to
   ; appropriate sentinel so no special return needed).
 ; CHECK-M-LABEL: irq_fn:
-; CHECK-M: push {r4, r6, r7, lr}
-; CHECK-M: add r7, sp, #8
+; CHECK-M: push.w {r4, r10, r11, lr}
+; CHECK-M: add.w r11, sp, #8
 ; CHECK-M: mov r4, sp
 ; CHECK-M: bic r4, r4, #7
 ; CHECK-M: mov sp, r4
 ; CHECK-M: blx _bar
-; CHECK-M: sub.w r4, r7, #8
+; CHECK-M: sub.w r4, r11, #8
 ; CHECK-M: mov sp, r4
-; CHECK-M: pop {r4, r6, r7, pc}
+; CHECK-M: pop.w {r4, r10, r11, pc}
 
   call arm_aapcscc void @bar()
   ret void
diff --git a/test/CodeGen/ARM/intrinsics-memory-barrier.ll b/test/CodeGen/ARM/intrinsics-memory-barrier.ll
new file mode 100644
index 0000000..5ee0b3e
--- /dev/null
+++ b/test/CodeGen/ARM/intrinsics-memory-barrier.ll
@@ -0,0 +1,55 @@
+; RUN: llc < %s -mtriple=armv7 -mattr=+db | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7 -mattr=+db | FileCheck %s
+
+; CHECK-LABEL: test
+define void @test() {
+  call void @llvm.arm.dmb(i32 3)     ; CHECK: dmb osh
+  call void @llvm.arm.dsb(i32 7)     ; CHECK: dsb nsh
+  call void @llvm.arm.isb(i32 15)    ; CHECK: isb sy
+  ret void
+}
+
+; Important point is that the compiler should not reorder memory access
+; instructions around DMB.
+; Failure to do so, two STRs will collapse into one STRD.
+; CHECK-LABEL: test_dmb_reordering
+define void @test_dmb_reordering(i32 %a, i32 %b, i32* %d) {
+  store i32 %a, i32* %d              ; CHECK: str {{r[0-9]+}}, [{{r[0-9]+}}]
+
+  call void @llvm.arm.dmb(i32 15)    ; CHECK: dmb sy
+
+  %d1 = getelementptr i32* %d, i32 1
+  store i32 %b, i32* %d1             ; CHECK: str {{r[0-9]+}}, [{{r[0-9]+}}, #4]
+
+  ret void
+}
+
+; Similarly for DSB.
+; CHECK-LABEL: test_dsb_reordering
+define void @test_dsb_reordering(i32 %a, i32 %b, i32* %d) {
+  store i32 %a, i32* %d              ; CHECK: str {{r[0-9]+}}, [{{r[0-9]+}}]
+
+  call void @llvm.arm.dsb(i32 15)    ; CHECK: dsb sy
+
+  %d1 = getelementptr i32* %d, i32 1
+  store i32 %b, i32* %d1             ; CHECK: str {{r[0-9]+}}, [{{r[0-9]+}}, #4]
+
+  ret void
+}
+
+; And ISB.
+; CHECK-LABEL: test_isb_reordering
+define void @test_isb_reordering(i32 %a, i32 %b, i32* %d) {
+  store i32 %a, i32* %d              ; CHECK: str {{r[0-9]+}}, [{{r[0-9]+}}]
+
+  call void @llvm.arm.isb(i32 15)    ; CHECK: isb sy
+
+  %d1 = getelementptr i32* %d, i32 1
+  store i32 %b, i32* %d1             ; CHECK: str {{r[0-9]+}}, [{{r[0-9]+}}, #4]
+
+  ret void
+}
+
+declare void @llvm.arm.dmb(i32)
+declare void @llvm.arm.dsb(i32)
+declare void @llvm.arm.isb(i32)
diff --git a/test/CodeGen/ARM/jump_tables.ll b/test/CodeGen/ARM/jump_tables.ll
new file mode 100644
index 0000000..907a86c
--- /dev/null
+++ b/test/CodeGen/ARM/jump_tables.ll
@@ -0,0 +1,32 @@
+; RUN: llc <%s -mtriple=arm-unknown-linux-gnueabi -jump-table-type=single | FileCheck --check-prefix=ARM %s
+; RUN: llc <%s -mtriple=thumb-unknown-linux-gnueabi -jump-table-type=single | FileCheck --check-prefix=THUMB %s
+
+define void @indirect_fun() unnamed_addr jumptable {
+  ret void
+}
+define void ()* @get_fun() {
+  ret void ()* @indirect_fun
+
+; ARM:         ldr     r0, [[LABEL:.*]]
+; ARM:         mov     pc, lr
+; ARM: [[LABEL]]:
+; ARM:         .long   __llvm_jump_instr_table_0_1
+
+; THUMB:         ldr     r0, [[LABEL:.*]]
+; THUMB:         bx      lr
+; THUMB: [[LABEL]]:
+; THUMB:         .long   __llvm_jump_instr_table_0_1
+}
+
+; ARM:         .globl  __llvm_jump_instr_table_0_1
+; ARM:         .align  3
+; ARM:         .type   __llvm_jump_instr_table_0_1,%function
+; ARM: __llvm_jump_instr_table_0_1:
+; ARM:         b     indirect_fun(PLT)
+
+; THUMB:         .globl  __llvm_jump_instr_table_0_1
+; THUMB:         .align  3
+; THUMB:         .thumb_func
+; THUMB:         .type   __llvm_jump_instr_table_0_1,%function
+; THUMB: __llvm_jump_instr_table_0_1:
+; THUMB:         b     indirect_fun(PLT)
diff --git a/test/CodeGen/ARM/ldstrex-m.ll b/test/CodeGen/ARM/ldstrex-m.ll
new file mode 100644
index 0000000..b50699f
--- /dev/null
+++ b/test/CodeGen/ARM/ldstrex-m.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 | FileCheck %s
+
+; CHECK-LABEL: f0:
+; CHECK-NOT: ldrexd
+define i64 @f0(i64* %p) nounwind readonly {
+entry:
+  %0 = load atomic i64* %p seq_cst, align 8
+  ret i64 %0
+}
+
+; CHECK-LABEL: f1:
+; CHECK-NOT: strexd
+define void @f1(i64* %p) nounwind readonly {
+entry:
+  store atomic i64 0, i64* %p seq_cst, align 8
+  ret void
+}
+
+; CHECK-LABEL: f2:
+; CHECK-NOT: ldrexd
+; CHECK-NOT: strexd
+define i64 @f2(i64* %p) nounwind readonly {
+entry:
+  %0 = atomicrmw add i64* %p, i64 1 seq_cst
+  ret i64 %0
+}
+
+; CHECK-LABEL: f3:
+; CHECK: ldr
+define i32 @f3(i32* %p) nounwind readonly {
+entry:
+  %0 = load atomic i32* %p seq_cst, align 4
+  ret i32 %0
+}
+
+; CHECK-LABEL: f4:
+; CHECK: ldrb
+define i8 @f4(i8* %p) nounwind readonly {
+entry:
+  %0 = load atomic i8* %p seq_cst, align 4
+  ret i8 %0
+}
+
+; CHECK-LABEL: f5:
+; CHECK: str
+define void @f5(i32* %p) nounwind readonly {
+entry:
+  store atomic i32 0, i32* %p seq_cst, align 4
+  ret void
+}
+
+; CHECK-LABEL: f6:
+; CHECK: ldrex
+; CHECK: strex
+define i32 @f6(i32* %p) nounwind readonly {
+entry:
+  %0 = atomicrmw add i32* %p, i32 1 seq_cst
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM/lit.local.cfg b/test/CodeGen/ARM/lit.local.cfg
index 8a3ba96..98c6700 100644
--- a/test/CodeGen/ARM/lit.local.cfg
+++ b/test/CodeGen/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/ARM/lsr-unfolded-offset.ll b/test/CodeGen/ARM/lsr-unfolded-offset.ll
index 1dafa00..3ad60d4 100644
--- a/test/CodeGen/ARM/lsr-unfolded-offset.ll
+++ b/test/CodeGen/ARM/lsr-unfolded-offset.ll
@@ -1,4 +1,4 @@
-; RUN: llc -regalloc=greedy < %s | FileCheck %s
+; RUN: llc -regalloc=greedy -arm-atomic-cfg-tidy=0 < %s | FileCheck %s
 
 ; LSR shouldn't introduce more induction variables than needed, increasing
 ; register pressure and therefore spilling. There is more room for improvement
diff --git a/test/CodeGen/ARM/metadata-default.ll b/test/CodeGen/ARM/metadata-default.ll
new file mode 100644
index 0000000..f6a3fe2
--- /dev/null
+++ b/test/CodeGen/ARM/metadata-default.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+target triple = "armv7--none-eabi"
+
+define i32 @f(i64 %z) {
+	ret i32 0
+}
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = metadata !{i32 1, metadata !"wchar_size", i32 4}
+!1 = metadata !{i32 1, metadata !"min_enum_size", i32 4}
+
+; CHECK: .eabi_attribute 18, 4   @ Tag_ABI_PCS_wchar_t
+; CHECK: .eabi_attribute 26, 2   @ Tag_ABI_enum_size
diff --git a/test/CodeGen/ARM/metadata-short-enums.ll b/test/CodeGen/ARM/metadata-short-enums.ll
new file mode 100644
index 0000000..bccd332
--- /dev/null
+++ b/test/CodeGen/ARM/metadata-short-enums.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+target triple = "armv7--none-eabi"
+
+define i32 @f(i64 %z) {
+	ret i32 0
+}
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = metadata !{i32 1, metadata !"wchar_size", i32 4}
+!1 = metadata !{i32 1, metadata !"min_enum_size", i32 1}
+
+; CHECK: .eabi_attribute 18, 4   @ Tag_ABI_PCS_wchar_t
+; CHECK: .eabi_attribute 26, 1   @ Tag_ABI_enum_size
diff --git a/test/CodeGen/ARM/metadata-short-wchar.ll b/test/CodeGen/ARM/metadata-short-wchar.ll
new file mode 100644
index 0000000..6de9bf1
--- /dev/null
+++ b/test/CodeGen/ARM/metadata-short-wchar.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+target triple = "armv7--none-eabi"
+
+define i32 @f(i64 %z) {
+	ret i32 0
+}
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = metadata !{i32 1, metadata !"wchar_size", i32 2}
+!1 = metadata !{i32 1, metadata !"min_enum_size", i32 4}
+
+; CHECK: .eabi_attribute 18, 2   @ Tag_ABI_PCS_wchar_t
+; CHECK: .eabi_attribute 26, 2   @ Tag_ABI_enum_size
diff --git a/test/CodeGen/ARM/misched-copy-arm.ll b/test/CodeGen/ARM/misched-copy-arm.ll
index 26adf0c..bb2d42c 100644
--- a/test/CodeGen/ARM/misched-copy-arm.ll
+++ b/test/CodeGen/ARM/misched-copy-arm.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc -mtriple=thumb-eabi -mcpu=swift -pre-RA-sched=source -join-globalcopies -enable-misched -verify-misched -debug-only=misched %s -o - 2>&1 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=swift -pre-RA-sched=source -join-globalcopies -enable-misched -verify-misched -debug-only=misched -arm-atomic-cfg-tidy=0 %s -o - 2>&1 | FileCheck %s
 ;
 ; Loop counter copies should be eliminated.
 ; There is also a MUL here, but we don't care where it is scheduled.
diff --git a/test/CodeGen/ARM/none-macho.ll b/test/CodeGen/ARM/none-macho.ll
index 2795b8c..60c2171 100644
--- a/test/CodeGen/ARM/none-macho.ll
+++ b/test/CodeGen/ARM/none-macho.ll
@@ -48,8 +48,8 @@ define i32 @test_frame_ptr() {
 ; CHECK-LABEL: test_frame_ptr:
   call void @test_trap()
 
-  ; Frame pointer is r7 as for Darwin
-; CHECK: mov r7, sp
+  ; Frame pointer is r11.
+; CHECK: mov r11, sp
   ret i32 42
 }
 
@@ -63,11 +63,9 @@ define void @test_two_areas(%big_arr* %addr) {
   ; This goes with the choice of r7 as FP (largely). FP and LR have to be stored
   ; consecutively on the stack for the frame record to be valid, which means we
   ; need the 2 register-save areas employed by iOS.
-; CHECK-NON-FAST: push {r4, r5, r6, r7, lr}
-; CHECK-NON-FAST: push.w {r8, r9, r10, r11}
+; CHECK-NON-FAST: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; ...
-; CHECK-NON-FAST: pop.w {r8, r9, r10, r11}
-; CHECK-NON-FAST: pop {r4, r5, r6, r7, pc}
+; CHECK-NON-FAST: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
   ret void
 }
 
diff --git a/test/CodeGen/ARM/null-streamer.ll b/test/CodeGen/ARM/null-streamer.ll
new file mode 100644
index 0000000..350c45e
--- /dev/null
+++ b/test/CodeGen/ARM/null-streamer.ll
@@ -0,0 +1,7 @@
+; Test the null streamer with a terget streamer.
+; RUN: llc -O0 -filetype=null -mtriple=arm-linux < %s
+
+define i32 @main()  {
+entry:
+  ret i32 0
+}
diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll
index b245674..feed5ad 100644
--- a/test/CodeGen/ARM/reg_sequence.ll
+++ b/test/CodeGen/ARM/reg_sequence.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 | FileCheck %s
-; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 -regalloc=basic | FileCheck %s
 ; Implementing vld / vst as REG_SEQUENCE eliminates the extra vmov's.
 
 %struct.int16x8_t = type { <8 x i16> }
diff --git a/test/CodeGen/ARM/spill-q.ll b/test/CodeGen/ARM/spill-q.ll
index b924663..4fa97ea 100644
--- a/test/CodeGen/ARM/spill-q.ll
+++ b/test/CodeGen/ARM/spill-q.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv7-elf -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-elf -mattr=+neon -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; PR4789
 
 %bar = type { float, float, float }
diff --git a/test/CodeGen/ARM/struct-byval-frame-index.ll b/test/CodeGen/ARM/struct-byval-frame-index.ll
index 465ee12..0fd55ec 100644
--- a/test/CodeGen/ARM/struct-byval-frame-index.ll
+++ b/test/CodeGen/ARM/struct-byval-frame-index.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=cortex-a15 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mcpu=cortex-a15 -verify-machineinstrs -arm-atomic-cfg-tidy=0 | FileCheck %s
 
 ; Check a spill right after a function call with large struct byval is correctly
 ; generated.
diff --git a/test/CodeGen/ARM/twoaddrinstr.ll b/test/CodeGen/ARM/twoaddrinstr.ll
index 8da875f..01df3b4 100644
--- a/test/CodeGen/ARM/twoaddrinstr.ll
+++ b/test/CodeGen/ARM/twoaddrinstr.ll
@@ -1,5 +1,5 @@
 ; Tests for the two-address instruction pass.
-; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 -arm-atomic-cfg-tidy=0 %s -o - | FileCheck %s
 
 define void @PR13378() nounwind {
 ; This was orriginally a crasher trying to schedule the instructions.
diff --git a/test/CodeGen/ARM/va_arg.ll b/test/CodeGen/ARM/va_arg.ll
index f18b498..d901a74 100644
--- a/test/CodeGen/ARM/va_arg.ll
+++ b/test/CodeGen/ARM/va_arg.ll
@@ -24,13 +24,13 @@ entry:
 ; CHECK-NOT:	bfc
 ; CHECK: bx	lr
 
-define double @test2(i32 %a, i32 %b, ...) nounwind optsize {
+define double @test2(i32 %a, i32* %b, ...) nounwind optsize {
 entry:
   %ap = alloca i8*, align 4                       ; <i8**> [#uses=3]
   %ap1 = bitcast i8** %ap to i8*                  ; <i8*> [#uses=2]
   call void @llvm.va_start(i8* %ap1)
   %0 = va_arg i8** %ap, i32                       ; <i32> [#uses=0]
-  store i32 %0, i32* undef
+  store i32 %0, i32* %b
   %1 = va_arg i8** %ap, double                    ; <double> [#uses=1]
   call void @llvm.va_end(i8* %ap1)
   ret double %1
diff --git a/test/CodeGen/ARM/vldm-sched-a9.ll b/test/CodeGen/ARM/vldm-sched-a9.ll
index d0a9ac6..f2e5eb9 100644
--- a/test/CodeGen/ARM/vldm-sched-a9.ll
+++ b/test/CodeGen/ARM/vldm-sched-a9.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mtriple=armv7-linux-gnueabihf -float-abi=hard -mcpu=cortex-a9 -O3 | FileCheck %s
+; RUN: llc < %s -march=arm -mtriple=armv7-linux-gnueabihf -arm-atomic-cfg-tidy=0 -float-abi=hard -mcpu=cortex-a9 -O3 | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32-S64"
 
diff --git a/test/CodeGen/ARM/widen-vmovs.ll b/test/CodeGen/ARM/widen-vmovs.ll
index 1efbc73..316cfab 100644
--- a/test/CodeGen/ARM/widen-vmovs.ll
+++ b/test/CodeGen/ARM/widen-vmovs.ll
@@ -17,7 +17,7 @@ target triple = "thumbv7-apple-ios"
 ; - Register liveness is verified.
 ; - The execution domain switch to vorr works across basic blocks.
 
-define void @Mm() nounwind {
+define void @Mm(i32 %in, float* %addr) nounwind {
 entry:
   br label %for.body4
 
@@ -27,10 +27,10 @@ for.body4:
 for.body.i:
   %tmp3.i = phi float [ 1.000000e+10, %for.body4 ], [ %add.i, %for.body.i ]
   %add.i = fadd float %tmp3.i, 1.000000e+10
-  %exitcond.i = icmp eq i32 undef, 41
+  %exitcond.i = icmp eq i32 %in, 41
   br i1 %exitcond.i, label %rInnerproduct.exit, label %for.body.i
 
 rInnerproduct.exit:
-  store float %add.i, float* undef, align 4
+  store float %add.i, float* %addr, align 4
   br label %for.body4
 }
diff --git a/test/CodeGen/CPP/atomic.ll b/test/CodeGen/CPP/atomic.ll
new file mode 100644
index 0000000..e79c45d
--- /dev/null
+++ b/test/CodeGen/CPP/atomic.ll
@@ -0,0 +1,89 @@
+; RUN: llc -march=cpp -o - %s | FileCheck %s
+
+define void @test_atomicrmw(i32* %addr, i32 %inc) {
+  %inst0 = atomicrmw xchg i32* %addr, i32 %inc seq_cst
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Xchg, {{.*}}, SequentiallyConsistent, CrossThread
+  ; CHECK: [[INST]]->setName("inst0");
+  ; CHECK: [[INST]]->setVolatile(false);
+
+  %inst1 = atomicrmw add i32* %addr, i32 %inc seq_cst
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Add, {{.*}}, SequentiallyConsistent, CrossThread
+  ; CHECK: [[INST]]->setName("inst1");
+  ; CHECK: [[INST]]->setVolatile(false);
+
+  %inst2 = atomicrmw volatile sub i32* %addr, i32 %inc singlethread monotonic
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Sub, {{.*}}, Monotonic, SingleThread
+  ; CHECK: [[INST]]->setName("inst2");
+  ; CHECK: [[INST]]->setVolatile(true);
+
+  %inst3 = atomicrmw and i32* %addr, i32 %inc acq_rel
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::And, {{.*}}, AcquireRelease, CrossThread
+  ; CHECK: [[INST]]->setName("inst3");
+  ; CHECK: [[INST]]->setVolatile(false);
+
+  %inst4 = atomicrmw nand i32* %addr, i32 %inc release
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Nand, {{.*}}, Release, CrossThread
+  ; CHECK: [[INST]]->setName("inst4");
+  ; CHECK: [[INST]]->setVolatile(false);
+
+  %inst5 = atomicrmw volatile or i32* %addr, i32 %inc singlethread seq_cst
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Or, {{.*}}, SequentiallyConsistent, SingleThread
+  ; CHECK: [[INST]]->setName("inst5");
+  ; CHECK: [[INST]]->setVolatile(true);
+
+  %inst6 = atomicrmw xor i32* %addr, i32 %inc release
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Xor, {{.*}}, Release, CrossThread
+  ; CHECK: [[INST]]->setName("inst6");
+  ; CHECK: [[INST]]->setVolatile(false);
+
+  %inst7 = atomicrmw volatile max i32* %addr, i32 %inc singlethread monotonic
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Max, {{.*}}, Monotonic, SingleThread
+  ; CHECK: [[INST]]->setName("inst7");
+  ; CHECK: [[INST]]->setVolatile(true);
+
+  %inst8 = atomicrmw min i32* %addr, i32 %inc acquire
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::Min, {{.*}}, Acquire, CrossThread
+  ; CHECK: [[INST]]->setName("inst8");
+  ; CHECK: [[INST]]->setVolatile(false);
+
+  %inst9 = atomicrmw volatile umax i32* %addr, i32 %inc monotonic
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::UMax, {{.*}}, Monotonic, CrossThread
+  ; CHECK: [[INST]]->setName("inst9");
+  ; CHECK: [[INST]]->setVolatile(true);
+
+  %inst10 = atomicrmw umin i32* %addr, i32 %inc singlethread release
+  ; CHECK: AtomicRMWInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicRMWInst(AtomicRMWInst::UMin, {{.*}}, Release, SingleThread
+  ; CHECK: [[INST]]->setName("inst10");
+  ; CHECK: [[INST]]->setVolatile(false);
+
+
+  ret void
+}
+
+define void @test_cmpxchg(i32* %addr, i32 %desired, i32 %new) {
+  %inst0 = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+  ; CHECK: AtomicCmpXchgInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicCmpXchgInst({{.*}}, SequentiallyConsistent, Monotonic, CrossThread
+  ; CHECK: [[INST]]->setName("inst0");
+  ; CHECK: [[INST]]->setVolatile(false);
+  ; CHECK: [[INST]]->setWeak(false);
+
+  %inst1 = cmpxchg volatile i32* %addr, i32 %desired, i32 %new singlethread acq_rel acquire
+  ; CHECK: AtomicCmpXchgInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicCmpXchgInst({{.*}}, AcquireRelease, Acquire, SingleThread
+  ; CHECK: [[INST]]->setName("inst1");
+  ; CHECK: [[INST]]->setVolatile(true);
+  ; CHECK: [[INST]]->setWeak(false);
+
+  %inst2 = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+  ; CHECK: AtomicCmpXchgInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicCmpXchgInst({{.*}}, SequentiallyConsistent, Monotonic, CrossThread
+  ; CHECK: [[INST]]->setName("inst2");
+  ; CHECK: [[INST]]->setVolatile(false);
+  ; CHECK: [[INST]]->setWeak(true);
+
+  %inst3 = cmpxchg weak volatile i32* %addr, i32 %desired, i32 %new singlethread acq_rel acquire
+  ; CHECK: AtomicCmpXchgInst* [[INST:[a-zA-Z0-9_]+]] = new AtomicCmpXchgInst({{.*}}, AcquireRelease, Acquire, SingleThread
+  ; CHECK: [[INST]]->setName("inst3");
+  ; CHECK: [[INST]]->setVolatile(true);
+  ; CHECK: [[INST]]->setWeak(true);
+
+  ret void
+}
diff --git a/test/CodeGen/CPP/lit.local.cfg b/test/CodeGen/CPP/lit.local.cfg
index 4063dd1..3ff5c6b 100644
--- a/test/CodeGen/CPP/lit.local.cfg
+++ b/test/CodeGen/CPP/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'CppBackend' in targets:
+if not 'CppBackend' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/Generic/MachineBranchProb.ll b/test/CodeGen/Generic/MachineBranchProb.ll
index 802ee2c..0e98280 100644
--- a/test/CodeGen/Generic/MachineBranchProb.ll
+++ b/test/CodeGen/Generic/MachineBranchProb.ll
@@ -1,5 +1,8 @@
 ; RUN: llc < %s -print-machineinstrs=expand-isel-pseudos -o /dev/null 2>&1 | FileCheck %s
 
+; ARM & AArch64 run an extra SimplifyCFG which disrupts this test.
+; XFAIL: arm,aarch64
+
 ; Make sure we have the correct weight attached to each successor.
 define i32 @test2(i32 %x) nounwind uwtable readnone ssp {
 ; CHECK: Machine code for function test2:
diff --git a/test/CodeGen/Generic/select.ll b/test/CodeGen/Generic/select.ll
index 77636eb..c4841b7 100644
--- a/test/CodeGen/Generic/select.ll
+++ b/test/CodeGen/Generic/select.ll
@@ -192,4 +192,3 @@ define <1 x i32> @checkScalariseVSELECT(<1 x i32> %a, <1 x i32> %b) {
         %s = select <1 x i1> %cond, <1 x i32> %a, <1 x i32> %b
         ret <1 x i32> %s
 }
-
diff --git a/test/CodeGen/Generic/stop-after.ll b/test/CodeGen/Generic/stop-after.ll
index 557e097..5e0e350 100644
--- a/test/CodeGen/Generic/stop-after.ll
+++ b/test/CodeGen/Generic/stop-after.ll
@@ -5,6 +5,6 @@
 ; STOP: Loop Strength Reduction
 ; STOP-NEXT: Machine Function Analysis
 
-; START: -machine-branch-prob -gc-lowering
+; START: -machine-branch-prob -jump-instr-tables -gc-lowering
 ; START: FunctionPass Manager
 ; START-NEXT: Lower Garbage Collection Instructions
diff --git a/test/CodeGen/Hexagon/lit.local.cfg b/test/CodeGen/Hexagon/lit.local.cfg
index e96bab8..ba72ff6 100644
--- a/test/CodeGen/Hexagon/lit.local.cfg
+++ b/test/CodeGen/Hexagon/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Hexagon' in targets:
+if not 'Hexagon' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/MSP430/lit.local.cfg b/test/CodeGen/MSP430/lit.local.cfg
index a18fe6f..b1cf1fb 100644
--- a/test/CodeGen/MSP430/lit.local.cfg
+++ b/test/CodeGen/MSP430/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'MSP430' in targets:
+if not 'MSP430' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/Mips/2008-08-01-AsmInline.ll b/test/CodeGen/Mips/2008-08-01-AsmInline.ll
index e274bc0..3c1bb39 100644
--- a/test/CodeGen/Mips/2008-08-01-AsmInline.ll
+++ b/test/CodeGen/Mips/2008-08-01-AsmInline.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mips < %s | FileCheck %s
+; RUN: llc -march=mips -mcpu=mips32 < %s | FileCheck %s
 ; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 < %s | FileCheck %s
 
 %struct.DWstruct = type { i32, i32 }
diff --git a/test/CodeGen/Mips/2013-11-18-fp64-const0.ll b/test/CodeGen/Mips/2013-11-18-fp64-const0.ll
index f8390d9..6a210a0 100644
--- a/test/CodeGen/Mips/2013-11-18-fp64-const0.ll
+++ b/test/CodeGen/Mips/2013-11-18-fp64-const0.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=mips -mattr=-fp64 < %s | FileCheck -check-prefix=CHECK-FP32 %s
-; RUN: llc -march=mips -mattr=+fp64 < %s | FileCheck -check-prefix=CHECK-FP64 %s
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck -check-prefix=CHECK-FP64 %s
 
 ; This test case is a simplified version of an llvm-stress generated test with
 ; seed=3718491962.
diff --git a/test/CodeGen/Mips/Fast-ISel/loadstore2.ll b/test/CodeGen/Mips/Fast-ISel/loadstore2.ll
new file mode 100644
index 0000000..f113a0e
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/loadstore2.ll
@@ -0,0 +1,83 @@
+; ModuleID = 'loadstore2.c'
+target datalayout = "E-m:m-p:32:32-i8:8:32-i16:16:32-i64:64-n32-S64"
+target triple = "mips--linux-gnu"
+
+@c2 = common global i8 0, align 1
+@c1 = common global i8 0, align 1
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+
+@s2 = common global i16 0, align 2
+@s1 = common global i16 0, align 2
+@i2 = common global i32 0, align 4
+@i1 = common global i32 0, align 4
+@f2 = common global float 0.000000e+00, align 4
+@f1 = common global float 0.000000e+00, align 4
+@d2 = common global double 0.000000e+00, align 8
+@d1 = common global double 0.000000e+00, align 8
+
+; Function Attrs: nounwind
+define void @cfoo() #0 {
+entry:
+  %0 = load i8* @c2, align 1
+  store i8 %0, i8* @c1, align 1
+; CHECK-LABEL:	cfoo:
+; CHECK:	lbu	$[[REGc:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:	sb	$[[REGc]], 0(${{[0-9]+}})
+
+
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @sfoo() #0 {
+entry:
+  %0 = load i16* @s2, align 2
+  store i16 %0, i16* @s1, align 2
+; CHECK-LABEL:	sfoo:
+; CHECK:	lhu	$[[REGs:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:	sh	$[[REGs]], 0(${{[0-9]+}})
+
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @ifoo() #0 {
+entry:
+  %0 = load i32* @i2, align 4
+  store i32 %0, i32* @i1, align 4
+; CHECK-LABEL:	ifoo:
+; CHECK:	lw	$[[REGi:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:	sw	$[[REGi]], 0(${{[0-9]+}})
+
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @ffoo() #0 {
+entry:
+  %0 = load float* @f2, align 4
+  store float %0, float* @f1, align 4
+; CHECK-LABEL:	ffoo:
+; CHECK:	lwc1	$f[[REGf:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:	swc1	$f[[REGf]], 0(${{[0-9]+}})
+
+
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @dfoo() #0 {
+entry:
+  %0 = load double* @d2, align 8
+  store double %0, double* @d1, align 8
+; CHECK-LABEL:        dfoo:
+; CHECK:        ldc1    $f[[REGd:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:        sdc1    $f[[REGd]], 0(${{[0-9]+}})
+; CHECK:        .end    dfoo
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+
diff --git a/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll b/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll
new file mode 100644
index 0000000..6759c01
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+
+@f = common global float 0.000000e+00, align 4
+@de = common global double 0.000000e+00, align 8
+
+; Function Attrs: nounwind
+define void @f1() #0 {
+entry:
+  store float 0x3FFA76C8C0000000, float* @f, align 4
+  ret void
+; CHECK:  .ent  f1
+; CHECK:  lui  $[[REG1:[0-9]+]], 16339
+; CHECK:  ori  $[[REG2:[0-9]+]], $[[REG1]], 46662
+; CHECK:  mtc1  $[[REG2]], $f[[REG3:[0-9]+]]
+; CHECK:  lw  $[[REG4:[0-9]+]], %got(f)(${{[0-9]+}})
+; CHECK:  swc1  $f[[REG3]], 0($[[REG4]])
+; CHECK:   .end  f1
+
+}
+
+; Function Attrs: nounwind
+define void @d1() #0 {
+entry:
+  store double 1.234567e+00, double* @de, align 8
+; CHECK:  .ent  d1
+; CHECK:  lui  $[[REG1a:[0-9]+]], 16371
+; CHECK:  ori  $[[REG2a:[0-9]+]], $[[REG1a]], 49353
+; CHECK:  lui  $[[REG1b:[0-9]+]], 21403
+; CHECK:  ori  $[[REG2b:[0-9]+]], $[[REG1b]], 34951
+; CHECK:  mtc1  $[[REG2b]], $f[[REG3:[0-9]+]]
+; CHECK:  mthc1  $[[REG2a]], $f[[REG3]]
+; CHECK:  sdc1  $f[[REG3]], 0(${{[0-9]+}})
+; CHECK:  .end  d1
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/abiflags-xx.ll b/test/CodeGen/Mips/abiflags-xx.ll
new file mode 100644
index 0000000..b8aa071
--- /dev/null
+++ b/test/CodeGen/Mips/abiflags-xx.ll
@@ -0,0 +1,6 @@
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -mattr=fpxx %s -o - | FileCheck %s
+; XFAIL: *
+
+; CHECK: .nan    legacy
+; CHECK: .module fp=xx
+
diff --git a/test/CodeGen/Mips/abiflags32.ll b/test/CodeGen/Mips/abiflags32.ll
new file mode 100644
index 0000000..093964f
--- /dev/null
+++ b/test/CodeGen/Mips/abiflags32.ll
@@ -0,0 +1,12 @@
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o - | FileCheck %s
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -mattr=fp64 %s -o - | FileCheck  -check-prefix=CHECK-64 %s
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips64 -mattr=-n64,n32 %s -o - | FileCheck  -check-prefix=CHECK-64n %s
+
+; CHECK: .nan    legacy
+; CHECK: .module fp=32
+
+; CHECK-64: .nan    legacy
+; CHECK-64: .module fp=64
+
+; CHECK-64n: .nan    legacy
+; CHECK-64n: .module fp=64
diff --git a/test/CodeGen/Mips/analyzebranch.ll b/test/CodeGen/Mips/analyzebranch.ll
index 8ec5d93..4b5d097 100644
--- a/test/CodeGen/Mips/analyzebranch.ll
+++ b/test/CodeGen/Mips/analyzebranch.ll
@@ -1,9 +1,25 @@
-; RUN: llc -march=mips < %s | FileCheck %s
+; RUN: llc -march=mips -mcpu=mips32   < %s | FileCheck %s -check-prefix=ALL -check-prefix=FCC
+; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=FCC
+; RUN: llc -march=mips -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR -check-prefix=32-GPR
+; RUN: llc -march=mips64 -mcpu=mips4    < %s | FileCheck %s -check-prefix=ALL -check-prefix=FCC
+; RUN: llc -march=mips64 -mcpu=mips64   < %s | FileCheck %s -check-prefix=ALL -check-prefix=FCC
+; RUN: llc -march=mips64 -mcpu=mips64r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=FCC
+; RUN: llc -march=mips64 -mcpu=mips64r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR -check-prefix=64-GPR
 
 define double @foo(double %a, double %b) nounwind readnone {
 entry:
-; CHECK: bc1f $BB
-; CHECK: nop
+; ALL-LABEL: foo:
+
+; FCC:           bc1f $BB
+; FCC:           nop
+
+; 32-GPR:        mtc1      $zero, $[[Z:f[0-9]]]
+; 32-GPR:        mthc1     $zero, $[[Z:f[0-9]]]
+; 64-GPR:        dmtc1     $zero, $[[Z:f[0-9]]]
+; GPR:           cmp.lt.d  $[[FGRCC:f[0-9]+]], $[[Z]], $f12
+; GPR:           mfc1      $[[GPRCC:[0-9]+]], $[[FGRCC]]
+; GPR-NOT:       not       $[[GPRCC]], $[[GPRCC]]
+; GPR:           bnez      $[[GPRCC]], $BB
 
   %cmp = fcmp ogt double %a, 0.000000e+00
   br i1 %cmp, label %if.end6, label %if.else
@@ -25,8 +41,17 @@ return:                                           ; preds = %if.else, %if.end6
 
 define void @f1(float %f) nounwind {
 entry:
-; CHECK: bc1f $BB
-; CHECK: nop
+; ALL-LABEL: f1:
+
+; FCC:           bc1f $BB
+; FCC:           nop
+
+; GPR:           mtc1     $zero, $[[Z:f[0-9]]]
+; GPR:           cmp.eq.s $[[FGRCC:f[0-9]+]], $f12, $[[Z]]
+; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC]]
+; GPR-NOT:       not      $[[GPRCC]], $[[GPRCC]]
+; GPR:           beqz     $[[GPRCC]], $BB
+
   %cmp = fcmp une float %f, 0.000000e+00
   br i1 %cmp, label %if.then, label %if.end
 
diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll
index 77d7bf3..066d42c 100644
--- a/test/CodeGen/Mips/atomic.ll
+++ b/test/CodeGen/Mips/atomic.ll
@@ -1,5 +1,14 @@
-; RUN: llc -march=mipsel --disable-machine-licm < %s | FileCheck %s -check-prefix=CHECK-EL
-; RUN: llc -march=mips   --disable-machine-licm < %s | FileCheck %s -check-prefix=CHECK-EB
+; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32   < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=NO-SEB-SEH  -check-prefix=CHECK-EL
+; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL
+; RUN: llc -march=mipsel --disable-machine-licm -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL
+; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips4    < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=NO-SEB-SEH  -check-prefix=CHECK-EL
+; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips64   < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=NO-SEB-SEH  -check-prefix=CHECK-EL
+; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips64r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL
+; RUN: llc -march=mips64el --disable-machine-licm -mcpu=mips64r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64-ANY -check-prefix=HAS-SEB-SEH -check-prefix=CHECK-EL
+
+; Keep one big-endian check so that we don't reduce testing, but don't add more
+; since endianness doesn't affect the body of the atomic operations.
+; RUN: llc -march=mips   --disable-machine-licm -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=CHECK-EB
 
 @x = common global i32 0, align 4
 
@@ -8,21 +17,16 @@ entry:
   %0 = atomicrmw add i32* @x, i32 %incr monotonic
   ret i32 %0
 
-; CHECK-EL-LABEL:   AtomicLoadAdd32:
-; CHECK-EL:   lw      $[[R0:[0-9]+]], %got(x)
-; CHECK-EL:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EL:   ll      $[[R1:[0-9]+]], 0($[[R0]])
-; CHECK-EL:   addu    $[[R2:[0-9]+]], $[[R1]], $4
-; CHECK-EL:   sc      $[[R2]], 0($[[R0]])
-; CHECK-EL:   beqz    $[[R2]], $[[BB0]]
-
-; CHECK-EB-LABEL:   AtomicLoadAdd32:
-; CHECK-EB:   lw      $[[R0:[0-9]+]], %got(x)
-; CHECK-EB:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EB:   ll      $[[R1:[0-9]+]], 0($[[R0]])
-; CHECK-EB:   addu    $[[R2:[0-9]+]], $[[R1]], $4
-; CHECK-EB:   sc      $[[R2]], 0($[[R0]])
-; CHECK-EB:   beqz    $[[R2]], $[[BB0]]
+; ALL-LABEL: AtomicLoadAdd32:
+
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(x)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(x)(
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $[[R1:[0-9]+]], 0($[[R0]])
+; ALL:           addu    $[[R2:[0-9]+]], $[[R1]], $4
+; ALL:           sc      $[[R2]], 0($[[R0]])
+; ALL:           beqz    $[[R2]], $[[BB0]]
 }
 
 define i32 @AtomicLoadNand32(i32 %incr) nounwind {
@@ -30,23 +34,17 @@ entry:
   %0 = atomicrmw nand i32* @x, i32 %incr monotonic
   ret i32 %0
 
-; CHECK-EL-LABEL:   AtomicLoadNand32:
-; CHECK-EL:   lw      $[[R0:[0-9]+]], %got(x)
-; CHECK-EL:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EL:   ll      $[[R1:[0-9]+]], 0($[[R0]])
-; CHECK-EL:   and     $[[R3:[0-9]+]], $[[R1]], $4
-; CHECK-EL:   nor     $[[R2:[0-9]+]], $zero, $[[R3]]
-; CHECK-EL:   sc      $[[R2]], 0($[[R0]])
-; CHECK-EL:   beqz    $[[R2]], $[[BB0]]
-
-; CHECK-EB-LABEL:   AtomicLoadNand32:
-; CHECK-EB:   lw      $[[R0:[0-9]+]], %got(x)
-; CHECK-EB:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EB:   ll      $[[R1:[0-9]+]], 0($[[R0]])
-; CHECK-EB:   and     $[[R3:[0-9]+]], $[[R1]], $4
-; CHECK-EB:   nor     $[[R2:[0-9]+]], $zero, $[[R3]]
-; CHECK-EB:   sc      $[[R2]], 0($[[R0]])
-; CHECK-EB:   beqz    $[[R2]], $[[BB0]]
+; ALL-LABEL: AtomicLoadNand32:
+
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(x)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(x)(
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $[[R1:[0-9]+]], 0($[[R0]])
+; ALL:           and     $[[R3:[0-9]+]], $[[R1]], $4
+; ALL:           nor     $[[R2:[0-9]+]], $zero, $[[R3]]
+; ALL:           sc      $[[R2]], 0($[[R0]])
+; ALL:           beqz    $[[R2]], $[[BB0]]
 }
 
 define i32 @AtomicSwap32(i32 %newval) nounwind {
@@ -57,19 +55,15 @@ entry:
   %0 = atomicrmw xchg i32* @x, i32 %tmp monotonic
   ret i32 %0
 
-; CHECK-EL-LABEL:   AtomicSwap32:
-; CHECK-EL:   lw      $[[R0:[0-9]+]], %got(x)
-; CHECK-EL:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EL:   ll      ${{[0-9]+}}, 0($[[R0]])
-; CHECK-EL:   sc      $[[R2:[0-9]+]], 0($[[R0]])
-; CHECK-EL:   beqz    $[[R2]], $[[BB0]]
-
-; CHECK-EB-LABEL:   AtomicSwap32:
-; CHECK-EB:   lw      $[[R0:[0-9]+]], %got(x)
-; CHECK-EB:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EB:   ll      ${{[0-9]+}}, 0($[[R0]])
-; CHECK-EB:   sc      $[[R2:[0-9]+]], 0($[[R0]])
-; CHECK-EB:   beqz    $[[R2]], $[[BB0]]
+; ALL-LABEL: AtomicSwap32:
+
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(x)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(x)
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      ${{[0-9]+}}, 0($[[R0]])
+; ALL:           sc      $[[R2:[0-9]+]], 0($[[R0]])
+; ALL:           beqz    $[[R2]], $[[BB0]]
 }
 
 define i32 @AtomicCmpSwap32(i32 %oldval, i32 %newval) nounwind {
@@ -78,25 +72,20 @@ entry:
   store i32 %newval, i32* %newval.addr, align 4
   %tmp = load i32* %newval.addr, align 4
   %0 = cmpxchg i32* @x, i32 %oldval, i32 %tmp monotonic monotonic
-  ret i32 %0
+  %1 = extractvalue { i32, i1 } %0, 0
+  ret i32 %1
+
+; ALL-LABEL: AtomicCmpSwap32:
 
-; CHECK-EL-LABEL:   AtomicCmpSwap32:
-; CHECK-EL:   lw      $[[R0:[0-9]+]], %got(x)
-; CHECK-EL:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EL:   ll      $2, 0($[[R0]])
-; CHECK-EL:   bne     $2, $4, $[[BB1:[A-Z_0-9]+]]
-; CHECK-EL:   sc      $[[R2:[0-9]+]], 0($[[R0]])
-; CHECK-EL:   beqz    $[[R2]], $[[BB0]]
-; CHECK-EL:   $[[BB1]]:
-
-; CHECK-EB-LABEL:   AtomicCmpSwap32:
-; CHECK-EB:   lw      $[[R0:[0-9]+]], %got(x)
-; CHECK-EB:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EB:   ll      $2, 0($[[R0]])
-; CHECK-EB:   bne     $2, $4, $[[BB1:[A-Z_0-9]+]]
-; CHECK-EB:   sc      $[[R2:[0-9]+]], 0($[[R0]])
-; CHECK-EB:   beqz    $[[R2]], $[[BB0]]
-; CHECK-EB:   $[[BB1]]:
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(x)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(x)(
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $2, 0($[[R0]])
+; ALL:           bne     $2, $4, $[[BB1:[A-Z_0-9]+]]
+; ALL:           sc      $[[R2:[0-9]+]], 0($[[R0]])
+; ALL:           beqz    $[[R2]], $[[BB0]]
+; ALL:       $[[BB1]]:
 }
 
 
@@ -108,56 +97,38 @@ entry:
   %0 = atomicrmw add i8* @y, i8 %incr monotonic
   ret i8 %0
 
-; CHECK-EL-LABEL:   AtomicLoadAdd8:
-; CHECK-EL:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EL:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EL:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EL:   andi    $[[R3:[0-9]+]], $[[R0]], 3
-; CHECK-EL:   sll     $[[R4:[0-9]+]], $[[R3]], 3
-; CHECK-EL:   ori     $[[R5:[0-9]+]], $zero, 255
-; CHECK-EL:   sllv    $[[R6:[0-9]+]], $[[R5]], $[[R4]]
-; CHECK-EL:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
-; CHECK-EL:   sllv    $[[R9:[0-9]+]], $4, $[[R4]]
-
-; CHECK-EL:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EL:   ll      $[[R10:[0-9]+]], 0($[[R2]])
-; CHECK-EL:   addu    $[[R11:[0-9]+]], $[[R10]], $[[R9]]
-; CHECK-EL:   and     $[[R12:[0-9]+]], $[[R11]], $[[R6]]
-; CHECK-EL:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK-EL:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
-; CHECK-EL:   sc      $[[R14]], 0($[[R2]])
-; CHECK-EL:   beqz    $[[R14]], $[[BB0]]
-
-; CHECK-EL:   and     $[[R15:[0-9]+]], $[[R10]], $[[R6]]
-; CHECK-EL:   srlv    $[[R16:[0-9]+]], $[[R15]], $[[R4]]
-; CHECK-EL:   sll     $[[R17:[0-9]+]], $[[R16]], 24
-; CHECK-EL:   sra     $2, $[[R17]], 24
-
-; CHECK-EB-LABEL:   AtomicLoadAdd8:
-; CHECK-EB:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EB:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EB:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EB:   andi    $[[R3:[0-9]+]], $[[R0]], 3
-; CHECK-EB:   xori    $[[R4:[0-9]+]], $[[R3]], 3
-; CHECK-EB:   sll     $[[R5:[0-9]+]], $[[R4]], 3
-; CHECK-EB:   ori     $[[R6:[0-9]+]], $zero, 255
-; CHECK-EB:   sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
-; CHECK-EB:   nor     $[[R8:[0-9]+]], $zero, $[[R7]]
-; CHECK-EB:   sllv    $[[R9:[0-9]+]], $4, $[[R5]]
-
-; CHECK-EB:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EB:   ll      $[[R10:[0-9]+]], 0($[[R2]])
-; CHECK-EB:   addu    $[[R11:[0-9]+]], $[[R10]], $[[R9]]
-; CHECK-EB:   and     $[[R12:[0-9]+]], $[[R11]], $[[R7]]
-; CHECK-EB:   and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
-; CHECK-EB:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
-; CHECK-EB:   sc      $[[R14]], 0($[[R2]])
-; CHECK-EB:   beqz    $[[R14]], $[[BB0]]
-
-; CHECK-EB:   and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK-EB:   srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
-; CHECK-EB:   sll     $[[R17:[0-9]+]], $[[R16]], 24
-; CHECK-EB:   sra     $2, $[[R17]], 24
+; ALL-LABEL: AtomicLoadAdd8:
+
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(y)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(y)(
+
+; ALL:           addiu   $[[R1:[0-9]+]], $zero, -4
+; ALL:           and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; ALL:           andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK-EB:      xori    $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK-EB:      sll     $[[R5:[0-9]+]], $[[R4]], 3
+; CHECK-EL:      sll     $[[R5:[0-9]+]], $[[R3]], 3
+; ALL:           ori     $[[R6:[0-9]+]], $zero, 255
+; ALL:           sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
+; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
+; ALL:           sllv    $[[R9:[0-9]+]], $4, $[[R5]]
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $[[R10:[0-9]+]], 0($[[R2]])
+; ALL:           addu    $[[R11:[0-9]+]], $[[R10]], $[[R9]]
+; ALL:           and     $[[R12:[0-9]+]], $[[R11]], $[[R7]]
+; ALL:           and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
+; ALL:           or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
+; ALL:           sc      $[[R14]], 0($[[R2]])
+; ALL:           beqz    $[[R14]], $[[BB0]]
+
+; ALL:           and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
+; ALL:           srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
+
+; NO-SEB-SEH:    sll     $[[R17:[0-9]+]], $[[R16]], 24
+; NO-SEB-SEH:    sra     $2, $[[R17]], 24
+
+; HAS-SEB-SEH:   seb     $2, $[[R16]]
 }
 
 define signext i8 @AtomicLoadSub8(i8 signext %incr) nounwind {
@@ -165,56 +136,38 @@ entry:
   %0 = atomicrmw sub i8* @y, i8 %incr monotonic
   ret i8 %0
 
-; CHECK-EL-LABEL:   AtomicLoadSub8:
-; CHECK-EL:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EL:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EL:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EL:   andi    $[[R3:[0-9]+]], $[[R0]], 3
-; CHECK-EL:   sll     $[[R4:[0-9]+]], $[[R3]], 3
-; CHECK-EL:   ori     $[[R5:[0-9]+]], $zero, 255
-; CHECK-EL:   sllv    $[[R6:[0-9]+]], $[[R5]], $[[R4]]
-; CHECK-EL:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
-; CHECK-EL:   sllv     $[[R9:[0-9]+]], $4, $[[R4]]
-
-; CHECK-EL:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EL:   ll      $[[R10:[0-9]+]], 0($[[R2]])
-; CHECK-EL:   subu    $[[R11:[0-9]+]], $[[R10]], $[[R9]]
-; CHECK-EL:   and     $[[R12:[0-9]+]], $[[R11]], $[[R6]]
-; CHECK-EL:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK-EL:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
-; CHECK-EL:   sc      $[[R14]], 0($[[R2]])
-; CHECK-EL:   beqz    $[[R14]], $[[BB0]]
-
-; CHECK-EL:   and     $[[R15:[0-9]+]], $[[R10]], $[[R6]]
-; CHECK-EL:   srlv    $[[R16:[0-9]+]], $[[R15]], $[[R4]]
-; CHECK-EL:   sll     $[[R17:[0-9]+]], $[[R16]], 24
-; CHECK-EL:   sra     $2, $[[R17]], 24
-
-; CHECK-EB-LABEL:   AtomicLoadSub8:
-; CHECK-EB:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EB:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EB:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EB:   andi    $[[R3:[0-9]+]], $[[R0]], 3
+; ALL-LABEL: AtomicLoadSub8:
+
+; MIPS32-ANY: lw      $[[R0:[0-9]+]], %got(y)
+; MIPS64-ANY: ld      $[[R0:[0-9]+]], %got_disp(y)(
+
+; ALL:        addiu   $[[R1:[0-9]+]], $zero, -4
+; ALL:        and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; ALL:        andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK-EL:   sll     $[[R5:[0-9]+]], $[[R3]], 3
 ; CHECK-EB:   xori    $[[R4:[0-9]+]], $[[R3]], 3
 ; CHECK-EB:   sll     $[[R5:[0-9]+]], $[[R4]], 3
-; CHECK-EB:   ori     $[[R6:[0-9]+]], $zero, 255
-; CHECK-EB:   sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
-; CHECK-EB:   nor     $[[R8:[0-9]+]], $zero, $[[R7]]
-; CHECK-EB:   sllv    $[[R9:[0-9]+]], $4, $[[R5]]
-
-; CHECK-EB:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EB:   ll      $[[R10:[0-9]+]], 0($[[R2]])
-; CHECK-EB:   subu    $[[R11:[0-9]+]], $[[R10]], $[[R9]]
-; CHECK-EB:   and     $[[R12:[0-9]+]], $[[R11]], $[[R7]]
-; CHECK-EB:   and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
-; CHECK-EB:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
-; CHECK-EB:   sc      $[[R14]], 0($[[R2]])
-; CHECK-EB:   beqz    $[[R14]], $[[BB0]]
-
-; CHECK-EB:   and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK-EB:   srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
-; CHECK-EB:   sll     $[[R17:[0-9]+]], $[[R16]], 24
-; CHECK-EB:   sra     $2, $[[R17]], 24
+; ALL:        ori     $[[R6:[0-9]+]], $zero, 255
+; ALL:        sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
+; ALL:        nor     $[[R8:[0-9]+]], $zero, $[[R7]]
+; ALL:        sllv    $[[R9:[0-9]+]], $4, $[[R5]]
+
+; ALL:    $[[BB0:[A-Z_0-9]+]]:
+; ALL:        ll      $[[R10:[0-9]+]], 0($[[R2]])
+; ALL:        subu    $[[R11:[0-9]+]], $[[R10]], $[[R9]]
+; ALL:        and     $[[R12:[0-9]+]], $[[R11]], $[[R7]]
+; ALL:        and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
+; ALL:        or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
+; ALL:        sc      $[[R14]], 0($[[R2]])
+; ALL:        beqz    $[[R14]], $[[BB0]]
+
+; ALL:        and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
+; ALL:        srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
+
+; NO-SEB-SEH: sll     $[[R17:[0-9]+]], $[[R16]], 24
+; NO-SEB-SEH: sra     $2, $[[R17]], 24
+
+; HAS-SEB-SEH:seb     $2, $[[R16]]
 }
 
 define signext i8 @AtomicLoadNand8(i8 signext %incr) nounwind {
@@ -222,58 +175,39 @@ entry:
   %0 = atomicrmw nand i8* @y, i8 %incr monotonic
   ret i8 %0
 
-; CHECK-EL-LABEL:   AtomicLoadNand8:
-; CHECK-EL:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EL:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EL:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EL:   andi    $[[R3:[0-9]+]], $[[R0]], 3
-; CHECK-EL:   sll     $[[R4:[0-9]+]], $[[R3]], 3
-; CHECK-EL:   ori     $[[R5:[0-9]+]], $zero, 255
-; CHECK-EL:   sllv    $[[R6:[0-9]+]], $[[R5]], $[[R4]]
-; CHECK-EL:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
-; CHECK-EL:   sllv    $[[R9:[0-9]+]], $4, $[[R4]]
-
-; CHECK-EL:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EL:   ll      $[[R10:[0-9]+]], 0($[[R2]])
-; CHECK-EL:   and     $[[R18:[0-9]+]], $[[R10]], $[[R9]]
-; CHECK-EL:   nor     $[[R11:[0-9]+]], $zero, $[[R18]]
-; CHECK-EL:   and     $[[R12:[0-9]+]], $[[R11]], $[[R6]]
-; CHECK-EL:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK-EL:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
-; CHECK-EL:   sc      $[[R14]], 0($[[R2]])
-; CHECK-EL:   beqz    $[[R14]], $[[BB0]]
-
-; CHECK-EL:   and     $[[R15:[0-9]+]], $[[R10]], $[[R6]]
-; CHECK-EL:   srlv    $[[R16:[0-9]+]], $[[R15]], $[[R4]]
-; CHECK-EL:   sll     $[[R17:[0-9]+]], $[[R16]], 24
-; CHECK-EL:   sra     $2, $[[R17]], 24
-
-; CHECK-EB-LABEL:   AtomicLoadNand8:
-; CHECK-EB:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EB:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EB:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EB:   andi    $[[R3:[0-9]+]], $[[R0]], 3
-; CHECK-EB:   xori    $[[R4:[0-9]+]], $[[R3]], 3
-; CHECK-EB:   sll     $[[R5:[0-9]+]], $[[R4]], 3
-; CHECK-EB:   ori     $[[R6:[0-9]+]], $zero, 255
-; CHECK-EB:   sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
-; CHECK-EB:   nor     $[[R8:[0-9]+]], $zero, $[[R7]]
-; CHECK-EB:   sllv    $[[R9:[0-9]+]], $4, $[[R5]]
-
-; CHECK-EB:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EB:   ll      $[[R10:[0-9]+]], 0($[[R2]])
-; CHECK-EB:   and     $[[R18:[0-9]+]], $[[R10]], $[[R9]]
-; CHECK-EB:   nor     $[[R11:[0-9]+]], $zero, $[[R18]]
-; CHECK-EB:   and     $[[R12:[0-9]+]], $[[R11]], $[[R7]]
-; CHECK-EB:   and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
-; CHECK-EB:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
-; CHECK-EB:   sc      $[[R14]], 0($[[R2]])
-; CHECK-EB:   beqz    $[[R14]], $[[BB0]]
-
-; CHECK-EB:   and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK-EB:   srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
-; CHECK-EB:   sll     $[[R17:[0-9]+]], $[[R16]], 24
-; CHECK-EB:   sra     $2, $[[R17]], 24
+; ALL-LABEL: AtomicLoadNand8:
+
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(y)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(y)(
+
+; ALL:           addiu   $[[R1:[0-9]+]], $zero, -4
+; ALL:           and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; ALL:           andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK-EL:      sll     $[[R5:[0-9]+]], $[[R3]], 3
+; CHECK-EB:      xori    $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK-EB:      sll     $[[R5:[0-9]+]], $[[R4]], 3
+; ALL:           ori     $[[R6:[0-9]+]], $zero, 255
+; ALL:           sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
+; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
+; ALL:           sllv    $[[R9:[0-9]+]], $4, $[[R5]]
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $[[R10:[0-9]+]], 0($[[R2]])
+; ALL:           and     $[[R18:[0-9]+]], $[[R10]], $[[R9]]
+; ALL:           nor     $[[R11:[0-9]+]], $zero, $[[R18]]
+; ALL:           and     $[[R12:[0-9]+]], $[[R11]], $[[R7]]
+; ALL:           and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
+; ALL:           or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
+; ALL:           sc      $[[R14]], 0($[[R2]])
+; ALL:           beqz    $[[R14]], $[[BB0]]
+
+; ALL:           and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
+; ALL:           srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
+
+; NO-SEB-SEH:    sll     $[[R17:[0-9]+]], $[[R16]], 24
+; NO-SEB-SEH:    sra     $2, $[[R17]], 24
+
+; HAS-SEB-SEH:   seb     $2, $[[R16]]
 }
 
 define signext i8 @AtomicSwap8(i8 signext %newval) nounwind {
@@ -281,121 +215,126 @@ entry:
   %0 = atomicrmw xchg i8* @y, i8 %newval monotonic
   ret i8 %0
 
-; CHECK-EL-LABEL:   AtomicSwap8:
-; CHECK-EL:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EL:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EL:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EL:   andi    $[[R3:[0-9]+]], $[[R0]], 3
-; CHECK-EL:   sll     $[[R4:[0-9]+]], $[[R3]], 3
-; CHECK-EL:   ori     $[[R5:[0-9]+]], $zero, 255
-; CHECK-EL:   sllv    $[[R6:[0-9]+]], $[[R5]], $[[R4]]
-; CHECK-EL:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
-; CHECK-EL:   sllv    $[[R9:[0-9]+]], $4, $[[R4]]
-
-; CHECK-EL:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EL:   ll      $[[R10:[0-9]+]], 0($[[R2]])
-; CHECK-EL:   and     $[[R18:[0-9]+]], $[[R9]], $[[R6]]
-; CHECK-EL:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK-EL:   or      $[[R14:[0-9]+]], $[[R13]], $[[R18]]
-; CHECK-EL:   sc      $[[R14]], 0($[[R2]])
-; CHECK-EL:   beqz    $[[R14]], $[[BB0]]
-
-; CHECK-EL:   and     $[[R15:[0-9]+]], $[[R10]], $[[R6]]
-; CHECK-EL:   srlv    $[[R16:[0-9]+]], $[[R15]], $[[R4]]
-; CHECK-EL:   sll     $[[R17:[0-9]+]], $[[R16]], 24
-; CHECK-EL:   sra     $2, $[[R17]], 24
-
-; CHECK-EB-LABEL:   AtomicSwap8:
-; CHECK-EB:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EB:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EB:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EB:   andi    $[[R3:[0-9]+]], $[[R0]], 3
-; CHECK-EB:   xori    $[[R4:[0-9]+]], $[[R3]], 3
-; CHECK-EB:   sll     $[[R5:[0-9]+]], $[[R4]], 3
-; CHECK-EB:   ori     $[[R6:[0-9]+]], $zero, 255
-; CHECK-EB:   sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
-; CHECK-EB:   nor     $[[R8:[0-9]+]], $zero, $[[R7]]
-; CHECK-EB:   sllv    $[[R9:[0-9]+]], $4, $[[R5]]
-
-; CHECK-EB:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EB:   ll      $[[R10:[0-9]+]], 0($[[R2]])
-; CHECK-EB:   and     $[[R18:[0-9]+]], $[[R9]], $[[R7]]
-; CHECK-EB:   and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
-; CHECK-EB:   or      $[[R14:[0-9]+]], $[[R13]], $[[R18]]
-; CHECK-EB:   sc      $[[R14]], 0($[[R2]])
-; CHECK-EB:   beqz    $[[R14]], $[[BB0]]
-
-; CHECK-EB:   and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK-EB:   srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
-; CHECK-EB:   sll     $[[R17:[0-9]+]], $[[R16]], 24
-; CHECK-EB:   sra     $2, $[[R17]], 24
+; ALL-LABEL: AtomicSwap8:
+
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(y)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(y)(
+
+; ALL:           addiu   $[[R1:[0-9]+]], $zero, -4
+; ALL:           and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; ALL:           andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK-EL:      sll     $[[R5:[0-9]+]], $[[R3]], 3
+; CHECK-EB:      xori    $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK-EB:      sll     $[[R5:[0-9]+]], $[[R4]], 3
+; ALL:           ori     $[[R6:[0-9]+]], $zero, 255
+; ALL:           sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
+; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
+; ALL:           sllv    $[[R9:[0-9]+]], $4, $[[R5]]
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $[[R10:[0-9]+]], 0($[[R2]])
+; ALL:           and     $[[R18:[0-9]+]], $[[R9]], $[[R7]]
+; ALL:           and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
+; ALL:           or      $[[R14:[0-9]+]], $[[R13]], $[[R18]]
+; ALL:           sc      $[[R14]], 0($[[R2]])
+; ALL:           beqz    $[[R14]], $[[BB0]]
+
+; ALL:           and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
+; ALL:           srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
+
+; NO-SEB-SEH:    sll     $[[R17:[0-9]+]], $[[R16]], 24
+; NO-SEB-SEH:    sra     $2, $[[R17]], 24
+
+; HAS-SEB-SEH:   seb     $2, $[[R16]]
 }
 
 define signext i8 @AtomicCmpSwap8(i8 signext %oldval, i8 signext %newval) nounwind {
 entry:
-  %0 = cmpxchg i8* @y, i8 %oldval, i8 %newval monotonic monotonic
+  %pair0 = cmpxchg i8* @y, i8 %oldval, i8 %newval monotonic monotonic
+  %0 = extractvalue { i8, i1 } %pair0, 0
   ret i8 %0
 
-; CHECK-EL-LABEL:   AtomicCmpSwap8:
-; CHECK-EL:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EL:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EL:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EL:   andi    $[[R3:[0-9]+]], $[[R0]], 3
-; CHECK-EL:   sll     $[[R4:[0-9]+]], $[[R3]], 3
-; CHECK-EL:   ori     $[[R5:[0-9]+]], $zero, 255
-; CHECK-EL:   sllv    $[[R6:[0-9]+]], $[[R5]], $[[R4]]
-; CHECK-EL:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
-; CHECK-EL:   andi    $[[R8:[0-9]+]], $4, 255
-; CHECK-EL:   sllv    $[[R9:[0-9]+]], $[[R8]], $[[R4]]
-; CHECK-EL:   andi    $[[R10:[0-9]+]], $5, 255
-; CHECK-EL:   sllv    $[[R11:[0-9]+]], $[[R10]], $[[R4]]
-
-; CHECK-EL:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EL:   ll      $[[R12:[0-9]+]], 0($[[R2]])
-; CHECK-EL:   and     $[[R13:[0-9]+]], $[[R12]], $[[R6]]
-; CHECK-EL:   bne     $[[R13]], $[[R9]], $[[BB1:[A-Z_0-9]+]]
-
-; CHECK-EL:   and     $[[R14:[0-9]+]], $[[R12]], $[[R7]]
-; CHECK-EL:   or      $[[R15:[0-9]+]], $[[R14]], $[[R11]]
-; CHECK-EL:   sc      $[[R15]], 0($[[R2]])
-; CHECK-EL:   beqz    $[[R15]], $[[BB0]]
-
-; CHECK-EL:   $[[BB1]]:
-; CHECK-EL:   srlv    $[[R16:[0-9]+]], $[[R13]], $[[R4]]
-; CHECK-EL:   sll     $[[R17:[0-9]+]], $[[R16]], 24
-; CHECK-EL:   sra     $2, $[[R17]], 24
-
-; CHECK-EB-LABEL:   AtomicCmpSwap8:
-; CHECK-EB:   lw      $[[R0:[0-9]+]], %got(y)
-; CHECK-EB:   addiu   $[[R1:[0-9]+]], $zero, -4
-; CHECK-EB:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
-; CHECK-EB:   andi    $[[R3:[0-9]+]], $[[R0]], 3
-; CHECK-EB:   xori    $[[R4:[0-9]+]], $[[R3]], 3
-; CHECK-EB:   sll     $[[R5:[0-9]+]], $[[R4]], 3
-; CHECK-EB:   ori     $[[R6:[0-9]+]], $zero, 255
-; CHECK-EB:   sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
-; CHECK-EB:   nor     $[[R8:[0-9]+]], $zero, $[[R7]]
-; CHECK-EB:   andi    $[[R9:[0-9]+]], $4, 255
-; CHECK-EB:   sllv    $[[R10:[0-9]+]], $[[R9]], $[[R5]]
-; CHECK-EB:   andi    $[[R11:[0-9]+]], $5, 255
-; CHECK-EB:   sllv    $[[R12:[0-9]+]], $[[R11]], $[[R5]]
-
-; CHECK-EB:   $[[BB0:[A-Z_0-9]+]]:
-; CHECK-EB:   ll      $[[R13:[0-9]+]], 0($[[R2]])
-; CHECK-EB:   and     $[[R14:[0-9]+]], $[[R13]], $[[R7]]
-; CHECK-EB:   bne     $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
-
-; CHECK-EB:   and     $[[R15:[0-9]+]], $[[R13]], $[[R8]]
-; CHECK-EB:   or      $[[R16:[0-9]+]], $[[R15]], $[[R12]]
-; CHECK-EB:   sc      $[[R16]], 0($[[R2]])
-; CHECK-EB:   beqz    $[[R16]], $[[BB0]]
-
-; CHECK-EB:   $[[BB1]]:
-; CHECK-EB:   srlv    $[[R17:[0-9]+]], $[[R14]], $[[R5]]
-; CHECK-EB:   sll     $[[R18:[0-9]+]], $[[R17]], 24
-; CHECK-EB:   sra     $2, $[[R18]], 24
+; ALL-LABEL: AtomicCmpSwap8:
+
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(y)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(y)(
+
+; ALL:           addiu   $[[R1:[0-9]+]], $zero, -4
+; ALL:           and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; ALL:           andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK-EL:      sll     $[[R5:[0-9]+]], $[[R3]], 3
+; CHECK-EB:      xori    $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK-EB:      sll     $[[R5:[0-9]+]], $[[R4]], 3
+; ALL:           ori     $[[R6:[0-9]+]], $zero, 255
+; ALL:           sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
+; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
+; ALL:           andi    $[[R9:[0-9]+]], $4, 255
+; ALL:           sllv    $[[R10:[0-9]+]], $[[R9]], $[[R5]]
+; ALL:           andi    $[[R11:[0-9]+]], $5, 255
+; ALL:           sllv    $[[R12:[0-9]+]], $[[R11]], $[[R5]]
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $[[R13:[0-9]+]], 0($[[R2]])
+; ALL:           and     $[[R14:[0-9]+]], $[[R13]], $[[R7]]
+; ALL:           bne     $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
+
+; ALL:           and     $[[R15:[0-9]+]], $[[R13]], $[[R8]]
+; ALL:           or      $[[R16:[0-9]+]], $[[R15]], $[[R12]]
+; ALL:           sc      $[[R16]], 0($[[R2]])
+; ALL:           beqz    $[[R16]], $[[BB0]]
+
+; ALL:       $[[BB1]]:
+; ALL:           srlv    $[[R17:[0-9]+]], $[[R14]], $[[R5]]
+
+; NO-SEB-SEH:    sll     $[[R18:[0-9]+]], $[[R17]], 24
+; NO-SEB-SEH:    sra     $2, $[[R18]], 24
+
+; HAS-SEB-SEH:   seb     $2, $[[R17]]
+}
+
+; Check one i16 so that we cover the seh sign extend
+@z = common global i16 0, align 1
+
+define signext i16 @AtomicLoadAdd16(i16 signext %incr) nounwind {
+entry:
+  %0 = atomicrmw add i16* @z, i16 %incr monotonic
+  ret i16 %0
+
+; ALL-LABEL: AtomicLoadAdd16:
+
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(z)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(z)(
+
+; ALL:           addiu   $[[R1:[0-9]+]], $zero, -4
+; ALL:           and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; ALL:           andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK-EB:      xori    $[[R4:[0-9]+]], $[[R3]], 2
+; CHECK-EB:      sll     $[[R5:[0-9]+]], $[[R4]], 3
+; CHECK-EL:      sll     $[[R5:[0-9]+]], $[[R3]], 3
+; ALL:           ori     $[[R6:[0-9]+]], $zero, 65535
+; ALL:           sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
+; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
+; ALL:           sllv    $[[R9:[0-9]+]], $4, $[[R5]]
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $[[R10:[0-9]+]], 0($[[R2]])
+; ALL:           addu    $[[R11:[0-9]+]], $[[R10]], $[[R9]]
+; ALL:           and     $[[R12:[0-9]+]], $[[R11]], $[[R7]]
+; ALL:           and     $[[R13:[0-9]+]], $[[R10]], $[[R8]]
+; ALL:           or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
+; ALL:           sc      $[[R14]], 0($[[R2]])
+; ALL:           beqz    $[[R14]], $[[BB0]]
+
+; ALL:           and     $[[R15:[0-9]+]], $[[R10]], $[[R7]]
+; ALL:           srlv    $[[R16:[0-9]+]], $[[R15]], $[[R5]]
+
+; NO-SEB-SEH:    sll     $[[R17:[0-9]+]], $[[R16]], 16
+; NO-SEB-SEH:    sra     $2, $[[R17]], 16
+
+; MIPS32R2:      seh     $2, $[[R16]]
 }
 
+
 @countsint = common global i32 0, align 4
 
 define i32 @CheckSync(i32 %v) nounwind noinline {
@@ -403,19 +342,13 @@ entry:
   %0 = atomicrmw add i32* @countsint, i32 %v seq_cst
   ret i32 %0 
 
-; CHECK-EL-LABEL:   CheckSync:
-; CHECK-EL:   sync 0
-; CHECK-EL:   ll
-; CHECK-EL:   sc
-; CHECK-EL:   beq
-; CHECK-EL:   sync 0
-
-; CHECK-EB-LABEL:   CheckSync:
-; CHECK-EB:   sync 0
-; CHECK-EB:   ll
-; CHECK-EB:   sc
-; CHECK-EB:   beq
-; CHECK-EB:   sync 0
+; ALL-LABEL: CheckSync:
+
+; ALL:           sync
+; ALL:           ll
+; ALL:           sc
+; ALL:           beq
+; ALL:           sync
 }
 
 ; make sure that this assertion in
@@ -429,8 +362,29 @@ entry:
 
 define i32 @zeroreg() nounwind {
 entry:
-  %0 = cmpxchg i32* @a, i32 1, i32 0 seq_cst seq_cst
+  %pair0 = cmpxchg i32* @a, i32 1, i32 0 seq_cst seq_cst
+  %0 = extractvalue { i32, i1 } %pair0, 0
   %1 = icmp eq i32 %0, 1
   %conv = zext i1 %1 to i32
   ret i32 %conv
 }
+
+; Check that MIPS32R6 has the correct offset range.
+; FIXME: At the moment, we don't seem to do addr+offset for any atomic load/store.
+define i32 @AtomicLoadAdd32_OffGt9Bit(i32 %incr) nounwind {
+entry:
+  %0 = atomicrmw add i32* getelementptr(i32* @x, i32 256), i32 %incr monotonic
+  ret i32 %0
+
+; ALL-LABEL: AtomicLoadAdd32_OffGt9Bit:
+
+; MIPS32-ANY:    lw      $[[R0:[0-9]+]], %got(x)
+; MIPS64-ANY:    ld      $[[R0:[0-9]+]], %got_disp(x)(
+
+; ALL:           addiu   $[[PTR:[0-9]+]], $[[R0]], 1024
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $[[R1:[0-9]+]], 0($[[PTR]])
+; ALL:           addu    $[[R2:[0-9]+]], $[[R1]], $4
+; ALL:           sc      $[[R2]], 0($[[PTR]])
+; ALL:           beqz    $[[R2]], $[[BB0]]
+}
diff --git a/test/CodeGen/Mips/atomicops.ll b/test/CodeGen/Mips/atomicops.ll
index dc07c63..c264152 100644
--- a/test/CodeGen/Mips/atomicops.ll
+++ b/test/CodeGen/Mips/atomicops.ll
@@ -20,7 +20,8 @@ entry:
   %add.i = add nsw i32 %0, 2
   %1 = load volatile i32* %x, align 4
   %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %add.i, i32 %1) nounwind
-  %2 = cmpxchg i32* %x, i32 1, i32 2 seq_cst seq_cst
+  %pair = cmpxchg i32* %x, i32 1, i32 2 seq_cst seq_cst
+  %2 = extractvalue { i32, i1 } %pair, 0
   %3 = load volatile i32* %x, align 4
   %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %2, i32 %3) nounwind
   %4 = atomicrmw xchg i32* %x, i32 1 seq_cst
diff --git a/test/CodeGen/Mips/buildpairextractelementf64.ll b/test/CodeGen/Mips/buildpairextractelementf64.ll
index b9bf2b6..88d1d07 100644
--- a/test/CodeGen/Mips/buildpairextractelementf64.ll
+++ b/test/CodeGen/Mips/buildpairextractelementf64.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=FP32 -check-prefix=CHECK
 ; RUN: llc -march=mips  < %s | FileCheck %s -check-prefix=FP32 -check-prefix=CHECK
-; RUN: llc -march=mipsel -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64 -check-prefix=CHECK
-; RUN: llc -march=mips -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64 -check-prefix=CHECK
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64 -check-prefix=CHECK
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64 -check-prefix=CHECK
 
 @a = external global i32
 
diff --git a/test/CodeGen/Mips/cconv/callee-saved-fpxx.ll b/test/CodeGen/Mips/cconv/callee-saved-fpxx.ll
new file mode 100644
index 0000000..4b28b99
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/callee-saved-fpxx.ll
@@ -0,0 +1,58 @@
+; RUN: llc -march=mips -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX %s
+; RUN: llc -march=mipsel -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX %s
+; RUN: llc -march=mips -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX-INV %s
+; RUN: llc -march=mipsel -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX-INV %s
+
+; RUN-TODO: llc -march=mips64 -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX %s
+; RUN-TODO: llc -march=mips64el -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX %s
+; RUN-TODO: llc -march=mips64 -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX-INV --check-prefix=O32-FPXX-INV %s
+; RUN-TODO: llc -march=mips64el -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=ALL --check-prefix=O32-FPXX-INV --check-prefix=O32-FPXX-INV %s
+
+define void @fpu_clobber() nounwind {
+entry:
+    call void asm "# Clobber", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f12},~{$f13},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"()
+    ret void
+}
+
+; O32-FPXX-LABEL: fpu_clobber:
+; O32-FPXX-INV-NOT:   sdc1 $f0,
+; O32-FPXX-INV-NOT:   sdc1 $f1,
+; O32-FPXX-INV-NOT:   sdc1 $f2,
+; O32-FPXX-INV-NOT:   sdc1 $f3,
+; O32-FPXX-INV-NOT:   sdc1 $f4,
+; O32-FPXX-INV-NOT:   sdc1 $f5,
+; O32-FPXX-INV-NOT:   sdc1 $f6,
+; O32-FPXX-INV-NOT:   sdc1 $f7,
+; O32-FPXX-INV-NOT:   sdc1 $f8,
+; O32-FPXX-INV-NOT:   sdc1 $f9,
+; O32-FPXX-INV-NOT:   sdc1 $f10,
+; O32-FPXX-INV-NOT:   sdc1 $f11,
+; O32-FPXX-INV-NOT:   sdc1 $f12,
+; O32-FPXX-INV-NOT:   sdc1 $f13,
+; O32-FPXX-INV-NOT:   sdc1 $f14,
+; O32-FPXX-INV-NOT:   sdc1 $f15,
+; O32-FPXX-INV-NOT:   sdc1 $f16,
+; O32-FPXX-INV-NOT:   sdc1 $f17,
+; O32-FPXX-INV-NOT:   sdc1 $f18,
+; O32-FPXX-INV-NOT:   sdc1 $f19,
+; O32-FPXX-INV-NOT:   sdc1 $f21,
+; O32-FPXX-INV-NOT:   sdc1 $f23,
+; O32-FPXX-INV-NOT:   sdc1 $f25,
+; O32-FPXX-INV-NOT:   sdc1 $f27,
+; O32-FPXX-INV-NOT:   sdc1 $f29,
+; O32-FPXX-INV-NOT:   sdc1 $f31,
+
+; O32-FPXX:           addiu $sp, $sp, -48
+; O32-FPXX-DAG:       sdc1 [[F20:\$f20]], [[OFF20:[0-9]+]]($sp)
+; O32-FPXX-DAG:       sdc1 [[F22:\$f22]], [[OFF22:[0-9]+]]($sp)
+; O32-FPXX-DAG:       sdc1 [[F24:\$f24]], [[OFF24:[0-9]+]]($sp)
+; O32-FPXX-DAG:       sdc1 [[F26:\$f26]], [[OFF26:[0-9]+]]($sp)
+; O32-FPXX-DAG:       sdc1 [[F28:\$f28]], [[OFF28:[0-9]+]]($sp)
+; O32-FPXX-DAG:       sdc1 [[F30:\$f30]], [[OFF30:[0-9]+]]($sp)
+; O32-FPXX-DAG:       ldc1 [[F20]], [[OFF20]]($sp)
+; O32-FPXX-DAG:       ldc1 [[F22]], [[OFF22]]($sp)
+; O32-FPXX-DAG:       ldc1 [[F24]], [[OFF24]]($sp)
+; O32-FPXX-DAG:       ldc1 [[F26]], [[OFF26]]($sp)
+; O32-FPXX-DAG:       ldc1 [[F28]], [[OFF28]]($sp)
+; O32-FPXX-DAG:       ldc1 [[F30]], [[OFF30]]($sp)
+; O32-FPXX:           addiu $sp, $sp, 48
diff --git a/test/CodeGen/Mips/cconv/callee-saved-fpxx1.ll b/test/CodeGen/Mips/cconv/callee-saved-fpxx1.ll
new file mode 100644
index 0000000..489879e
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/callee-saved-fpxx1.ll
@@ -0,0 +1,24 @@
+; RUN: llc -march=mips -mattr=+o32,+fp64 < %s | FileCheck --check-prefix=O32-FP64-INV %s
+; RUN: llc -march=mipsel -mattr=+o32,+fp64 < %s | FileCheck --check-prefix=O32-FP64-INV %s
+
+; RUN: llc -march=mips -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=O32-FPXX %s
+; RUN: llc -march=mipsel -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=O32-FPXX %s
+
+; RUN-TODO: llc -march=mips64 -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=O32-FPXX %s
+; RUN-TODO: llc -march=mips64el -mattr=+o32,+fpxx < %s | FileCheck --check-prefix=O32-FPXX %s
+
+define void @fpu_clobber() nounwind {
+entry:
+    call void asm "# Clobber", "~{$f21}"()
+    ret void
+}
+
+; O32-FPXX-LABEL: fpu_clobber:
+
+; O32-FPXX:           addiu $sp, $sp, -8
+
+; O32-FP64-INV-NOT:   sdc1 $f20,
+; O32-FPXX-DAG:       sdc1 [[F20:\$f20]], [[OFF20:[0-9]+]]($sp)
+; O32-FPXX-DAG:       ldc1 [[F20]], [[OFF20]]($sp)
+
+; O32-FPXX:           addiu $sp, $sp, 8
diff --git a/test/CodeGen/Mips/cmov.ll b/test/CodeGen/Mips/cmov.ll
index b9732eb..999bdb4 100644
--- a/test/CodeGen/Mips/cmov.ll
+++ b/test/CodeGen/Mips/cmov.ll
@@ -1,17 +1,43 @@
-; RUN: llc -march=mips < %s | FileCheck %s -check-prefix=O32
-; RUN: llc -march=mips -regalloc=basic < %s | FileCheck %s -check-prefix=O32
-; RUN: llc -march=mips64el -mcpu=mips4 -mattr=n64 < %s | FileCheck %s -check-prefix=N64
-; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | FileCheck %s -check-prefix=N64
+; RUN: llc -march=mips     -mcpu=mips32                 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-CMOV
+; RUN: llc -march=mips     -mcpu=mips32 -regalloc=basic < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-CMOV
+; RUN: llc -march=mips     -mcpu=mips32r2               < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-CMOV
+; RUN: llc -march=mips     -mcpu=mips32r6               < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-CMP
+; RUN: llc -march=mips64el -mcpu=mips4                  < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-CMOV
+; RUN: llc -march=mips64el -mcpu=mips64                 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-CMOV
+; RUN: llc -march=mips64el -mcpu=mips64r6               < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-CMP
 
 @i1 = global [3 x i32] [i32 1, i32 2, i32 3], align 4
 @i3 = common global i32* null, align 4
 
-; O32-DAG:  lw $[[R0:[0-9]+]], %got(i3)
-; O32-DAG:  addiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got(i1)
-; O32:      movn $[[R0]], $[[R1]], ${{[0-9]+}}
-; N64-DAG:  ldr $[[R0:[0-9]+]]
-; N64-DAG:  ld $[[R1:[0-9]+]], %got_disp(i1)
-; N64:      movn $[[R0]], $[[R1]], ${{[0-9]+}}
+; ALL-LABEL: cmov1:
+
+; 32-CMOV-DAG:  lw $[[R0:[0-9]+]], %got(i3)
+; 32-CMOV-DAG:  addiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got(i1)
+; 32-CMOV-DAG:  movn $[[R0]], $[[R1]], $4
+; 32-CMOV-DAG:  lw $2, 0($[[R0]])
+
+; 32-CMP-DAG:   lw $[[R0:[0-9]+]], %got(i3)
+; 32-CMP-DAG:   addiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got(i1)
+; 32-CMP-DAG:   selnez $[[T0:[0-9]+]], $[[R1]], $4
+; 32-CMP-DAG:   seleqz $[[T1:[0-9]+]], $[[R0]], $4
+; 32-CMP-DAG:   or $[[T2:[0-9]+]], $[[T0]], $[[T1]]
+; 32-CMP-DAG:   lw $2, 0($[[T2]])
+
+; 64-CMOV-DAG:  ldr $[[R0:[0-9]+]]
+; 64-CMOV-DAG:  ld $[[R1:[0-9]+]], %got_disp(i1)
+; 64-CMOV-DAG:  movn $[[R0]], $[[R1]], $4
+
+; 64-CMP-DAG:   ld $[[R0:[0-9]+]], %got_disp(i3)(
+; 64-CMP-DAG:   daddiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got_disp(i1)
+; FIXME: This sll works around an implementation detail in the code generator
+;        (setcc's result is i32 so bits 32-63 are undefined). It's not really
+;        needed.
+; 64-CMP-DAG:   sll $[[CC:[0-9]+]], $4, 0
+; 64-CMP-DAG:   selnez $[[T0:[0-9]+]], $[[R1]], $[[CC]]
+; 64-CMP-DAG:   seleqz $[[T1:[0-9]+]], $[[R0]], $[[CC]]
+; 64-CMP-DAG:   or $[[T2:[0-9]+]], $[[T0]], $[[T1]]
+; 64-CMP-DAG:   ld $2, 0($[[T2]])
+
 define i32* @cmov1(i32 %s) nounwind readonly {
 entry:
   %tobool = icmp ne i32 %s, 0
@@ -23,14 +49,35 @@ entry:
 @c = global i32 1, align 4
 @d = global i32 0, align 4
 
-; O32-LABEL: cmov2:
-; O32: addiu $[[R1:[0-9]+]], ${{[a-z0-9]+}}, %got(d)
-; O32: addiu $[[R0:[0-9]+]], ${{[a-z0-9]+}}, %got(c)
-; O32: movn  $[[R1]], $[[R0]], ${{[0-9]+}}
-; N64-LABEL: cmov2:
-; N64: daddiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got_disp(d)
-; N64: daddiu $[[R0:[0-9]+]], ${{[0-9]+}}, %got_disp(c)
-; N64: movn  $[[R1]], $[[R0]], ${{[0-9]+}}
+; ALL-LABEL: cmov2:
+
+; 32-CMOV-DAG:  addiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got(d)
+; 32-CMOV-DAG:  addiu $[[R0:[0-9]+]], ${{[0-9]+}}, %got(c)
+; 32-CMOV-DAG:  movn  $[[R1]], $[[R0]], $4
+; 32-CMOV-DAG:  lw $2, 0($[[R0]])
+
+; 32-CMP-DAG:   addiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got(d)
+; 32-CMP-DAG:   addiu $[[R0:[0-9]+]], ${{[0-9]+}}, %got(c)
+; 32-CMP-DAG:   selnez $[[T0:[0-9]+]], $[[R0]], $4
+; 32-CMP-DAG:   seleqz $[[T1:[0-9]+]], $[[R1]], $4
+; 32-CMP-DAG:   or $[[T2:[0-9]+]], $[[T0]], $[[T1]]
+; 32-CMP-DAG:   lw $2, 0($[[T2]])
+
+; 64-CMOV:      daddiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got_disp(d)
+; 64-CMOV:      daddiu $[[R0:[0-9]+]], ${{[0-9]+}}, %got_disp(c)
+; 64-CMOV:      movn  $[[R1]], $[[R0]], $4
+
+; 64-CMP-DAG:   daddiu $[[R1:[0-9]+]], ${{[0-9]+}}, %got_disp(d)
+; 64-CMP-DAG:   daddiu $[[R0:[0-9]+]], ${{[0-9]+}}, %got_disp(c)
+; FIXME: This sll works around an implementation detail in the code generator
+;        (setcc's result is i32 so bits 32-63 are undefined). It's not really
+;        needed.
+; 64-CMP-DAG:   sll $[[CC:[0-9]+]], $4, 0
+; 64-CMP-DAG:   selnez $[[T0:[0-9]+]], $[[R0]], $[[CC]]
+; 64-CMP-DAG:   seleqz $[[T1:[0-9]+]], $[[R1]], $[[CC]]
+; 64-CMP-DAG:   or $[[T2:[0-9]+]], $[[T0]], $[[T1]]
+; 64-CMP-DAG:   lw $2, 0($[[T2]])
+
 define i32 @cmov2(i32 %s) nounwind readonly {
 entry:
   %tobool = icmp ne i32 %s, 0
@@ -40,9 +87,28 @@ entry:
   ret i32 %cond
 }
 
-; O32-LABEL: cmov3:
-; O32: xori $[[R0:[0-9]+]], ${{[0-9]+}}, 234
-; O32: movz ${{[0-9]+}}, ${{[0-9]+}}, $[[R0]]
+; ALL-LABEL: cmov3:
+
+; We won't check the result register since we can't know if the move is first
+; or last. We do know it will be either one of two registers so we can at least
+; check that.
+
+; 32-CMOV:      xori $[[R0:[0-9]+]], $4, 234
+; 32-CMOV:      movz ${{[26]}}, $5, $[[R0]]
+
+; 32-CMP-DAG:   xori $[[CC:[0-9]+]], $4, 234
+; 32-CMP-DAG:   seleqz $[[T0:[0-9]+]], $5, $[[CC]]
+; 32-CMP-DAG:   selnez $[[T1:[0-9]+]], $6, $[[CC]]
+; 32-CMP-DAG:   or $2, $[[T0]], $[[T1]]
+
+; 64-CMOV:      xori $[[R0:[0-9]+]], $4, 234
+; 64-CMOV:      movz ${{[26]}}, $5, $[[R0]]
+
+; 64-CMP-DAG:   xori $[[CC:[0-9]+]], $4, 234
+; 64-CMP-DAG:   seleqz $[[T0:[0-9]+]], $5, $[[CC]]
+; 64-CMP-DAG:   selnez $[[T1:[0-9]+]], $6, $[[CC]]
+; 64-CMP-DAG:   or $2, $[[T0]], $[[T1]]
+
 define i32 @cmov3(i32 %a, i32 %b, i32 %c) nounwind readnone {
 entry:
   %cmp = icmp eq i32 %a, 234
@@ -50,9 +116,36 @@ entry:
   ret i32 %cond
 }
 
-; N64-LABEL: cmov4:
-; N64: xori $[[R0:[0-9]+]], ${{[0-9]+}}, 234
-; N64: movz ${{[0-9]+}}, ${{[0-9]+}}, $[[R0]]
+; ALL-LABEL: cmov4:
+
+; We won't check the result register since we can't know if the move is first
+; or last. We do know it will be one of two registers so we can at least check
+; that.
+
+; 32-CMOV-DAG: xori $[[R0:[0-9]+]], $4, 234
+; 32-CMOV-DAG: lw $[[R1:2]], 16($sp)
+; 32-CMOV-DAG: lw $[[R2:3]], 20($sp)
+; 32-CMOV-DAG: movz $[[R1]], $6, $[[R0]]
+; 32-CMOV-DAG: movz $[[R2]], $7, $[[R0]]
+
+; 32-CMP-DAG:  xori $[[R0:[0-9]+]], $4, 234
+; 32-CMP-DAG:  lw $[[R1:[0-9]+]], 16($sp)
+; 32-CMP-DAG:  lw $[[R2:[0-9]+]], 20($sp)
+; 32-CMP-DAG:  seleqz $[[T0:[0-9]+]], $6, $[[R0]]
+; 32-CMP-DAG:  seleqz $[[T1:[0-9]+]], $7, $[[R0]]
+; 32-CMP-DAG:  selnez $[[T2:[0-9]+]], $[[R1]], $[[R0]]
+; 32-CMP-DAG:  selnez $[[T3:[0-9]+]], $[[R2]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T2]]
+; 32-CMP-DAG:  or $3, $[[T1]], $[[T3]]
+
+; 64-CMOV: xori $[[R0:[0-9]+]], $4, 234
+; 64-CMOV: movz ${{[26]}}, $5, $[[R0]]
+
+; 64-CMP-DAG:  xori $[[R0:[0-9]+]], $4, 234
+; 64-CMP-DAG:  seleqz $[[T0:[0-9]+]], $5, $[[R0]]
+; 64-CMP-DAG:  selnez $[[T1:[0-9]+]], $6, $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
 define i64 @cmov4(i32 %a, i64 %b, i64 %c) nounwind readnone {
 entry:
   %cmp = icmp eq i32 %a, 234
@@ -68,9 +161,33 @@ entry:
 ;  (movz t, (setlt a, N + 1), f)
 ; if N + 1 fits in 16-bit.
 
-; O32-LABEL: slti0:
-; O32: slti $[[R0:[0-9]+]], ${{[0-9]+}}, 32767
-; O32: movz ${{[0-9]+}}, ${{[0-9]+}}, $[[R0]]
+; ALL-LABEL: slti0:
+
+; 32-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 32-CMOV-DAG: slti $[[R0:[0-9]+]], $4, 32767
+; 32-CMOV-DAG: movz $[[I5]], $[[I3]], $[[R0]]
+
+; 32-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:  slti $[[R0:[0-9]+]], $4, 32767
+; FIXME: We can do better than this by using selccz to choose between +0 and +2
+; 32-CMP-DAG:  seleqz $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 32-CMP-DAG:  selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
+; 64-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 64-CMOV-DAG: slti $[[R0:[0-9]+]], $4, 32767
+; 64-CMOV-DAG: movz $[[I5]], $[[I3]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMP-DAG:  slti $[[R0:[0-9]+]], $4, 32767
+; FIXME: We can do better than this by using selccz to choose between +0 and +2
+; 64-CMP-DAG:  seleqz $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 64-CMP-DAG:  selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i32 @slti0(i32 %a) {
 entry:
@@ -79,19 +196,72 @@ entry:
   ret i32 %cond
 }
 
-; O32-LABEL: slti1:
-; O32: slt ${{[0-9]+}}
+; ALL-LABEL: slti1:
+
+; 32-CMOV-DAG: addiu $[[I7:[0-9]+]], $zero, 7
+; 32-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 32-CMOV-DAG: addiu $[[R1:[0-9]+]], $zero, 32767
+; 32-CMOV-DAG: slt $[[R0:[0-9]+]], $[[R1]], $4
+; 32-CMOV-DAG: movn $[[I5]], $[[I7]], $[[R0]]
+
+; 32-CMP-DAG:  addiu $[[I7:[0-9]+]], $zero, 7
+; 32-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:  addiu $[[I32767:[0-9]+]], $zero, 32767
+; 32-CMP-DAG:  slt $[[R0:[0-9]+]], $[[I32767]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 32-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I7]], $[[R0]]
+; 32-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
+; 64-CMOV-DAG: addiu $[[I7:[0-9]+]], $zero, 7
+; 64-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 64-CMOV-DAG: addiu $[[R1:[0-9]+]], $zero, 32767
+; 64-CMOV-DAG: slt $[[R0:[0-9]+]], $[[R1]], $4
+; 64-CMOV-DAG: movn $[[I5]], $[[I7]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I7:[0-9]+]], $zero, 7
+; 64-CMP-DAG:  addiu $[[I5:2]], $zero, 5
+; 64-CMP-DAG:  addiu $[[R1:[0-9]+]], $zero, 32767
+; 64-CMP-DAG:  slt $[[R0:[0-9]+]], $[[R1]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I7]], $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i32 @slti1(i32 %a) {
 entry:
   %cmp = icmp sgt i32 %a, 32767
-  %cond = select i1 %cmp, i32 3, i32 5
+  %cond = select i1 %cmp, i32 7, i32 5
   ret i32 %cond
 }
 
-; O32-LABEL: slti2:
-; O32: slti $[[R0:[0-9]+]], ${{[0-9]+}}, -32768
-; O32: movz ${{[0-9]+}}, ${{[0-9]+}}, $[[R0]]
+; ALL-LABEL: slti2:
+
+; 32-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 32-CMOV-DAG: slti $[[R0:[0-9]+]], $4, -32768
+; 32-CMOV-DAG: movz $[[I5]], $[[I3]], $[[R0]]
+
+; 32-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:  slti $[[R0:[0-9]+]], $4, -32768
+; FIXME: We can do better than this by using selccz to choose between +0 and +2
+; 32-CMP-DAG:  seleqz $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 32-CMP-DAG:  selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
+; 64-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 64-CMOV-DAG: slti $[[R0:[0-9]+]], $4, -32768
+; 64-CMOV-DAG: movz $[[I5]], $[[I3]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMP-DAG:  slti $[[R0:[0-9]+]], $4, -32768
+; FIXME: We can do better than this by using selccz to choose between +0 and +2
+; 64-CMP-DAG:  seleqz $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 64-CMP-DAG:  selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i32 @slti2(i32 %a) {
 entry:
@@ -100,8 +270,41 @@ entry:
   ret i32 %cond
 }
 
-; O32-LABEL: slti3:
-; O32: slt ${{[0-9]+}}
+; ALL-LABEL: slti3:
+
+; 32-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 32-CMOV-DAG: lui $[[R1:[0-9]+]], 65535
+; 32-CMOV-DAG: ori $[[R1]], $[[R1]], 32766
+; 32-CMOV-DAG: slt $[[R0:[0-9]+]], $[[R1]], $4
+; 32-CMOV-DAG: movn $[[I5]], $[[I3]], $[[R0]]
+
+; 32-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:  lui $[[IMM:[0-9]+]], 65535
+; 32-CMP-DAG:  ori $[[IMM]], $[[IMM]], 32766
+; 32-CMP-DAG:  slt $[[R0:[0-9]+]], $[[I32767]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 32-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 32-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
+; 64-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 64-CMOV-DAG: lui $[[R1:[0-9]+]], 65535
+; 64-CMOV-DAG: ori $[[R1]], $[[R1]], 32766
+; 64-CMOV-DAG: slt $[[R0:[0-9]+]], $[[R1]], $4
+; 64-CMOV-DAG: movn $[[I5]], $[[I3]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMP-DAG:  addiu $[[I5:2]], $zero, 5
+; 64-CMP-DAG:  lui $[[IMM:[0-9]+]], 65535
+; 64-CMP-DAG:  ori $[[IMM]], $[[IMM]], 32766
+; 64-CMP-DAG:  slt $[[R0:[0-9]+]], $[[IMM]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i32 @slti3(i32 %a) {
 entry:
@@ -112,30 +315,117 @@ entry:
 
 ; 64-bit patterns.
 
-; N64-LABEL: slti64_0:
-; N64: slti $[[R0:[0-9]+]], ${{[0-9]+}}, 32767
-; N64: movz ${{[0-9]+}}, ${{[0-9]+}}, $[[R0]]
+; ALL-LABEL: slti64_0:
+
+; 32-CMOV-DAG:  slt $[[CC:[0-9]+]], $zero, $4
+; 32-CMOV-DAG:  addiu $[[I32766:[0-9]+]], $zero, 32766
+; 32-CMOV-DAG:  sltu $[[R1:[0-9]+]], $[[I32766]], $5
+; 32-CMOV-DAG:  movz $[[CC:[0-9]+]], $[[R1]], $4
+; 32-CMOV-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMOV-DAG:  addiu $[[I4:3]], $zero, 4
+; 32-CMOV-DAG:  movn $[[I4]], $[[I5]], $[[CC]]
+; 32-CMOV-DAG:  addiu $2, $zero, 0
+
+; 32-CMP-DAG:   slt $[[CC0:[0-9]+]], $zero, $4
+; 32-CMP-DAG:   addiu $[[I32766:[0-9]+]], $zero, 32766
+; 32-CMP-DAG:   sltu $[[CC1:[0-9]+]], $[[I32766]], $5
+; 32-CMP-DAG:   selnez $[[CC2:[0-9]+]], $[[CC0]], $4
+; 32-CMP-DAG:   seleqz $[[CC3:[0-9]+]], $[[CC1]], $4
+; 32-CMP:       or $[[CC:[0-9]+]], $[[CC3]], $[[CC2]]
+; 32-CMP-DAG:   addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:   addiu $[[I4:[0-9]+]], $zero, 4
+; 32-CMP-DAG:   seleqz $[[T0:[0-9]+]], $[[I4]], $[[CC]]
+; 32-CMP-DAG:   selnez $[[T1:[0-9]+]], $[[I5]], $[[CC]]
+; 32-CMP-DAG:   or $3, $[[T1]], $[[T0]]
+; 32-CMP-DAG:   addiu $2, $zero, 0
+
+; 64-CMOV-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMOV-DAG:  addiu $[[I4:2]], $zero, 4
+; 64-CMOV-DAG:  slti $[[R0:[0-9]+]], $4, 32767
+; 64-CMOV-DAG:  movz $[[I4]], $[[I5]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMP-DAG:  addiu $[[I4:[0-9]+]], $zero, 4
+; 64-CMP-DAG:  slti $[[R0:[0-9]+]], $4, 32767
+; FIXME: We can do better than this by adding/subtracting the result of slti
+;        to/from one of the constants.
+; 64-CMP-DAG:  seleqz $[[T0:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  selnez $[[T1:[0-9]+]], $[[I4]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i64 @slti64_0(i64 %a) {
 entry:
   %cmp = icmp sgt i64 %a, 32766
-  %conv = select i1 %cmp, i64 3, i64 4
+  %conv = select i1 %cmp, i64 5, i64 4
   ret i64 %conv
 }
 
-; N64-LABEL: slti64_1:
-; N64: slt ${{[0-9]+}}
+; ALL-LABEL: slti64_1:
+
+; 32-CMOV-DAG:  slt $[[CC:[0-9]+]], $zero, $4
+; 32-CMOV-DAG:  addiu $[[I32766:[0-9]+]], $zero, 32767
+; 32-CMOV-DAG:  sltu $[[R1:[0-9]+]], $[[I32766]], $5
+; 32-CMOV-DAG:  movz $[[CC:[0-9]+]], $[[R1]], $4
+; 32-CMOV-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMOV-DAG:  addiu $[[I4:3]], $zero, 4
+; 32-CMOV-DAG:  movn $[[I4]], $[[I5]], $[[CC]]
+; 32-CMOV-DAG:  addiu $2, $zero, 0
+
+; 32-CMP-DAG:   slt $[[CC0:[0-9]+]], $zero, $4
+; 32-CMP-DAG:   addiu $[[I32766:[0-9]+]], $zero, 32767
+; 32-CMP-DAG:   sltu $[[CC1:[0-9]+]], $[[I32766]], $5
+; 32-CMP-DAG:   selnez $[[CC2:[0-9]+]], $[[CC0]], $4
+; 32-CMP-DAG:   seleqz $[[CC3:[0-9]+]], $[[CC1]], $4
+; 32-CMP:       or $[[CC:[0-9]+]], $[[CC3]], $[[CC2]]
+; 32-CMP-DAG:   addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:   addiu $[[I4:[0-9]+]], $zero, 4
+; 32-CMP-DAG:   seleqz $[[T0:[0-9]+]], $[[I4]], $[[CC]]
+; 32-CMP-DAG:   selnez $[[T1:[0-9]+]], $[[I5]], $[[CC]]
+; 32-CMP-DAG:   or $3, $[[T1]], $[[T0]]
+; 32-CMP-DAG:   addiu $2, $zero, 0
+
+; 64-CMOV-DAG: daddiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMOV-DAG: daddiu $[[I4:2]], $zero, 4
+; 64-CMOV-DAG: daddiu $[[R1:[0-9]+]], $zero, 32767
+; 64-CMOV-DAG: slt $[[R0:[0-9]+]], $[[R1]], $4
+; 64-CMOV-DAG: movn $[[I4]], $[[I5]], $[[R0]]
+
+; 64-CMP-DAG:  daddiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMP-DAG:  daddiu $[[I4:2]], $zero, 4
+; 64-CMP-DAG:  daddiu $[[R1:[0-9]+]], $zero, 32767
+; 64-CMP-DAG:  slt $[[R0:[0-9]+]], $[[R1]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I4]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i64 @slti64_1(i64 %a) {
 entry:
   %cmp = icmp sgt i64 %a, 32767
-  %conv = select i1 %cmp, i64 3, i64 4
+  %conv = select i1 %cmp, i64 5, i64 4
   ret i64 %conv
 }
 
-; N64-LABEL: slti64_2:
-; N64: slti $[[R0:[0-9]+]], ${{[0-9]+}}, -32768
-; N64: movz ${{[0-9]+}}, ${{[0-9]+}}, $[[R0]]
+; ALL-LABEL: slti64_2:
+
+; FIXME: The 32-bit versions of this test are too complicated to reasonably
+;        match at the moment. They do show some missing optimizations though
+;        such as:
+;           (movz $a, $b, (neg $c)) -> (movn $a, $b, $c)
+
+; 64-CMOV-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMOV-DAG:  addiu $[[I4:2]], $zero, 4
+; 64-CMOV-DAG:  slti $[[R0:[0-9]+]], $4, -32768
+; 64-CMOV-DAG:  movz $[[I4]], $[[I3]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMP-DAG:  addiu $[[I4:[0-9]+]], $zero, 4
+; 64-CMP-DAG:  slti $[[R0:[0-9]+]], $4, -32768
+; FIXME: We can do better than this by adding/subtracting the result of slti
+;        to/from one of the constants.
+; 64-CMP-DAG:  seleqz $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 64-CMP-DAG:  selnez $[[T1:[0-9]+]], $[[I4]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i64 @slti64_2(i64 %a) {
 entry:
@@ -144,21 +434,64 @@ entry:
   ret i64 %conv
 }
 
-; N64-LABEL: slti64_3:
-; N64: slt ${{[0-9]+}}
+; ALL-LABEL: slti64_3:
+
+; FIXME: The 32-bit versions of this test are too complicated to reasonably
+;        match at the moment. They do show some missing optimizations though
+;        such as:
+;           (movz $a, $b, (neg $c)) -> (movn $a, $b, $c)
+
+; 64-CMOV-DAG: daddiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMOV-DAG: daddiu $[[I4:2]], $zero, 4
+; 64-CMOV-DAG: daddiu $[[R1:[0-9]+]], ${{[0-9]+}}, 32766
+; 64-CMOV-DAG: slt $[[R0:[0-9]+]], $[[R1]], $4
+; 64-CMOV-DAG: movn $[[I4]], $[[I5]], $[[R0]]
+
+; 64-CMP-DAG:  daddiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMP-DAG:  daddiu $[[I4:2]], $zero, 4
+; 64-CMP-DAG:  daddiu $[[R1:[0-9]+]], ${{[0-9]+}}, 32766
+; 64-CMP-DAG:  slt $[[R0:[0-9]+]], $[[R1]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I4]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i64 @slti64_3(i64 %a) {
 entry:
   %cmp = icmp sgt i64 %a, -32770
-  %conv = select i1 %cmp, i64 3, i64 4
+  %conv = select i1 %cmp, i64 5, i64 4
   ret i64 %conv
 }
 
 ; sltiu instructions.
 
-; O32-LABEL: sltiu0:
-; O32: sltiu $[[R0:[0-9]+]], ${{[0-9]+}}, 32767
-; O32: movz ${{[0-9]+}}, ${{[0-9]+}}, $[[R0]]
+; ALL-LABEL: sltiu0:
+
+; 32-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 32-CMOV-DAG: sltiu $[[R0:[0-9]+]], $4, 32767
+; 32-CMOV-DAG: movz $[[I5]], $[[I3]], $[[R0]]
+
+; 32-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:  sltiu $[[R0:[0-9]+]], $4, 32767
+; FIXME: We can do better than this by using selccz to choose between +0 and +2
+; 32-CMP-DAG:  seleqz $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 32-CMP-DAG:  selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
+; 64-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 64-CMOV-DAG: sltiu $[[R0:[0-9]+]], $4, 32767
+; 64-CMOV-DAG: movz $[[I5]], $[[I3]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMP-DAG:  sltiu $[[R0:[0-9]+]], $4, 32767
+; FIXME: We can do better than this by using selccz to choose between +0 and +2
+; 64-CMP-DAG:  seleqz $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 64-CMP-DAG:  selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i32 @sltiu0(i32 %a) {
 entry:
@@ -167,19 +500,72 @@ entry:
   ret i32 %cond
 }
 
-; O32-LABEL: sltiu1:
-; O32: sltu ${{[0-9]+}}
+; ALL-LABEL: sltiu1:
+
+; 32-CMOV-DAG: addiu $[[I7:[0-9]+]], $zero, 7
+; 32-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 32-CMOV-DAG: addiu $[[R1:[0-9]+]], $zero, 32767
+; 32-CMOV-DAG: sltu $[[R0:[0-9]+]], $[[R1]], $4
+; 32-CMOV-DAG: movn $[[I5]], $[[I7]], $[[R0]]
+
+; 32-CMP-DAG:  addiu $[[I7:[0-9]+]], $zero, 7
+; 32-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:  addiu $[[I32767:[0-9]+]], $zero, 32767
+; 32-CMP-DAG:  sltu $[[R0:[0-9]+]], $[[I32767]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 32-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I7]], $[[R0]]
+; 32-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
+; 64-CMOV-DAG: addiu $[[I7:[0-9]+]], $zero, 7
+; 64-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 64-CMOV-DAG: addiu $[[R1:[0-9]+]], $zero, 32767
+; 64-CMOV-DAG: sltu $[[R0:[0-9]+]], $[[R1]], $4
+; 64-CMOV-DAG: movn $[[I5]], $[[I7]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I7:[0-9]+]], $zero, 7
+; 64-CMP-DAG:  addiu $[[I5:2]], $zero, 5
+; 64-CMP-DAG:  addiu $[[R1:[0-9]+]], $zero, 32767
+; 64-CMP-DAG:  sltu $[[R0:[0-9]+]], $[[R1]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I7]], $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i32 @sltiu1(i32 %a) {
 entry:
   %cmp = icmp ugt i32 %a, 32767
-  %cond = select i1 %cmp, i32 3, i32 5
+  %cond = select i1 %cmp, i32 7, i32 5
   ret i32 %cond
 }
 
-; O32-LABEL: sltiu2:
-; O32: sltiu $[[R0:[0-9]+]], ${{[0-9]+}}, -32768
-; O32: movz ${{[0-9]+}}, ${{[0-9]+}}, $[[R0]]
+; ALL-LABEL: sltiu2:
+
+; 32-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 32-CMOV-DAG: sltiu $[[R0:[0-9]+]], $4, -32768
+; 32-CMOV-DAG: movz $[[I5]], $[[I3]], $[[R0]]
+
+; 32-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:  sltiu $[[R0:[0-9]+]], $4, -32768
+; FIXME: We can do better than this by using selccz to choose between +0 and +2
+; 32-CMP-DAG:  seleqz $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 32-CMP-DAG:  selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
+; 64-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 64-CMOV-DAG: sltiu $[[R0:[0-9]+]], $4, -32768
+; 64-CMOV-DAG: movz $[[I5]], $[[I3]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 64-CMP-DAG:  sltiu $[[R0:[0-9]+]], $4, -32768
+; FIXME: We can do better than this by using selccz to choose between +0 and +2
+; 64-CMP-DAG:  seleqz $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 64-CMP-DAG:  selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i32 @sltiu2(i32 %a) {
 entry:
@@ -188,8 +574,41 @@ entry:
   ret i32 %cond
 }
 
-; O32-LABEL: sltiu3:
-; O32: sltu ${{[0-9]+}}
+; ALL-LABEL: sltiu3:
+
+; 32-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 32-CMOV-DAG: lui $[[R1:[0-9]+]], 65535
+; 32-CMOV-DAG: ori $[[R1]], $[[R1]], 32766
+; 32-CMOV-DAG: sltu $[[R0:[0-9]+]], $[[R1]], $4
+; 32-CMOV-DAG: movn $[[I5]], $[[I3]], $[[R0]]
+
+; 32-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 32-CMP-DAG:  addiu $[[I5:[0-9]+]], $zero, 5
+; 32-CMP-DAG:  lui $[[IMM:[0-9]+]], 65535
+; 32-CMP-DAG:  ori $[[IMM]], $[[IMM]], 32766
+; 32-CMP-DAG:  sltu $[[R0:[0-9]+]], $[[I32767]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 32-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 32-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
+; 64-CMOV-DAG: addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMOV-DAG: addiu $[[I5:2]], $zero, 5
+; 64-CMOV-DAG: lui $[[R1:[0-9]+]], 65535
+; 64-CMOV-DAG: ori $[[R1]], $[[R1]], 32766
+; 64-CMOV-DAG: sltu $[[R0:[0-9]+]], $[[R1]], $4
+; 64-CMOV-DAG: movn $[[I5]], $[[I3]], $[[R0]]
+
+; 64-CMP-DAG:  addiu $[[I3:[0-9]+]], $zero, 3
+; 64-CMP-DAG:  addiu $[[I5:2]], $zero, 5
+; 64-CMP-DAG:  lui $[[IMM:[0-9]+]], 65535
+; 64-CMP-DAG:  ori $[[IMM]], $[[IMM]], 32766
+; 64-CMP-DAG:  sltu $[[R0:[0-9]+]], $[[IMM]], $4
+; FIXME: We can do better than this by using selccz to choose between -0 and -2
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $[[I3]], $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
 define i32 @sltiu3(i32 %a) {
 entry:
@@ -210,11 +629,25 @@ define i32 @slti4(i32 %a) nounwind readnone {
   ret i32 %2
 }
 
-; O32-LABEL: slti4:
-; O32-DAG: slti [[R1:\$[0-9]+]], $4, 7
-; O32-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
-; O32-NOT: movn
-; O32:.size slti4
+; ALL-LABEL: slti4:
+
+; 32-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
+; 32-CMOV-DAG: addiu $2, [[R1]], 3
+; 32-CMOV-NOT: movn
+
+; 32-CMP-DAG:  slti [[R1:\$[0-9]+]], $4, 7
+; 32-CMP-DAG:  addiu $2, [[R1]], 3
+; 32-CMP-NOT:  seleqz
+; 32-CMP-NOT:  selnez
+
+; 64-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
+; 64-CMOV-DAG: addiu $2, [[R1]], 3
+; 64-CMOV-NOT: movn
+
+; 64-CMP-DAG:  slti [[R1:\$[0-9]+]], $4, 7
+; 64-CMP-DAG:  addiu $2, [[R1]], 3
+; 64-CMP-NOT:  seleqz
+; 64-CMP-NOT:  selnez
 
 define i32 @slti5(i32 %a) nounwind readnone {
   %1 = icmp slt i32 %a, 7
@@ -222,11 +655,25 @@ define i32 @slti5(i32 %a) nounwind readnone {
   ret i32 %2
 }
 
-; O32-LABEL: slti5:
-; O32-DAG: slti [[R1:\$[0-9]+]], $4, 7
-; O32-DAG: addiu [[R3:\$[0-9]+]], [[R2:\$[a-z0-9]+]], -4
-; O32-NOT: movn
-; O32:.size slti5
+; ALL-LABEL: slti5:
+
+; 32-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
+; 32-CMOV-DAG: addiu [[R3:\$[0-9]+]], [[R2:\$[a-z0-9]+]], -4
+; 32-CMOV-NOT: movn
+
+; 32-CMP-DAG:  slti [[R1:\$[0-9]+]], $4, 7
+; 32-CMP-DAG:  addiu [[R3:\$[0-9]+]], [[R2:\$[a-z0-9]+]], -4
+; 32-CMP-NOT:  seleqz
+; 32-CMP-NOT:  selnez
+
+; 64-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
+; 64-CMOV-DAG: addiu [[R3:\$[0-9]+]], [[R2:\$[a-z0-9]+]], -4
+; 64-CMOV-NOT: movn
+
+; 64-CMP-DAG:  slti [[R1:\$[0-9]+]], $4, 7
+; 64-CMP-DAG:  addiu [[R3:\$[0-9]+]], [[R2:\$[a-z0-9]+]], -4
+; 64-CMP-NOT:  seleqz
+; 64-CMP-NOT:  selnez
 
 define i32 @slti6(i32 %a) nounwind readnone {
   %1 = icmp slt i32 %a, 7
@@ -234,9 +681,26 @@ define i32 @slti6(i32 %a) nounwind readnone {
   ret i32 %2
 }
 
-; O32-LABEL: slti6:
-; O32-DAG: slti [[R1:\$[0-9]+]], $4, 7
-; O32-DAG: xori [[R1]], [[R1]], 1
-; O32-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
-; O32-NOT: movn
-; O32:.size slti6
+; ALL-LABEL: slti6:
+
+; 32-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
+; 32-CMOV-DAG: xori [[R1]], [[R1]], 1
+; 32-CMOV-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
+; 32-CMOV-NOT: movn
+
+; 32-CMP-DAG:  slti [[R1:\$[0-9]+]], $4, 7
+; 32-CMP-DAG:  xori [[R1]], [[R1]], 1
+; 32-CMP-DAG:  addiu [[R2:\$[0-9]+]], [[R1]], 3
+; 32-CMP-NOT:  seleqz
+; 32-CMP-NOT:  selnez
+
+; 64-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
+; 64-CMOV-DAG: xori [[R1]], [[R1]], 1
+; 64-CMOV-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
+; 64-CMOV-NOT: movn
+
+; 64-CMP-DAG:  slti [[R1:\$[0-9]+]], $4, 7
+; 64-CMP-DAG:  xori [[R1]], [[R1]], 1
+; 64-CMP-DAG:  addiu [[R2:\$[0-9]+]], [[R1]], 3
+; 64-CMP-NOT:  seleqz
+; 64-CMP-NOT:  selnez
diff --git a/test/CodeGen/Mips/countleading.ll b/test/CodeGen/Mips/countleading.ll
new file mode 100644
index 0000000..6e63cff
--- /dev/null
+++ b/test/CodeGen/Mips/countleading.ll
@@ -0,0 +1,90 @@
+; RUN: llc -march=mipsel -mcpu=mips32   < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32-R1-R2 -check-prefix=MIPS32-GT-R1 %s
+; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32-R1-R2 -check-prefix=MIPS32-GT-R1 %s
+; RUN: llc -march=mipsel -mcpu=mips32r6 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32-R6    -check-prefix=MIPS32-GT-R1 %s
+; RUN: llc -march=mips64el -mcpu=mips4    < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS4 %s
+; RUN: llc -march=mips64el -mcpu=mips64   < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64-GT-R1 %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64-GT-R1 %s
+; R!N: llc -march=mips64el -mcpu=mips64r6 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64-GT-R1 %s
+
+; Prefixes:
+;   ALL      - All
+;   MIPS32-GT-R1 - MIPS64r1 and above (does not include MIPS64's)
+;   MIPS64-GT-R1 - MIPS64r1 and above
+
+define i32 @ctlz_i32(i32 %X) nounwind readnone {
+entry:
+; ALL-LABEL: ctlz_i32:
+
+; MIPS4-NOT:     clz
+
+; MIPS32-GT-R1:  clz $2, $4
+
+; MIPS64-GT-R1:  clz $2, $4
+
+  %tmp1 = tail call i32 @llvm.ctlz.i32(i32 %X, i1 true)
+  ret i32 %tmp1
+}
+
+declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
+
+define i32 @ctlo_i32(i32 %X) nounwind readnone {
+entry:
+; ALL-LABEL: ctlo_i32:
+
+; MIPS4-NOT:     clo
+
+; MIPS32-GT-R1:  clo $2, $4
+
+; MIPS64-GT-R1:  clo $2, $4
+
+  %neg = xor i32 %X, -1
+  %tmp1 = tail call i32 @llvm.ctlz.i32(i32 %neg, i1 true)
+  ret i32 %tmp1
+}
+
+define i64 @ctlz_i64(i64 %X) nounwind readnone {
+entry:
+; ALL-LABEL: ctlz_i64:
+
+; MIPS4-NOT:     dclz
+
+; MIPS32-GT-R1-DAG: clz $[[R0:[0-9]+]], $4
+; MIPS32-GT-R1-DAG: clz $[[R1:[0-9]+]], $5
+; MIPS32-GT-R1-DAG: addiu $[[R2:2+]], $[[R0]], 32
+; MIPS32-R1-R2-DAG: movn $[[R2]], $[[R1]], $5
+; MIPS32-R6-DAG:    seleqz $[[R5:[0-9]+]], $[[R2]], $5
+; MIPS32-R6-DAG:    selnez $[[R6:[0-9]+]], $[[R1]], $5
+; MIPS32-R6-DAG:    or $2, $[[R6]], $[[R5]]
+; MIPS32-GT-R1-DAG: addiu $3, $zero, 0
+
+; MIPS64-GT-R1:  dclz $2, $4
+
+  %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %X, i1 true)
+  ret i64 %tmp1
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
+
+define i64 @ctlo_i64(i64 %X) nounwind readnone {
+entry:
+; ALL-LABEL: ctlo_i64:
+
+; MIPS4-NOT:     dclo
+
+; MIPS32-GT-R1-DAG: clo $[[R0:[0-9]+]], $4
+; MIPS32-GT-R1-DAG: clo $[[R1:[0-9]+]], $5
+; MIPS32-GT-R1-DAG: addiu $[[R2:2+]], $[[R0]], 32
+; MIPS32-GT-R1-DAG: addiu $[[R3:[0-9]+]], $zero, -1
+; MIPS32-GT-R1-DAG: xor $[[R4:[0-9]+]], $5, $[[R3]]
+; MIPS32-R1-R2-DAG: movn $[[R2]], $[[R1]], $[[R4]]
+; MIPS32-R6-DAG:    selnez $[[R5:[0-9]+]], $[[R1]], $[[R4]]
+; MIPS32-R6-DAG:    seleqz $[[R6:[0-9]+]], $[[R2]], $[[R4]]
+; MIPS32-R6-DAG:    or $2, $[[R5]], $[[R6]]
+; MIPS32-GT-R1-DAG: addiu $3, $zero, 0
+
+; MIPS64-GT-R1:  dclo $2, $4
+
+  %neg = xor i64 %X, -1
+  %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %neg, i1 true)
+  ret i64 %tmp1
+}
diff --git a/test/CodeGen/Mips/divrem.ll b/test/CodeGen/Mips/divrem.ll
index b631c3b..97f8360 100644
--- a/test/CodeGen/Mips/divrem.ll
+++ b/test/CodeGen/Mips/divrem.ll
@@ -1,77 +1,223 @@
-; RUN: llc -march=mips -verify-machineinstrs < %s |\
-; RUN: FileCheck %s -check-prefix=TRAP
-; RUN: llc -march=mips -mno-check-zero-division < %s |\
-; RUN: FileCheck %s -check-prefix=NOCHECK
+; RUN: llc -march=mips   -mcpu=mips32   -verify-machineinstrs    < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC32 -check-prefix=ACC32-TRAP
+; RUN: llc -march=mips   -mcpu=mips32r2 -verify-machineinstrs    < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC32 -check-prefix=ACC32-TRAP
+; RUN: llc -march=mips   -mcpu=mips32r6 -verify-machineinstrs    < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR32 -check-prefix=GPR32-TRAP
+; RUN: llc -march=mips64 -mcpu=mips64   -verify-machineinstrs    < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC64 -check-prefix=ACC64-TRAP
+; RUN: llc -march=mips64 -mcpu=mips64r2 -verify-machineinstrs    < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC64 -check-prefix=ACC64-TRAP
+; RUN: llc -march=mips64 -mcpu=mips64r6 -verify-machineinstrs    < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=GPR64-TRAP
 
-; TRAP-LABEL: sdiv1:
-; TRAP: div $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
-; TRAP: teq $[[R0]], $zero, 7
-; TRAP: mflo
+; RUN: llc -march=mips   -mcpu=mips32   -mno-check-zero-division < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC32 -check-prefix=NOCHECK
+; RUN: llc -march=mips   -mcpu=mips32r2 -mno-check-zero-division < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC32 -check-prefix=NOCHECK
+; RUN: llc -march=mips   -mcpu=mips32r6 -mno-check-zero-division < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR32 -check-prefix=NOCHECK
+; RUN: llc -march=mips64 -mcpu=mips64   -mno-check-zero-division < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC64 -check-prefix=NOCHECK
+; RUN: llc -march=mips64 -mcpu=mips64r2 -mno-check-zero-division < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC64 -check-prefix=NOCHECK
+; RUN: llc -march=mips64 -mcpu=mips64r6 -mno-check-zero-division < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=NOCHECK
 
-; NOCHECK-LABEL: sdiv1:
-; NOCHECK-NOT: teq
-; NOCHECK: .end sdiv1
+; FileCheck Prefixes:
+;   ALL - All targets
+;   ACC32 - Accumulator based multiply/divide on 32-bit targets
+;   ACC64 - Same as ACC32 but only for 64-bit targets
+;   GPR32 - GPR based multiply/divide on 32-bit targets
+;   GPR64 - Same as GPR32 but only for 64-bit targets
+;   ACC32-TRAP - Same as TRAP and ACC32 combined
+;   ACC64-TRAP - Same as TRAP and ACC64 combined
+;   GPR32-TRAP - Same as TRAP and GPR32 combined
+;   GPR64-TRAP - Same as TRAP and GPR64 combined
+;   NOCHECK - Division by zero will not be detected
 
 @g0 = common global i32 0, align 4
 @g1 = common global i32 0, align 4
 
 define i32 @sdiv1(i32 %a0, i32 %a1) nounwind readnone {
 entry:
+; ALL-LABEL: sdiv1:
+
+; ACC32:         div $zero, $4, $5
+; ACC32-TRAP:    teq $5, $zero, 7
+
+; ACC64:         div $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+
+; GPR32:         div $2, $4, $5
+; GPR32-TRAP:    teq $5, $zero, 7
+
+; GPR64:         div $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+
+; NOCHECK-NOT:   teq
+
+; ACC32:         mflo $2
+; ACC64:         mflo $2
+
+; ALL: .end sdiv1
+
   %div = sdiv i32 %a0, %a1
   ret i32 %div
 }
 
-; TRAP-LABEL: srem1:
-; TRAP: div $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
-; TRAP: teq $[[R0]], $zero, 7
-; TRAP: mfhi
-
 define i32 @srem1(i32 %a0, i32 %a1) nounwind readnone {
 entry:
+; ALL-LABEL: srem1:
+
+; ACC32:         div $zero, $4, $5
+; ACC32-TRAP:    teq $5, $zero, 7
+
+; ACC64:         div $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+
+; GPR32:         mod $2, $4, $5
+; GPR32-TRAP:    teq $5, $zero, 7
+
+; GPR64:         mod $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+
+; NOCHECK-NOT:   teq
+
+; ACC32:         mfhi $2
+; ACC64:         mfhi $2
+
+; ALL: .end srem1
+
   %rem = srem i32 %a0, %a1
   ret i32 %rem
 }
 
-; TRAP-LABEL: udiv1:
-; TRAP: divu $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
-; TRAP: teq $[[R0]], $zero, 7
-; TRAP: mflo
-
 define i32 @udiv1(i32 %a0, i32 %a1) nounwind readnone {
 entry:
+; ALL-LABEL: udiv1:
+
+; ACC32:         divu $zero, $4, $5
+; ACC32-TRAP:    teq $5, $zero, 7
+
+; ACC64:         divu $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+
+; GPR32:         divu $2, $4, $5
+; GPR32-TRAP:    teq $5, $zero, 7
+
+; GPR64:         divu $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+
+; NOCHECK-NOT:   teq
+
+; ACC32:         mflo $2
+; ACC64:         mflo $2
+
+; ALL: .end udiv1
   %div = udiv i32 %a0, %a1
   ret i32 %div
 }
 
-; TRAP-LABEL: urem1:
-; TRAP: divu $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
-; TRAP: teq $[[R0]], $zero, 7
-; TRAP: mfhi
-
 define i32 @urem1(i32 %a0, i32 %a1) nounwind readnone {
 entry:
+; ALL-LABEL: urem1:
+
+; ACC32:         divu $zero, $4, $5
+; ACC32-TRAP:    teq $5, $zero, 7
+
+; ACC64:         divu $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+
+; GPR32:         modu $2, $4, $5
+; GPR32-TRAP:    teq $5, $zero, 7
+
+; GPR64:         modu $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+
+; NOCHECK-NOT:   teq
+
+; ACC32:         mfhi $2
+; ACC64:         mfhi $2
+
+; ALL: .end urem1
+
   %rem = urem i32 %a0, %a1
   ret i32 %rem
 }
 
-; TRAP: div $zero,
 define i32 @sdivrem1(i32 %a0, i32 %a1, i32* nocapture %r) nounwind {
 entry:
+; ALL-LABEL: sdivrem1:
+
+; ACC32:         div $zero, $4, $5
+; ACC32-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; ACC32:         mflo $2
+; ACC32:         mfhi $[[R0:[0-9]+]]
+; ACC32:         sw $[[R0]], 0(${{[0-9]+}})
+
+; ACC64:         div $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; ACC64:         mflo $2
+; ACC64:         mfhi $[[R0:[0-9]+]]
+; ACC64:         sw $[[R0]], 0(${{[0-9]+}})
+
+; GPR32:         mod $[[R0:[0-9]+]], $4, $5
+; GPR32-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; GPR32:         sw $[[R0]], 0(${{[0-9]+}})
+; GPR32-DAG:     div $2, $4, $5
+; GPR32-TRAP:    teq $5, $zero, 7
+
+; GPR64:         mod $[[R0:[0-9]+]], $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; GPR64:         sw $[[R0]], 0(${{[0-9]+}})
+; GPR64-DAG:     div $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+
+; ALL: .end sdivrem1
+
   %rem = srem i32 %a0, %a1
   store i32 %rem, i32* %r, align 4
   %div = sdiv i32 %a0, %a1
   ret i32 %div
 }
 
-; TRAP: divu $zero,
 define i32 @udivrem1(i32 %a0, i32 %a1, i32* nocapture %r) nounwind {
 entry:
+; ALL-LABEL: udivrem1:
+
+; ACC32:         divu $zero, $4, $5
+; ACC32-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; ACC32:         mflo $2
+; ACC32:         mfhi $[[R0:[0-9]+]]
+; ACC32:         sw $[[R0]], 0(${{[0-9]+}})
+
+; ACC64:         divu $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; ACC64:         mflo $2
+; ACC64:         mfhi $[[R0:[0-9]+]]
+; ACC64:         sw $[[R0]], 0(${{[0-9]+}})
+
+; GPR32:         modu $[[R0:[0-9]+]], $4, $5
+; GPR32-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; GPR32:         sw $[[R0]], 0(${{[0-9]+}})
+; GPR32-DAG:     divu $2, $4, $5
+; GPR32-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+
+; GPR64:         modu $[[R0:[0-9]+]], $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; GPR64:         sw $[[R0]], 0(${{[0-9]+}})
+; GPR64-DAG:     divu $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+
+; ALL: .end udivrem1
+
   %rem = urem i32 %a0, %a1
   store i32 %rem, i32* %r, align 4
   %div = udiv i32 %a0, %a1
   ret i32 %div
 }
 
+; FIXME: It's not clear what this is supposed to test.
 define i32 @killFlags() {
 entry:
   %0 = load i32* @g0, align 4
@@ -79,3 +225,164 @@ entry:
   %div = sdiv i32 %0, %1
   ret i32 %div
 }
+
+define i64 @sdiv2(i64 %a0, i64 %a1) nounwind readnone {
+entry:
+; ALL-LABEL: sdiv2:
+
+; ACC32:         lw $25, %call16(__divdi3)(
+; ACC32:         jalr $25
+
+; ACC64:         ddiv $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+
+; GPR64:         ddiv $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+
+; NOCHECK-NOT:   teq
+
+; ACC64:         mflo $2
+
+; ALL: .end sdiv2
+
+  %div = sdiv i64 %a0, %a1
+  ret i64 %div
+}
+
+define i64 @srem2(i64 %a0, i64 %a1) nounwind readnone {
+entry:
+; ALL-LABEL: srem2:
+
+; ACC32:         lw $25, %call16(__moddi3)(
+; ACC32:         jalr $25
+
+; ACC64:         div $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+
+; GPR64:         dmod $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+
+; NOCHECK-NOT:   teq
+
+; ACC64:         mfhi $2
+
+; ALL: .end srem2
+
+  %rem = srem i64 %a0, %a1
+  ret i64 %rem
+}
+
+define i64 @udiv2(i64 %a0, i64 %a1) nounwind readnone {
+entry:
+; ALL-LABEL: udiv2:
+
+; ACC32:         lw $25, %call16(__udivdi3)(
+; ACC32:         jalr $25
+
+; ACC64:         divu $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+
+; GPR64:         ddivu $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+
+; NOCHECK-NOT:   teq
+
+; ACC64:         mflo $2
+
+; ALL: .end udiv2
+  %div = udiv i64 %a0, %a1
+  ret i64 %div
+}
+
+define i64 @urem2(i64 %a0, i64 %a1) nounwind readnone {
+entry:
+; ALL-LABEL: urem2:
+
+; ACC32:         lw $25, %call16(__umoddi3)(
+; ACC32:         jalr $25
+
+; ACC64:         divu $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+
+; GPR64:         dmodu $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+
+; NOCHECK-NOT:   teq
+
+; ACC64:         mfhi $2
+
+; ALL: .end urem2
+
+  %rem = urem i64 %a0, %a1
+  ret i64 %rem
+}
+
+define i64 @sdivrem2(i64 %a0, i64 %a1, i64* nocapture %r) nounwind {
+entry:
+; ALL-LABEL: sdivrem2:
+
+; sdivrem2 is too complex to effectively check. We can at least check for the
+; calls though.
+; ACC32:         lw $25, %call16(__moddi3)(
+; ACC32:         jalr $25
+; ACC32:         lw $25, %call16(__divdi3)(
+; ACC32:         jalr $25
+
+; ACC64:         ddiv $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; ACC64:         mflo $2
+; ACC64:         mfhi $[[R0:[0-9]+]]
+; ACC64:         sd $[[R0]], 0(${{[0-9]+}})
+
+; GPR64:         dmod $[[R0:[0-9]+]], $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; GPR64:         sd $[[R0]], 0(${{[0-9]+}})
+
+; GPR64-DAG:     ddiv $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+
+; ALL: .end sdivrem2
+
+  %rem = srem i64 %a0, %a1
+  store i64 %rem, i64* %r, align 8
+  %div = sdiv i64 %a0, %a1
+  ret i64 %div
+}
+
+define i64 @udivrem2(i64 %a0, i64 %a1, i64* nocapture %r) nounwind {
+entry:
+; ALL-LABEL: udivrem2:
+
+; udivrem2 is too complex to effectively check. We can at least check for the
+; calls though.
+; ACC32:         lw $25, %call16(__umoddi3)(
+; ACC32:         jalr $25
+; ACC32:         lw $25, %call16(__udivdi3)(
+; ACC32:         jalr $25
+
+; ACC64:         ddivu $zero, $4, $5
+; ACC64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; ACC64:         mflo $2
+; ACC64:         mfhi $[[R0:[0-9]+]]
+; ACC64:         sd $[[R0]], 0(${{[0-9]+}})
+
+; GPR64:         dmodu $[[R0:[0-9]+]], $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+; GPR64:         sd $[[R0]], 0(${{[0-9]+}})
+
+; GPR64-DAG:     ddivu $2, $4, $5
+; GPR64-TRAP:    teq $5, $zero, 7
+; NOCHECK-NOT:   teq
+
+; ALL: .end udivrem2
+
+  %rem = urem i64 %a0, %a1
+  store i64 %rem, i64* %r, align 8
+  %div = udiv i64 %a0, %a1
+  ret i64 %div
+}
diff --git a/test/CodeGen/Mips/dsp-r1.ll b/test/CodeGen/Mips/dsp-r1.ll
index acdd17d..fbd9703 100644
--- a/test/CodeGen/Mips/dsp-r1.ll
+++ b/test/CodeGen/Mips/dsp-r1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -mattr=+dsp < %s | FileCheck %s
+; RUN: llc -march=mipsel -mcpu=mips32 -mattr=+dsp < %s | FileCheck %s
 
 define i32 @test__builtin_mips_extr_w1(i32 %i0, i32, i64 %a0) nounwind {
 entry:
diff --git a/test/CodeGen/Mips/eh-return32.ll b/test/CodeGen/Mips/eh-return32.ll
index c3003b3..748050c 100644
--- a/test/CodeGen/Mips/eh-return32.ll
+++ b/test/CodeGen/Mips/eh-return32.ll
@@ -1,4 +1,6 @@
-; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mcpu=mips32   -asm-show-inst < %s | FileCheck %s -check-prefix=CHECK -check-prefix=NOT-R6
+; RUN: llc -march=mipsel -mcpu=mips32r2 -asm-show-inst < %s | FileCheck %s -check-prefix=CHECK -check-prefix=NOT-R6
+; RUN: llc -march=mipsel -mcpu=mips32r6 -asm-show-inst < %s | FileCheck %s -check-prefix=CHECK -check-prefix=R6
 
 declare void @llvm.eh.return.i32(i32, i8*)
 declare void @foo(...)
@@ -9,7 +11,7 @@ entry:
   call void @llvm.eh.return.i32(i32 %offset, i8* %handler)
   unreachable
 
-; CHECK:        f1
+; CHECK:    f1:
 ; CHECK:        addiu   $sp, $sp, -[[spoffset:[0-9]+]]
 
 ; check that $a0-$a3 are saved on stack.
@@ -41,7 +43,8 @@ entry:
 ; CHECK:        addiu   $sp, $sp, [[spoffset]]
 ; CHECK:        move    $25, $2
 ; CHECK:        move    $ra, $2
-; CHECK:        jr      $ra
+; NOT-R6:       jr      $ra # <MCInst #{{[0-9]+}} JR
+; R6:           jr      $ra # <MCInst #{{[0-9]+}} JALR
 ; CHECK:        addu    $sp, $sp, $3
 }
 
@@ -50,7 +53,7 @@ entry:
   call void @llvm.eh.return.i32(i32 %offset, i8* %handler)
   unreachable
 
-; CHECK:        f2
+; CHECK:    f2:
 ; CHECK:        addiu   $sp, $sp, -[[spoffset:[0-9]+]]
 
 ; check that $a0-$a3 are saved on stack.
@@ -80,6 +83,7 @@ entry:
 ; CHECK:        addiu   $sp, $sp, [[spoffset]]
 ; CHECK:        move    $25, $2
 ; CHECK:        move    $ra, $2
-; CHECK:        jr      $ra
+; NOT-R6:       jr      $ra # <MCInst #{{[0-9]+}} JR
+; R6:           jr      $ra # <MCInst #{{[0-9]+}} JALR
 ; CHECK:        addu    $sp, $sp, $3
 }
diff --git a/test/CodeGen/Mips/eh-return64.ll b/test/CodeGen/Mips/eh-return64.ll
index 8c5af50..74a4323 100644
--- a/test/CodeGen/Mips/eh-return64.ll
+++ b/test/CodeGen/Mips/eh-return64.ll
@@ -1,5 +1,7 @@
-; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s
-; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips4    -asm-show-inst < %s | FileCheck %s -check-prefix=CHECK -check-prefix=NOT-R6
+; RUN: llc -march=mips64el -mcpu=mips64   -asm-show-inst < %s | FileCheck %s -check-prefix=CHECK -check-prefix=NOT-R6
+; RUN: llc -march=mips64el -mcpu=mips64r2 -asm-show-inst < %s | FileCheck %s -check-prefix=CHECK -check-prefix=NOT-R6
+; RUN: llc -march=mips64el -mcpu=mips64r6 -asm-show-inst < %s | FileCheck %s -check-prefix=CHECK -check-prefix=R6
 
 declare void @llvm.eh.return.i64(i64, i8*)
 declare void @foo(...)
@@ -10,7 +12,7 @@ entry:
   call void @llvm.eh.return.i64(i64 %offset, i8* %handler)
   unreachable
 
-; CHECK:        f1
+; CHECK:    f1:
 ; CHECK:        daddiu  $sp, $sp, -[[spoffset:[0-9]+]]
 
 ; check that $a0-$a3 are saved on stack.
@@ -42,9 +44,9 @@ entry:
 ; CHECK:        daddiu  $sp, $sp, [[spoffset]]
 ; CHECK:        move    $25, $2
 ; CHECK:        move    $ra, $2
-; CHECK:        jr      $ra
+; NOT-R6:       jr      $ra # <MCInst #{{[0-9]+}} JR
+; R6:           jr      $ra # <MCInst #{{[0-9]+}} JALR
 ; CHECK:        daddu   $sp, $sp, $3
-
 }
 
 define void @f2(i64 %offset, i8* %handler) {
@@ -52,7 +54,7 @@ entry:
   call void @llvm.eh.return.i64(i64 %offset, i8* %handler)
   unreachable
 
-; CHECK:        f2
+; CHECK:    f2:
 ; CHECK:        .cfi_startproc
 ; CHECK:        daddiu  $sp, $sp, -[[spoffset:[0-9]+]]
 ; CHECK:        .cfi_def_cfa_offset [[spoffset]]
@@ -84,7 +86,8 @@ entry:
 ; CHECK:        daddiu  $sp, $sp, [[spoffset]]
 ; CHECK:        move    $25, $2
 ; CHECK:        move    $ra, $2
-; CHECK:        jr      $ra
+; NOT-R6:       jr      $ra # <MCInst #{{[0-9]+}} JR
+; R6:           jr      $ra # <MCInst #{{[0-9]+}} JALR
 ; CHECK:        daddu   $sp, $sp, $3
 ; CHECK:        .cfi_endproc
 }
diff --git a/test/CodeGen/Mips/ehframe-indirect.ll b/test/CodeGen/Mips/ehframe-indirect.ll
new file mode 100644
index 0000000..e78497a
--- /dev/null
+++ b/test/CodeGen/Mips/ehframe-indirect.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=mipsel-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-linux-android < %s | FileCheck %s
+
+define i32 @main() {
+; CHECK: .cfi_startproc
+; CHECK: .cfi_personality 128, DW.ref.__gxx_personality_v0
+
+entry:
+  invoke void @foo() to label %cont unwind label %lpad
+; CHECK: foo
+; CHECK: jalr
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8*
+    bitcast (i32 (...)* @__gxx_personality_v0 to i8*) catch i8* null
+  ret i32 0
+
+cont:
+  ret i32 0
+}
+; CHECK: .cfi_endproc
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @foo()
+
+; CHECK: .hidden DW.ref.__gxx_personality_v0
+; CHECK: .weak DW.ref.__gxx_personality_v0
+; CHECK: .section .data.DW.ref.__gxx_personality_v0,"aGw",@progbits,DW.ref.__gxx_personality_v0,comdat
+; CHECK: .align 2
+; CHECK: .type DW.ref.__gxx_personality_v0,@object
+; CHECK: .size DW.ref.__gxx_personality_v0, 4
+; CHECK: DW.ref.__gxx_personality_v0:
+; CHECK: .4byte __gxx_personality_v0
diff --git a/test/CodeGen/Mips/fcmp.ll b/test/CodeGen/Mips/fcmp.ll
new file mode 100644
index 0000000..b775983
--- /dev/null
+++ b/test/CodeGen/Mips/fcmp.ll
@@ -0,0 +1,783 @@
+; RUN: llc < %s -march=mipsel   -mcpu=mips32   | FileCheck %s -check-prefix=ALL -check-prefix=32-C
+; RUN: llc < %s -march=mipsel   -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=32-C
+; RUN: llc < %s -march=mipsel   -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=32-CMP
+; RUN: llc < %s -march=mips64el -mcpu=mips4    | FileCheck %s -check-prefix=ALL -check-prefix=64-C
+; RUN: llc < %s -march=mips64el -mcpu=mips64   | FileCheck %s -check-prefix=ALL -check-prefix=64-C
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 | FileCheck %s -check-prefix=ALL -check-prefix=64-C
+; RUN: llc < %s -march=mips64el -mcpu=mips64r6 | FileCheck %s -check-prefix=ALL -check-prefix=64-CMP
+
+define i32 @false_f32(float %a, float %b) nounwind {
+; ALL-LABEL: false_f32:
+; ALL:           addiu $2, $zero, 0
+
+  %1 = fcmp false float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @oeq_f32(float %a, float %b) nounwind {
+; ALL-LABEL: oeq_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.eq.s $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.eq.s $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.eq.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.eq.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp oeq float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ogt_f32(float %a, float %b) nounwind {
+; ALL-LABEL: ogt_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ule.s $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ule.s $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.lt.s $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.lt.s $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp ogt float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @oge_f32(float %a, float %b) nounwind {
+; ALL-LABEL: oge_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ult.s $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ult.s $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.le.s $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.le.s $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp oge float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @olt_f32(float %a, float %b) nounwind {
+; ALL-LABEL: olt_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.olt.s $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.olt.s $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.lt.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.lt.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp olt float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ole_f32(float %a, float %b) nounwind {
+; ALL-LABEL: ole_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ole.s $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ole.s $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.le.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.le.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp ole float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @one_f32(float %a, float %b) nounwind {
+; ALL-LABEL: one_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ueq.s $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ueq.s $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ueq.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    not $[[T2:[0-9]+]], $[[T1]]
+; 32-CMP-DAG:    andi $2, $[[T2]], 1
+
+; 64-CMP-DAG:    cmp.ueq.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    not $[[T2:[0-9]+]], $[[T1]]
+; 64-CMP-DAG:    andi $2, $[[T2]], 1
+
+  %1 = fcmp one float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ord_f32(float %a, float %b) nounwind {
+; ALL-LABEL: ord_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.un.s $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.un.s $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.un.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    not $[[T2:[0-9]+]], $[[T1]]
+; 32-CMP-DAG:    andi $2, $[[T2]], 1
+
+; 64-CMP-DAG:    cmp.un.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    not $[[T2:[0-9]+]], $[[T1]]
+; 64-CMP-DAG:    andi $2, $[[T2]], 1
+
+  %1 = fcmp ord float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ueq_f32(float %a, float %b) nounwind {
+; ALL-LABEL: ueq_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ueq.s $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ueq.s $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ueq.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.ueq.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp ueq float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ugt_f32(float %a, float %b) nounwind {
+; ALL-LABEL: ugt_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ole.s $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ole.s $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ult.s $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.ult.s $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp ugt float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @uge_f32(float %a, float %b) nounwind {
+; ALL-LABEL: uge_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.olt.s $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.olt.s $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ule.s $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.ule.s $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp uge float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ult_f32(float %a, float %b) nounwind {
+; ALL-LABEL: ult_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ult.s $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ult.s $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ult.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.ult.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp ult float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ule_f32(float %a, float %b) nounwind {
+; ALL-LABEL: ule_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ule.s $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ule.s $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ule.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.ule.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp ule float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @une_f32(float %a, float %b) nounwind {
+; ALL-LABEL: une_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.eq.s $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.eq.s $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.eq.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    not $[[T2:[0-9]+]], $[[T1]]
+; 32-CMP-DAG:    andi $2, $[[T2]], 1
+
+; 64-CMP-DAG:    cmp.eq.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    not $[[T2:[0-9]+]], $[[T1]]
+; 64-CMP-DAG:    andi $2, $[[T2]], 1
+
+  %1 = fcmp une float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @uno_f32(float %a, float %b) nounwind {
+; ALL-LABEL: uno_f32:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.un.s $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.un.s $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.un.s $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.un.s $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp uno float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @true_f32(float %a, float %b) nounwind {
+; ALL-LABEL: true_f32:
+; ALL:           addiu $2, $zero, 1
+
+  %1 = fcmp true float %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @false_f64(double %a, double %b) nounwind {
+; ALL-LABEL: false_f64:
+; ALL:           addiu $2, $zero, 0
+
+  %1 = fcmp false double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @oeq_f64(double %a, double %b) nounwind {
+; ALL-LABEL: oeq_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.eq.d $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.eq.d $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.eq.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.eq.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp oeq double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ogt_f64(double %a, double %b) nounwind {
+; ALL-LABEL: ogt_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ule.d $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ule.d $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.lt.d $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.lt.d $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp ogt double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @oge_f64(double %a, double %b) nounwind {
+; ALL-LABEL: oge_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ult.d $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ult.d $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.le.d $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.le.d $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp oge double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @olt_f64(double %a, double %b) nounwind {
+; ALL-LABEL: olt_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.olt.d $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.olt.d $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.lt.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.lt.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp olt double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ole_f64(double %a, double %b) nounwind {
+; ALL-LABEL: ole_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ole.d $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ole.d $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.le.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.le.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp ole double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @one_f64(double %a, double %b) nounwind {
+; ALL-LABEL: one_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ueq.d $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ueq.d $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ueq.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    not $[[T2:[0-9]+]], $[[T1]]
+; 32-CMP-DAG:    andi $2, $[[T2]], 1
+
+; 64-CMP-DAG:    cmp.ueq.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    not $[[T2:[0-9]+]], $[[T1]]
+; 64-CMP-DAG:    andi $2, $[[T2]], 1
+
+  %1 = fcmp one double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ord_f64(double %a, double %b) nounwind {
+; ALL-LABEL: ord_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.un.d $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.un.d $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.un.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    not $[[T2:[0-9]+]], $[[T1]]
+; 32-CMP-DAG:    andi $2, $[[T2]], 1
+
+; 64-CMP-DAG:    cmp.un.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    not $[[T2:[0-9]+]], $[[T1]]
+; 64-CMP-DAG:    andi $2, $[[T2]], 1
+
+  %1 = fcmp ord double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ueq_f64(double %a, double %b) nounwind {
+; ALL-LABEL: ueq_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ueq.d $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ueq.d $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ueq.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.ueq.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp ueq double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ugt_f64(double %a, double %b) nounwind {
+; ALL-LABEL: ugt_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ole.d $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ole.d $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ult.d $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.ult.d $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp ugt double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @uge_f64(double %a, double %b) nounwind {
+; ALL-LABEL: uge_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.olt.d $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.olt.d $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ule.d $[[T0:f[0-9]+]], $f14, $f12
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.ule.d $[[T0:f[0-9]+]], $f13, $f12
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp uge double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ult_f64(double %a, double %b) nounwind {
+; ALL-LABEL: ult_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ult.d $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ult.d $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ult.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.ult.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp ult double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @ule_f64(double %a, double %b) nounwind {
+; ALL-LABEL: ule_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.ule.d $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.ule.d $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.ule.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.ule.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp ule double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @une_f64(double %a, double %b) nounwind {
+; ALL-LABEL: une_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.eq.d $f12, $f14
+; 32-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.eq.d $f12, $f13
+; 64-C-DAG:      movf $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.eq.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    not $[[T2:[0-9]+]], $[[T1]]
+; 32-CMP-DAG:    andi $2, $[[T2]], 1
+
+; 64-CMP-DAG:    cmp.eq.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    not $[[T2:[0-9]+]], $[[T1]]
+; 64-CMP-DAG:    andi $2, $[[T2]], 1
+
+  %1 = fcmp une double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @uno_f64(double %a, double %b) nounwind {
+; ALL-LABEL: uno_f64:
+
+; 32-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 32-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 32-C-DAG:      c.un.d $f12, $f14
+; 32-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 64-C-DAG:      addiu $[[T0:2]], $zero, 0
+; 64-C-DAG:      addiu $[[T1:[0-9]+]], $zero, 1
+; 64-C-DAG:      c.un.d $f12, $f13
+; 64-C-DAG:      movt $[[T0]], $1, $fcc0
+
+; 32-CMP-DAG:    cmp.un.d $[[T0:f[0-9]+]], $f12, $f14
+; 32-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 32-CMP-DAG:    andi $2, $[[T1]], 1
+
+; 64-CMP-DAG:    cmp.un.d $[[T0:f[0-9]+]], $f12, $f13
+; 64-CMP-DAG:    mfc1 $[[T1:[0-9]+]], $[[T0]]
+; 64-CMP-DAG:    andi $2, $[[T1]], 1
+
+  %1 = fcmp uno double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @true_f64(double %a, double %b) nounwind {
+; ALL-LABEL: true_f64:
+; ALL:           addiu $2, $zero, 1
+
+  %1 = fcmp true double %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
diff --git a/test/CodeGen/Mips/fcopysign.ll b/test/CodeGen/Mips/fcopysign.ll
index 44c4117..3a9d9c7 100644
--- a/test/CodeGen/Mips/fcopysign.ll
+++ b/test/CodeGen/Mips/fcopysign.ll
@@ -17,7 +17,7 @@ entry:
 
 ; 32R2: ext  $[[EXT:[0-9]+]], ${{[0-9]+}}, 31, 1
 ; 32R2: ins  $[[INS:[0-9]+]], $[[EXT]], 31, 1
-; 32R2: mtc1 $[[INS]], $f1
+; 32R2: mthc1 $[[INS]], $f0
 
 ; 64: daddiu $[[T0:[0-9]+]], $zero, 1
 ; 64: dsll   $[[MSK1:[0-9]+]], $[[T0]], 63
diff --git a/test/CodeGen/Mips/fmadd1.ll b/test/CodeGen/Mips/fmadd1.ll
index a9a8e21..271631e 100644
--- a/test/CodeGen/Mips/fmadd1.ll
+++ b/test/CodeGen/Mips/fmadd1.ll
@@ -5,15 +5,54 @@
 ; IEEE 754 (1985) and IEEE 754 (2008). These instructions are therefore only
 ; available when -enable-no-nans-fp-math is given.
 
-; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -enable-no-nans-fp-math | FileCheck %s -check-prefix=32R2 -check-prefix=CHECK
-; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=64R2 -check-prefix=CHECK
-; RUN: llc < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=32R2NAN -check-prefix=CHECK
-; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 | FileCheck %s -check-prefix=64R2NAN -check-prefix=CHECK
+; RUN: llc < %s -march=mipsel   -mcpu=mips32              -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=32   -check-prefix=32-NONAN
+; RUN: llc < %s -march=mipsel   -mcpu=mips32r2            -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=32R2 -check-prefix=32R2-NONAN
+; RUN: llc < %s -march=mipsel   -mcpu=mips32r6            -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=32R6 -check-prefix=32R6-NONAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64   -mattr=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=64   -check-prefix=64-NONAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=64R2 -check-prefix=64R2-NONAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64r6 -mattr=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=ALL -check-prefix=64R6 -check-prefix=64R6-NONAN
+; RUN: llc < %s -march=mipsel   -mcpu=mips32              | FileCheck %s -check-prefix=ALL -check-prefix=32 -check-prefix=32-NAN
+; RUN: llc < %s -march=mipsel   -mcpu=mips32r2            | FileCheck %s -check-prefix=ALL -check-prefix=32R2 -check-prefix=32R2-NAN
+; RUN: llc < %s -march=mipsel   -mcpu=mips32r6            | FileCheck %s -check-prefix=ALL -check-prefix=32R6 -check-prefix=32R6-NAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64   -mattr=n64 | FileCheck %s -check-prefix=ALL -check-prefix=64   -check-prefix=64-NAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 | FileCheck %s -check-prefix=ALL -check-prefix=64R2 -check-prefix=64R2-NAN
+; RUN: llc < %s -march=mips64el -mcpu=mips64r6 -mattr=n64 | FileCheck %s -check-prefix=ALL -check-prefix=64R6 -check-prefix=64R6-NAN
 
 define float @FOO0float(float %a, float %b, float %c) nounwind readnone {
 entry:
-; CHECK-LABEL: FOO0float:
-; CHECK: madd.s 
+; ALL-LABEL: FOO0float:
+
+; 32-DAG:        mtc1 $6, $[[T0:f[0-9]+]]
+; 32-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f14
+; 32-DAG:        add.s $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 32-DAG:        add.s $f0, $[[T1]], $[[T2]]
+
+; 32R2:          mtc1 $6, $[[T0:f[0-9]+]]
+; 32R2:          madd.s $[[T1:f[0-9]+]], $[[T0]], $f12, $f14
+; 32R2:          mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R2:          add.s $f0, $[[T1]], $[[T2]]
+
+; 32R6-DAG:      mtc1 $6, $[[T0:f[0-9]+]]
+; 32R6-DAG:      mul.s $[[T1:f[0-9]+]], $f12, $f14
+; 32R6-DAG:      add.s $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R6-DAG:      add.s $f0, $[[T1]], $[[T2]]
+
+; 64-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f13
+; 64-DAG:        add.s $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 64-DAG:        add.s $f0, $[[T1]], $[[T2]]
+
+; 64R2:          madd.s $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64R2:          mtc1 $zero, $[[T1:f[0-9]+]]
+; 64R2:          add.s $f0, $[[T0]], $[[T1]]
+
+; 64R6-DAG:      mul.s $[[T0:f[0-9]+]], $f12, $f13
+; 64R6-DAG:      add.s $[[T1:f[0-9]+]], $[[T0]], $f14
+; 64R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 64R6-DAG:      add.s $f0, $[[T1]], $[[T2]]
+
   %mul = fmul float %a, %b
   %add = fadd float %mul, %c
   %add1 = fadd float %add, 0.000000e+00
@@ -22,8 +61,39 @@ entry:
 
 define float @FOO1float(float %a, float %b, float %c) nounwind readnone {
 entry:
-; CHECK-LABEL: FOO1float:
-; CHECK: msub.s 
+; ALL-LABEL: FOO1float:
+
+; 32-DAG:        mtc1 $6, $[[T0:f[0-9]+]]
+; 32-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f14
+; 32-DAG:        sub.s $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 32-DAG:        add.s $f0, $[[T1]], $[[T2]]
+
+; 32R2:          mtc1 $6, $[[T0:f[0-9]+]]
+; 32R2:          msub.s $[[T1:f[0-9]+]], $[[T0]], $f12, $f14
+; 32R2:          mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R2:          add.s $f0, $[[T1]], $[[T2]]
+
+; 32R6-DAG:      mtc1 $6, $[[T0:f[0-9]+]]
+; 32R6-DAG:      mul.s $[[T1:f[0-9]+]], $f12, $f14
+; 32R6-DAG:      sub.s $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R6-DAG:      add.s $f0, $[[T1]], $[[T2]]
+
+; 64-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f13
+; 64-DAG:        sub.s $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 64-DAG:        add.s $f0, $[[T1]], $[[T2]]
+
+; 64R2:          msub.s $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64R2:          mtc1 $zero, $[[T1:f[0-9]+]]
+; 64R2:          add.s $f0, $[[T0]], $[[T1]]
+
+; 64R6-DAG:      mul.s $[[T0:f[0-9]+]], $f12, $f13
+; 64R6-DAG:      sub.s $[[T1:f[0-9]+]], $[[T0]], $f14
+; 64R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 64R6-DAG:      add.s $f0, $[[T1]], $[[T2]]
+
   %mul = fmul float %a, %b
   %sub = fsub float %mul, %c
   %add = fadd float %sub, 0.000000e+00
@@ -32,11 +102,44 @@ entry:
 
 define float @FOO2float(float %a, float %b, float %c) nounwind readnone {
 entry:
-; CHECK-LABEL: FOO2float:
-; 32R2: nmadd.s 
-; 64R2: nmadd.s 
-; 32R2NAN: madd.s 
-; 64R2NAN: madd.s 
+; ALL-LABEL: FOO2float:
+
+; 32-DAG:        mtc1 $6, $[[T0:f[0-9]+]]
+; 32-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f14
+; 32-DAG:        add.s $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 32-DAG:        sub.s $f0, $[[T2]], $[[T1]]
+
+; 32R2-NONAN:    mtc1 $6, $[[T0:f[0-9]+]]
+; 32R2-NONAN:    nmadd.s $f0, $[[T0]], $f12, $f14
+
+; 32R2-NAN:      mtc1 $6, $[[T0:f[0-9]+]]
+; 32R2-NAN:      madd.s $[[T1:f[0-9]+]], $[[T0]], $f12, $f14
+; 32R2-NAN:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R2-NAN:      sub.s  $f0, $[[T2]], $[[T1]]
+
+; 32R6-DAG:      mtc1 $6, $[[T0:f[0-9]+]]
+; 32R6-DAG:      mul.s $[[T1:f[0-9]+]], $f12, $f14
+; 32R6-DAG:      add.s $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R6-DAG:      sub.s $f0, $[[T2]], $[[T1]]
+
+; 64-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f13
+; 64-DAG:        add.s $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 64-DAG:        sub.s $f0, $[[T2]], $[[T1]]
+
+; 64R2-NONAN:    nmadd.s $f0, $f14, $f12, $f13
+
+; 64R2-NAN:      madd.s $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64R2-NAN:      mtc1 $zero, $[[T1:f[0-9]+]]
+; 64R2-NAN:      sub.s  $f0, $[[T1]], $[[T0]]
+
+; 64R6-DAG:      mul.s $[[T1:f[0-9]+]], $f12, $f13
+; 64R6-DAG:      add.s $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 64R6-DAG:      sub.s $f0, $[[T2]], $[[T1]]
+
   %mul = fmul float %a, %b
   %add = fadd float %mul, %c
   %sub = fsub float 0.000000e+00, %add
@@ -45,11 +148,36 @@ entry:
 
 define float @FOO3float(float %a, float %b, float %c) nounwind readnone {
 entry:
-; CHECK-LABEL: FOO3float:
-; 32R2: nmsub.s 
-; 64R2: nmsub.s 
-; 32R2NAN: msub.s 
-; 64R2NAN: msub.s 
+; ALL-LABEL: FOO3float:
+
+; 32-DAG:        mtc1 $6, $[[T0:f[0-9]+]]
+; 32-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f14
+; 32-DAG:        sub.s $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 32-DAG:        sub.s $f0, $[[T2]], $[[T1]]
+
+; 32R2-NONAN:    mtc1 $6, $[[T0:f[0-9]+]]
+; 32R2-NONAN:    nmsub.s $f0, $[[T0]], $f12, $f14
+
+; 32R2-NAN:      mtc1 $6, $[[T0:f[0-9]+]]
+; 32R2-NAN:      msub.s $[[T1:f[0-9]+]], $[[T0]], $f12, $f14
+; 32R2-NAN:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R2-NAN:      sub.s  $f0, $[[T2]], $[[T1]]
+
+; 64-DAG:        mul.s $[[T1:f[0-9]+]], $f12, $f13
+; 64-DAG:        sub.s $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 64-DAG:        sub.s $f0, $[[T2]], $[[T1]]
+
+; 64R2-NAN:      msub.s $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64R2-NAN:      mtc1 $zero, $[[T1:f[0-9]+]]
+; 64R2-NAN:      sub.s  $f0, $[[T1]], $[[T0]]
+
+; 64R6-DAG:      mul.s $[[T1:f[0-9]+]], $f12, $f13
+; 64R6-DAG:      sub.s $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 64R6-DAG:      sub.s $f0, $[[T2]], $[[T1]]
+
   %mul = fmul float %a, %b
   %sub = fsub float %mul, %c
   %sub1 = fsub float 0.000000e+00, %sub
@@ -58,8 +186,40 @@ entry:
 
 define double @FOO10double(double %a, double %b, double %c) nounwind readnone {
 entry:
-; CHECK-LABEL: FOO10double:
-; CHECK: madd.d
+; ALL-LABEL: FOO10double:
+
+; 32-DAG:        ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f14
+; 32-DAG:        add.d $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 32-DAG:        add.d $f0, $[[T1]], $[[T2]]
+
+; 32R2:          ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R2:          madd.d $[[T1:f[0-9]+]], $[[T0]], $f12, $f14
+; 32R2:          mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R2:          mthc1 $zero, $[[T2]]
+; 32R2:          add.d $f0, $[[T1]], $[[T2]]
+
+; 32R6-DAG:      ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R6-DAG:      mul.d $[[T1:f[0-9]+]], $f12, $f14
+; 32R6-DAG:      add.d $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R6-DAG:      add.d $f0, $[[T1]], $[[T2]]
+
+; 64-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f13
+; 64-DAG:        add.d $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64-DAG:        dmtc1 $zero, $[[T2:f[0-9]+]]
+; 64-DAG:        add.d $f0, $[[T1]], $[[T2]]
+
+; 64R2:          madd.d $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64R2:          mtc1 $zero, $[[T1:f[0-9]+]]
+; 64R2:          add.d $f0, $[[T0]], $[[T1]]
+
+; 64R6-DAG:      mul.d $[[T1:f[0-9]+]], $f12, $f13
+; 64R6-DAG:      add.d $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64R6-DAG:      dmtc1 $zero, $[[T2:f[0-9]+]]
+; 64R6-DAG:      add.d $f0, $[[T1]], $[[T2]]
+
   %mul = fmul double %a, %b
   %add = fadd double %mul, %c
   %add1 = fadd double %add, 0.000000e+00
@@ -68,8 +228,40 @@ entry:
 
 define double @FOO11double(double %a, double %b, double %c) nounwind readnone {
 entry:
-; CHECK-LABEL: FOO11double:
-; CHECK: msub.d
+; ALL-LABEL: FOO11double:
+
+; 32-DAG:        ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f14
+; 32-DAG:        sub.d $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 32-DAG:        add.d $f0, $[[T1]], $[[T2]]
+
+; 32R2:          ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R2:          msub.d $[[T1:f[0-9]+]], $[[T0]], $f12, $f14
+; 32R2:          mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R2:          mthc1 $zero, $[[T2]]
+; 32R2:          add.d $f0, $[[T1]], $[[T2]]
+
+; 32R6-DAG:      ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R6-DAG:      mul.d $[[T1:f[0-9]+]], $f12, $f14
+; 32R6-DAG:      sub.d $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R6-DAG:      add.d $f0, $[[T1]], $[[T2]]
+
+; 64-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f13
+; 64-DAG:        sub.d $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64-DAG:        dmtc1 $zero, $[[T2:f[0-9]+]]
+; 64-DAG:        add.d $f0, $[[T1]], $[[T2]]
+
+; 64R2:          msub.d $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64R2:          mtc1 $zero, $[[T1:f[0-9]+]]
+; 64R2:          add.d $f0, $[[T0]], $[[T1]]
+
+; 64R6-DAG:      mul.d $[[T1:f[0-9]+]], $f12, $f13
+; 64R6-DAG:      sub.d $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64R6-DAG:      dmtc1 $zero, $[[T2:f[0-9]+]]
+; 64R6-DAG:      add.d $f0, $[[T1]], $[[T2]]
+
   %mul = fmul double %a, %b
   %sub = fsub double %mul, %c
   %add = fadd double %sub, 0.000000e+00
@@ -78,11 +270,45 @@ entry:
 
 define double @FOO12double(double %a, double %b, double %c) nounwind readnone {
 entry:
-; CHECK-LABEL: FOO12double:
-; 32R2: nmadd.d 
-; 64R2: nmadd.d 
-; 32R2NAN: madd.d 
-; 64R2NAN: madd.d 
+; ALL-LABEL: FOO12double:
+
+; 32-DAG:        ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f14
+; 32-DAG:        add.d $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 32-DAG:        sub.d $f0, $[[T2]], $[[T1]]
+
+; 32R2-NONAN:    ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R2-NONAN:    nmadd.d $f0, $[[T0]], $f12, $f14
+
+; 32R2-NAN:      ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R2-NAN:      madd.d $[[T1:f[0-9]+]], $[[T0]], $f12, $f14
+; 32R2-NAN:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R2-NAN:      mthc1 $zero, $[[T2]]
+; 32R2-NAN:      sub.d $f0, $[[T2]], $[[T1]]
+
+; 32R6-DAG:      ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R6-DAG:      mul.d $[[T1:f[0-9]+]], $f12, $f14
+; 32R6-DAG:      add.d $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R6-DAG:      sub.d $f0, $[[T2]], $[[T1]]
+
+; 64-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f13
+; 64-DAG:        add.d $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64-DAG:        dmtc1 $zero, $[[T2:f[0-9]+]]
+; 64-DAG:        sub.d $f0, $[[T2]], $[[T1]]
+
+; 64R2-NONAN:    nmadd.d $f0, $f14, $f12, $f13
+
+; 64R2-NAN:      madd.d $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64R2-NAN:      mtc1 $zero, $[[T1:f[0-9]+]]
+; 64R2-NAN:      sub.d $f0, $[[T1]], $[[T0]]
+
+; 64R6-DAG:      mul.d $[[T1:f[0-9]+]], $f12, $f13
+; 64R6-DAG:      add.d $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64R6-DAG:      dmtc1 $zero, $[[T2:f[0-9]+]]
+; 64R6-DAG:      sub.d $f0, $[[T2]], $[[T1]]
+
   %mul = fmul double %a, %b
   %add = fadd double %mul, %c
   %sub = fsub double 0.000000e+00, %add
@@ -91,11 +317,45 @@ entry:
 
 define double @FOO13double(double %a, double %b, double %c) nounwind readnone {
 entry:
-; CHECK-LABEL: FOO13double:
-; 32R2: nmsub.d 
-; 64R2: nmsub.d 
-; 32R2NAN: msub.d 
-; 64R2NAN: msub.d 
+; ALL-LABEL: FOO13double:
+
+; 32-DAG:        ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f14
+; 32-DAG:        sub.d $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32-DAG:        mtc1 $zero, $[[T2:f[0-9]+]]
+; 32-DAG:        sub.d $f0, $[[T2]], $[[T1]]
+
+; 32R2-NONAN:    ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R2-NONAN:    nmsub.d $f0, $[[T0]], $f12, $f14
+
+; 32R2-NAN:      ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R2-NAN:      msub.d $[[T1:f[0-9]+]], $[[T0]], $f12, $f14
+; 32R2-NAN:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R2-NAN:      mthc1 $zero, $[[T2]]
+; 32R2-NAN:      sub.d $f0, $[[T2]], $[[T1]]
+
+; 32R6-DAG:      ldc1 $[[T0:f[0-9]+]], 16($sp)
+; 32R6-DAG:      mul.d $[[T1:f[0-9]+]], $f12, $f14
+; 32R6-DAG:      sub.d $[[T2:f[0-9]+]], $[[T1]], $[[T0]]
+; 32R6-DAG:      mtc1 $zero, $[[T2:f[0-9]+]]
+; 32R6-DAG:      sub.d $f0, $[[T2]], $[[T1]]
+
+; 64-DAG:        mul.d $[[T1:f[0-9]+]], $f12, $f13
+; 64-DAG:        sub.d $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64-DAG:        dmtc1 $zero, $[[T2:f[0-9]+]]
+; 64-DAG:        sub.d $f0, $[[T2]], $[[T1]]
+
+; 64R2-NONAN:    nmsub.d $f0, $f14, $f12, $f13
+
+; 64R2-NAN:      msub.d $[[T0:f[0-9]+]], $f14, $f12, $f13
+; 64R2-NAN:      mtc1 $zero, $[[T1:f[0-9]+]]
+; 64R2-NAN:      sub.d $f0, $[[T1]], $[[T0]]
+
+; 64R6-DAG:      mul.d $[[T1:f[0-9]+]], $f12, $f13
+; 64R6-DAG:      sub.d $[[T2:f[0-9]+]], $[[T1]], $f14
+; 64R6-DAG:      dmtc1 $zero, $[[T2:f[0-9]+]]
+; 64R6-DAG:      sub.d $f0, $[[T2]], $[[T1]]
+
   %mul = fmul double %a, %b
   %sub = fsub double %mul, %c
   %sub1 = fsub double 0.000000e+00, %sub
diff --git a/test/CodeGen/Mips/fp-indexed-ls.ll b/test/CodeGen/Mips/fp-indexed-ls.ll
index d8c37e7..787e131 100644
--- a/test/CodeGen/Mips/fp-indexed-ls.ll
+++ b/test/CodeGen/Mips/fp-indexed-ls.ll
@@ -1,6 +1,13 @@
-; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel-none-nacl-gnu -mcpu=mips32r2 < %s \
-; RUN:  | FileCheck %s -check-prefix=CHECK-NACL
+; RUN: llc -march=mipsel   -mcpu=mips32   < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32R1
+; RUN: llc -march=mipsel   -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32R2
+; RUN: llc -march=mipsel   -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32R6
+; RUN: llc -march=mips64el -mcpu=mips4    -mattr=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS4
+; RUN: llc -march=mips64el -mcpu=mips64   -mattr=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS4
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS4
+; RUN: llc -march=mips64el -mcpu=mips64r6 -mattr=n64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS64R6
+
+; Check that [ls][dwu]xc1 are not emitted for nacl.
+; RUN: llc -mtriple=mipsel-none-nacl-gnu -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=CHECK-NACL
 
 %struct.S = type <{ [4 x float] }>
 %struct.S2 = type <{ [4 x double] }>
@@ -14,8 +21,30 @@
 
 define float @foo0(float* nocapture %b, i32 %o) nounwind readonly {
 entry:
-; CHECK: lwxc1
+; ALL-LABEL: foo0:
+
+; MIPS32R1:      sll $[[T1:[0-9]+]], $5, 2
+; MIPS32R1:      addu $[[T3:[0-9]+]], $4, $[[T1]]
+; MIPS32R1:      lwc1 $f0, 0($[[T3]])
+
+; MIPS32R2:      sll $[[T1:[0-9]+]], $5, 2
+; MIPS32R2:      lwxc1 $f0, $[[T1]]($4)
+
+; MIPS32R6:      sll $[[T1:[0-9]+]], $5, 2
+; MIPS32R6:      addu $[[T3:[0-9]+]], $4, $[[T1]]
+; MIPS32R6:      lwc1 $f0, 0($[[T3]])
+
+; MIPS4:         sll $[[T0:[0-9]+]], $5, 0
+; MIPS4:         dsll $[[T1:[0-9]+]], $[[T0]], 2
+; MIPS4:         lwxc1 $f0, $[[T1]]($4)
+
+; MIPS64R6:      sll $[[T0:[0-9]+]], $5, 0
+; MIPS64R6:      dsll $[[T1:[0-9]+]], $[[T0]], 2
+; MIPS64R6:      daddu $[[T3:[0-9]+]], $4, $[[T1]]
+; MIPS64R6:      lwc1 $f0, 0($[[T3]])
+
 ; CHECK-NACL-NOT: lwxc1
+
   %arrayidx = getelementptr inbounds float* %b, i32 %o
   %0 = load float* %arrayidx, align 4
   ret float %0
@@ -23,8 +52,30 @@ entry:
 
 define double @foo1(double* nocapture %b, i32 %o) nounwind readonly {
 entry:
-; CHECK: ldxc1
+; ALL-LABEL: foo1:
+
+; MIPS32R1:      sll $[[T1:[0-9]+]], $5, 3
+; MIPS32R1:      addu $[[T3:[0-9]+]], $4, $[[T1]]
+; MIPS32R1:      ldc1 $f0, 0($[[T3]])
+
+; MIPS32R2:      sll $[[T1:[0-9]+]], $5, 3
+; MIPS32R2:      ldxc1 $f0, $[[T1]]($4)
+
+; MIPS32R6:      sll $[[T1:[0-9]+]], $5, 3
+; MIPS32R6:      addu $[[T3:[0-9]+]], $4, $[[T1]]
+; MIPS32R6:      ldc1 $f0, 0($[[T3]])
+
+; MIPS4:         sll $[[T0:[0-9]+]], $5, 0
+; MIPS4:         dsll $[[T1:[0-9]+]], $[[T0]], 3
+; MIPS4:         ldxc1 $f0, $[[T1]]($4)
+
+; MIPS64R6:      sll $[[T0:[0-9]+]], $5, 0
+; MIPS64R6:      dsll $[[T1:[0-9]+]], $[[T0]], 3
+; MIPS64R6:      daddu $[[T3:[0-9]+]], $4, $[[T1]]
+; MIPS64R6:      ldc1 $f0, 0($[[T3]])
+
 ; CHECK-NACL-NOT: ldxc1
+
   %arrayidx = getelementptr inbounds double* %b, i32 %o
   %0 = load double* %arrayidx, align 8
   ret double %0
@@ -32,7 +83,23 @@ entry:
 
 define float @foo2(i32 %b, i32 %c) nounwind readonly {
 entry:
-; CHECK-NOT: luxc1
+; ALL-LABEL: foo2:
+
+; luxc1 did not exist in MIPS32r1
+; MIPS32R1-NOT:  luxc1
+
+; luxc1 is a misnomer since it aligns the given pointer downwards and performs
+; an aligned load. We mustn't use it to handle unaligned loads.
+; MIPS32R2-NOT:  luxc1
+
+; luxc1 was removed in MIPS32r6
+; MIPS32R6-NOT:  luxc1
+
+; MIPS4-NOT:     luxc1
+
+; luxc1 was removed in MIPS64r6
+; MIPS64R6-NOT:  luxc1
+
   %arrayidx1 = getelementptr inbounds [4 x %struct.S]* @s, i32 0, i32 %b, i32 0, i32 %c
   %0 = load float* %arrayidx1, align 1
   ret float %0
@@ -40,8 +107,28 @@ entry:
 
 define void @foo3(float* nocapture %b, i32 %o) nounwind {
 entry:
-; CHECK: swxc1
+; ALL-LABEL: foo3:
+
+; MIPS32R1-DAG:  lwc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS32R1-DAG:  addu $[[T1:[0-9]+]], $4, ${{[0-9]+}}
+; MIPS32R1-DAG:  swc1 $[[T0]], 0($[[T1]])
+
+; MIPS32R2:      lwc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS32R2:      swxc1 $[[T0]], ${{[0-9]+}}($4)
+
+; MIPS32R6-DAG:  lwc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS32R6-DAG:  addu $[[T1:[0-9]+]], $4, ${{[0-9]+}}
+; MIPS32R6-DAG:  swc1 $[[T0]], 0($[[T1]])
+
+; MIPS4:         lwc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS4:         swxc1 $[[T0]], ${{[0-9]+}}($4)
+
+; MIPS64R6-DAG:  lwc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS64R6-DAG:  daddu $[[T1:[0-9]+]], $4, ${{[0-9]+}}
+; MIPS64R6-DAG:  swc1 $[[T0]], 0($[[T1]])
+
 ; CHECK-NACL-NOT: swxc1
+
   %0 = load float* @gf, align 4
   %arrayidx = getelementptr inbounds float* %b, i32 %o
   store float %0, float* %arrayidx, align 4
@@ -50,8 +137,28 @@ entry:
 
 define void @foo4(double* nocapture %b, i32 %o) nounwind {
 entry:
-; CHECK: sdxc1
+; ALL-LABEL: foo4:
+
+; MIPS32R1-DAG:  ldc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS32R1-DAG:  addu $[[T1:[0-9]+]], $4, ${{[0-9]+}}
+; MIPS32R1-DAG:  sdc1 $[[T0]], 0($[[T1]])
+
+; MIPS32R2:      ldc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS32R2:      sdxc1 $[[T0]], ${{[0-9]+}}($4)
+
+; MIPS32R6-DAG:  ldc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS32R6-DAG:  addu $[[T1:[0-9]+]], $4, ${{[0-9]+}}
+; MIPS32R6-DAG:  sdc1 $[[T0]], 0($[[T1]])
+
+; MIPS4:         ldc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS4:         sdxc1 $[[T0]], ${{[0-9]+}}($4)
+
+; MIPS64R6-DAG:  ldc1 $[[T0:f0]], 0(${{[0-9]+}})
+; MIPS64R6-DAG:  daddu $[[T1:[0-9]+]], $4, ${{[0-9]+}}
+; MIPS64R6-DAG:  sdc1 $[[T0]], 0($[[T1]])
+
 ; CHECK-NACL-NOT: sdxc1
+
   %0 = load double* @gd, align 8
   %arrayidx = getelementptr inbounds double* %b, i32 %o
   store double %0, double* %arrayidx, align 8
@@ -60,7 +167,18 @@ entry:
 
 define void @foo5(i32 %b, i32 %c) nounwind {
 entry:
-; CHECK-NOT: suxc1
+; ALL-LABEL: foo5:
+
+; MIPS32R1-NOT:  suxc1
+
+; MIPS32R2-NOT:  suxc1
+
+; MIPS32R6-NOT:  suxc1
+
+; MIPS4-NOT:     suxc1
+
+; MIPS64R6-NOT:  suxc1
+
   %0 = load float* @gf, align 4
   %arrayidx1 = getelementptr inbounds [4 x %struct.S]* @s, i32 0, i32 %b, i32 0, i32 %c
   store float %0, float* %arrayidx1, align 1
@@ -69,8 +187,18 @@ entry:
 
 define double @foo6(i32 %b, i32 %c) nounwind readonly {
 entry:
-; CHECK: foo6
-; CHECK-NOT: luxc1
+; ALL-LABEL: foo6:
+
+; MIPS32R1-NOT:  luxc1
+
+; MIPS32R2-NOT:  luxc1
+
+; MIPS32R6-NOT:  luxc1
+
+; MIPS4-NOT:     luxc1
+
+; MIPS64R6-NOT:  luxc1
+
   %arrayidx1 = getelementptr inbounds [4 x %struct.S2]* @s2, i32 0, i32 %b, i32 0, i32 %c
   %0 = load double* %arrayidx1, align 1
   ret double %0
@@ -78,8 +206,18 @@ entry:
 
 define void @foo7(i32 %b, i32 %c) nounwind {
 entry:
-; CHECK: foo7
-; CHECK-NOT: suxc1
+; ALL-LABEL: foo7:
+
+; MIPS32R1-NOT:  suxc1
+
+; MIPS32R2-NOT:  suxc1
+
+; MIPS32R6-NOT:  suxc1
+
+; MIPS4-NOT:     suxc1
+
+; MIPS64R6-NOT:  suxc1
+
   %0 = load double* @gd, align 8
   %arrayidx1 = getelementptr inbounds [4 x %struct.S2]* @s2, i32 0, i32 %b, i32 0, i32 %c
   store double %0, double* %arrayidx1, align 1
@@ -88,16 +226,36 @@ entry:
 
 define float @foo8() nounwind readonly {
 entry:
-; CHECK: foo8
-; CHECK-NOT: luxc1
+; ALL-LABEL: foo8:
+
+; MIPS32R1-NOT:  luxc1
+
+; MIPS32R2-NOT:  luxc1
+
+; MIPS32R6-NOT:  luxc1
+
+; MIPS4-NOT:     luxc1
+
+; MIPS64R6-NOT:  luxc1
+
   %0 = load float* getelementptr inbounds (%struct.S3* @s3, i32 0, i32 1), align 1
   ret float %0
 }
 
 define void @foo9(float %f) nounwind {
 entry:
-; CHECK: foo9
-; CHECK-NOT: suxc1
+; ALL-LABEL: foo9:
+
+; MIPS32R1-NOT:  suxc1
+
+; MIPS32R2-NOT:  suxc1
+
+; MIPS32R6-NOT:  suxc1
+
+; MIPS4-NOT:     suxc1
+
+; MIPS64R6-NOT:  suxc1
+
   store float %f, float* getelementptr inbounds (%struct.S3* @s3, i32 0, i32 1), align 1
   ret void
 }
diff --git a/test/CodeGen/Mips/fpbr.ll b/test/CodeGen/Mips/fpbr.ll
index a136557..311b830 100644
--- a/test/CodeGen/Mips/fpbr.ll
+++ b/test/CodeGen/Mips/fpbr.ll
@@ -1,9 +1,25 @@
-; RUN: llc  < %s -march=mipsel | FileCheck %s
+; RUN: llc < %s -march=mipsel -mcpu=mips32   | FileCheck %s -check-prefix=ALL -check-prefix=FCC -check-prefix=32-FCC
+; RUN: llc < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=FCC -check-prefix=32-FCC
+; RUN: llc < %s -march=mipsel -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=GPR -check-prefix=32-GPR
+; RUN: llc < %s -march=mips64el -mcpu=mips64   | FileCheck %s -check-prefix=ALL -check-prefix=FCC -check-prefix=64-FCC
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 | FileCheck %s -check-prefix=ALL -check-prefix=FCC -check-prefix=64-FCC
+; RUN: llc < %s -march=mips64el -mcpu=mips64r6 | FileCheck %s -check-prefix=ALL -check-prefix=GPR -check-prefix=64-GPR
 
 define void @func0(float %f2, float %f3) nounwind {
 entry:
-; CHECK: c.eq.s
-; CHECK: bc1f
+; ALL-LABEL: func0:
+
+; 32-FCC:        c.eq.s $f12, $f14
+; 64-FCC:        c.eq.s $f12, $f13
+; FCC:           bc1f   $BB0_2
+
+; 32-GPR:        cmp.eq.s $[[FGRCC:f[0-9]+]], $f12, $f14
+; 64-GPR:        cmp.eq.s $[[FGRCC:f[0-9]+]], $f12, $f13
+; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
+; FIXME: We ought to be able to transform not+bnez -> beqz
+; GPR:           not      $[[GPRCC]], $[[GPRCC]]
+; GPR:           bnez     $[[GPRCC]], $BB0_2
+
   %cmp = fcmp oeq float %f2, %f3
   br i1 %cmp, label %if.then, label %if.else
 
@@ -25,8 +41,18 @@ declare void @g1(...)
 
 define void @func1(float %f2, float %f3) nounwind {
 entry:
-; CHECK: c.olt.s
-; CHECK: bc1f
+; ALL-LABEL: func1:
+
+; 32-FCC:        c.olt.s $f12, $f14
+; 64-FCC:        c.olt.s $f12, $f13
+; FCC:           bc1f    $BB1_2
+
+; 32-GPR:        cmp.ule.s $[[FGRCC:f[0-9]+]], $f14, $f12
+; 64-GPR:        cmp.ule.s $[[FGRCC:f[0-9]+]], $f13, $f12
+; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
+; GPR-NOT:       not      $[[GPRCC]], $[[GPRCC]]
+; GPR:           bnez     $[[GPRCC]], $BB1_2
+
   %cmp = fcmp olt float %f2, %f3
   br i1 %cmp, label %if.then, label %if.else
 
@@ -44,8 +70,18 @@ if.end:                                           ; preds = %if.else, %if.then
 
 define void @func2(float %f2, float %f3) nounwind {
 entry:
-; CHECK: c.ole.s
-; CHECK: bc1t
+; ALL-LABEL: func2:
+
+; 32-FCC:        c.ole.s $f12, $f14
+; 64-FCC:        c.ole.s $f12, $f13
+; FCC:           bc1t    $BB2_2
+
+; 32-GPR:        cmp.ult.s $[[FGRCC:f[0-9]+]], $f14, $f12
+; 64-GPR:        cmp.ult.s $[[FGRCC:f[0-9]+]], $f13, $f12
+; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
+; GPR-NOT:       not      $[[GPRCC]], $[[GPRCC]]
+; GPR:           beqz     $[[GPRCC]], $BB2_2
+
   %cmp = fcmp ugt float %f2, %f3
   br i1 %cmp, label %if.else, label %if.then
 
@@ -63,8 +99,19 @@ if.end:                                           ; preds = %if.else, %if.then
 
 define void @func3(double %f2, double %f3) nounwind {
 entry:
-; CHECK: c.eq.d
-; CHECK: bc1f
+; ALL-LABEL: func3:
+
+; 32-FCC:        c.eq.d $f12, $f14
+; 64-FCC:        c.eq.d $f12, $f13
+; FCC:           bc1f $BB3_2
+
+; 32-GPR:        cmp.eq.d $[[FGRCC:f[0-9]+]], $f12, $f14
+; 64-GPR:        cmp.eq.d $[[FGRCC:f[0-9]+]], $f12, $f13
+; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
+; FIXME: We ought to be able to transform not+bnez -> beqz
+; GPR:           not      $[[GPRCC]], $[[GPRCC]]
+; GPR:           bnez     $[[GPRCC]], $BB3_2
+
   %cmp = fcmp oeq double %f2, %f3
   br i1 %cmp, label %if.then, label %if.else
 
@@ -82,8 +129,18 @@ if.end:                                           ; preds = %if.else, %if.then
 
 define void @func4(double %f2, double %f3) nounwind {
 entry:
-; CHECK: c.olt.d
-; CHECK: bc1f
+; ALL-LABEL: func4:
+
+; 32-FCC:        c.olt.d $f12, $f14
+; 64-FCC:        c.olt.d $f12, $f13
+; FCC:           bc1f $BB4_2
+
+; 32-GPR:        cmp.ule.d $[[FGRCC:f[0-9]+]], $f14, $f12
+; 64-GPR:        cmp.ule.d $[[FGRCC:f[0-9]+]], $f13, $f12
+; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
+; GPR-NOT:       not      $[[GPRCC]], $[[GPRCC]]
+; GPR:           bnez     $[[GPRCC]], $BB4_2
+
   %cmp = fcmp olt double %f2, %f3
   br i1 %cmp, label %if.then, label %if.else
 
@@ -101,8 +158,18 @@ if.end:                                           ; preds = %if.else, %if.then
 
 define void @func5(double %f2, double %f3) nounwind {
 entry:
-; CHECK: c.ole.d
-; CHECK: bc1t
+; ALL-LABEL: func5:
+
+; 32-FCC:        c.ole.d $f12, $f14
+; 64-FCC:        c.ole.d $f12, $f13
+; FCC:           bc1t $BB5_2
+
+; 32-GPR:        cmp.ult.d $[[FGRCC:f[0-9]+]], $f14, $f12
+; 64-GPR:        cmp.ult.d $[[FGRCC:f[0-9]+]], $f13, $f12
+; GPR:           mfc1     $[[GPRCC:[0-9]+]], $[[FGRCC:f[0-9]+]]
+; GPR-NOT:       not      $[[GPRCC]], $[[GPRCC]]
+; GPR:           beqz     $[[GPRCC]], $BB5_2
+
   %cmp = fcmp ugt double %f2, %f3
   br i1 %cmp, label %if.else, label %if.then
 
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
index 9464918..a67ddce 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
@@ -1,6 +1,7 @@
 ; Positive test for inline register constraints
 ;
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s
 
 define i32 @main() nounwind {
 entry:
diff --git a/test/CodeGen/Mips/lit.local.cfg b/test/CodeGen/Mips/lit.local.cfg
index 1fa54b4..a3183a2 100644
--- a/test/CodeGen/Mips/lit.local.cfg
+++ b/test/CodeGen/Mips/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Mips' in targets:
+if not 'Mips' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/Mips/llvm-ir/call.ll b/test/CodeGen/Mips/llvm-ir/call.ll
new file mode 100644
index 0000000..4cbf43c
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/call.ll
@@ -0,0 +1,166 @@
+; Test the 'call' instruction and the tailcall variant.
+
+; FIXME: We should remove the need for -enable-mips-tail-calls
+; RUN: llc -march=mips   -mcpu=mips32   -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32
+; RUN: llc -march=mips   -mcpu=mips32r2 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32
+; RUN: llc -march=mips   -mcpu=mips32r6 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=O32
+; RUN: llc -march=mips64 -mcpu=mips4    -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64
+; RUN: llc -march=mips64 -mcpu=mips64   -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64
+; RUN: llc -march=mips64 -mcpu=mips64r2 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64
+; RUN: llc -march=mips64 -mcpu=mips64r6 -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=ALL -check-prefix=N64
+
+declare void @extern_void_void()
+declare i32 @extern_i32_void()
+declare float @extern_float_void()
+
+define i32 @call_void_void() {
+; ALL-LABEL: call_void_void:
+
+; O32:           lw $[[TGT:[0-9]+]], %call16(extern_void_void)($gp)
+
+; N64:           ld $[[TGT:[0-9]+]], %call16(extern_void_void)($gp)
+
+; ALL:           jalr $[[TGT]]
+
+  call void @extern_void_void()
+  ret i32 0
+}
+
+define i32 @call_i32_void() {
+; ALL-LABEL: call_i32_void:
+
+; O32:           lw $[[TGT:[0-9]+]], %call16(extern_i32_void)($gp)
+
+; N64:           ld $[[TGT:[0-9]+]], %call16(extern_i32_void)($gp)
+
+; ALL:           jalr $[[TGT]]
+
+  %1 = call i32 @extern_i32_void()
+  %2 = add i32 %1, 1
+  ret i32 %2
+}
+
+define float @call_float_void() {
+; ALL-LABEL: call_float_void:
+
+; FIXME: Not sure why we don't use $gp directly on such a simple test. We should
+;        look into it at some point.
+; O32:           addu $[[GP:[0-9]+]], ${{[0-9]+}}, $25
+; O32:           lw $[[TGT:[0-9]+]], %call16(extern_float_void)($[[GP]])
+
+; N64:           ld $[[TGT:[0-9]+]], %call16(extern_float_void)($gp)
+
+; ALL:           jalr $[[TGT]]
+
+; O32:           move $gp, $[[GP]]
+
+  %1 = call float @extern_float_void()
+  %2 = fadd float %1, 1.0
+  ret float %2
+}
+
+define void @musttail_call_void_void() {
+; ALL-LABEL: musttail_call_void_void:
+
+; O32:           lw $[[TGT:[0-9]+]], %call16(extern_void_void)($gp)
+
+; N64:           ld $[[TGT:[0-9]+]], %call16(extern_void_void)($gp)
+
+; NOT-R6:        jr $[[TGT]]
+; R6:            r6.jr $[[TGT]]
+
+  musttail call void @extern_void_void()
+  ret void
+}
+
+define i32 @musttail_call_i32_void() {
+; ALL-LABEL: musttail_call_i32_void:
+
+; O32:           lw $[[TGT:[0-9]+]], %call16(extern_i32_void)($gp)
+
+; N64:           ld $[[TGT:[0-9]+]], %call16(extern_i32_void)($gp)
+
+; NOT-R6:        jr $[[TGT]]
+; R6:            r6.jr $[[TGT]]
+
+  %1 = musttail call i32 @extern_i32_void()
+  ret i32 %1
+}
+
+define float @musttail_call_float_void() {
+; ALL-LABEL: musttail_call_float_void:
+
+; O32:           lw $[[TGT:[0-9]+]], %call16(extern_float_void)($gp)
+
+; N64:           ld $[[TGT:[0-9]+]], %call16(extern_float_void)($gp)
+
+; NOT-R6:        jr $[[TGT]]
+; R6:            r6.jr $[[TGT]]
+
+  %1 = musttail call float @extern_float_void()
+  ret float %1
+}
+
+define i32 @indirect_call_void_void(void ()* %addr) {
+; ALL-LABEL: indirect_call_void_void:
+
+; ALL:           move $25, $4
+; ALL:           jalr $25
+
+  call void %addr()
+  ret i32 0
+}
+
+define i32 @indirect_call_i32_void(i32 ()* %addr) {
+; ALL-LABEL: indirect_call_i32_void:
+
+; ALL:           move $25, $4
+; ALL:           jalr $25
+
+  %1 = call i32 %addr()
+  %2 = add i32 %1, 1
+  ret i32 %2
+}
+
+define float @indirect_call_float_void(float ()* %addr) {
+; ALL-LABEL: indirect_call_float_void:
+
+; ALL:           move $25, $4
+; ALL:           jalr $25
+
+  %1 = call float %addr()
+  %2 = fadd float %1, 1.0
+  ret float %2
+}
+
+; We can't use 'musttail' here because the verifier is too conservative and
+; prohibits any prototype difference.
+define void @tail_indirect_call_void_void(void ()* %addr) {
+; ALL-LABEL: tail_indirect_call_void_void:
+
+; ALL:           move $25, $4
+; ALL:           jr $25
+
+  tail call void %addr()
+  ret void
+}
+
+define i32 @tail_indirect_call_i32_void(i32 ()* %addr) {
+; ALL-LABEL: tail_indirect_call_i32_void:
+
+; ALL:           move $25, $4
+; ALL:           jr $25
+
+  %1 = tail call i32 %addr()
+  ret i32 %1
+}
+
+define float @tail_indirect_call_float_void(float ()* %addr) {
+; ALL-LABEL: tail_indirect_call_float_void:
+
+; ALL:           move $25, $4
+; ALL:           jr $25
+
+  %1 = tail call float %addr()
+  ret float %1
+}
diff --git a/test/CodeGen/Mips/llvm-ir/indirectbr.ll b/test/CodeGen/Mips/llvm-ir/indirectbr.ll
new file mode 100644
index 0000000..d8fd787
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/indirectbr.ll
@@ -0,0 +1,34 @@
+; Test all important variants of the unconditional 'br' instruction.
+
+; RUN: llc -march=mips   -mcpu=mips32   -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
+; RUN: llc -march=mips   -mcpu=mips32r2 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
+; RUN: llc -march=mips   -mcpu=mips32r6 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=R6
+; RUN: llc -march=mips64 -mcpu=mips4    -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
+; RUN: llc -march=mips64 -mcpu=mips64   -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
+; RUN: llc -march=mips64 -mcpu=mips64r2 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOT-R6
+; RUN: llc -march=mips64 -mcpu=mips64r6 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=R6
+
+define i32 @br(i8 *%addr) {
+; ALL-LABEL: br:
+; NOT-R6:        jr $4 # <MCInst #{{[0-9]+}} JR
+; R6:            jr $4 # <MCInst #{{[0-9]+}} JALR
+
+; ALL: $BB0_1: # %L1
+; NOT-R6:        jr $ra # <MCInst #{{[0-9]+}} JR
+; R6:            jr $ra # <MCInst #{{[0-9]+}} JALR
+; ALL:           addiu $2, $zero, 0
+
+; ALL: $BB0_2: # %L2
+; NOT-R6:        jr $ra # <MCInst #{{[0-9]+}} JR
+; R6:            jr $ra # <MCInst #{{[0-9]+}} JALR
+; ALL:           addiu $2, $zero, 1
+
+entry:
+  indirectbr i8* %addr, [label %L1, label %L2]
+
+L1:
+  ret i32 0
+
+L2:
+  ret i32 1
+}
diff --git a/test/CodeGen/Mips/llvm-ir/ret.ll b/test/CodeGen/Mips/llvm-ir/ret.ll
new file mode 100644
index 0000000..8f5b115
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/ret.ll
@@ -0,0 +1,205 @@
+; Test all important variants of the 'ret' instruction.
+;
+; For non-void returns it is necessary to have something to return so we also
+; test constant generation here.
+;
+; We'll test pointer returns in a separate file since the relocation model
+; affects it and it's undesirable to repeat the non-pointer returns for each
+; relocation model.
+
+; RUN: llc -march=mips   -mcpu=mips32   -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR32 -check-prefix=NO-MTHC1 -check-prefix=NOT-R6
+; RUN: llc -march=mips   -mcpu=mips32r2 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR32 -check-prefix=MTHC1 -check-prefix=NOT-R6
+; RUN: llc -march=mips   -mcpu=mips32r6 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR32 -check-prefix=MTHC1 -check-prefix=R6
+; RUN: llc -march=mips64 -mcpu=mips4    -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=DMTC1 -check-prefix=NOT-R6
+; RUN: llc -march=mips64 -mcpu=mips64   -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=DMTC1 -check-prefix=NOT-R6
+; RUN: llc -march=mips64 -mcpu=mips64r2 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=DMTC1 -check-prefix=NOT-R6
+; RUN: llc -march=mips64 -mcpu=mips64r6 -asm-show-inst < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR64 -check-prefix=DMTC1 -check-prefix=R6
+
+define void @ret_void() {
+; ALL-LABEL: ret_void:
+
+; NOT-R6-DAG:    jr $ra # <MCInst #{{[0-9]+}} JR
+; R6-DAG:        jr $ra # <MCInst #{{[0-9]+}} JALR
+
+  ret void
+}
+
+define i8 @ret_i8() {
+; ALL-LABEL: ret_i8:
+; ALL-DAG:       addiu $2, $zero, 3
+
+; NOT-R6-DAG:    jr $ra # <MCInst #{{[0-9]+}} JR
+; R6-DAG:        jr $ra # <MCInst #{{[0-9]+}} JALR
+
+  ret i8 3
+}
+
+define i16 @ret_i16_3() {
+; ALL-LABEL: ret_i16_3:
+; ALL-DAG:       addiu $2, $zero, 3
+
+; NOT-R6-DAG:    jr $ra # <MCInst #{{[0-9]+}} JR
+; R6-DAG:        jr $ra # <MCInst #{{[0-9]+}} JALR
+
+  ret i16 3
+}
+
+define i16 @ret_i16_256() {
+; ALL-LABEL: ret_i16_256:
+; ALL-DAG:       addiu $2, $zero, 256
+
+; NOT-R6-DAG:    jr $ra # <MCInst #{{[0-9]+}} JR
+; R6-DAG:        jr $ra # <MCInst #{{[0-9]+}} JALR
+
+  ret i16 256
+}
+
+define i16 @ret_i16_257() {
+; ALL-LABEL: ret_i16_257:
+; ALL-DAG:       addiu $2, $zero, 257
+
+; NOT-R6-DAG:    jr $ra # <MCInst #{{[0-9]+}} JR
+; R6-DAG:        jr $ra # <MCInst #{{[0-9]+}} JALR
+
+  ret i16 257
+}
+
+define i32 @ret_i32_257() {
+; ALL-LABEL: ret_i32_257:
+; ALL-DAG:       addiu $2, $zero, 257
+
+; NOT-R6-DAG:    jr $ra # <MCInst #{{[0-9]+}} JR
+; R6-DAG:        jr $ra # <MCInst #{{[0-9]+}} JALR
+
+  ret i32 257
+}
+
+define i32 @ret_i32_65536() {
+; ALL-LABEL: ret_i32_65536:
+; ALL-DAG:       lui $2, 1
+
+; NOT-R6-DAG:    jr $ra # <MCInst #{{[0-9]+}} JR
+; R6-DAG:        jr $ra # <MCInst #{{[0-9]+}} JALR
+
+  ret i32 65536
+}
+
+define i32 @ret_i32_65537() {
+; ALL-LABEL: ret_i32_65537:
+; ALL:           lui $[[T0:[0-9]+]], 1
+; ALL-DAG:       ori $2, $[[T0]], 1
+
+; NOT-R6-DAG:    jr $ra # <MCInst #{{[0-9]+}} JR
+; R6-DAG:        jr $ra # <MCInst #{{[0-9]+}} JALR
+
+  ret i32 65537
+}
+
+define i64 @ret_i64_65537() {
+; ALL-LABEL: ret_i64_65537:
+; ALL:           lui $[[T0:[0-9]+]], 1
+
+; GPR32-DAG:     ori $3, $[[T0]], 1
+; GPR32-DAG:     addiu $2, $zero, 0
+
+; GPR64-DAG:     daddiu $2, $[[T0]], 1
+
+; NOT-R6-DAG:    jr $ra # <MCInst #{{[0-9]+}} JR
+; R6-DAG:        jr $ra # <MCInst #{{[0-9]+}} JALR
+
+  ret i64 65537
+}
+
+define i64 @ret_i64_281479271677952() {
+; ALL-LABEL: ret_i64_281479271677952:
+; ALL-DAG:       lui $[[T0:[0-9]+]], 1
+
+; GPR32-DAG:     ori $2, $[[T0]], 1
+; GPR32-DAG:     addiu $3, $zero, 0
+
+; GPR64-DAG:     daddiu $[[T1:[0-9]+]], $[[T0]], 1
+; GPR64-DAG:     dsll $2, $[[T1]], 32
+
+; NOT-R6-DAG:    jr $ra # <MCInst #{{[0-9]+}} JR
+; R6-DAG:        jr $ra # <MCInst #{{[0-9]+}} JALR
+
+  ret i64 281479271677952
+}
+
+define i64 @ret_i64_281479271809026() {
+; ALL-LABEL: ret_i64_281479271809026:
+; GPR32-DAG:     lui $[[T0:[0-9]+]], 1
+; GPR32-DAG:     lui $[[T1:[0-9]+]], 2
+; GPR32-DAG:     ori $2, $[[T0]], 1
+; GPR32-DAG:     ori $3, $[[T1]], 2
+
+; GPR64-DAG:     ori  $[[T0:[0-9]+]], $zero, 32769
+; GPR64-DAG:     dsll $[[T1:[0-9]+]], $[[T0]], 16
+; GPR64-DAG:     daddiu $[[T0:[0-9]+]], $[[T0]], -32767
+; GPR64-DAG:     dsll $[[T1:[0-9]+]], $[[T0]], 17
+; GPR64-DAG:     daddiu $2, $[[T1]], 2
+
+; NOT-R6-DAG:    jr $ra # <MCInst #{{[0-9]+}} JR
+; R6-DAG:        jr $ra # <MCInst #{{[0-9]+}} JALR
+
+  ret i64 281479271809026
+}
+
+define float @ret_float_0x0() {
+; ALL-LABEL: ret_float_0x0:
+
+; NO-MTHC1-DAG:  mtc1 $zero, $f0
+
+; MTHC1-DAG:     mtc1 $zero, $f0
+
+; DMTC-DAG:      dmtc1 $zero, $f0
+
+; NOT-R6-DAG:    jr $ra # <MCInst #{{[0-9]+}} JR
+; R6-DAG:        jr $ra # <MCInst #{{[0-9]+}} JALR
+
+  ret float 0x0000000000000000
+}
+
+define float @ret_float_0x3() {
+; ALL-LABEL: ret_float_0x3:
+
+; Use a constant pool
+; O32-DAG:       lwc1 $f0, %lo($CPI
+; N64-DAG:       lwc1 $f0, %got_ofst($CPI
+
+; NOT-R6-DAG:    jr $ra # <MCInst #{{[0-9]+}} JR
+; R6-DAG:        jr $ra # <MCInst #{{[0-9]+}} JALR
+
+; float constants are written as double constants
+  ret float 0x36b8000000000000
+}
+
+define double @ret_double_0x0() {
+; ALL-LABEL: ret_double_0x0:
+
+; NO-MTHC1-DAG:  mtc1 $zero, $f0
+; NO-MTHC1-DAG:  mtc1 $zero, $f1
+
+; MTHC1-DAG:     mtc1 $zero, $f0
+; MTHC1-DAG:     mthc1 $zero, $f0
+
+; DMTC-DAG:      dmtc1 $zero, $f0
+
+; NOT-R6-DAG:    jr $ra # <MCInst #{{[0-9]+}} JR
+; R6-DAG:        jr $ra # <MCInst #{{[0-9]+}} JALR
+
+  ret double 0x0000000000000000
+}
+
+define double @ret_double_0x3() {
+; ALL-LABEL: ret_double_0x3:
+
+; Use a constant pool
+; O32-DAG:       ldc1 $f0, %lo($CPI
+; N64-DAG:       ldc1 $f0, %got_ofst($CPI
+
+; NOT-R6-DAG:    jr $ra # <MCInst #{{[0-9]+}} JR
+; R6-DAG:        jr $ra # <MCInst #{{[0-9]+}} JALR
+
+  ret double 0x0000000000000003
+}
diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
index c7fe6fd..a403744 100644
--- a/test/CodeGen/Mips/longbranch.ll
+++ b/test/CodeGen/Mips/longbranch.ll
@@ -7,6 +7,8 @@
 ; RUN:   < %s | FileCheck %s -check-prefix=N64
 ; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=micromips \
 ; RUN:   -force-mips-long-branch -O3 < %s | FileCheck %s -check-prefix=MICROMIPS
+; RUN: llc -mtriple=mipsel-none-nacl -force-mips-long-branch -O3 < %s \
+; RUN:   | FileCheck %s -check-prefix=NACL
 
 
 @x = external global i32
@@ -126,4 +128,36 @@ end:
 ; MICROMIPS:   $[[BB2]]:
 ; MICROMIPS:        jr      $ra
 ; MICROMIPS:        nop
+
+
+; Check the NaCl version.  Check that sp change is not in the branch delay slot
+; of "jr $1" instruction.  Check that target of indirect branch "jr $1" is
+; bundle aligned.
+
+; NACL:        lui     $[[R0:[0-9]+]], %hi(_gp_disp)
+; NACL:        addiu   $[[R0]], $[[R0]], %lo(_gp_disp)
+; NACL:        bnez    $4, $[[BB0:BB[0-9_]+]]
+; NACL:        addu    $[[GP:[0-9]+]], $[[R0]], $25
+
+; Check for long branch expansion:
+; NACL:             addiu   $sp, $sp, -8
+; NACL-NEXT:        sw      $ra, 0($sp)
+; NACL-NEXT:        lui     $1, %hi(($[[BB2:BB[0-9_]+]])-($[[BB1:BB[0-9_]+]]))
+; NACL-NEXT:        bal     $[[BB1]]
+; NACL-NEXT:        addiu   $1, $1, %lo(($[[BB2]])-($[[BB1]]))
+; NACL-NEXT:   $[[BB1]]:
+; NACL-NEXT:        addu    $1, $ra, $1
+; NACL-NEXT:        lw      $ra, 0($sp)
+; NACL-NEXT:        addiu   $sp, $sp, 8
+; NACL-NEXT:        jr      $1
+; NACL-NEXT:        nop
+
+; NACL:        $[[BB0]]:
+; NACL:             lw      $[[R1:[0-9]+]], %got(x)($[[GP]])
+; NACL:             addiu   $[[R2:[0-9]+]], $zero, 1
+; NACL:             sw      $[[R2]], 0($[[R1]])
+; NACL:             .align  4
+; NACL-NEXT:   $[[BB2]]:
+; NACL:             jr      $ra
+; NACL:             nop
 }
diff --git a/test/CodeGen/Mips/madd-msub.ll b/test/CodeGen/Mips/madd-msub.ll
index 0dbb2c2..8222967 100644
--- a/test/CodeGen/Mips/madd-msub.ll
+++ b/test/CodeGen/Mips/madd-msub.ll
@@ -1,9 +1,49 @@
-; RUN: llc -march=mips < %s | FileCheck %s -check-prefix=32
-; RUN: llc -march=mips -mattr=dsp < %s | FileCheck %s -check-prefix=DSP
+; RUN: llc -march=mips -mcpu=mips32   < %s | FileCheck %s -check-prefix=ALL -check-prefix=32
+; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32
+; RUN: llc -march=mips -mcpu=mips32r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R6
+; RUN: llc -march=mips -mcpu=mips32 -mattr=dsp < %s | FileCheck %s -check-prefix=DSP
+; RUN: llc -march=mips -mcpu=mips64   < %s | FileCheck %s -check-prefix=ALL -check-prefix=64
+; RUN: llc -march=mips -mcpu=mips64r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64
+; RUN: llc -march=mips -mcpu=mips64r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64R6
+
+; FIXME: The MIPS16 test should check its output
 ; RUN: llc -march=mips -mcpu=mips16 < %s
 
-; 32: madd ${{[0-9]+}}
-; DSP: madd $ac
+; ALL-LABEL: madd1:
+
+; 32-DAG:        sra $[[T0:[0-9]+]], $6, 31
+; 32-DAG:        mtlo $6
+; 32-DAG:        [[m:m]]add ${{[45]}}, ${{[45]}}
+; 32-DAG:        [[m]]fhi $2
+; 32-DAG:        [[m]]flo $3
+
+; DSP-DAG:       sra $[[T0:[0-9]+]], $6, 31
+; DSP-DAG:       mtlo $[[AC:ac[0-3]+]], $6
+; DSP-DAG:       madd $[[AC]], ${{[45]}}, ${{[45]}}
+; DSP-DAG:       mfhi $2, $[[AC]]
+; DSP-DAG:       mflo $3, $[[AC]]
+
+; 32R6-DAG:      mul  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      addu $[[T1:[0-9]+]], $[[T0]], $6
+; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $6
+; 32R6-DAG:      sra  $[[T3:[0-9]+]], $6, 31
+; 32R6-DAG:      addu $[[T4:[0-9]+]], $[[T2]], $[[T3]]
+; 32R6-DAG:      muh  $[[T5:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      addu $2, $[[T5]], $[[T4]]
+
+; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
+; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
+; 64-DAG:        d[[m:m]]ult $[[T1]], $[[T0]]
+; 64-DAG:        [[m]]flo $[[T2:[0-9]+]]
+; 64-DAG:        sll $[[T3:[0-9]+]], $6, 0
+; 64-DAG:        daddu $2, $[[T2]], $[[T3]]
+
+; 64R6-DAG:      sll $[[T0:[0-9]+]], $4, 0
+; 64R6-DAG:      sll $[[T1:[0-9]+]], $5, 0
+; 64R6-DAG:      dmul $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+; 64R6-DAG:      sll $[[T3:[0-9]+]], $6, 0
+; 64R6-DAG:      daddu $2, $[[T2]], $[[T3]]
+
 define i64 @madd1(i32 %a, i32 %b, i32 %c) nounwind readnone {
 entry:
   %conv = sext i32 %a to i64
@@ -14,8 +54,47 @@ entry:
   ret i64 %add
 }
 
-; 32: maddu ${{[0-9]+}}
-; DSP: maddu $ac
+; ALL-LABEL: madd2:
+
+; FIXME: We don't really need this instruction
+; 32-DAG:        addiu $[[T0:[0-9]+]], $zero, 0
+; 32-DAG:        mtlo $6
+; 32-DAG:        [[m:m]]addu ${{[45]}}, ${{[45]}}
+; 32-DAG:        [[m]]fhi $2
+; 32-DAG:        [[m]]flo $3
+
+; DSP-DAG:       addiu $[[T0:[0-9]+]], $zero, 0
+; DSP-DAG:       mtlo $[[AC:ac[0-3]+]], $6
+; DSP-DAG:       maddu $[[AC]], ${{[45]}}, ${{[45]}}
+; DSP-DAG:       mfhi $2, $[[AC]]
+; DSP-DAG:       mflo $3, $[[AC]]
+
+; 32R6-DAG:      mul  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      addu $[[T1:[0-9]+]], $[[T0]], $6
+; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $6
+; FIXME: There's a redundant move here. We should remove it
+; 32R6-DAG:      muhu $[[T3:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      addu $2, $[[T3]], $[[T2]]
+
+; 64-DAG:        dsll $[[T0:[0-9]+]], $4, 32
+; 64-DAG:        dsrl $[[T1:[0-9]+]], $[[T0]], 32
+; 64-DAG:        dsll $[[T2:[0-9]+]], $5, 32
+; 64-DAG:        dsrl $[[T3:[0-9]+]], $[[T2]], 32
+; 64-DAG:        d[[m:m]]ult $[[T3]], $[[T1]]
+; 64-DAG:        [[m]]flo $[[T4:[0-9]+]]
+; 64-DAG:        dsll $[[T5:[0-9]+]], $6, 32
+; 64-DAG:        dsrl $[[T6:[0-9]+]], $[[T5]], 32
+; 64-DAG:        daddu $2, $[[T4]], $[[T6]]
+
+; 64R6-DAG:      dsll $[[T0:[0-9]+]], $4, 32
+; 64R6-DAG:      dsrl $[[T1:[0-9]+]], $[[T0]], 32
+; 64R6-DAG:      dsll $[[T2:[0-9]+]], $5, 32
+; 64R6-DAG:      dsrl $[[T3:[0-9]+]], $[[T2]], 32
+; 64R6-DAG:      dmul $[[T4:[0-9]+]], $[[T3]], $[[T1]]
+; 64R6-DAG:      dsll $[[T5:[0-9]+]], $6, 32
+; 64R6-DAG:      dsrl $[[T6:[0-9]+]], $[[T5]], 32
+; 64R6-DAG:      daddu $2, $[[T4]], $[[T6]]
+
 define i64 @madd2(i32 %a, i32 %b, i32 %c) nounwind readnone {
 entry:
   %conv = zext i32 %a to i64
@@ -26,8 +105,38 @@ entry:
   ret i64 %add
 }
 
-; 32: madd ${{[0-9]+}}
-; DSP: madd $ac
+; ALL-LABEL: madd3:
+
+; 32-DAG:        mthi $6
+; 32-DAG:        mtlo $7
+; 32-DAG:        [[m:m]]add ${{[45]}}, ${{[45]}}
+; 32-DAG:        [[m]]fhi $2
+; 32-DAG:        [[m]]flo $3
+
+; DSP-DAG:       mthi $[[AC:ac[0-3]+]], $6
+; DSP-DAG:       mtlo $[[AC]], $7
+; DSP-DAG:       madd $[[AC]], ${{[45]}}, ${{[45]}}
+; DSP-DAG:       mfhi $2, $[[AC]]
+; DSP-DAG:       mflo $3, $[[AC]]
+
+; 32R6-DAG:      mul  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      addu $[[T1:[0-9]+]], $[[T0]], $7
+; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $7
+; 32R6-DAG:      addu $[[T4:[0-9]+]], $[[T2]], $6
+; 32R6-DAG:      muh  $[[T5:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      addu $2, $[[T5]], $[[T4]]
+
+; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
+; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
+; 64-DAG:        d[[m:m]]ult $[[T1]], $[[T0]]
+; 64-DAG:        [[m]]flo $[[T2:[0-9]+]]
+; 64-DAG:        daddu $2, $[[T2]], $6
+
+; 64R6-DAG:      sll $[[T0:[0-9]+]], $4, 0
+; 64R6-DAG:      sll $[[T1:[0-9]+]], $5, 0
+; 64R6-DAG:      dmul $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+; 64R6-DAG:      daddu $2, $[[T2]], $6
+
 define i64 @madd3(i32 %a, i32 %b, i64 %c) nounwind readnone {
 entry:
   %conv = sext i32 %a to i64
@@ -37,8 +146,41 @@ entry:
   ret i64 %add
 }
 
-; 32: msub ${{[0-9]+}}
-; DSP: msub $ac
+; ALL-LABEL: msub1:
+
+; 32-DAG:        sra $[[T0:[0-9]+]], $6, 31
+; 32-DAG:        mtlo $6
+; 32-DAG:        [[m:m]]sub ${{[45]}}, ${{[45]}}
+; 32-DAG:        [[m]]fhi $2
+; 32-DAG:        [[m]]flo $3
+
+; DSP-DAG:       sra $[[T0:[0-9]+]], $6, 31
+; DSP-DAG:       mtlo $[[AC:ac[0-3]+]], $6
+; DSP-DAG:       msub $[[AC]], ${{[45]}}, ${{[45]}}
+; DSP-DAG:       mfhi $2, $[[AC]]
+; DSP-DAG:       mflo $3, $[[AC]]
+
+; 32R6-DAG:      muh  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      mul  $[[T1:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      sltu $[[T3:[0-9]+]], $6, $[[T1]]
+; 32R6-DAG:      addu $[[T4:[0-9]+]], $[[T3]], $[[T0]]
+; 32R6-DAG:      sra  $[[T5:[0-9]+]], $6, 31
+; 32R6-DAG:      subu $2, $[[T5]], $[[T4]]
+; 32R6-DAG:      subu $3, $6, $[[T1]]
+
+; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
+; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
+; 64-DAG:        d[[m:m]]ult $[[T1]], $[[T0]]
+; 64-DAG:        [[m]]flo $[[T2:[0-9]+]]
+; 64-DAG:        sll $[[T3:[0-9]+]], $6, 0
+; 64-DAG:        dsubu $2, $[[T3]], $[[T2]]
+
+; 64R6-DAG:      sll $[[T0:[0-9]+]], $4, 0
+; 64R6-DAG:      sll $[[T1:[0-9]+]], $5, 0
+; 64R6-DAG:      dmul $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+; 64R6-DAG:      sll $[[T3:[0-9]+]], $6, 0
+; 64R6-DAG:      dsubu $2, $[[T3]], $[[T2]]
+
 define i64 @msub1(i32 %a, i32 %b, i32 %c) nounwind readnone {
 entry:
   %conv = sext i32 %c to i64
@@ -49,8 +191,48 @@ entry:
   ret i64 %sub
 }
 
-; 32: msubu ${{[0-9]+}}
-; DSP: msubu $ac
+; ALL-LABEL: msub2:
+
+; FIXME: We don't really need this instruction
+; 32-DAG:        addiu $[[T0:[0-9]+]], $zero, 0
+; 32-DAG:        mtlo $6
+; 32-DAG:        [[m:m]]subu ${{[45]}}, ${{[45]}}
+; 32-DAG:        [[m]]fhi $2
+; 32-DAG:        [[m]]flo $3
+
+; DSP-DAG:       addiu $[[T0:[0-9]+]], $zero, 0
+; DSP-DAG:       mtlo $[[AC:ac[0-3]+]], $6
+; DSP-DAG:       msubu $[[AC]], ${{[45]}}, ${{[45]}}
+; DSP-DAG:       mfhi $2, $[[AC]]
+; DSP-DAG:       mflo $3, $[[AC]]
+
+; 32R6-DAG:      muhu $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      mul $[[T1:[0-9]+]], ${{[45]}}, ${{[45]}}
+
+; 32R6-DAG:      sltu $[[T2:[0-9]+]], $6, $[[T1]]
+; 32R6-DAG:      addu $[[T3:[0-9]+]], $[[T2]], $[[T0]]
+; 32R6-DAG:      negu $2, $[[T3]]
+; 32R6-DAG:      subu $3, $6, $[[T1]]
+
+; 64-DAG:        dsll $[[T0:[0-9]+]], $4, 32
+; 64-DAG:        dsrl $[[T1:[0-9]+]], $[[T0]], 32
+; 64-DAG:        dsll $[[T2:[0-9]+]], $5, 32
+; 64-DAG:        dsrl $[[T3:[0-9]+]], $[[T2]], 32
+; 64-DAG:        d[[m:m]]ult $[[T3]], $[[T1]]
+; 64-DAG:        [[m]]flo $[[T4:[0-9]+]]
+; 64-DAG:        dsll $[[T5:[0-9]+]], $6, 32
+; 64-DAG:        dsrl $[[T6:[0-9]+]], $[[T5]], 32
+; 64-DAG:        dsubu $2, $[[T6]], $[[T4]]
+
+; 64R6-DAG:      dsll $[[T0:[0-9]+]], $4, 32
+; 64R6-DAG:      dsrl $[[T1:[0-9]+]], $[[T0]], 32
+; 64R6-DAG:      dsll $[[T2:[0-9]+]], $5, 32
+; 64R6-DAG:      dsrl $[[T3:[0-9]+]], $[[T2]], 32
+; 64R6-DAG:      dmul $[[T4:[0-9]+]], $[[T3]], $[[T1]]
+; 64R6-DAG:      dsll $[[T5:[0-9]+]], $6, 32
+; 64R6-DAG:      dsrl $[[T6:[0-9]+]], $[[T5]], 32
+; 64R6-DAG:      dsubu $2, $[[T6]], $[[T4]]
+
 define i64 @msub2(i32 %a, i32 %b, i32 %c) nounwind readnone {
 entry:
   %conv = zext i32 %c to i64
@@ -61,8 +243,39 @@ entry:
   ret i64 %sub
 }
 
-; 32: msub ${{[0-9]+}}
-; DSP: msub $ac
+; ALL-LABEL: msub3:
+
+; FIXME: We don't really need this instruction
+; 32-DAG:        mthi $6
+; 32-DAG:        mtlo $7
+; 32-DAG:        [[m:m]]sub ${{[45]}}, ${{[45]}}
+; 32-DAG:        [[m]]fhi $2
+; 32-DAG:        [[m]]flo $3
+
+; DSP-DAG:       addiu $[[T0:[0-9]+]], $zero, 0
+; DSP-DAG:       mtlo $[[AC:ac[0-3]+]], $6
+; DSP-DAG:       msub $[[AC]], ${{[45]}}, ${{[45]}}
+; DSP-DAG:       mfhi $2, $[[AC]]
+; DSP-DAG:       mflo $3, $[[AC]]
+
+; 32R6-DAG:      muh $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      mul $[[T1:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      sltu $[[T2:[0-9]+]], $7, $[[T1]]
+; 32R6-DAG:      addu $[[T3:[0-9]+]], $[[T2]], $[[T0]]
+; 32R6-DAG:      subu $2, $6, $[[T3]]
+; 32R6-DAG:      subu $3, $7, $[[T1]]
+
+; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
+; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
+; 64-DAG:        d[[m:m]]ult $[[T1]], $[[T0]]
+; 64-DAG:        [[m]]flo $[[T2:[0-9]+]]
+; 64-DAG:        dsubu $2, $6, $[[T2]]
+
+; 64R6-DAG:      sll $[[T0:[0-9]+]], $4, 0
+; 64R6-DAG:      sll $[[T1:[0-9]+]], $5, 0
+; 64R6-DAG:      dmul $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+; 64R6-DAG:      dsubu $2, $6, $[[T2]]
+
 define i64 @msub3(i32 %a, i32 %b, i64 %c) nounwind readnone {
 entry:
   %conv = sext i32 %a to i64
diff --git a/test/CodeGen/Mips/mips16ex.ll b/test/CodeGen/Mips/mips16ex.ll
index ecb30b5..a1a9919 100644
--- a/test/CodeGen/Mips/mips16ex.ll
+++ b/test/CodeGen/Mips/mips16ex.ll
@@ -1,6 +1,8 @@
 ; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
-;16: $eh_func_begin0=.
+;16: .cfi_personality
+;16-NEXT: [[TMP:.*]]:
+;16-NEXT: $eh_func_begin0 = ([[TMP]])
 @.str = private unnamed_addr constant [7 x i8] c"hello\0A\00", align 1
 @_ZTIi = external constant i8*
 @.str1 = private unnamed_addr constant [15 x i8] c"exception %i \0A\00", align 1
diff --git a/test/CodeGen/Mips/mips64-f128.ll b/test/CodeGen/Mips/mips64-f128.ll
index 4d590b6..7f7d515 100644
--- a/test/CodeGen/Mips/mips64-f128.ll
+++ b/test/CodeGen/Mips/mips64-f128.ll
@@ -1,7 +1,11 @@
 ; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips4 -soft-float -O1 \
-; RUN:     -disable-mips-delay-filler < %s | FileCheck %s
+; RUN:     -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=ALL -check-prefix=C_CC_FMT
 ; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips64 -soft-float -O1 \
-; RUN:     -disable-mips-delay-filler < %s | FileCheck %s
+; RUN:     -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=ALL -check-prefix=C_CC_FMT
+; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips64r2 -soft-float -O1 \
+; RUN:     -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=ALL -check-prefix=C_CC_FMT
+; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips64r6 -soft-float -O1 \
+; RUN:     -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=ALL -check-prefix=CMP_CC_FMT
 
 @gld0 = external global fp128
 @gld1 = external global fp128
@@ -9,8 +13,8 @@
 @gf1 = external global float
 @gd1 = external global double
 
-; CHECK-LABEL: addLD:
-; CHECK: ld $25, %call16(__addtf3)
+; ALL-LABEL: addLD:
+; ALL: ld $25, %call16(__addtf3)
 
 define fp128 @addLD() {
 entry:
@@ -20,8 +24,8 @@ entry:
   ret fp128 %add
 }
 
-; CHECK-LABEL: subLD:
-; CHECK: ld $25, %call16(__subtf3)
+; ALL-LABEL: subLD:
+; ALL: ld $25, %call16(__subtf3)
 
 define fp128 @subLD() {
 entry:
@@ -31,8 +35,8 @@ entry:
   ret fp128 %sub
 }
 
-; CHECK-LABEL: mulLD:
-; CHECK: ld $25, %call16(__multf3)
+; ALL-LABEL: mulLD:
+; ALL: ld $25, %call16(__multf3)
 
 define fp128 @mulLD() {
 entry:
@@ -42,8 +46,8 @@ entry:
   ret fp128 %mul
 }
 
-; CHECK-LABEL: divLD:
-; CHECK: ld $25, %call16(__divtf3)
+; ALL-LABEL: divLD:
+; ALL: ld $25, %call16(__divtf3)
 
 define fp128 @divLD() {
 entry:
@@ -53,8 +57,8 @@ entry:
   ret fp128 %div
 }
 
-; CHECK-LABEL: conv_LD_char:
-; CHECK: ld $25, %call16(__floatsitf)
+; ALL-LABEL: conv_LD_char:
+; ALL: ld $25, %call16(__floatsitf)
 
 define fp128 @conv_LD_char(i8 signext %a) {
 entry:
@@ -62,8 +66,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_LD_short:
-; CHECK: ld $25, %call16(__floatsitf)
+; ALL-LABEL: conv_LD_short:
+; ALL: ld $25, %call16(__floatsitf)
 
 define fp128 @conv_LD_short(i16 signext %a) {
 entry:
@@ -71,8 +75,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_LD_int:
-; CHECK: ld $25, %call16(__floatsitf)
+; ALL-LABEL: conv_LD_int:
+; ALL: ld $25, %call16(__floatsitf)
 
 define fp128 @conv_LD_int(i32 %a) {
 entry:
@@ -80,8 +84,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_LD_LL:
-; CHECK: ld $25, %call16(__floatditf)
+; ALL-LABEL: conv_LD_LL:
+; ALL: ld $25, %call16(__floatditf)
 
 define fp128 @conv_LD_LL(i64 %a) {
 entry:
@@ -89,8 +93,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_LD_UChar:
-; CHECK: ld $25, %call16(__floatunsitf)
+; ALL-LABEL: conv_LD_UChar:
+; ALL: ld $25, %call16(__floatunsitf)
 
 define fp128 @conv_LD_UChar(i8 zeroext %a) {
 entry:
@@ -98,8 +102,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_LD_UShort:
-; CHECK: ld $25, %call16(__floatunsitf)
+; ALL-LABEL: conv_LD_UShort:
+; ALL: ld $25, %call16(__floatunsitf)
 
 define fp128 @conv_LD_UShort(i16 zeroext %a) {
 entry:
@@ -107,8 +111,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_LD_UInt:
-; CHECK: ld $25, %call16(__floatunsitf)
+; ALL-LABEL: conv_LD_UInt:
+; ALL: ld $25, %call16(__floatunsitf)
 
 define fp128 @conv_LD_UInt(i32 %a) {
 entry:
@@ -116,8 +120,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_LD_ULL:
-; CHECK: ld $25, %call16(__floatunditf)
+; ALL-LABEL: conv_LD_ULL:
+; ALL: ld $25, %call16(__floatunditf)
 
 define fp128 @conv_LD_ULL(i64 %a) {
 entry:
@@ -125,8 +129,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_char_LD:
-; CHECK: ld $25, %call16(__fixtfsi)
+; ALL-LABEL: conv_char_LD:
+; ALL: ld $25, %call16(__fixtfsi)
 
 define signext i8 @conv_char_LD(fp128 %a) {
 entry:
@@ -134,8 +138,8 @@ entry:
   ret i8 %conv
 }
 
-; CHECK-LABEL: conv_short_LD:
-; CHECK: ld $25, %call16(__fixtfsi)
+; ALL-LABEL: conv_short_LD:
+; ALL: ld $25, %call16(__fixtfsi)
 
 define signext i16 @conv_short_LD(fp128 %a) {
 entry:
@@ -143,8 +147,8 @@ entry:
   ret i16 %conv
 }
 
-; CHECK-LABEL: conv_int_LD:
-; CHECK: ld $25, %call16(__fixtfsi)
+; ALL-LABEL: conv_int_LD:
+; ALL: ld $25, %call16(__fixtfsi)
 
 define i32 @conv_int_LD(fp128 %a) {
 entry:
@@ -152,8 +156,8 @@ entry:
   ret i32 %conv
 }
 
-; CHECK-LABEL: conv_LL_LD:
-; CHECK: ld $25, %call16(__fixtfdi)
+; ALL-LABEL: conv_LL_LD:
+; ALL: ld $25, %call16(__fixtfdi)
 
 define i64 @conv_LL_LD(fp128 %a) {
 entry:
@@ -161,8 +165,8 @@ entry:
   ret i64 %conv
 }
 
-; CHECK-LABEL: conv_UChar_LD:
-; CHECK: ld $25, %call16(__fixtfsi)
+; ALL-LABEL: conv_UChar_LD:
+; ALL: ld $25, %call16(__fixtfsi)
 
 define zeroext i8 @conv_UChar_LD(fp128 %a) {
 entry:
@@ -170,8 +174,8 @@ entry:
   ret i8 %conv
 }
 
-; CHECK-LABEL: conv_UShort_LD:
-; CHECK: ld $25, %call16(__fixtfsi)
+; ALL-LABEL: conv_UShort_LD:
+; ALL: ld $25, %call16(__fixtfsi)
 
 define zeroext i16 @conv_UShort_LD(fp128 %a) {
 entry:
@@ -179,8 +183,8 @@ entry:
   ret i16 %conv
 }
 
-; CHECK-LABEL: conv_UInt_LD:
-; CHECK: ld $25, %call16(__fixunstfsi)
+; ALL-LABEL: conv_UInt_LD:
+; ALL: ld $25, %call16(__fixunstfsi)
 
 define i32 @conv_UInt_LD(fp128 %a) {
 entry:
@@ -188,8 +192,8 @@ entry:
   ret i32 %conv
 }
 
-; CHECK-LABEL: conv_ULL_LD:
-; CHECK: ld $25, %call16(__fixunstfdi)
+; ALL-LABEL: conv_ULL_LD:
+; ALL: ld $25, %call16(__fixunstfdi)
 
 define i64 @conv_ULL_LD(fp128 %a) {
 entry:
@@ -197,8 +201,8 @@ entry:
   ret i64 %conv
 }
 
-; CHECK-LABEL: conv_LD_float:
-; CHECK: ld $25, %call16(__extendsftf2)
+; ALL-LABEL: conv_LD_float:
+; ALL: ld $25, %call16(__extendsftf2)
 
 define fp128 @conv_LD_float(float %a) {
 entry:
@@ -206,8 +210,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_LD_double:
-; CHECK: ld $25, %call16(__extenddftf2)
+; ALL-LABEL: conv_LD_double:
+; ALL: ld $25, %call16(__extenddftf2)
 
 define fp128 @conv_LD_double(double %a) {
 entry:
@@ -215,8 +219,8 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: conv_float_LD:
-; CHECK: ld $25, %call16(__trunctfsf2)
+; ALL-LABEL: conv_float_LD:
+; ALL: ld $25, %call16(__trunctfsf2)
 
 define float @conv_float_LD(fp128 %a) {
 entry:
@@ -224,8 +228,8 @@ entry:
   ret float %conv
 }
 
-; CHECK-LABEL: conv_double_LD:
-; CHECK: ld $25, %call16(__trunctfdf2)
+; ALL-LABEL: conv_double_LD:
+; ALL: ld $25, %call16(__trunctfdf2)
 
 define double @conv_double_LD(fp128 %a) {
 entry:
@@ -233,13 +237,13 @@ entry:
   ret double %conv
 }
 
-; CHECK-LABEL:             libcall1_fabsl:
-; CHECK-DAG: ld      $[[R0:[0-9]+]], 8($[[R4:[0-9]+]])
-; CHECK-DAG: daddiu  $[[R1:[0-9]+]], $zero, 1
-; CHECK-DAG: dsll    $[[R2:[0-9]+]], $[[R1]], 63
-; CHECK-DAG: daddiu  $[[R3:[0-9]+]], $[[R2]], -1
-; CHECK-DAG: and     $4, $[[R0]], $[[R3]]
-; CHECK-DAG: ld      $2, 0($[[R4]])
+; ALL-LABEL:             libcall1_fabsl:
+; ALL-DAG: ld      $[[R0:[0-9]+]], 8($[[R4:[0-9]+]])
+; ALL-DAG: daddiu  $[[R1:[0-9]+]], $zero, 1
+; ALL-DAG: dsll    $[[R2:[0-9]+]], $[[R1]], 63
+; ALL-DAG: daddiu  $[[R3:[0-9]+]], $[[R2]], -1
+; ALL-DAG: and     $4, $[[R0]], $[[R3]]
+; ALL-DAG: ld      $2, 0($[[R4]])
 
 define fp128 @libcall1_fabsl() {
 entry:
@@ -250,8 +254,8 @@ entry:
 
 declare fp128 @fabsl(fp128) #1
 
-; CHECK-LABEL: libcall1_ceill:
-; CHECK: ld $25, %call16(ceill)
+; ALL-LABEL: libcall1_ceill:
+; ALL: ld $25, %call16(ceill)
 
 define fp128 @libcall1_ceill() {
 entry:
@@ -262,8 +266,8 @@ entry:
 
 declare fp128 @ceill(fp128) #1
 
-; CHECK-LABEL: libcall1_sinl:
-; CHECK: ld $25, %call16(sinl)
+; ALL-LABEL: libcall1_sinl:
+; ALL: ld $25, %call16(sinl)
 
 define fp128 @libcall1_sinl() {
 entry:
@@ -274,8 +278,8 @@ entry:
 
 declare fp128 @sinl(fp128) #2
 
-; CHECK-LABEL: libcall1_cosl:
-; CHECK: ld $25, %call16(cosl)
+; ALL-LABEL: libcall1_cosl:
+; ALL: ld $25, %call16(cosl)
 
 define fp128 @libcall1_cosl() {
 entry:
@@ -286,8 +290,8 @@ entry:
 
 declare fp128 @cosl(fp128) #2
 
-; CHECK-LABEL: libcall1_expl:
-; CHECK: ld $25, %call16(expl)
+; ALL-LABEL: libcall1_expl:
+; ALL: ld $25, %call16(expl)
 
 define fp128 @libcall1_expl() {
 entry:
@@ -298,8 +302,8 @@ entry:
 
 declare fp128 @expl(fp128) #2
 
-; CHECK-LABEL: libcall1_exp2l:
-; CHECK: ld $25, %call16(exp2l)
+; ALL-LABEL: libcall1_exp2l:
+; ALL: ld $25, %call16(exp2l)
 
 define fp128 @libcall1_exp2l() {
 entry:
@@ -310,8 +314,8 @@ entry:
 
 declare fp128 @exp2l(fp128) #2
 
-; CHECK-LABEL: libcall1_logl:
-; CHECK: ld $25, %call16(logl)
+; ALL-LABEL: libcall1_logl:
+; ALL: ld $25, %call16(logl)
 
 define fp128 @libcall1_logl() {
 entry:
@@ -322,8 +326,8 @@ entry:
 
 declare fp128 @logl(fp128) #2
 
-; CHECK-LABEL: libcall1_log2l:
-; CHECK: ld $25, %call16(log2l)
+; ALL-LABEL: libcall1_log2l:
+; ALL: ld $25, %call16(log2l)
 
 define fp128 @libcall1_log2l() {
 entry:
@@ -334,8 +338,8 @@ entry:
 
 declare fp128 @log2l(fp128) #2
 
-; CHECK-LABEL: libcall1_log10l:
-; CHECK: ld $25, %call16(log10l)
+; ALL-LABEL: libcall1_log10l:
+; ALL: ld $25, %call16(log10l)
 
 define fp128 @libcall1_log10l() {
 entry:
@@ -346,8 +350,8 @@ entry:
 
 declare fp128 @log10l(fp128) #2
 
-; CHECK-LABEL: libcall1_nearbyintl:
-; CHECK: ld $25, %call16(nearbyintl)
+; ALL-LABEL: libcall1_nearbyintl:
+; ALL: ld $25, %call16(nearbyintl)
 
 define fp128 @libcall1_nearbyintl() {
 entry:
@@ -358,8 +362,8 @@ entry:
 
 declare fp128 @nearbyintl(fp128) #1
 
-; CHECK-LABEL: libcall1_floorl:
-; CHECK: ld $25, %call16(floorl)
+; ALL-LABEL: libcall1_floorl:
+; ALL: ld $25, %call16(floorl)
 
 define fp128 @libcall1_floorl() {
 entry:
@@ -370,8 +374,8 @@ entry:
 
 declare fp128 @floorl(fp128) #1
 
-; CHECK-LABEL: libcall1_sqrtl:
-; CHECK: ld $25, %call16(sqrtl)
+; ALL-LABEL: libcall1_sqrtl:
+; ALL: ld $25, %call16(sqrtl)
 
 define fp128 @libcall1_sqrtl() {
 entry:
@@ -382,8 +386,8 @@ entry:
 
 declare fp128 @sqrtl(fp128) #2
 
-; CHECK-LABEL: libcall1_rintl:
-; CHECK: ld $25, %call16(rintl)
+; ALL-LABEL: libcall1_rintl:
+; ALL: ld $25, %call16(rintl)
 
 define fp128 @libcall1_rintl() {
 entry:
@@ -394,8 +398,8 @@ entry:
 
 declare fp128 @rintl(fp128) #1
 
-; CHECK-LABEL: libcall_powil:
-; CHECK: ld $25, %call16(__powitf2)
+; ALL-LABEL: libcall_powil:
+; ALL: ld $25, %call16(__powitf2)
 
 define fp128 @libcall_powil(fp128 %a, i32 %b) {
 entry:
@@ -405,18 +409,18 @@ entry:
 
 declare fp128 @llvm.powi.f128(fp128, i32) #3
 
-; CHECK-LABEL:     libcall2_copysignl:
-; CHECK-DAG: daddiu $[[R2:[0-9]+]], $zero, 1
-; CHECK-DAG: dsll   $[[R3:[0-9]+]], $[[R2]], 63
-; CHECK-DAG: ld     $[[R0:[0-9]+]], %got_disp(gld1)
-; CHECK-DAG: ld     $[[R1:[0-9]+]], 8($[[R0]])
-; CHECK-DAG: and    $[[R4:[0-9]+]], $[[R1]], $[[R3]]
-; CHECK-DAG: ld     $[[R5:[0-9]+]], %got_disp(gld0)
-; CHECK-DAG: ld     $[[R6:[0-9]+]], 8($[[R5]])
-; CHECK-DAG: daddiu $[[R7:[0-9]+]], $[[R3]], -1
-; CHECK-DAG: and    $[[R8:[0-9]+]], $[[R6]], $[[R7]]
-; CHECK-DAG: or     $4, $[[R8]], $[[R4]]
-; CHECK-DAG: ld     $2, 0($[[R5]])
+; ALL-LABEL:     libcall2_copysignl:
+; ALL-DAG: daddiu $[[R2:[0-9]+]], $zero, 1
+; ALL-DAG: dsll   $[[R3:[0-9]+]], $[[R2]], 63
+; ALL-DAG: ld     $[[R0:[0-9]+]], %got_disp(gld1)
+; ALL-DAG: ld     $[[R1:[0-9]+]], 8($[[R0]])
+; ALL-DAG: and    $[[R4:[0-9]+]], $[[R1]], $[[R3]]
+; ALL-DAG: ld     $[[R5:[0-9]+]], %got_disp(gld0)
+; ALL-DAG: ld     $[[R6:[0-9]+]], 8($[[R5]])
+; ALL-DAG: daddiu $[[R7:[0-9]+]], $[[R3]], -1
+; ALL-DAG: and    $[[R8:[0-9]+]], $[[R6]], $[[R7]]
+; ALL-DAG: or     $4, $[[R8]], $[[R4]]
+; ALL-DAG: ld     $2, 0($[[R5]])
 
 define fp128 @libcall2_copysignl() {
 entry:
@@ -428,8 +432,8 @@ entry:
 
 declare fp128 @copysignl(fp128, fp128) #1
 
-; CHECK-LABEL: libcall2_powl:
-; CHECK: ld $25, %call16(powl)
+; ALL-LABEL: libcall2_powl:
+; ALL: ld $25, %call16(powl)
 
 define fp128 @libcall2_powl() {
 entry:
@@ -441,8 +445,8 @@ entry:
 
 declare fp128 @powl(fp128, fp128) #2
 
-; CHECK-LABEL: libcall2_fmodl:
-; CHECK: ld $25, %call16(fmodl)
+; ALL-LABEL: libcall2_fmodl:
+; ALL: ld $25, %call16(fmodl)
 
 define fp128 @libcall2_fmodl() {
 entry:
@@ -454,8 +458,8 @@ entry:
 
 declare fp128 @fmodl(fp128, fp128) #2
 
-; CHECK-LABEL: libcall3_fmal:
-; CHECK: ld $25, %call16(fmal)
+; ALL-LABEL: libcall3_fmal:
+; ALL: ld $25, %call16(fmal)
 
 define fp128 @libcall3_fmal() {
 entry:
@@ -468,8 +472,8 @@ entry:
 
 declare fp128 @llvm.fma.f128(fp128, fp128, fp128) #4
 
-; CHECK-LABEL: cmp_lt:
-; CHECK: ld $25, %call16(__lttf2)
+; ALL-LABEL: cmp_lt:
+; ALL: ld $25, %call16(__lttf2)
 
 define i32 @cmp_lt(fp128 %a, fp128 %b) {
 entry:
@@ -478,8 +482,8 @@ entry:
   ret i32 %conv
 }
 
-; CHECK-LABEL: cmp_le:
-; CHECK: ld $25, %call16(__letf2)
+; ALL-LABEL: cmp_le:
+; ALL: ld $25, %call16(__letf2)
 
 define i32 @cmp_le(fp128 %a, fp128 %b) {
 entry:
@@ -488,8 +492,8 @@ entry:
   ret i32 %conv
 }
 
-; CHECK-LABEL: cmp_gt:
-; CHECK: ld $25, %call16(__gttf2)
+; ALL-LABEL: cmp_gt:
+; ALL: ld $25, %call16(__gttf2)
 
 define i32 @cmp_gt(fp128 %a, fp128 %b) {
 entry:
@@ -498,8 +502,8 @@ entry:
   ret i32 %conv
 }
 
-; CHECK-LABEL: cmp_ge:
-; CHECK: ld $25, %call16(__getf2)
+; ALL-LABEL: cmp_ge:
+; ALL: ld $25, %call16(__getf2)
 
 define i32 @cmp_ge(fp128 %a, fp128 %b) {
 entry:
@@ -508,8 +512,8 @@ entry:
   ret i32 %conv
 }
 
-; CHECK-LABEL: cmp_eq:
-; CHECK: ld $25, %call16(__eqtf2)
+; ALL-LABEL: cmp_eq:
+; ALL: ld $25, %call16(__eqtf2)
 
 define i32 @cmp_eq(fp128 %a, fp128 %b) {
 entry:
@@ -518,8 +522,8 @@ entry:
   ret i32 %conv
 }
 
-; CHECK-LABEL: cmp_ne:
-; CHECK: ld $25, %call16(__netf2)
+; ALL-LABEL: cmp_ne:
+; ALL: ld $25, %call16(__netf2)
 
 define i32 @cmp_ne(fp128 %a, fp128 %b) {
 entry:
@@ -528,10 +532,10 @@ entry:
   ret i32 %conv
 }
 
-; CHECK-LABEL: load_LD_LD:
-; CHECK: ld $[[R0:[0-9]+]], %got_disp(gld1)
-; CHECK: ld $2, 0($[[R0]])
-; CHECK: ld $4, 8($[[R0]])
+; ALL-LABEL: load_LD_LD:
+; ALL: ld $[[R0:[0-9]+]], %got_disp(gld1)
+; ALL: ld $2, 0($[[R0]])
+; ALL: ld $4, 8($[[R0]])
 
 define fp128 @load_LD_LD() {
 entry:
@@ -539,11 +543,11 @@ entry:
   ret fp128 %0
 }
 
-; CHECK-LABEL: load_LD_float:
-; CHECK: ld   $[[R0:[0-9]+]], %got_disp(gf1)
-; CHECK: lw   $4, 0($[[R0]])
-; CHECK: ld   $25, %call16(__extendsftf2)
-; CHECK: jalr $25
+; ALL-LABEL: load_LD_float:
+; ALL: ld   $[[R0:[0-9]+]], %got_disp(gf1)
+; ALL: lw   $4, 0($[[R0]])
+; ALL: ld   $25, %call16(__extendsftf2)
+; ALL: jalr $25
 
 define fp128 @load_LD_float() {
 entry:
@@ -552,11 +556,11 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: load_LD_double:
-; CHECK: ld   $[[R0:[0-9]+]], %got_disp(gd1)
-; CHECK: ld   $4, 0($[[R0]])
-; CHECK: ld   $25, %call16(__extenddftf2)
-; CHECK: jalr $25
+; ALL-LABEL: load_LD_double:
+; ALL: ld   $[[R0:[0-9]+]], %got_disp(gd1)
+; ALL: ld   $4, 0($[[R0]])
+; ALL: ld   $25, %call16(__extenddftf2)
+; ALL: jalr $25
 
 define fp128 @load_LD_double() {
 entry:
@@ -565,13 +569,13 @@ entry:
   ret fp128 %conv
 }
 
-; CHECK-LABEL: store_LD_LD:
-; CHECK: ld $[[R0:[0-9]+]], %got_disp(gld1)
-; CHECK: ld $[[R1:[0-9]+]], 0($[[R0]])
-; CHECK: ld $[[R2:[0-9]+]], 8($[[R0]])
-; CHECK: ld $[[R3:[0-9]+]], %got_disp(gld0)
-; CHECK: sd $[[R2]], 8($[[R3]])
-; CHECK: sd $[[R1]], 0($[[R3]])
+; ALL-LABEL: store_LD_LD:
+; ALL: ld $[[R0:[0-9]+]], %got_disp(gld1)
+; ALL: ld $[[R1:[0-9]+]], 0($[[R0]])
+; ALL: ld $[[R2:[0-9]+]], 8($[[R0]])
+; ALL: ld $[[R3:[0-9]+]], %got_disp(gld0)
+; ALL: sd $[[R2]], 8($[[R3]])
+; ALL: sd $[[R1]], 0($[[R3]])
 
 define void @store_LD_LD() {
 entry:
@@ -580,14 +584,14 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: store_LD_float:
-; CHECK: ld   $[[R0:[0-9]+]], %got_disp(gld1)
-; CHECK: ld   $4, 0($[[R0]])
-; CHECK: ld   $5, 8($[[R0]])
-; CHECK: ld   $25, %call16(__trunctfsf2)
-; CHECK: jalr $25
-; CHECK: ld   $[[R1:[0-9]+]], %got_disp(gf1)
-; CHECK: sw   $2, 0($[[R1]])
+; ALL-LABEL: store_LD_float:
+; ALL: ld   $[[R0:[0-9]+]], %got_disp(gld1)
+; ALL: ld   $4, 0($[[R0]])
+; ALL: ld   $5, 8($[[R0]])
+; ALL: ld   $25, %call16(__trunctfsf2)
+; ALL: jalr $25
+; ALL: ld   $[[R1:[0-9]+]], %got_disp(gf1)
+; ALL: sw   $2, 0($[[R1]])
 
 define void @store_LD_float() {
 entry:
@@ -597,14 +601,14 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: store_LD_double:
-; CHECK: ld   $[[R0:[0-9]+]], %got_disp(gld1)
-; CHECK: ld   $4, 0($[[R0]])
-; CHECK: ld   $5, 8($[[R0]])
-; CHECK: ld   $25, %call16(__trunctfdf2)
-; CHECK: jalr $25
-; CHECK: ld   $[[R1:[0-9]+]], %got_disp(gd1)
-; CHECK: sd   $2, 0($[[R1]])
+; ALL-LABEL: store_LD_double:
+; ALL: ld   $[[R0:[0-9]+]], %got_disp(gld1)
+; ALL: ld   $4, 0($[[R0]])
+; ALL: ld   $5, 8($[[R0]])
+; ALL: ld   $25, %call16(__trunctfdf2)
+; ALL: jalr $25
+; ALL: ld   $[[R1:[0-9]+]], %got_disp(gd1)
+; ALL: sd   $2, 0($[[R1]])
 
 define void @store_LD_double() {
 entry:
@@ -614,11 +618,22 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: select_LD:
-; CHECK: movn $8, $6, $4
-; CHECK: movn $9, $7, $4
-; CHECK: move $2, $8
-; CHECK: move $4, $9
+; ALL-LABEL: select_LD:
+; C_CC_FMT:      movn $8, $6, $4
+; C_CC_FMT:      movn $9, $7, $4
+; C_CC_FMT:      move $2, $8
+; C_CC_FMT:      move $4, $9
+
+; FIXME: This sll works around an implementation detail in the code generator
+;        (setcc's result is i32 so bits 32-63 are undefined). It's not really
+;        needed.
+; CMP_CC_FMT-DAG: sll $[[CC:[0-9]+]], $4, 0
+; CMP_CC_FMT-DAG: seleqz $[[EQ1:[0-9]+]], $8, $[[CC]]
+; CMP_CC_FMT-DAG: selnez $[[NE1:[0-9]+]], $6, $[[CC]]
+; CMP_CC_FMT-DAG: or $2, $[[NE1]], $[[EQ1]]
+; CMP_CC_FMT-DAG: seleqz $[[EQ2:[0-9]+]], $9, $[[CC]]
+; CMP_CC_FMT-DAG: selnez $[[NE2:[0-9]+]], $7, $[[CC]]
+; CMP_CC_FMT-DAG: or $4, $[[NE2]], $[[EQ2]]
 
 define fp128 @select_LD(i32 %a, i64, fp128 %b, fp128 %c) {
 entry:
@@ -627,18 +642,27 @@ entry:
   ret fp128 %cond
 }
 
-; CHECK-LABEL: selectCC_LD:
-; CHECK: move $[[R0:[0-9]+]], $11
-; CHECK: move $[[R1:[0-9]+]], $10
-; CHECK: move $[[R2:[0-9]+]], $9
-; CHECK: move $[[R3:[0-9]+]], $8
-; CHECK: ld   $25, %call16(__gttf2)($gp)
-; CHECK: jalr $25
-; CHECK: slti $1, $2, 1
-; CHECK: movz $[[R1]], $[[R3]], $1
-; CHECK: movz $[[R0]], $[[R2]], $1
-; CHECK: move $2, $[[R1]]
-; CHECK: move $4, $[[R0]]
+; ALL-LABEL: selectCC_LD:
+; ALL:           move $[[R0:[0-9]+]], $11
+; ALL:           move $[[R1:[0-9]+]], $10
+; ALL:           move $[[R2:[0-9]+]], $9
+; ALL:           move $[[R3:[0-9]+]], $8
+; ALL:           ld   $25, %call16(__gttf2)($gp)
+; ALL:           jalr $25
+
+; C_CC_FMT:      slti $[[CC:[0-9]+]], $2, 1
+; C_CC_FMT:      movz $[[R1]], $[[R3]], $[[CC]]
+; C_CC_FMT:      movz $[[R0]], $[[R2]], $[[CC]]
+; C_CC_FMT:      move $2, $[[R1]]
+; C_CC_FMT:      move $4, $[[R0]]
+
+; CMP_CC_FMT:    slt $[[CC:[0-9]+]], $zero, $2
+; CMP_CC_FMT:    seleqz $[[EQ1:[0-9]+]], $[[R1]], $[[CC]]
+; CMP_CC_FMT:    selnez $[[NE1:[0-9]+]], $[[R3]], $[[CC]]
+; CMP_CC_FMT:    or $2, $[[NE1]], $[[EQ1]]
+; CMP_CC_FMT:    seleqz $[[EQ2:[0-9]+]], $[[R0]], $[[CC]]
+; CMP_CC_FMT:    selnez $[[NE2:[0-9]+]], $[[R2]], $[[CC]]
+; CMP_CC_FMT:    or $4, $[[NE2]], $[[EQ2]]
 
 define fp128 @selectCC_LD(fp128 %a, fp128 %b, fp128 %c, fp128 %d) {
 entry:
diff --git a/test/CodeGen/Mips/mips64-fp-indexed-ls.ll b/test/CodeGen/Mips/mips64-fp-indexed-ls.ll
deleted file mode 100644
index bbdc05c..0000000
--- a/test/CodeGen/Mips/mips64-fp-indexed-ls.ll
+++ /dev/null
@@ -1,110 +0,0 @@
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 < %s | FileCheck %s
-
-%struct.S = type <{ [4 x float] }>
-%struct.S2 = type <{ [4 x double] }>
-%struct.S3 = type <{ i8, float }>
-
-@s = external global [4 x %struct.S]
-@gf = external global float
-@gd = external global double
-@s2 = external global [4 x %struct.S2]
-@s3 = external global %struct.S3
-
-define float @foo0(float* nocapture %b, i32 %o) nounwind readonly {
-entry:
-; CHECK: lwxc1
-  %idxprom = zext i32 %o to i64
-  %arrayidx = getelementptr inbounds float* %b, i64 %idxprom
-  %0 = load float* %arrayidx, align 4
-  ret float %0
-}
-
-define double @foo1(double* nocapture %b, i32 %o) nounwind readonly {
-entry:
-; CHECK: ldxc1
-  %idxprom = zext i32 %o to i64
-  %arrayidx = getelementptr inbounds double* %b, i64 %idxprom
-  %0 = load double* %arrayidx, align 8
-  ret double %0
-}
-
-define float @foo2(i32 %b, i32 %c) nounwind readonly {
-entry:
-; CHECK-NOT: luxc1
-  %idxprom = zext i32 %c to i64
-  %idxprom1 = zext i32 %b to i64
-  %arrayidx2 = getelementptr inbounds [4 x %struct.S]* @s, i64 0, i64 %idxprom1, i32 0, i64 %idxprom
-  %0 = load float* %arrayidx2, align 1
-  ret float %0
-}
-
-define void @foo3(float* nocapture %b, i32 %o) nounwind {
-entry:
-; CHECK: swxc1
-  %0 = load float* @gf, align 4
-  %idxprom = zext i32 %o to i64
-  %arrayidx = getelementptr inbounds float* %b, i64 %idxprom
-  store float %0, float* %arrayidx, align 4
-  ret void
-}
-
-define void @foo4(double* nocapture %b, i32 %o) nounwind {
-entry:
-; CHECK: sdxc1
-  %0 = load double* @gd, align 8
-  %idxprom = zext i32 %o to i64
-  %arrayidx = getelementptr inbounds double* %b, i64 %idxprom
-  store double %0, double* %arrayidx, align 8
-  ret void
-}
-
-define void @foo5(i32 %b, i32 %c) nounwind {
-entry:
-; CHECK-NOT: suxc1
-  %0 = load float* @gf, align 4
-  %idxprom = zext i32 %c to i64
-  %idxprom1 = zext i32 %b to i64
-  %arrayidx2 = getelementptr inbounds [4 x %struct.S]* @s, i64 0, i64 %idxprom1, i32 0, i64 %idxprom
-  store float %0, float* %arrayidx2, align 1
-  ret void
-}
-
-define double @foo6(i32 %b, i32 %c) nounwind readonly {
-entry:
-; CHECK: foo6
-; CHECK-NOT: luxc1
-  %idxprom = zext i32 %c to i64
-  %idxprom1 = zext i32 %b to i64
-  %arrayidx2 = getelementptr inbounds [4 x %struct.S2]* @s2, i64 0, i64 %idxprom1, i32 0, i64 %idxprom
-  %0 = load double* %arrayidx2, align 1
-  ret double %0
-}
-
-define void @foo7(i32 %b, i32 %c) nounwind {
-entry:
-; CHECK: foo7
-; CHECK-NOT: suxc1
-  %0 = load double* @gd, align 8
-  %idxprom = zext i32 %c to i64
-  %idxprom1 = zext i32 %b to i64
-  %arrayidx2 = getelementptr inbounds [4 x %struct.S2]* @s2, i64 0, i64 %idxprom1, i32 0, i64 %idxprom
-  store double %0, double* %arrayidx2, align 1
-  ret void
-}
-
-define float @foo8() nounwind readonly {
-entry:
-; CHECK: foo8
-; CHECK-NOT: luxc1
-  %0 = load float* getelementptr inbounds (%struct.S3* @s3, i64 0, i32 1), align 1
-  ret float %0
-}
-
-define void @foo9(float %f) nounwind {
-entry:
-; CHECK: foo9
-; CHECK-NOT: suxc1
-  store float %f, float* getelementptr inbounds (%struct.S3* @s3, i64 0, i32 1), align 1
-  ret void
-}
-
diff --git a/test/CodeGen/Mips/mips64countleading.ll b/test/CodeGen/Mips/mips64countleading.ll
deleted file mode 100644
index 252f323..0000000
--- a/test/CodeGen/Mips/mips64countleading.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck -check-prefix=CHECK -check-prefix=MIPS4 %s
-; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck -check-prefix=CHECK -check-prefix=MIPS64 %s
-
-define i64 @t1(i64 %X) nounwind readnone {
-entry:
-; CHECK-LABEL: t1:
-; MIPS4-NOT: dclz
-; MIPS64: dclz
-  %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %X, i1 true)
-  ret i64 %tmp1
-}
-
-declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
-
-define i64 @t3(i64 %X) nounwind readnone {
-entry:
-; CHECK-LABEL: t3:
-; MIPS4-NOT: dclo
-; MIPS64: dclo
-  %neg = xor i64 %X, -1
-  %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %neg, i1 true)
-  ret i64 %tmp1
-}
-
diff --git a/test/CodeGen/Mips/mips64instrs.ll b/test/CodeGen/Mips/mips64instrs.ll
index 58f11f1..ed617be 100644
--- a/test/CodeGen/Mips/mips64instrs.ll
+++ b/test/CodeGen/Mips/mips64instrs.ll
@@ -1,99 +1,128 @@
-; RUN: llc -march=mips64el -mcpu=mips4 -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=MIPS4 %s
-; RUN: llc -march=mips64el -mcpu=mips64 -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=MIPS64 %s
+; RUN: llc -march=mips64el -mcpu=mips4 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS4 -check-prefix=ACCMULDIV %s
+; RUN: llc -march=mips64el -mcpu=mips64 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=HAS-DCLO -check-prefix=ACCMULDIV %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=HAS-DCLO -check-prefix=ACCMULDIV %s
+; RUN: llc -march=mips64el -mcpu=mips64r6 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=HAS-DCLO -check-prefix=GPRMULDIV %s
 
 @gll0 = common global i64 0, align 8
 @gll1 = common global i64 0, align 8
 
 define i64 @f0(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: daddu
+; ALL-LABEL: f0:
+; ALL:           daddu $2, ${{[45]}}, ${{[45]}}
   %add = add nsw i64 %a1, %a0
   ret i64 %add
 }
 
 define i64 @f1(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: dsubu
+; ALL-LABEL: f1:
+; ALL:           dsubu $2, $4, $5
   %sub = sub nsw i64 %a0, %a1
   ret i64 %sub
 }
 
 define i64 @f4(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: and
+; ALL-LABEL: f4:
+; ALL:           and $2, ${{[45]}}, ${{[45]}}
   %and = and i64 %a1, %a0
   ret i64 %and
 }
 
 define i64 @f5(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: or
+; ALL-LABEL: f5:
+; ALL:           or $2, ${{[45]}}, ${{[45]}}
   %or = or i64 %a1, %a0
   ret i64 %or
 }
 
 define i64 @f6(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: xor
+; ALL-LABEL: f6:
+; ALL:           xor $2, ${{[45]}}, ${{[45]}}
   %xor = xor i64 %a1, %a0
   ret i64 %xor
 }
 
 define i64 @f7(i64 %a0) nounwind readnone {
 entry:
-; CHECK: daddiu ${{[0-9]+}}, ${{[0-9]+}}, 20
+; ALL-LABEL: f7:
+; ALL:           daddiu $2, $4, 20
   %add = add nsw i64 %a0, 20
   ret i64 %add
 }
 
 define i64 @f8(i64 %a0) nounwind readnone {
 entry:
-; CHECK: daddiu ${{[0-9]+}}, ${{[0-9]+}}, -20
+; ALL-LABEL: f8:
+; ALL:           daddiu $2, $4, -20
   %sub = add nsw i64 %a0, -20
   ret i64 %sub
 }
 
 define i64 @f9(i64 %a0) nounwind readnone {
 entry:
-; CHECK: andi ${{[0-9]+}}, ${{[0-9]+}}, 20
+; ALL-LABEL: f9:
+; ALL:           andi $2, $4, 20
   %and = and i64 %a0, 20
   ret i64 %and
 }
 
 define i64 @f10(i64 %a0) nounwind readnone {
 entry:
-; CHECK: ori ${{[0-9]+}}, ${{[0-9]+}}, 20
+; ALL-LABEL: f10:
+; ALL:           ori $2, $4, 20
   %or = or i64 %a0, 20
   ret i64 %or
 }
 
 define i64 @f11(i64 %a0) nounwind readnone {
 entry:
-; CHECK: xori ${{[0-9]+}}, ${{[0-9]+}}, 20
+; ALL-LABEL: f11:
+; ALL:           xori $2, $4, 20
   %xor = xor i64 %a0, 20
   ret i64 %xor
 }
 
 define i64 @f12(i64 %a, i64 %b) nounwind readnone {
 entry:
-; CHECK: mult
+; ALL-LABEL: f12:
+
+; ACCMULDIV:     mult ${{[45]}}, ${{[45]}}
+; GPRMULDIV:     dmul $2, ${{[45]}}, ${{[45]}}
+
   %mul = mul nsw i64 %b, %a
   ret i64 %mul
 }
 
 define i64 @f13(i64 %a, i64 %b) nounwind readnone {
 entry:
-; CHECK: mult
+; ALL-LABEL: f13:
+
+; ACCMULDIV:     mult ${{[45]}}, ${{[45]}}
+; GPRMULDIV:     dmul $2, ${{[45]}}, ${{[45]}}
+
   %mul = mul i64 %b, %a
   ret i64 %mul
 }
 
 define i64 @f14(i64 %a, i64 %b) nounwind readnone {
 entry:
-; CHECK-LABEL: f14:
-; CHECK: ddiv $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
-; CHECK: teq $[[R0]], $zero, 7
-; CHECK: mflo
+; ALL-LABEL: f14:
+; ALL-DAG:       ld $[[P0:[0-9]+]], %got_disp(gll0)(
+; ALL-DAG:       ld $[[P1:[0-9]+]], %got_disp(gll1)(
+; ALL-DAG:       ld $[[T0:[0-9]+]], 0($[[P0]])
+; ALL-DAG:       ld $[[T1:[0-9]+]], 0($[[P1]])
+
+; ACCMULDIV:     ddiv $zero, $[[T0]], $[[T1]]
+; ACCMULDIV:     teq $[[T1]], $zero, 7
+; ACCMULDIV:     mflo $2
+
+; GPRMULDIV:     ddiv $2, $[[T0]], $[[T1]]
+; GPRMULDIV:     teq $[[T1]], $zero, 7
+
   %0 = load i64* @gll0, align 8
   %1 = load i64* @gll1, align 8
   %div = sdiv i64 %0, %1
@@ -102,10 +131,19 @@ entry:
 
 define i64 @f15() nounwind readnone {
 entry:
-; CHECK-LABEL: f15:
-; CHECK: ddivu $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
-; CHECK: teq $[[R0]], $zero, 7
-; CHECK: mflo
+; ALL-LABEL: f15:
+; ALL-DAG:       ld $[[P0:[0-9]+]], %got_disp(gll0)(
+; ALL-DAG:       ld $[[P1:[0-9]+]], %got_disp(gll1)(
+; ALL-DAG:       ld $[[T0:[0-9]+]], 0($[[P0]])
+; ALL-DAG:       ld $[[T1:[0-9]+]], 0($[[P1]])
+
+; ACCMULDIV:     ddivu $zero, $[[T0]], $[[T1]]
+; ACCMULDIV:     teq $[[T1]], $zero, 7
+; ACCMULDIV:     mflo $2
+
+; GPRMULDIV:     ddivu $2, $[[T0]], $[[T1]]
+; GPRMULDIV:     teq $[[T1]], $zero, 7
+
   %0 = load i64* @gll0, align 8
   %1 = load i64* @gll1, align 8
   %div = udiv i64 %0, %1
@@ -114,20 +152,30 @@ entry:
 
 define i64 @f16(i64 %a, i64 %b) nounwind readnone {
 entry:
-; CHECK-LABEL: f16:
-; CHECK: ddiv $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
-; CHECK: teq $[[R0]], $zero, 7
-; CHECK: mfhi
+; ALL-LABEL: f16:
+
+; ACCMULDIV:     ddiv $zero, $4, $5
+; ACCMULDIV:     teq $5, $zero, 7
+; ACCMULDIV:     mfhi $2
+
+; GPRMULDIV:     dmod $2, $4, $5
+; GPRMULDIV:     teq $5, $zero, 7
+
   %rem = srem i64 %a, %b
   ret i64 %rem
 }
 
 define i64 @f17(i64 %a, i64 %b) nounwind readnone {
 entry:
-; CHECK-LABEL: f17:
-; CHECK: ddivu $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
-; CHECK: teq $[[R0]], $zero, 7
-; CHECK: mfhi
+; ALL-LABEL: f17:
+
+; ACCMULDIV:     ddivu $zero, $4, $5
+; ACCMULDIV:     teq $5, $zero, 7
+; ACCMULDIV:     mfhi $2
+
+; GPRMULDIV:     dmodu $2, $4, $5
+; GPRMULDIV:     teq $5, $zero, 7
+
   %rem = urem i64 %a, %b
   ret i64 %rem
 }
@@ -136,24 +184,26 @@ declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
 
 define i64 @f18(i64 %X) nounwind readnone {
 entry:
-; CHECK-LABEL: f18:
+; ALL-LABEL: f18:
 
 ; The MIPS4 version is too long to reasonably test. At least check we don't get dclz
-; MIPS4-NOT: dclz
+; MIPS4-NOT:     dclz
+
+; HAS-DCLO:      dclz $2, $4
 
-; MIPS64: dclz $2, $4
   %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %X, i1 true)
   ret i64 %tmp1
 }
 
 define i64 @f19(i64 %X) nounwind readnone {
 entry:
-; CHECK-LABEL: f19:
+; ALL-LABEL: f19:
 
 ; The MIPS4 version is too long to reasonably test. At least check we don't get dclo
-; MIPS4-NOT: dclo
+; MIPS4-NOT:     dclo
+
+; HAS-DCLO:      dclo $2, $4
 
-; MIPS64: dclo $2, $4
   %neg = xor i64 %X, -1
   %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %neg, i1 true)
   ret i64 %tmp1
@@ -161,8 +211,8 @@ entry:
 
 define i64 @f20(i64 %a, i64 %b) nounwind readnone {
 entry:
-; CHECK-LABEL: f20:
-; CHECK: nor
+; ALL-LABEL: f20:
+; ALL:           nor $2, ${{[45]}}, ${{[45]}}
   %or = or i64 %b, %a
   %neg = xor i64 %or, -1
   ret i64 %neg
diff --git a/test/CodeGen/Mips/mips64muldiv.ll b/test/CodeGen/Mips/mips64muldiv.ll
index 39c73e9..32d05a9 100644
--- a/test/CodeGen/Mips/mips64muldiv.ll
+++ b/test/CodeGen/Mips/mips64muldiv.ll
@@ -1,50 +1,79 @@
-; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s
-; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC
+; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC
+; RUN: llc -march=mips64el -mcpu=mips64r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ACC
+; RUN: llc -march=mips64el -mcpu=mips64r6 < %s | FileCheck %s -check-prefix=ALL -check-prefix=GPR
+
+; FileCheck prefixes:
+;   ALL - All targets
+;   ACC - Targets with accumulator based mul/div (i.e. pre-MIPS32r6)
+;   GPR - Targets with register based mul/div (i.e. MIPS32r6)
 
 define i64 @m0(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: dmult
-; CHECK: mflo
+; ALL-LABEL: m0:
+; ACC:           dmult ${{[45]}}, ${{[45]}}
+; ACC:           mflo $2
+; GPR:           dmul $2, ${{[45]}}, ${{[45]}}
   %mul = mul i64 %a1, %a0
   ret i64 %mul
 }
 
 define i64 @m1(i64 %a) nounwind readnone {
 entry:
-; CHECK: dmult
-; CHECK: mfhi
+; ALL-LABEL: m1:
+; ALL:           lui $[[T0:[0-9]+]], 21845
+; ALL:           addiu $[[T0]], $[[T0]], 21845
+; ALL:           dsll $[[T0]], $[[T0]], 16
+; ALL:           addiu $[[T0]], $[[T0]], 21845
+; ALL:           dsll $[[T0]], $[[T0]], 16
+; ALL:           addiu $[[T0]], $[[T0]], 21846
+
+; ACC:           dmult $4, $[[T0]]
+; ACC:           mfhi $[[T1:[0-9]+]]
+; GPR:           dmuh $[[T1:[0-9]+]], $4, $[[T0]]
+
+; ALL:           dsrl $2, $[[T1]], 63
+; ALL:           daddu $2, $[[T1]], $2
   %div = sdiv i64 %a, 3
   ret i64 %div
 }
 
 define i64 @d0(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: ddivu
-; CHECK: mflo
+; ALL-LABEL: d0:
+; ACC:           ddivu $zero, $4, $5
+; ACC:           mflo $2
+; GPR:           ddivu $2, $4, $5
   %div = udiv i64 %a0, %a1
   ret i64 %div
 }
 
 define i64 @d1(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: ddiv
-; CHECK: mflo
+; ALL-LABEL: d1:
+; ACC:           ddiv $zero, $4, $5
+; ACC:           mflo $2
+; GPR:           ddiv $2, $4, $5
   %div = sdiv i64 %a0, %a1
   ret i64 %div
 }
 
 define i64 @d2(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: ddivu
-; CHECK: mfhi
+; ALL-LABEL: d2:
+; ACC:           ddivu $zero, $4, $5
+; ACC:           mfhi $2
+; GPR:           dmodu $2, $4, $5
   %rem = urem i64 %a0, %a1
   ret i64 %rem
 }
 
 define i64 @d3(i64 %a0, i64 %a1) nounwind readnone {
 entry:
-; CHECK: ddiv
-; CHECK: mfhi
+; ALL-LABEL: d3:
+; ACC:           ddiv $zero, $4, $5
+; ACC:           mfhi $2
+; GPR:           dmod $2, $4, $5
   %rem = srem i64 %a0, %a1
   ret i64 %rem
 }
diff --git a/test/CodeGen/Mips/mno-ldc1-sdc1.ll b/test/CodeGen/Mips/mno-ldc1-sdc1.ll
index f4854f8..244b03d 100644
--- a/test/CodeGen/Mips/mno-ldc1-sdc1.ll
+++ b/test/CodeGen/Mips/mno-ldc1-sdc1.ll
@@ -1,33 +1,113 @@
-; RUN: llc -march=mipsel -relocation-model=pic -mno-ldc1-sdc1 -mcpu=mips32r2 \
-; RUN: < %s | FileCheck %s -check-prefix=LE-PIC
-; RUN: llc -march=mipsel -relocation-model=static -mno-ldc1-sdc1 < %s | \
-; RUN: FileCheck %s -check-prefix=LE-STATIC
-; RUN: llc -march=mips -relocation-model=pic -mno-ldc1-sdc1 < %s | \
-; RUN: FileCheck %s -check-prefix=BE-PIC
+; Check that [sl]dc1 are normally emitted. MIPS32r2 should have [sl]dxc1 too.
+; RUN: llc -march=mipsel -mcpu=mips32   < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R1-LDC1
 ; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | \
-; RUN: FileCheck %s -check-prefix=CHECK-LDC1-SDC1
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R2-LDXC1
+; RUN: llc -march=mipsel -mcpu=mips32r6 < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R6-LDC1
+
+; Check that -mno-ldc1-sdc1 disables [sl]dc1
+; RUN: llc -march=mipsel -relocation-model=pic -mno-ldc1-sdc1 \
+; RUN:   -mcpu=mips32   < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R1 \
+; RUN:             -check-prefix=32R1-LE -check-prefix=32R1-LE-PIC
+; RUN: llc -march=mipsel -relocation-model=pic -mno-ldc1-sdc1 \
+; RUN:   -mcpu=mips32r2 < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R2 \
+; RUN:             -check-prefix=32R2-LE -check-prefix=32R2-LE-PIC
+; RUN: llc -march=mipsel -relocation-model=pic -mno-ldc1-sdc1 \
+; RUN:   -mcpu=mips32r6 < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R6 \
+; RUN:             -check-prefix=32R6-LE -check-prefix=32R6-LE-PIC
+
+; Check again for big-endian
+; RUN: llc -march=mips -relocation-model=pic -mno-ldc1-sdc1 \
+; RUN:   -mcpu=mips32   < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R1 \
+; RUN:             -check-prefix=32R1-BE -check-prefix=32R1-BE-PIC
+; RUN: llc -march=mips -relocation-model=pic -mno-ldc1-sdc1 \
+; RUN:   -mcpu=mips32r2 < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R2 \
+; RUN:             -check-prefix=32R2-BE -check-prefix=32R2-BE-PIC
+; RUN: llc -march=mips -relocation-model=pic -mno-ldc1-sdc1 \
+; RUN:   -mcpu=mips32r6 < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R6 \
+; RUN:             -check-prefix=32R6-BE -check-prefix=32R6-BE-PIC
+
+; Check again for the static relocation model
+; RUN: llc -march=mipsel -relocation-model=static -mno-ldc1-sdc1 \
+; RUN:   -mcpu=mips32   < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R1 \
+; RUN:             -check-prefix=32R1-LE -check-prefix=32R1-LE-STATIC
+; RUN: llc -march=mipsel -relocation-model=static -mno-ldc1-sdc1 \
+; RUN:   -mcpu=mips32r2 < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R2 \
+; RUN:             -check-prefix=32R2-LE -check-prefix=32R2-LE-STATIC
+; RUN: llc -march=mipsel -relocation-model=static -mno-ldc1-sdc1 \
+; RUN:   -mcpu=mips32r6 < %s | \
+; RUN:   FileCheck %s -check-prefix=ALL -check-prefix=32R6 \
+; RUN:             -check-prefix=32R6-LE -check-prefix=32R6-LE-STATIC
 
 @g0 = common global double 0.000000e+00, align 8
 
-; LE-PIC-LABEL: test_ldc1:
-; LE-PIC-DAG: lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
-; LE-PIC-DAG: lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
-; LE-PIC-DAG: mtc1 $[[R0]], $f0
-; LE-PIC-DAG: mtc1 $[[R1]], $f1
-; LE-STATIC-LABEL: test_ldc1:
-; LE-STATIC-DAG: lui $[[R0:[0-9]+]], %hi(g0)
-; LE-STATIC-DAG: lw $[[R1:[0-9]+]], %lo(g0)($[[R0]])
-; LE-STATIC-DAG: addiu $[[R2:[0-9]+]], $[[R0]], %lo(g0)
-; LE-STATIC-DAG: lw $[[R3:[0-9]+]], 4($[[R2]])
-; LE-STATIC-DAG: mtc1 $[[R1]], $f0
-; LE-STATIC-DAG: mtc1 $[[R3]], $f1
-; BE-PIC-LABEL: test_ldc1:
-; BE-PIC-DAG: lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
-; BE-PIC-DAG: lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
-; BE-PIC-DAG: mtc1 $[[R1]], $f0
-; BE-PIC-DAG: mtc1 $[[R0]], $f1
-; CHECK-LDC1-SDC1-LABEL: test_ldc1:
-; CHECK-LDC1-SDC1: ldc1 $f{{[0-9]+}}
+; ALL-LABEL: test_ldc1:
+
+; 32R1-LE-PIC-DAG:    lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R1-LE-PIC-DAG:    lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R1-LE-PIC-DAG:    mtc1 $[[R0]], $f0
+; 32R1-LE-PIC-DAG:    mtc1 $[[R1]], $f1
+
+; 32R2-LE-PIC-DAG:    lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R2-LE-PIC-DAG:    lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R2-LE-PIC-DAG:    mtc1 $[[R0]], $f0
+; 32R2-LE-PIC-DAG:    mthc1 $[[R1]], $f0
+
+; 32R6-LE-PIC-DAG:    lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R6-LE-PIC-DAG:    lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R6-LE-PIC-DAG:    mtc1 $[[R0]], $f0
+; 32R6-LE-PIC-DAG:    mthc1 $[[R1]], $f0
+
+; 32R1-LE-STATIC-DAG: lui $[[R0:[0-9]+]], %hi(g0)
+; 32R1-LE-STATIC-DAG: lw $[[R1:[0-9]+]], %lo(g0)($[[R0]])
+; 32R1-LE-STATIC-DAG: addiu $[[R2:[0-9]+]], $[[R0]], %lo(g0)
+; 32R1-LE-STATIC-DAG: lw $[[R3:[0-9]+]], 4($[[R2]])
+; 32R1-LE-STATIC-DAG: mtc1 $[[R1]], $f0
+; 32R1-LE-STATIC-DAG: mtc1 $[[R3]], $f1
+
+; 32R2-LE-STATIC-DAG: lui $[[R0:[0-9]+]], %hi(g0)
+; 32R2-LE-STATIC-DAG: lw $[[R1:[0-9]+]], %lo(g0)($[[R0]])
+; 32R2-LE-STATIC-DAG: addiu $[[R2:[0-9]+]], $[[R0]], %lo(g0)
+; 32R2-LE-STATIC-DAG: lw $[[R3:[0-9]+]], 4($[[R2]])
+; 32R2-LE-STATIC-DAG: mtc1 $[[R1]], $f0
+; 32R2-LE-STATIC-DAG: mthc1 $[[R3]], $f0
+
+; 32R6-LE-STATIC-DAG: lui $[[R0:[0-9]+]], %hi(g0)
+; 32R6-LE-STATIC-DAG: lw $[[R1:[0-9]+]], %lo(g0)($[[R0]])
+; 32R6-LE-STATIC-DAG: addiu $[[R2:[0-9]+]], $[[R0]], %lo(g0)
+; 32R6-LE-STATIC-DAG: lw $[[R3:[0-9]+]], 4($[[R2]])
+; 32R6-LE-STATIC-DAG: mtc1 $[[R1]], $f0
+; 32R6-LE-STATIC-DAG: mthc1 $[[R3]], $f0
+
+; 32R1-BE-PIC-DAG:    lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R1-BE-PIC-DAG:    lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R1-BE-PIC-DAG:    mtc1 $[[R1]], $f0
+; 32R1-BE-PIC-DAG:    mtc1 $[[R0]], $f1
+
+; 32R2-BE-PIC-DAG:    lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R2-BE-PIC-DAG:    lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R2-BE-PIC-DAG:    mtc1 $[[R1]], $f0
+; 32R2-BE-PIC-DAG:    mthc1 $[[R0]], $f0
+
+; 32R6-BE-PIC-DAG:    lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R6-BE-PIC-DAG:    lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R6-BE-PIC-DAG:    mtc1 $[[R1]], $f0
+; 32R6-BE-PIC-DAG:    mthc1 $[[R0]], $f0
+
+; 32R1-LDC1:          ldc1 $f0, 0(${{[0-9]+}})
+
+; 32R2-LDXC1:         ldc1 $f0, 0(${{[0-9]+}})
+
+; 32R6-LDC1:          ldc1 $f0, 0(${{[0-9]+}})
 
 define double @test_ldc1() {
 entry:
@@ -35,25 +115,64 @@ entry:
   ret double %0
 }
 
-; LE-PIC-LABEL: test_sdc1:
-; LE-PIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
-; LE-PIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
-; LE-PIC-DAG: sw $[[R0]], 0(${{[0-9]+}})
-; LE-PIC-DAG: sw $[[R1]], 4(${{[0-9]+}})
-; LE-STATIC-LABEL: test_sdc1:
-; LE-STATIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
-; LE-STATIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
-; LE-STATIC-DAG: lui $[[R2:[0-9]+]], %hi(g0)
-; LE-STATIC-DAG: sw $[[R0]], %lo(g0)($[[R2]])
-; LE-STATIC-DAG: addiu $[[R3:[0-9]+]], $[[R2]], %lo(g0)
-; LE-STATIC-DAG: sw $[[R1]], 4($[[R3]])
-; BE-PIC-LABEL: test_sdc1:
-; BE-PIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
-; BE-PIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
-; BE-PIC-DAG: sw $[[R1]], 0(${{[0-9]+}})
-; BE-PIC-DAG: sw $[[R0]], 4(${{[0-9]+}})
-; CHECK-LDC1-SDC1-LABEL: test_sdc1:
-; CHECK-LDC1-SDC1: sdc1 $f{{[0-9]+}}
+; ALL-LABEL: test_sdc1:
+
+; 32R1-LE-PIC-DAG:    mfc1 $[[R0:[0-9]+]], $f12
+; 32R1-LE-PIC-DAG:    mfc1 $[[R1:[0-9]+]], $f13
+; 32R1-LE-PIC-DAG:    sw $[[R0]], 0(${{[0-9]+}})
+; 32R1-LE-PIC-DAG:    sw $[[R1]], 4(${{[0-9]+}})
+
+; 32R2-LE-PIC-DAG:    mfc1 $[[R0:[0-9]+]], $f12
+; 32R2-LE-PIC-DAG:    mfc1 $[[R1:[0-9]+]], $f13
+; 32R2-LE-PIC-DAG:    sw $[[R0]], 0(${{[0-9]+}})
+; 32R2-LE-PIC-DAG:    sw $[[R1]], 4(${{[0-9]+}})
+
+; 32R6-LE-PIC-DAG:    mfc1 $[[R0:[0-9]+]], $f12
+; 32R6-LE-PIC-DAG:    mfhc1 $[[R1:[0-9]+]], $f12
+; 32R6-LE-PIC-DAG:    sw $[[R0]], 0(${{[0-9]+}})
+; 32R6-LE-PIC-DAG:    sw $[[R1]], 4(${{[0-9]+}})
+
+; 32R1-LE-STATIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
+; 32R1-LE-STATIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
+; 32R1-LE-STATIC-DAG: lui $[[R2:[0-9]+]], %hi(g0)
+; 32R1-LE-STATIC-DAG: sw $[[R0]], %lo(g0)($[[R2]])
+; 32R1-LE-STATIC-DAG: addiu $[[R3:[0-9]+]], $[[R2]], %lo(g0)
+; 32R1-LE-STATIC-DAG: sw $[[R1]], 4($[[R3]])
+
+; 32R2-LE-STATIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
+; 32R2-LE-STATIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
+; 32R2-LE-STATIC-DAG: lui $[[R2:[0-9]+]], %hi(g0)
+; 32R2-LE-STATIC-DAG: sw $[[R0]], %lo(g0)($[[R2]])
+; 32R2-LE-STATIC-DAG: addiu $[[R3:[0-9]+]], $[[R2]], %lo(g0)
+; 32R2-LE-STATIC-DAG: sw $[[R1]], 4($[[R3]])
+
+; 32R6-LE-STATIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
+; 32R6-LE-STATIC-DAG: mfhc1 $[[R1:[0-9]+]], $f12
+; 32R6-LE-STATIC-DAG: lui $[[R2:[0-9]+]], %hi(g0)
+; 32R6-LE-STATIC-DAG: sw $[[R0]], %lo(g0)($[[R2]])
+; 32R6-LE-STATIC-DAG: addiu $[[R3:[0-9]+]], $[[R2]], %lo(g0)
+; 32R6-LE-STATIC-DAG: sw $[[R1]], 4($[[R3]])
+
+; 32R1-BE-PIC-DAG:    mfc1 $[[R0:[0-9]+]], $f12
+; 32R1-BE-PIC-DAG:    mfc1 $[[R1:[0-9]+]], $f13
+; 32R1-BE-PIC-DAG:    sw $[[R1]], 0(${{[0-9]+}})
+; 32R1-BE-PIC-DAG:    sw $[[R0]], 4(${{[0-9]+}})
+
+; 32R2-BE-PIC-DAG:    mfc1 $[[R0:[0-9]+]], $f12
+; 32R2-BE-PIC-DAG:    mfc1 $[[R1:[0-9]+]], $f13
+; 32R2-BE-PIC-DAG:    sw $[[R1]], 0(${{[0-9]+}})
+; 32R2-BE-PIC-DAG:    sw $[[R0]], 4(${{[0-9]+}})
+
+; 32R6-BE-PIC-DAG:    mfc1 $[[R0:[0-9]+]], $f12
+; 32R6-BE-PIC-DAG:    mfhc1 $[[R1:[0-9]+]], $f12
+; 32R6-BE-PIC-DAG:    sw $[[R1]], 0(${{[0-9]+}})
+; 32R6-BE-PIC-DAG:    sw $[[R0]], 4(${{[0-9]+}})
+
+; 32R1-LDC1:          sdc1 $f{{[0-9]+}}, 0(${{[0-9]+}})
+
+; 32R2-LDXC1:         sdc1 $f{{[0-9]+}}, 0(${{[0-9]+}})
+
+; 32R6-LDC1:          sdc1 $f{{[0-9]+}}, 0(${{[0-9]+}})
 
 define void @test_sdc1(double %a) {
 entry:
@@ -61,14 +180,35 @@ entry:
   ret void
 }
 
+; ALL-LABEL: test_ldxc1:
+
+; 32R1-LE-DAG:   lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R1-LE-DAG:   lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R1-BE-DAG:   lw $[[R0:[0-9]+]], 4(${{[0-9]+}})
+; 32R1-BE-DAG:   lw $[[R1:[0-9]+]], 0(${{[0-9]+}})
+; 32R1-DAG:      mtc1 $[[R0]], $f0
+; 32R1-DAG:      mtc1 $[[R1]], $f1
+
+; 32R2-LE-DAG:   lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R2-LE-DAG:   lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R2-BE-DAG:   lw $[[R0:[0-9]+]], 4(${{[0-9]+}})
+; 32R2-BE-DAG:   lw $[[R1:[0-9]+]], 0(${{[0-9]+}})
+; 32R2-DAG:      mtc1 $[[R0]], $f0
+; 32R2-DAG:      mthc1 $[[R1]], $f0
 
-; LE-PIC-LABEL: test_ldxc1:
-; LE-PIC-DAG: lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
-; LE-PIC-DAG: lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
-; LE-PIC-DAG: mtc1 $[[R0]], $f0
-; LE-PIC-DAG: mtc1 $[[R1]], $f1
-; CHECK-LDC1-SDC1-LABEL: test_ldxc1:
-; CHECK-LDC1-SDC1: ldxc1 $f{{[0-9]+}}
+; 32R6-LE-DAG:   lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R6-LE-DAG:   lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R6-BE-DAG:   lw $[[R0:[0-9]+]], 4(${{[0-9]+}})
+; 32R6-BE-DAG:   lw $[[R1:[0-9]+]], 0(${{[0-9]+}})
+; 32R6-DAG:      mtc1 $[[R0]], $f0
+; 32R6-DAG:      mthc1 $[[R1]], $f0
+
+; 32R1-LDC1:     ldc1 $f0, 0(${{[0-9]+}})
+
+; 32R2-LDXC1:    sll $[[OFFSET:[0-9]+]], $5, 3
+; 32R2-LDXC1:    ldxc1 $f0, $[[OFFSET]]($4)
+
+; 32R6-LDC1:     ldc1 $f0, 0(${{[0-9]+}})
 
 define double @test_ldxc1(double* nocapture readonly %a, i32 %i) {
 entry:
@@ -77,13 +217,29 @@ entry:
   ret double %0
 }
 
-; LE-PIC-LABEL: test_sdxc1:
-; LE-PIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
-; LE-PIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
-; LE-PIC-DAG: sw $[[R0]], 0(${{[0-9]+}})
-; LE-PIC-DAG: sw $[[R1]], 4(${{[0-9]+}})
-; CHECK-LDC1-SDC1-LABEL: test_sdxc1:
-; CHECK-LDC1-SDC1: sdxc1 $f{{[0-9]+}}
+; ALL-LABEL: test_sdxc1:
+
+; 32R1-DAG:      mfc1 $[[R0:[0-9]+]], $f12
+; 32R1-DAG:      mfc1 $[[R1:[0-9]+]], $f13
+; 32R1-DAG:      sw $[[R0]], 0(${{[0-9]+}})
+; 32R1-DAG:      sw $[[R1]], 4(${{[0-9]+}})
+
+; 32R2-DAG:      mfc1 $[[R0:[0-9]+]], $f12
+; 32R2-DAG:      mfc1 $[[R1:[0-9]+]], $f13
+; 32R2-DAG:      sw $[[R0]], 0(${{[0-9]+}})
+; 32R2-DAG:      sw $[[R1]], 4(${{[0-9]+}})
+
+; 32R6-DAG:      mfc1 $[[R0:[0-9]+]], $f12
+; 32R6-DAG:      mfhc1 $[[R1:[0-9]+]], $f12
+; 32R6-DAG:      sw $[[R0]], 0(${{[0-9]+}})
+; 32R6-DAG:      sw $[[R1]], 4(${{[0-9]+}})
+
+; 32R1-LDC1:     sdc1 $f{{[0-9]+}}, 0(${{[0-9]+}})
+
+; 32R2-LDXC1:    sll $[[OFFSET:[0-9]+]], $7, 3
+; 32R2-LDXC1:    sdxc1 $f{{[0-9]+}}, $[[OFFSET]]($6)
+
+; 32R6-LDC1:     sdc1 $f{{[0-9]+}}, 0(${{[0-9]+}})
 
 define void @test_sdxc1(double %b, double* nocapture %a, i32 %i) {
 entry:
diff --git a/test/CodeGen/Mips/msa/special.ll b/test/CodeGen/Mips/msa/special.ll
index f65a14f..b9badf5 100644
--- a/test/CodeGen/Mips/msa/special.ll
+++ b/test/CodeGen/Mips/msa/special.ll
@@ -4,6 +4,10 @@
 ; RUN:   FileCheck %s --check-prefix=MIPS32
 ; RUN: llc -march=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64 < %s | \
 ; RUN:   FileCheck %s --check-prefix=MIPS64
+; RUN: llc -march=mips -mcpu=mips32r6 -mattr=+msa < %s | \
+; RUN:   FileCheck %s --check-prefix=MIPS32
+; RUN: llc -march=mips64 -mcpu=mips64r6 -mattr=+msa < %s | \
+; RUN:   FileCheck %s --check-prefix=MIPS64
 
 define i32 @llvm_mips_lsa_test(i32 %a, i32 %b) nounwind {
 entry:
diff --git a/test/CodeGen/Mips/no-odd-spreg.ll b/test/CodeGen/Mips/no-odd-spreg.ll
new file mode 100644
index 0000000..b42ed6a
--- /dev/null
+++ b/test/CodeGen/Mips/no-odd-spreg.ll
@@ -0,0 +1,54 @@
+; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ODDSPREG
+; RUN: llc -march=mipsel -mcpu=mips32 -mattr=+nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOODDSPREG
+; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=fp64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ODDSPREG
+; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=fp64,+nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOODDSPREG
+
+; ODDSPREG:       .module oddspreg
+; NOODDSPREG:     .module nooddspreg
+
+define float @two_floats(float %a) {
+entry:
+  ; Clobber all except $f12 and $f13
+  ;
+  ; The intention is that if odd single precision registers are permitted, the
+  ; allocator will choose $f12 and $f13 to avoid the spill/reload.
+  ;
+  ; On the other hand, if odd single precision registers are not permitted, it
+  ; will be forced to spill/reload either %a or %0.
+
+  %0 = fadd float %a, 1.0
+  call void asm "# Clobber", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"()
+  %1 = fadd float %a, %0
+  ret float %1
+}
+
+; ALL-LABEL:  two_floats:
+; ODDSPREG:       add.s $f13, $f12, ${{f[0-9]+}}
+; ODDSPREG-NOT:   swc1
+; ODDSPREG-NOT:   lwc1
+; ODDSPREG:       add.s $f0, $f12, $f13
+
+; NOODDSPREG:     add.s $[[T0:f[0-9]*[02468]]], $f12, ${{f[0-9]+}}
+; NOODDSPREG:     swc1 $[[T0]],
+; NOODDSPREG:     lwc1 $[[T1:f[0-9]*[02468]]],
+; NOODDSPREG:     add.s $f0, $f12, $[[T1]]
+
+define double @two_doubles(double %a) {
+entry:
+  ; Clobber all except $f12 and $f13
+  ;
+  ; -mno-odd-sp-reg doesn't need to affect double precision values so both cases
+  ; use $f12 and $f13.
+
+  %0 = fadd double %a, 1.0
+  call void asm "# Clobber", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"()
+  %1 = fadd double %a, %0
+  ret double %1
+}
+
+; ALL-LABEL: two_doubles:
+; ALL:           add.d $[[T0:f[0-9]+]], $f12, ${{f[0-9]+}}
+; ALL:           add.d $f0, $f12, $[[T0]]
+
+
+; INVALID: -mattr=+nooddspreg is not currently permitted for a 32-bit FPU register file (FR=0 mode).
diff --git a/test/CodeGen/Mips/null-streamer.ll b/test/CodeGen/Mips/null-streamer.ll
new file mode 100644
index 0000000..56cebbf
--- /dev/null
+++ b/test/CodeGen/Mips/null-streamer.ll
@@ -0,0 +1,7 @@
+; Test the null streamer with a terget streamer.
+; RUN: llc -O0 -filetype=null -mtriple=mips-linux < %s
+
+define i32 @main()  {
+entry:
+  ret i32 0
+}
diff --git a/test/CodeGen/Mips/prevent-hoisting.ll b/test/CodeGen/Mips/prevent-hoisting.ll
new file mode 100644
index 0000000..da665c2
--- /dev/null
+++ b/test/CodeGen/Mips/prevent-hoisting.ll
@@ -0,0 +1,144 @@
+; RUN: llc -march=mipsel -O3 < %s | FileCheck %s
+
+
+; MIPS direct branches implicitly define register $at. This test makes sure that
+; code hoisting optimization (which moves identical instructions at the start of
+; two basic blocks to the common predecessor block) takes this into account and
+; doesn't move definition of $at to the predecessor block (which would make $at
+; live-in at the start of successor block).
+
+
+; CHECK-LABEL: readLumaCoeff8x8_CABAC
+
+; The check for "addiu" instruction is added so that we can match the correct "b" instruction.
+; CHECK:           addiu ${{[0-9]+}}, $zero, -1
+; CHECK:           b $[[BB0:BB[0-9_]+]]
+
+; Check that sll instruction that writes to $1 starts basic block.
+; CHECK:       {{BB[0-9_#]+}}: 
+; CHECK-NEXT:      sll $1, $[[R0:[0-9]+]], 4
+
+; Check that identical sll instruction starts another basic block.
+; CHECK:       [[BB0]]:
+; CHECK-NEXT:      sll $1, $[[R0]], 4
+
+
+%struct.img_par = type { i32, i32, i32, i32, i32*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [16 x [16 x i16]], [6 x [32 x i32]], [16 x [16 x i32]], [4 x [12 x [4 x [4 x i32]]]], [16 x i32], i8**, i32*, i32***, i32**, i32, i32, i32, i32, %struct.Slice*, %struct.macroblock*, i32, i32, i32, i32, i32, i32, %struct.DecRefPicMarking_s*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [3 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32***, i32***, i32****, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [3 x [2 x i32]], [3 x [2 x i32]], i32, i32, i32, i32, %struct.timeb, %struct.timeb, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
+%struct.Slice = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, %struct.datapartition*, %struct.MotionInfoContexts*, %struct.TextureInfoContexts*, i32, i32*, i32*, i32*, i32, i32*, i32*, i32*, i32 (%struct.img_par*, %struct.inp_par*)*, i32, i32, i32, i32 }
+%struct.datapartition = type { %struct.Bitstream*, %struct.DecodingEnvironment, i32 (%struct.syntaxelement*, %struct.img_par*, %struct.datapartition*)* }
+%struct.Bitstream = type { i32, i32, i32, i32, i8*, i32 }
+%struct.DecodingEnvironment = type { i32, i32, i32, i32, i32, i8*, i32* }
+%struct.syntaxelement = type { i32, i32, i32, i32, i32, i32, i32, i32, void (i32, i32, i32*, i32*)*, void (%struct.syntaxelement*, %struct.img_par*, %struct.DecodingEnvironment*)* }
+%struct.MotionInfoContexts = type { [4 x [11 x %struct.BiContextType]], [2 x [9 x %struct.BiContextType]], [2 x [10 x %struct.BiContextType]], [2 x [6 x %struct.BiContextType]], [4 x %struct.BiContextType], [4 x %struct.BiContextType], [3 x %struct.BiContextType] }
+%struct.BiContextType = type { i16, i8 }
+%struct.TextureInfoContexts = type { [2 x %struct.BiContextType], [4 x %struct.BiContextType], [3 x [4 x %struct.BiContextType]], [10 x [4 x %struct.BiContextType]], [10 x [15 x %struct.BiContextType]], [10 x [15 x %struct.BiContextType]], [10 x [5 x %struct.BiContextType]], [10 x [5 x %struct.BiContextType]], [10 x [15 x %struct.BiContextType]], [10 x [15 x %struct.BiContextType]] }
+%struct.inp_par = type { [1000 x i8], [1000 x i8], [1000 x i8], i32, i32, i32, i32, i32, i32, i32, i32 }
+%struct.macroblock = type { i32, [2 x i32], i32, i32, %struct.macroblock*, %struct.macroblock*, i32, [2 x [4 x [4 x [2 x i32]]]], i32, i64, i64, i32, i32, [4 x i8], [4 x i8], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
+%struct.DecRefPicMarking_s = type { i32, i32, i32, i32, i32, %struct.DecRefPicMarking_s* }
+%struct.timeb = type { i32, i16, i16, i16 }
+
+@assignSE2partition = external global [0 x [20 x i32]]
+@FIELD_SCAN8x8 = external constant [64 x [2 x i8]]
+
+
+define void @readLumaCoeff8x8_CABAC(%struct.img_par* %img, i32 %b8) {
+
+  %1 = load i32* undef, align 4
+  br i1 false, label %2, label %3
+
+; <label>:2                                       ; preds = %0
+  br label %3
+
+; <label>:3                                       ; preds = %2, %0
+  br i1 undef, label %switch.lookup, label %4
+
+switch.lookup:                                    ; preds = %3
+  br label %4
+
+; <label>:4                                       ; preds = %switch.lookup, %3
+  br i1 undef, label %5, label %6
+
+; <label>:5                                       ; preds = %4
+  br label %6
+
+; <label>:6                                       ; preds = %5, %4
+  %7 = phi [2 x i8]* [ getelementptr inbounds ([64 x [2 x i8]]* @FIELD_SCAN8x8, i32 0, i32 0), %4 ], [ null, %5 ]
+  br i1 undef, label %switch.lookup6, label %8
+
+switch.lookup6:                                   ; preds = %6
+  br label %8
+
+; <label>:8                                       ; preds = %switch.lookup6, %6
+  br i1 undef, label %.loopexit, label %9
+
+; <label>:9                                       ; preds = %8
+  %10 = and i32 %b8, 1
+  %11 = shl nuw nsw i32 %10, 3
+  %12 = getelementptr inbounds %struct.Slice* null, i32 0, i32 9
+  br i1 undef, label %.preheader, label %.preheader11
+
+.preheader11:                                     ; preds = %21, %9
+  %k.014 = phi i32 [ %27, %21 ], [ 0, %9 ]
+  %coef_ctr.013 = phi i32 [ %23, %21 ], [ -1, %9 ]
+  br i1 false, label %13, label %14
+
+; <label>:13                                      ; preds = %.preheader11
+  br label %15
+
+; <label>:14                                      ; preds = %.preheader11
+  br label %15
+
+; <label>:15                                      ; preds = %14, %13
+  %16 = getelementptr inbounds [0 x [20 x i32]]* @assignSE2partition, i32 0, i32 %1, i32 undef
+  %17 = load i32* %16, align 4
+  %18 = getelementptr inbounds %struct.datapartition* null, i32 %17, i32 2
+  %19 = load i32 (%struct.syntaxelement*, %struct.img_par*, %struct.datapartition*)** %18, align 4
+  %20 = call i32 %19(%struct.syntaxelement* undef, %struct.img_par* %img, %struct.datapartition* undef)
+  br i1 false, label %.loopexit, label %21
+
+; <label>:21                                      ; preds = %15
+  %22 = add i32 %coef_ctr.013, 1
+  %23 = add i32 %22, 0
+  %24 = getelementptr inbounds [2 x i8]* %7, i32 %23, i32 0
+  %25 = add nsw i32 0, %11
+  %26 = getelementptr inbounds %struct.img_par* %img, i32 0, i32 27, i32 undef, i32 %25
+  store i32 0, i32* %26, align 4
+  %27 = add nsw i32 %k.014, 1
+  %28 = icmp slt i32 %27, 65
+  br i1 %28, label %.preheader11, label %.loopexit
+
+.preheader:                                       ; preds = %36, %9
+  %k.110 = phi i32 [ %45, %36 ], [ 0, %9 ]
+  %coef_ctr.29 = phi i32 [ %39, %36 ], [ -1, %9 ]
+  br i1 false, label %29, label %30
+
+; <label>:29                                      ; preds = %.preheader
+  br label %31
+
+; <label>:30                                      ; preds = %.preheader
+  br label %31
+
+; <label>:31                                      ; preds = %30, %29
+  %32 = getelementptr inbounds [0 x [20 x i32]]* @assignSE2partition, i32 0, i32 %1, i32 undef
+  %33 = load i32* %32, align 4
+  %34 = getelementptr inbounds %struct.datapartition* null, i32 %33
+  %35 = call i32 undef(%struct.syntaxelement* undef, %struct.img_par* %img, %struct.datapartition* %34)
+  br i1 false, label %.loopexit, label %36
+
+; <label>:36                                      ; preds = %31
+  %37 = load i32* undef, align 4
+  %38 = add i32 %coef_ctr.29, 1
+  %39 = add i32 %38, %37
+  %40 = getelementptr inbounds [2 x i8]* %7, i32 %39, i32 0
+  %41 = load i8* %40, align 1
+  %42 = zext i8 %41 to i32
+  %43 = add nsw i32 %42, %11
+  %44 = getelementptr inbounds %struct.img_par* %img, i32 0, i32 27, i32 undef, i32 %43
+  store i32 0, i32* %44, align 4
+  %45 = add nsw i32 %k.110, 1
+  %46 = icmp slt i32 %45, 65
+  br i1 %46, label %.preheader, label %.loopexit
+
+.loopexit:                                        ; preds = %36, %31, %21, %15, %8
+  ret void
+}
diff --git a/test/CodeGen/Mips/select.ll b/test/CodeGen/Mips/select.ll
index 06e2a86..eb2198b 100644
--- a/test/CodeGen/Mips/select.ll
+++ b/test/CodeGen/Mips/select.ll
@@ -1,135 +1,705 @@
-; RUN: llc  < %s -march=mipsel | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -march=mipsel   -mcpu=mips32   | FileCheck %s -check-prefix=ALL -check-prefix=32
+; RUN: llc < %s -march=mipsel   -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=32R2
+; RUN: llc < %s -march=mipsel   -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=32R6
+; RUN: llc < %s -march=mips64el -mcpu=mips64   | FileCheck %s -check-prefix=ALL -check-prefix=64
+; RUN: llc < %s -march=mips64el -mcpu=mips64r2 | FileCheck %s -check-prefix=ALL -check-prefix=64R2
+; RUN: llc < %s -march=mips64el -mcpu=mips64r6 | FileCheck %s -check-prefix=ALL -check-prefix=64R6
 
 @d2 = external global double
 @d3 = external global double
 
-define i32 @sel1(i32 %s, i32 %f0, i32 %f1) nounwind readnone {
+define i32 @i32_icmp_ne_i32_val(i32 %s, i32 %f0, i32 %f1) nounwind readnone {
 entry:
-; CHECK: movn
+; ALL-LABEL: i32_icmp_ne_i32_val:
+
+; 32:            movn $5, $6, $4
+; 32:            move $2, $5
+
+; 32R2:          movn $5, $6, $4
+; 32R2:          move $2, $5
+
+; 32R6-DAG:      seleqz $[[T0:[0-9]+]], $5, $4
+; 32R6-DAG:      selnez $[[T1:[0-9]+]], $6, $4
+; 32R6:          or $2, $[[T1]], $[[T0]]
+
+; 64:            movn $5, $6, $4
+; 64:            move $2, $5
+
+; 64R2:          movn $5, $6, $4
+; 64R2:          move $2, $5
+
+; 64R6-DAG:      seleqz $[[T0:[0-9]+]], $5, $4
+; 64R6-DAG:      selnez $[[T1:[0-9]+]], $6, $4
+; 64R6:          or $2, $[[T1]], $[[T0]]
+
   %tobool = icmp ne i32 %s, 0
   %cond = select i1 %tobool, i32 %f1, i32 %f0
   ret i32 %cond
 }
 
-define float @sel2(i32 %s, float %f0, float %f1) nounwind readnone {
+define i64 @i32_icmp_ne_i64_val(i32 %s, i64 %f0, i64 %f1) nounwind readnone {
+entry:
+; ALL-LABEL: i32_icmp_ne_i64_val:
+
+; 32-DAG:        lw $[[F1:[0-9]+]], 16($sp)
+; 32-DAG:        movn $6, $[[F1]], $4
+; 32-DAG:        lw $[[F1H:[0-9]+]], 20($sp)
+; 32:            movn $7, $[[F1H]], $4
+; 32:            move $2, $6
+; 32:            move $3, $7
+
+; 32R2-DAG:      lw $[[F1:[0-9]+]], 16($sp)
+; 32R2-DAG:      movn $6, $[[F1]], $4
+; 32R2-DAG:      lw $[[F1H:[0-9]+]], 20($sp)
+; 32R2:          movn $7, $[[F1H]], $4
+; 32R2:          move $2, $6
+; 32R2:          move $3, $7
+
+; 32R6-DAG:      lw $[[F1:[0-9]+]], 16($sp)
+; 32R6-DAG:      seleqz $[[T0:[0-9]+]], $6, $4
+; 32R6-DAG:      selnez $[[T1:[0-9]+]], $[[F1]], $4
+; 32R6:          or $2, $[[T1]], $[[T0]]
+; 32R6-DAG:      lw $[[F1H:[0-9]+]], 20($sp)
+; 32R6-DAG:      seleqz $[[T0:[0-9]+]], $7, $4
+; 32R6-DAG:      selnez $[[T1:[0-9]+]], $[[F1H]], $4
+; 32R6:          or $3, $[[T1]], $[[T0]]
+
+; 64:            movn $5, $6, $4
+; 64:            move $2, $5
+
+; 64R2:          movn $5, $6, $4
+; 64R2:          move $2, $5
+
+; FIXME: This sll works around an implementation detail in the code generator
+;        (setcc's result is i32 so bits 32-63 are undefined). It's not really
+;        needed.
+; 64R6-DAG:      sll $[[CC:[0-9]+]], $4, 0
+; 64R6-DAG:      seleqz $[[T0:[0-9]+]], $5, $[[CC]]
+; 64R6-DAG:      selnez $[[T1:[0-9]+]], $6, $[[CC]]
+; 64R6:          or $2, $[[T1]], $[[T0]]
+
+  %tobool = icmp ne i32 %s, 0
+  %cond = select i1 %tobool, i64 %f1, i64 %f0
+  ret i64 %cond
+}
+
+define i64 @i64_icmp_ne_i64_val(i64 %s, i64 %f0, i64 %f1) nounwind readnone {
 entry:
-; CHECK: movn.s
+; ALL-LABEL: i64_icmp_ne_i64_val:
+
+; 32-DAG:        or $[[CC:[0-9]+]], $4
+; 32-DAG:        lw $[[F1:[0-9]+]], 16($sp)
+; 32-DAG:        movn $6, $[[F1]], $[[CC]]
+; 32-DAG:        lw $[[F1H:[0-9]+]], 20($sp)
+; 32:            movn $7, $[[F1H]], $[[CC]]
+; 32:            move $2, $6
+; 32:            move $3, $7
+
+; 32R2-DAG:      or $[[CC:[0-9]+]], $4
+; 32R2-DAG:      lw $[[F1:[0-9]+]], 16($sp)
+; 32R2-DAG:      movn $6, $[[F1]], $[[CC]]
+; 32R2-DAG:      lw $[[F1H:[0-9]+]], 20($sp)
+; 32R2:          movn $7, $[[F1H]], $[[CC]]
+; 32R2:          move $2, $6
+; 32R2:          move $3, $7
+
+; 32R6-DAG:      lw $[[F1:[0-9]+]], 16($sp)
+; 32R6-DAG:      or $[[T2:[0-9]+]], $4, $5
+; 32R6-DAG:      seleqz $[[T0:[0-9]+]], $6, $[[T2]]
+; 32R6-DAG:      selnez $[[T1:[0-9]+]], $[[F1]], $[[T2]]
+; 32R6:          or $2, $[[T1]], $[[T0]]
+; 32R6-DAG:      lw $[[F1H:[0-9]+]], 20($sp)
+; 32R6-DAG:      seleqz $[[T0:[0-9]+]], $7, $[[T2]]
+; 32R6-DAG:      selnez $[[T1:[0-9]+]], $[[F1H]], $[[T2]]
+; 32R6:          or $3, $[[T1]], $[[T0]]
+
+; 64:            movn $5, $6, $4
+; 64:            move $2, $5
+
+; 64R2:          movn $5, $6, $4
+; 64R2:          move $2, $5
+
+; 64R6-DAG:      seleqz $[[T0:[0-9]+]], $5, $4
+; 64R6-DAG:      selnez $[[T1:[0-9]+]], $6, $4
+; 64R6:          or $2, $[[T1]], $[[T0]]
+
+  %tobool = icmp ne i64 %s, 0
+  %cond = select i1 %tobool, i64 %f1, i64 %f0
+  ret i64 %cond
+}
+
+define float @i32_icmp_ne_f32_val(i32 %s, float %f0, float %f1) nounwind readnone {
+entry:
+; ALL-LABEL: i32_icmp_ne_f32_val:
+
+; 32-DAG:        mtc1 $5, $[[F0:f[0-9]+]]
+; 32-DAG:        mtc1 $6, $[[F1:f0]]
+; 32:            movn.s $[[F1]], $[[F0]], $4
+
+; 32R2-DAG:      mtc1 $5, $[[F0:f[0-9]+]]
+; 32R2-DAG:      mtc1 $6, $[[F1:f0]]
+; 32R2:          movn.s $[[F1]], $[[F0]], $4
+
+; 32R6-DAG:      mtc1 $5, $[[F0:f[0-9]+]]
+; 32R6-DAG:      mtc1 $6, $[[F1:f[0-9]+]]
+; 32R6:          sltu $[[T0:[0-9]+]], $zero, $4
+; 32R6:          mtc1 $[[T0]], $[[CC:f0]]
+; 32R6:          sel.s $[[CC]], $[[F1]], $[[F0]]
+
+; 64:            movn.s $f14, $f13, $4
+; 64:            mov.s $f0, $f14
+
+; 64R2:          movn.s $f14, $f13, $4
+; 64R2:          mov.s $f0, $f14
+
+; 64R6:          sltu $[[T0:[0-9]+]], $zero, $4
+; 64R6:          mtc1 $[[T0]], $[[CC:f0]]
+; 64R6:          sel.s $[[CC]], $f14, $f13
+
   %tobool = icmp ne i32 %s, 0
   %cond = select i1 %tobool, float %f0, float %f1
   ret float %cond
 }
 
-define double @sel2_1(i32 %s, double %f0, double %f1) nounwind readnone {
+define double @i32_icmp_ne_f64_val(i32 %s, double %f0, double %f1) nounwind readnone {
 entry:
-; CHECK: movn.d
+; ALL-LABEL: i32_icmp_ne_f64_val:
+
+; 32-DAG:        mtc1 $6, $[[F0:f[1-3]*[02468]+]]
+; 32-DAG:        mtc1 $7, $[[F0H:f[1-3]*[13579]+]]
+; 32-DAG:        ldc1 $[[F1:f0]], 16($sp)
+; 32:            movn.d $[[F1]], $[[F0]], $4
+
+; 32R2-DAG:      mtc1 $6, $[[F0:f[0-9]+]]
+; 32R2-DAG:      mthc1 $7, $[[F0]]
+; 32R2-DAG:      ldc1 $[[F1:f0]], 16($sp)
+; 32R2:          movn.d $[[F1]], $[[F0]], $4
+
+; 32R6-DAG:      mtc1 $6, $[[F0:f[0-9]+]]
+; 32R6-DAG:      mthc1 $7, $[[F0]]
+; 32R6-DAG:      sltu $[[T0:[0-9]+]], $zero, $4
+; 32R6-DAG:      mtc1 $[[T0]], $[[CC:f0]]
+; 32R6-DAG:      ldc1 $[[F1:f[0-9]+]], 16($sp)
+; 32R6:          sel.d $[[CC]], $[[F1]], $[[F0]]
+
+; 64:            movn.d $f14, $f13, $4
+; 64:            mov.d $f0, $f14
+
+; 64R2:          movn.d $f14, $f13, $4
+; 64R2:          mov.d $f0, $f14
+
+; 64R6-DAG:      sltu $[[T0:[0-9]+]], $zero, $4
+; 64R6-DAG:      mtc1 $[[T0]], $[[CC:f0]]
+; 64R6:          sel.d $[[CC]], $f14, $f13
+
   %tobool = icmp ne i32 %s, 0
   %cond = select i1 %tobool, double %f0, double %f1
   ret double %cond
 }
 
-define float @sel3(float %f0, float %f1, float %f2, float %f3) nounwind readnone {
+define float @f32_fcmp_oeq_f32_val(float %f0, float %f1, float %f2, float %f3) nounwind readnone {
 entry:
-; CHECK: c.eq.s
-; CHECK: movt.s
+; ALL-LABEL: f32_fcmp_oeq_f32_val:
+
+; 32-DAG:        mtc1 $6, $[[F2:f[0-9]+]]
+; 32-DAG:        mtc1 $7, $[[F3:f[0-9]+]]
+; 32:            c.eq.s $[[F2]], $[[F3]]
+; 32:            movt.s $f14, $f12, $fcc0
+; 32:            mov.s $f0, $f14
+
+; 32R2-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R2-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R2:          c.eq.s $[[F2]], $[[F3]]
+; 32R2:          movt.s $f14, $f12, $fcc0
+; 32R2:          mov.s $f0, $f14
+
+; 32R6-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R6-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R6:          cmp.eq.s $[[CC:f0]], $[[F2]], $[[F3]]
+; 32R6:          sel.s $[[CC]], $f14, $f12
+
+; 64:            c.eq.s $f14, $f15
+; 64:            movt.s $f13, $f12, $fcc0
+; 64:            mov.s $f0, $f13
+
+; 64R2:          c.eq.s $f14, $f15
+; 64R2:          movt.s $f13, $f12, $fcc0
+; 64R2:          mov.s $f0, $f13
+
+; 64R6:          cmp.eq.s $[[CC:f0]], $f14, $f15
+; 64R6:          sel.s $[[CC]], $f13, $f12
+
   %cmp = fcmp oeq float %f2, %f3
   %cond = select i1 %cmp, float %f0, float %f1
   ret float %cond
 }
 
-define float @sel4(float %f0, float %f1, float %f2, float %f3) nounwind readnone {
+define float @f32_fcmp_olt_f32_val(float %f0, float %f1, float %f2, float %f3) nounwind readnone {
 entry:
-; CHECK: c.olt.s
-; CHECK: movt.s
+; ALL-LABEL: f32_fcmp_olt_f32_val:
+
+; 32-DAG:        mtc1 $6, $[[F2:f[0-9]+]]
+; 32-DAG:        mtc1 $7, $[[F3:f[0-9]+]]
+; 32:            c.olt.s $[[F2]], $[[F3]]
+; 32:            movt.s $f14, $f12, $fcc0
+; 32:            mov.s $f0, $f14
+
+; 32R2-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R2-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R2:          c.olt.s $[[F2]], $[[F3]]
+; 32R2:          movt.s $f14, $f12, $fcc0
+; 32R2:          mov.s $f0, $f14
+
+; 32R6-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R6-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R6:          cmp.lt.s $[[CC:f0]], $[[F2]], $[[F3]]
+; 32R6:          sel.s $[[CC]], $f14, $f12
+
+; 64:            c.olt.s $f14, $f15
+; 64:            movt.s $f13, $f12, $fcc0
+; 64:            mov.s $f0, $f13
+
+; 64R2:          c.olt.s $f14, $f15
+; 64R2:          movt.s $f13, $f12, $fcc0
+; 64R2:          mov.s $f0, $f13
+
+; 64R6:          cmp.lt.s $[[CC:f0]], $f14, $f15
+; 64R6:          sel.s $[[CC]], $f13, $f12
+
   %cmp = fcmp olt float %f2, %f3
   %cond = select i1 %cmp, float %f0, float %f1
   ret float %cond
 }
 
-define float @sel5(float %f0, float %f1, float %f2, float %f3) nounwind readnone {
+define float @f32_fcmp_ogt_f32_val(float %f0, float %f1, float %f2, float %f3) nounwind readnone {
 entry:
-; CHECK: c.ule.s
-; CHECK: movf.s
+; ALL-LABEL: f32_fcmp_ogt_f32_val:
+
+; 32-DAG:        mtc1 $6, $[[F2:f[0-9]+]]
+; 32-DAG:        mtc1 $7, $[[F3:f[0-9]+]]
+; 32:            c.ule.s $[[F2]], $[[F3]]
+; 32:            movf.s $f14, $f12, $fcc0
+; 32:            mov.s $f0, $f14
+
+; 32R2-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R2-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R2:          c.ule.s $[[F2]], $[[F3]]
+; 32R2:          movf.s $f14, $f12, $fcc0
+; 32R2:          mov.s $f0, $f14
+
+; 32R6-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R6-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R6:          cmp.lt.s $[[CC:f0]], $[[F3]], $[[F2]]
+; 32R6:          sel.s $[[CC]], $f14, $f12
+
+; 64:            c.ule.s $f14, $f15
+; 64:            movf.s $f13, $f12, $fcc0
+; 64:            mov.s $f0, $f13
+
+; 64R2:          c.ule.s $f14, $f15
+; 64R2:          movf.s $f13, $f12, $fcc0
+; 64R2:          mov.s $f0, $f13
+
+; 64R6:          cmp.lt.s $[[CC:f0]], $f15, $f14
+; 64R6:          sel.s $[[CC]], $f13, $f12
+
   %cmp = fcmp ogt float %f2, %f3
   %cond = select i1 %cmp, float %f0, float %f1
   ret float %cond
 }
 
-define double @sel5_1(double %f0, double %f1, float %f2, float %f3) nounwind readnone {
+define double @f32_fcmp_ogt_f64_val(double %f0, double %f1, float %f2, float %f3) nounwind readnone {
 entry:
-; CHECK: c.ule.s
-; CHECK: movf.d
+; ALL-LABEL: f32_fcmp_ogt_f64_val:
+
+; 32-DAG:        lwc1 $[[F2:f[0-9]+]], 16($sp)
+; 32-DAG:        lwc1 $[[F3:f[0-9]+]], 20($sp)
+; 32:            c.ule.s $[[F2]], $[[F3]]
+; 32:            movf.d $f14, $f12, $fcc0
+; 32:            mov.d $f0, $f14
+
+; 32R2-DAG:      lwc1 $[[F2:f[0-9]+]], 16($sp)
+; 32R2-DAG:      lwc1 $[[F3:f[0-9]+]], 20($sp)
+; 32R2:          c.ule.s $[[F2]], $[[F3]]
+; 32R2:          movf.d $f14, $f12, $fcc0
+; 32R2:          mov.d $f0, $f14
+
+; 32R6-DAG:      lwc1 $[[F2:f[0-9]+]], 16($sp)
+; 32R6-DAG:      lwc1 $[[F3:f[0-9]+]], 20($sp)
+; 32R6:          cmp.lt.s $[[CC:f0]], $[[F3]], $[[F2]]
+; 32R6:          sel.d $[[CC]], $f14, $f12
+
+; 64:            c.ule.s $f14, $f15
+; 64:            movf.d $f13, $f12, $fcc0
+; 64:            mov.d $f0, $f13
+
+; 64R2:          c.ule.s $f14, $f15
+; 64R2:          movf.d $f13, $f12, $fcc0
+; 64R2:          mov.d $f0, $f13
+
+; 64R6:          cmp.lt.s $[[CC:f0]], $f15, $f14
+; 64R6:          sel.d $[[CC]], $f13, $f12
+
   %cmp = fcmp ogt float %f2, %f3
   %cond = select i1 %cmp, double %f0, double %f1
   ret double %cond
 }
 
-define double @sel6(double %f0, double %f1, double %f2, double %f3) nounwind readnone {
+define double @f64_fcmp_oeq_f64_val(double %f0, double %f1, double %f2, double %f3) nounwind readnone {
 entry:
-; CHECK: c.eq.d
-; CHECK: movt.d
+; ALL-LABEL: f64_fcmp_oeq_f64_val:
+
+; 32-DAG:        ldc1 $[[F2:f[0-9]+]], 16($sp)
+; 32-DAG:        ldc1 $[[F3:f[0-9]+]], 24($sp)
+; 32:            c.eq.d $[[F2]], $[[F3]]
+; 32:            movt.d $f14, $f12, $fcc0
+; 32:            mov.d $f0, $f14
+
+; 32R2-DAG:      ldc1 $[[F2:f[0-9]+]], 16($sp)
+; 32R2-DAG:      ldc1 $[[F3:f[0-9]+]], 24($sp)
+; 32R2:          c.eq.d $[[F2]], $[[F3]]
+; 32R2:          movt.d $f14, $f12, $fcc0
+; 32R2:          mov.d $f0, $f14
+
+; 32R6-DAG:      ldc1 $[[F2:f[0-9]+]], 16($sp)
+; 32R6-DAG:      ldc1 $[[F3:f[0-9]+]], 24($sp)
+; 32R6:          cmp.eq.d $[[CC:f0]], $[[F2]], $[[F3]]
+; 32R6:          sel.d $[[CC]], $f14, $f12
+
+; 64:            c.eq.d $f14, $f15
+; 64:            movt.d $f13, $f12, $fcc0
+; 64:            mov.d $f0, $f13
+
+; 64R2:          c.eq.d $f14, $f15
+; 64R2:          movt.d $f13, $f12, $fcc0
+; 64R2:          mov.d $f0, $f13
+
+; 64R6:          cmp.eq.d $[[CC:f0]], $f14, $f15
+; 64R6:          sel.d $[[CC]], $f13, $f12
+
   %cmp = fcmp oeq double %f2, %f3
   %cond = select i1 %cmp, double %f0, double %f1
   ret double %cond
 }
 
-define double @sel7(double %f0, double %f1, double %f2, double %f3) nounwind readnone {
+define double @f64_fcmp_olt_f64_val(double %f0, double %f1, double %f2, double %f3) nounwind readnone {
 entry:
-; CHECK: c.olt.d
-; CHECK: movt.d
+; ALL-LABEL: f64_fcmp_olt_f64_val:
+
+; 32-DAG:        ldc1 $[[F2:f[0-9]+]], 16($sp)
+; 32-DAG:        ldc1 $[[F3:f[0-9]+]], 24($sp)
+; 32:            c.olt.d $[[F2]], $[[F3]]
+; 32:            movt.d $f14, $f12, $fcc0
+; 32:            mov.d $f0, $f14
+
+; 32R2-DAG:      ldc1 $[[F2:f[0-9]+]], 16($sp)
+; 32R2-DAG:      ldc1 $[[F3:f[0-9]+]], 24($sp)
+; 32R2:          c.olt.d $[[F2]], $[[F3]]
+; 32R2:          movt.d $f14, $f12, $fcc0
+; 32R2:          mov.d $f0, $f14
+
+; 32R6-DAG:      ldc1 $[[F2:f[0-9]+]], 16($sp)
+; 32R6-DAG:      ldc1 $[[F3:f[0-9]+]], 24($sp)
+; 32R6:          cmp.lt.d $[[CC:f0]], $[[F2]], $[[F3]]
+; 32R6:          sel.d $[[CC]], $f14, $f12
+
+; 64:            c.olt.d $f14, $f15
+; 64:            movt.d $f13, $f12, $fcc0
+; 64:            mov.d $f0, $f13
+
+; 64R2:          c.olt.d $f14, $f15
+; 64R2:          movt.d $f13, $f12, $fcc0
+; 64R2:          mov.d $f0, $f13
+
+; 64R6:          cmp.lt.d $[[CC:f0]], $f14, $f15
+; 64R6:          sel.d $[[CC]], $f13, $f12
+
   %cmp = fcmp olt double %f2, %f3
   %cond = select i1 %cmp, double %f0, double %f1
   ret double %cond
 }
 
-define double @sel8(double %f0, double %f1, double %f2, double %f3) nounwind readnone {
+define double @f64_fcmp_ogt_f64_val(double %f0, double %f1, double %f2, double %f3) nounwind readnone {
 entry:
-; CHECK: c.ule.d
-; CHECK: movf.d
+; ALL-LABEL: f64_fcmp_ogt_f64_val:
+
+; 32-DAG:        ldc1 $[[F2:f[0-9]+]], 16($sp)
+; 32-DAG:        ldc1 $[[F3:f[0-9]+]], 24($sp)
+; 32:            c.ule.d $[[F2]], $[[F3]]
+; 32:            movf.d $f14, $f12, $fcc0
+; 32:            mov.d $f0, $f14
+
+; 32R2-DAG:      ldc1 $[[F2:f[0-9]+]], 16($sp)
+; 32R2-DAG:      ldc1 $[[F3:f[0-9]+]], 24($sp)
+; 32R2:          c.ule.d $[[F2]], $[[F3]]
+; 32R2:          movf.d $f14, $f12, $fcc0
+; 32R2:          mov.d $f0, $f14
+
+; 32R6-DAG:      ldc1 $[[F2:f[0-9]+]], 16($sp)
+; 32R6-DAG:      ldc1 $[[F3:f[0-9]+]], 24($sp)
+; 32R6:          cmp.lt.d $[[CC:f0]], $[[F3]], $[[F2]]
+; 32R6:          sel.d $[[CC]], $f14, $f12
+
+; 64:            c.ule.d $f14, $f15
+; 64:            movf.d $f13, $f12, $fcc0
+; 64:            mov.d $f0, $f13
+
+; 64R2:          c.ule.d $f14, $f15
+; 64R2:          movf.d $f13, $f12, $fcc0
+; 64R2:          mov.d $f0, $f13
+
+; 64R6:          cmp.lt.d $[[CC:f0]], $f15, $f14
+; 64R6:          sel.d $[[CC]], $f13, $f12
+
   %cmp = fcmp ogt double %f2, %f3
   %cond = select i1 %cmp, double %f0, double %f1
   ret double %cond
 }
 
-define float @sel8_1(float %f0, float %f1, double %f2, double %f3) nounwind readnone {
+define float @f64_fcmp_ogt_f32_val(float %f0, float %f1, double %f2, double %f3) nounwind readnone {
 entry:
-; CHECK: c.ule.d
-; CHECK: movf.s
+; ALL-LABEL: f64_fcmp_ogt_f32_val:
+
+; 32-DAG:        mtc1 $6, $[[F2:f[1-3]*[02468]+]]
+; 32-DAG:        mtc1 $7, $[[F2H:f[1-3]*[13579]+]]
+; 32-DAG:        ldc1 $[[F3:f[0-9]+]], 16($sp)
+; 32:            c.ule.d $[[F2]], $[[F3]]
+; 32:            movf.s $f14, $f12, $fcc0
+; 32:            mov.s $f0, $f14
+
+; 32R2-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R2-DAG:      mthc1 $7, $[[F2]]
+; 32R2-DAG:      ldc1 $[[F3:f[0-9]+]], 16($sp)
+; 32R2:          c.ule.d $[[F2]], $[[F3]]
+; 32R2:          movf.s $f14, $f12, $fcc0
+; 32R2:          mov.s $f0, $f14
+
+; 32R6-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R6-DAG:      mthc1 $7, $[[F2]]
+; 32R6-DAG:      ldc1 $[[F3:f[0-9]+]], 16($sp)
+; 32R6:          cmp.lt.d $[[CC:f0]], $[[F3]], $[[F2]]
+; 32R6:          sel.s $[[CC]], $f14, $f12
+
+; 64:            c.ule.d $f14, $f15
+; 64:            movf.s $f13, $f12, $fcc0
+; 64:            mov.s $f0, $f13
+
+; 64R2:          c.ule.d $f14, $f15
+; 64R2:          movf.s $f13, $f12, $fcc0
+; 64R2:          mov.s $f0, $f13
+
+; 64R6:          cmp.lt.d $[[CC:f0]], $f15, $f14
+; 64R6:          sel.s $[[CC]], $f13, $f12
+
   %cmp = fcmp ogt double %f2, %f3
   %cond = select i1 %cmp, float %f0, float %f1
   ret float %cond
 }
 
-define i32 @sel9(i32 %f0, i32 %f1, float %f2, float %f3) nounwind readnone {
+define i32 @f32_fcmp_oeq_i32_val(i32 %f0, i32 %f1, float %f2, float %f3) nounwind readnone {
 entry:
-; CHECK: c.eq.s
-; CHECK: movt
+; ALL-LABEL: f32_fcmp_oeq_i32_val:
+
+; 32-DAG:        mtc1 $6, $[[F2:f[0-9]+]]
+; 32-DAG:        mtc1 $7, $[[F3:f[0-9]+]]
+; 32:            c.eq.s $[[F2]], $[[F3]]
+; 32:            movt $5, $4, $fcc0
+; 32:            move $2, $5
+
+; 32R2-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R2-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R2:          c.eq.s $[[F2]], $[[F3]]
+; 32R2:          movt $5, $4, $fcc0
+; 32R2:          move $2, $5
+
+; 32R6-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R6-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R6:          cmp.eq.s $[[CC:f[0-9]+]], $[[F2]], $[[F3]]
+; 32R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 32R6:          andi $[[CCGPR]], $[[CCGPR]], 1
+; 32R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; 32R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 32R6:          or $2, $[[NE]], $[[EQ]]
+
+; 64:            c.eq.s $f14, $f15
+; 64:            movt $5, $4, $fcc0
+; 64:            move $2, $5
+
+; 64R2:          c.eq.s $f14, $f15
+; 64R2:          movt $5, $4, $fcc0
+; 64R2:          move $2, $5
+
+; 64R6:          cmp.eq.s $[[CC:f[0-9]+]], $f14, $f15
+; 64R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 64R6:          andi $[[CCGPR]], $[[CCGPR]], 1
+; 64R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; 64R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 64R6:          or $2, $[[NE]], $[[EQ]]
+
   %cmp = fcmp oeq float %f2, %f3
   %cond = select i1 %cmp, i32 %f0, i32 %f1
   ret i32 %cond
 }
 
-define i32 @sel10(i32 %f0, i32 %f1, float %f2, float %f3) nounwind readnone {
+define i32 @f32_fcmp_olt_i32_val(i32 %f0, i32 %f1, float %f2, float %f3) nounwind readnone {
 entry:
-; CHECK: c.olt.s
-; CHECK: movt
+; ALL-LABEL: f32_fcmp_olt_i32_val:
+
+; 32-DAG:        mtc1 $6, $[[F2:f[0-9]+]]
+; 32-DAG:        mtc1 $7, $[[F3:f[0-9]+]]
+; 32:            c.olt.s $[[F2]], $[[F3]]
+; 32:            movt $5, $4, $fcc0
+; 32:            move $2, $5
+
+; 32R2-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R2-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R2:          c.olt.s $[[F2]], $[[F3]]
+; 32R2:          movt $5, $4, $fcc0
+; 32R2:          move $2, $5
+
+; 32R6-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R6-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R6:          cmp.lt.s $[[CC:f[0-9]+]], $[[F2]], $[[F3]]
+; 32R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 32R6:          andi $[[CCGPR]], $[[CCGPR]], 1
+; 32R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; 32R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 32R6:          or $2, $[[NE]], $[[EQ]]
+
+; 64:            c.olt.s $f14, $f15
+; 64:            movt $5, $4, $fcc0
+; 64:            move $2, $5
+
+; 64R2:          c.olt.s $f14, $f15
+; 64R2:          movt $5, $4, $fcc0
+; 64R2:          move $2, $5
+
+; 64R6:          cmp.lt.s $[[CC:f[0-9]+]], $f14, $f15
+; 64R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 64R6:          andi $[[CCGPR]], $[[CCGPR]], 1
+; 64R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; 64R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 64R6:          or $2, $[[NE]], $[[EQ]]
   %cmp = fcmp olt float %f2, %f3
   %cond = select i1 %cmp, i32 %f0, i32 %f1
   ret i32 %cond
 }
 
-define i32 @sel11(i32 %f0, i32 %f1, float %f2, float %f3) nounwind readnone {
+define i32 @f32_fcmp_ogt_i32_val(i32 %f0, i32 %f1, float %f2, float %f3) nounwind readnone {
 entry:
-; CHECK: c.ule.s
-; CHECK: movf
+; ALL-LABEL: f32_fcmp_ogt_i32_val:
+
+; 32-DAG:        mtc1 $6, $[[F2:f[0-9]+]]
+; 32-DAG:        mtc1 $7, $[[F3:f[0-9]+]]
+; 32:            c.ule.s $[[F2]], $[[F3]]
+; 32:            movf $5, $4, $fcc0
+; 32:            move $2, $5
+
+; 32R2-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R2-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R2:          c.ule.s $[[F2]], $[[F3]]
+; 32R2:          movf $5, $4, $fcc0
+; 32R2:          move $2, $5
+
+; 32R6-DAG:      mtc1 $6, $[[F2:f[0-9]+]]
+; 32R6-DAG:      mtc1 $7, $[[F3:f[0-9]+]]
+; 32R6:          cmp.lt.s $[[CC:f[0-9]+]], $[[F3]], $[[F2]]
+; 32R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 32R6:          andi $[[CCGPR]], $[[CCGPR]], 1
+; 32R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; 32R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 32R6:          or $2, $[[NE]], $[[EQ]]
+
+; 64:            c.ule.s $f14, $f15
+; 64:            movf $5, $4, $fcc0
+; 64:            move $2, $5
+
+; 64R2:          c.ule.s $f14, $f15
+; 64R2:          movf $5, $4, $fcc0
+; 64R2:          move $2, $5
+
+; 64R6:          cmp.lt.s $[[CC:f[0-9]+]], $f15, $f14
+; 64R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 64R6:          andi $[[CCGPR]], $[[CCGPR]], 1
+; 64R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; 64R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 64R6:          or $2, $[[NE]], $[[EQ]]
+
   %cmp = fcmp ogt float %f2, %f3
   %cond = select i1 %cmp, i32 %f0, i32 %f1
   ret i32 %cond
 }
 
-define i32 @sel12(i32 %f0, i32 %f1) nounwind readonly {
+define i32 @f64_fcmp_oeq_i32_val(i32 %f0, i32 %f1) nounwind readonly {
 entry:
-; CHECK: c.eq.d
-; CHECK: movt
+; ALL-LABEL: f64_fcmp_oeq_i32_val:
+
+; 32-DAG:        addiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(_gp_disp)
+; 32-DAG:        addu $[[GOT:[0-9]+]], $[[T0]], $25
+; 32-DAG:        lw $[[D2:[0-9]+]], %got(d2)($1)
+; 32-DAG:        ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 32-DAG:        lw $[[D3:[0-9]+]], %got(d3)($1)
+; 32-DAG:        ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 32:            c.eq.d $[[TMP]], $[[TMP1]]
+; 32:            movt $5, $4, $fcc0
+; 32:            move $2, $5
+
+; 32R2-DAG:      addiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(_gp_disp)
+; 32R2-DAG:      addu $[[GOT:[0-9]+]], $[[T0]], $25
+; 32R2-DAG:      lw $[[D2:[0-9]+]], %got(d2)($1)
+; 32R2-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 32R2-DAG:      lw $[[D3:[0-9]+]], %got(d3)($1)
+; 32R2-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 32R2:          c.eq.d $[[TMP]], $[[TMP1]]
+; 32R2:          movt $5, $4, $fcc0
+; 32R2:          move $2, $5
+
+; 32R6-DAG:      addiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(_gp_disp)
+; 32R6-DAG:      addu $[[GOT:[0-9]+]], $[[T0]], $25
+; 32R6-DAG:      lw $[[D2:[0-9]+]], %got(d2)($1)
+; 32R6-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 32R6-DAG:      lw $[[D3:[0-9]+]], %got(d3)($1)
+; 32R6-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 32R6:          cmp.eq.d $[[CC:f[0-9]+]], $[[TMP]], $[[TMP1]]
+; 32R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 32R6:          andi $[[CCGPR]], $[[CCGPR]], 1
+; 32R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; 32R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 32R6:          or $2, $[[NE]], $[[EQ]]
+
+; 64-DAG:        daddiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(%neg(%gp_rel(f64_fcmp_oeq_i32_val)))
+; 64-DAG:        daddu $[[GOT:[0-9]+]], $[[T0]], $25
+; 64-DAG:        ld $[[D2:[0-9]+]], %got_disp(d2)($1)
+; 64-DAG:        ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 64-DAG:        ld $[[D3:[0-9]+]], %got_disp(d3)($1)
+; 64-DAG:        ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 64:            c.eq.d $[[TMP]], $[[TMP1]]
+; 64:            movt $5, $4, $fcc0
+; 64:            move $2, $5
+
+; 64R2-DAG:      daddiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(%neg(%gp_rel(f64_fcmp_oeq_i32_val)))
+; 64R2-DAG:      daddu $[[GOT:[0-9]+]], $[[T0]], $25
+; 64R2-DAG:      ld $[[D2:[0-9]+]], %got_disp(d2)($1)
+; 64R2-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 64R2-DAG:      ld $[[D3:[0-9]+]], %got_disp(d3)($1)
+; 64R2-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 64R2:          c.eq.d $[[TMP]], $[[TMP1]]
+; 64R2:          movt $5, $4, $fcc0
+; 64R2:          move $2, $5
+
+; 64R6-DAG:      daddiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(%neg(%gp_rel(f64_fcmp_oeq_i32_val)))
+; 64R6-DAG:      daddu $[[GOT:[0-9]+]], $[[T0]], $25
+; 64R6-DAG:      ld $[[D2:[0-9]+]], %got_disp(d2)($1)
+; 64R6-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 64R6-DAG:      ld $[[D3:[0-9]+]], %got_disp(d3)($1)
+; 64R6-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 64R6:          cmp.eq.d $[[CC:f[0-9]+]], $[[TMP]], $[[TMP1]]
+; 64R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 64R6:          andi $[[CCGPR]], $[[CCGPR]], 1
+; 64R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; 64R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 64R6:          or $2, $[[NE]], $[[EQ]]
+
   %tmp = load double* @d2, align 8
   %tmp1 = load double* @d3, align 8
   %cmp = fcmp oeq double %tmp, %tmp1
@@ -137,10 +707,76 @@ entry:
   ret i32 %cond
 }
 
-define i32 @sel13(i32 %f0, i32 %f1) nounwind readonly {
+define i32 @f64_fcmp_olt_i32_val(i32 %f0, i32 %f1) nounwind readonly {
 entry:
-; CHECK: c.olt.d
-; CHECK: movt
+; ALL-LABEL: f64_fcmp_olt_i32_val:
+
+; 32-DAG:        addiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(_gp_disp)
+; 32-DAG:        addu $[[GOT:[0-9]+]], $[[T0]], $25
+; 32-DAG:        lw $[[D2:[0-9]+]], %got(d2)($1)
+; 32-DAG:        ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 32-DAG:        lw $[[D3:[0-9]+]], %got(d3)($1)
+; 32-DAG:        ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 32:            c.olt.d $[[TMP]], $[[TMP1]]
+; 32:            movt $5, $4, $fcc0
+; 32:            move $2, $5
+
+; 32R2-DAG:      addiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(_gp_disp)
+; 32R2-DAG:      addu $[[GOT:[0-9]+]], $[[T0]], $25
+; 32R2-DAG:      lw $[[D2:[0-9]+]], %got(d2)($1)
+; 32R2-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 32R2-DAG:      lw $[[D3:[0-9]+]], %got(d3)($1)
+; 32R2-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 32R2:          c.olt.d $[[TMP]], $[[TMP1]]
+; 32R2:          movt $5, $4, $fcc0
+; 32R2:          move $2, $5
+
+; 32R6-DAG:      addiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(_gp_disp)
+; 32R6-DAG:      addu $[[GOT:[0-9]+]], $[[T0]], $25
+; 32R6-DAG:      lw $[[D2:[0-9]+]], %got(d2)($1)
+; 32R6-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 32R6-DAG:      lw $[[D3:[0-9]+]], %got(d3)($1)
+; 32R6-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 32R6:          cmp.lt.d $[[CC:f[0-9]+]], $[[TMP]], $[[TMP1]]
+; 32R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 32R6:          andi $[[CCGPR]], $[[CCGPR]], 1
+; 32R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; 32R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 32R6:          or $2, $[[NE]], $[[EQ]]
+
+; 64-DAG:        daddiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(%neg(%gp_rel(f64_fcmp_olt_i32_val)))
+; 64-DAG:        daddu $[[GOT:[0-9]+]], $[[T0]], $25
+; 64-DAG:        ld $[[D2:[0-9]+]], %got_disp(d2)($1)
+; 64-DAG:        ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 64-DAG:        ld $[[D3:[0-9]+]], %got_disp(d3)($1)
+; 64-DAG:        ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 64:            c.olt.d $[[TMP]], $[[TMP1]]
+; 64:            movt $5, $4, $fcc0
+; 64:            move $2, $5
+
+; 64R2-DAG:      daddiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(%neg(%gp_rel(f64_fcmp_olt_i32_val)))
+; 64R2-DAG:      daddu $[[GOT:[0-9]+]], $[[T0]], $25
+; 64R2-DAG:      ld $[[D2:[0-9]+]], %got_disp(d2)($1)
+; 64R2-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 64R2-DAG:      ld $[[D3:[0-9]+]], %got_disp(d3)($1)
+; 64R2-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 64R2:          c.olt.d $[[TMP]], $[[TMP1]]
+; 64R2:          movt $5, $4, $fcc0
+; 64R2:          move $2, $5
+
+; 64R6-DAG:      daddiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(%neg(%gp_rel(f64_fcmp_olt_i32_val)))
+; 64R6-DAG:      daddu $[[GOT:[0-9]+]], $[[T0]], $25
+; 64R6-DAG:      ld $[[D2:[0-9]+]], %got_disp(d2)($1)
+; 64R6-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 64R6-DAG:      ld $[[D3:[0-9]+]], %got_disp(d3)($1)
+; 64R6-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 64R6:          cmp.lt.d $[[CC:f[0-9]+]], $[[TMP]], $[[TMP1]]
+; 64R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 64R6:          andi $[[CCGPR]], $[[CCGPR]], 1
+; 64R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; 64R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 64R6:          or $2, $[[NE]], $[[EQ]]
+
   %tmp = load double* @d2, align 8
   %tmp1 = load double* @d3, align 8
   %cmp = fcmp olt double %tmp, %tmp1
@@ -148,10 +784,76 @@ entry:
   ret i32 %cond
 }
 
-define i32 @sel14(i32 %f0, i32 %f1) nounwind readonly {
+define i32 @f64_fcmp_ogt_i32_val(i32 %f0, i32 %f1) nounwind readonly {
 entry:
-; CHECK: c.ule.d
-; CHECK: movf
+; ALL-LABEL: f64_fcmp_ogt_i32_val:
+
+; 32-DAG:        addiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(_gp_disp)
+; 32-DAG:        addu $[[GOT:[0-9]+]], $[[T0]], $25
+; 32-DAG:        lw $[[D2:[0-9]+]], %got(d2)($1)
+; 32-DAG:        ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 32-DAG:        lw $[[D3:[0-9]+]], %got(d3)($1)
+; 32-DAG:        ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 32:            c.ule.d $[[TMP]], $[[TMP1]]
+; 32:            movf $5, $4, $fcc0
+; 32:            move $2, $5
+
+; 32R2-DAG:      addiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(_gp_disp)
+; 32R2-DAG:      addu $[[GOT:[0-9]+]], $[[T0]], $25
+; 32R2-DAG:      lw $[[D2:[0-9]+]], %got(d2)($1)
+; 32R2-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 32R2-DAG:      lw $[[D3:[0-9]+]], %got(d3)($1)
+; 32R2-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 32R2:          c.ule.d $[[TMP]], $[[TMP1]]
+; 32R2:          movf $5, $4, $fcc0
+; 32R2:          move $2, $5
+
+; 32R6-DAG:      addiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(_gp_disp)
+; 32R6-DAG:      addu $[[GOT:[0-9]+]], $[[T0]], $25
+; 32R6-DAG:      lw $[[D2:[0-9]+]], %got(d2)($1)
+; 32R6-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 32R6-DAG:      lw $[[D3:[0-9]+]], %got(d3)($1)
+; 32R6-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 32R6:          cmp.lt.d $[[CC:f[0-9]+]], $[[TMP1]], $[[TMP]]
+; 32R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 32R6:          andi $[[CCGPR]], $[[CCGPR]], 1
+; 32R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; 32R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 32R6:          or $2, $[[NE]], $[[EQ]]
+
+; 64-DAG:        daddiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(%neg(%gp_rel(f64_fcmp_ogt_i32_val)))
+; 64-DAG:        daddu $[[GOT:[0-9]+]], $[[T0]], $25
+; 64-DAG:        ld $[[D2:[0-9]+]], %got_disp(d2)($1)
+; 64-DAG:        ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 64-DAG:        ld $[[D3:[0-9]+]], %got_disp(d3)($1)
+; 64-DAG:        ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 64:            c.ule.d $[[TMP]], $[[TMP1]]
+; 64:            movf $5, $4, $fcc0
+; 64:            move $2, $5
+
+; 64R2-DAG:      daddiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(%neg(%gp_rel(f64_fcmp_ogt_i32_val)))
+; 64R2-DAG:      daddu $[[GOT:[0-9]+]], $[[T0]], $25
+; 64R2-DAG:      ld $[[D2:[0-9]+]], %got_disp(d2)($1)
+; 64R2-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 64R2-DAG:      ld $[[D3:[0-9]+]], %got_disp(d3)($1)
+; 64R2-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 64R2:          c.ule.d $[[TMP]], $[[TMP1]]
+; 64R2:          movf $5, $4, $fcc0
+; 64R2:          move $2, $5
+
+; 64R6-DAG:      daddiu $[[T0:[0-9]+]], ${{[0-9]+}}, %lo(%neg(%gp_rel(f64_fcmp_ogt_i32_val)))
+; 64R6-DAG:      daddu $[[GOT:[0-9]+]], $[[T0]], $25
+; 64R6-DAG:      ld $[[D2:[0-9]+]], %got_disp(d2)($1)
+; 64R6-DAG:      ldc1 $[[TMP:f[0-9]+]], 0($[[D2]])
+; 64R6-DAG:      ld $[[D3:[0-9]+]], %got_disp(d3)($1)
+; 64R6-DAG:      ldc1 $[[TMP1:f[0-9]+]], 0($[[D3]])
+; 64R6:          cmp.lt.d $[[CC:f[0-9]+]], $[[TMP1]], $[[TMP]]
+; 64R6:          mfc1 $[[CCGPR:[0-9]+]], $[[CC]]
+; 64R6:          andi $[[CCGPR]], $[[CCGPR]], 1
+; 64R6:          seleqz $[[EQ:[0-9]+]], $5, $[[CCGPR]]
+; 64R6:          selnez $[[NE:[0-9]+]], $4, $[[CCGPR]]
+; 64R6:          or $2, $[[NE]], $[[EQ]]
+
   %tmp = load double* @d2, align 8
   %tmp1 = load double* @d3, align 8
   %cmp = fcmp ogt double %tmp, %tmp1
diff --git a/test/CodeGen/Mips/selectcc.ll b/test/CodeGen/Mips/selectcc.ll
index aeef60e..9790a0a 100644
--- a/test/CodeGen/Mips/selectcc.ll
+++ b/test/CodeGen/Mips/selectcc.ll
@@ -1,5 +1,7 @@
-; RUN: llc -march=mipsel < %s
-; RUN: llc -march=mipsel -pre-RA-sched=source < %s | FileCheck %s --check-prefix=SOURCE-SCHED
+; RUN: llc -march=mipsel -mcpu=mips32 < %s
+; RUN: llc -march=mipsel -mcpu=mips32 -pre-RA-sched=source < %s | FileCheck %s --check-prefix=SOURCE-SCHED
+; RUN: llc -march=mipsel -mcpu=mips32r2 < %s
+; RUN: llc -march=mipsel -mcpu=mips32r2 -pre-RA-sched=source < %s | FileCheck %s --check-prefix=SOURCE-SCHED
 
 @gf0 = external global float
 @gf1 = external global float
@@ -16,13 +18,11 @@ entry:
 ; SOURCE-SCHED: lw
 ; SOURCE-SCHED: lui
 ; SOURCE-SCHED: sw
-; SOURCE-SCHED: addiu
-; SOURCE-SCHED: addiu
-; SOURCE-SCHED: c.olt.s
-; SOURCE-SCHED: movt
+; SOURCE-SCHED: lw
+; SOURCE-SCHED: lwc1
 ; SOURCE-SCHED: mtc1
+; SOURCE-SCHED: c.olt.s
 ; SOURCE-SCHED: jr
-
   store float 0.000000e+00, float* @gf0, align 4
   store float 1.000000e+00, float* @gf1, align 4
   %cmp = fcmp olt float %a, %b
diff --git a/test/CodeGen/Mips/tls-alias.ll b/test/CodeGen/Mips/tls-alias.ll
index 80fbe87..b61f84e 100644
--- a/test/CodeGen/Mips/tls-alias.ll
+++ b/test/CodeGen/Mips/tls-alias.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=mipsel -relocation-model=pic -disable-mips-delay-filler < %s | FileCheck %s
 
 @foo = thread_local global i32 42
-@bar = hidden alias i32* @foo
+@bar = hidden thread_local alias i32* @foo
 
 define i32* @zed() {
 ; CHECK-DAG: __tls_get_addr
diff --git a/test/CodeGen/Mips/zeroreg.ll b/test/CodeGen/Mips/zeroreg.ll
index e0e93e2..a1b6cb0 100644
--- a/test/CodeGen/Mips/zeroreg.ll
+++ b/test/CodeGen/Mips/zeroreg.ll
@@ -1,21 +1,109 @@
-; RUN: llc < %s -march=mipsel | FileCheck %s
+; RUN: llc < %s -march=mipsel -mcpu=mips32   | FileCheck %s -check-prefix=ALL -check-prefix=32-CMOV
+; RUN: llc < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=32-CMOV
+; RUN: llc < %s -march=mipsel -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=32R6
+; RUN: llc < %s -march=mipsel -mcpu=mips4    | FileCheck %s -check-prefix=ALL -check-prefix=64-CMOV
+; RUN: llc < %s -march=mipsel -mcpu=mips64   | FileCheck %s -check-prefix=ALL -check-prefix=64-CMOV
+; RUN: llc < %s -march=mipsel -mcpu=mips64r2 | FileCheck %s -check-prefix=ALL -check-prefix=64-CMOV
+; RUN: llc < %s -march=mipsel -mcpu=mips64r6 | FileCheck %s -check-prefix=ALL -check-prefix=64R6
 
 @g1 = external global i32
 
-define i32 @foo0(i32 %s) nounwind readonly {
+define i32 @sel_icmp_nez_i32_z0(i32 %s) nounwind readonly {
 entry:
-; CHECK:     movn ${{[0-9]+}}, $zero
+; ALL-LABEL: sel_icmp_nez_i32_z0:
+
+; 32-CMOV:       lw $2, 0(${{[0-9]+}})
+; 32-CMOV:       movn $2, $zero, $4
+
+; 32R6:          lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R6:          seleqz $2, $[[R0]], $4
+
+; 64-CMOV:       lw $2, 0(${{[0-9]+}})
+; 64-CMOV:       movn $2, $zero, $4
+
+; 64R6:          lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 64R6:          seleqz $2, $[[R0]], $4
+
   %tobool = icmp ne i32 %s, 0
   %0 = load i32* @g1, align 4
   %cond = select i1 %tobool, i32 0, i32 %0
   ret i32 %cond
 }
 
-define i32 @foo1(i32 %s) nounwind readonly {
+define i32 @sel_icmp_nez_i32_z1(i32 %s) nounwind readonly {
 entry:
-; CHECK:     movz ${{[0-9]+}}, $zero
+; ALL-LABEL: sel_icmp_nez_i32_z1:
+
+; 32-CMOV:       lw $2, 0(${{[0-9]+}})
+; 32-CMOV:       movz $2, $zero, $4
+
+; 32R6:          lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R6:          selnez $2, $[[R0]], $4
+
+; 64-CMOV:       lw $2, 0(${{[0-9]+}})
+; 64-CMOV:       movz $2, $zero, $4
+
+; 64R6:          lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 64R6:          selnez $2, $[[R0]], $4
+
   %tobool = icmp ne i32 %s, 0
   %0 = load i32* @g1, align 4
   %cond = select i1 %tobool, i32 %0, i32 0
   ret i32 %cond
 }
+
+@g2 = external global i64
+
+define i64 @sel_icmp_nez_i64_z0(i64 %s) nounwind readonly {
+entry:
+; ALL-LABEL: sel_icmp_nez_i64_z0:
+
+; 32-CMOV-DAG:   lw $[[R0:2]], 0(${{[0-9]+}})
+; 32-CMOV-DAG:   lw $[[R1:3]], 4(${{[0-9]+}})
+; 32-CMOV-DAG:   movn $[[R0]], $zero, $4
+; 32-CMOV-DAG:   movn $[[R1]], $zero, $4
+
+; 32R6-DAG:      lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R6-DAG:      lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R6-DAG:      or $[[CC:[0-9]+]], $4, $5
+; 32R6-DAG:      seleqz $2, $[[R0]], $[[CC]]
+; 32R6-DAG:      seleqz $3, $[[R1]], $[[CC]]
+
+; 64-CMOV:       ld $2, 0(${{[0-9]+}})
+; 64-CMOV:       movn $2, $zero, $4
+
+; 64R6:          ld $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 64R6:          seleqz $2, $[[R0]], $4
+
+  %tobool = icmp ne i64 %s, 0
+  %0 = load i64* @g2, align 4
+  %cond = select i1 %tobool, i64 0, i64 %0
+  ret i64 %cond
+}
+
+define i64 @sel_icmp_nez_i64_z1(i64 %s) nounwind readonly {
+entry:
+; ALL-LABEL: sel_icmp_nez_i64_z1:
+
+; 32-CMOV-DAG:   lw $[[R0:2]], 0(${{[0-9]+}})
+; 32-CMOV-DAG:   lw $[[R1:3]], 4(${{[0-9]+}})
+; 32-CMOV-DAG:   movz $[[R0]], $zero, $4
+; 32-CMOV-DAG:   movz $[[R1]], $zero, $4
+
+; 32R6-DAG:      lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 32R6-DAG:      lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; 32R6-DAG:      or $[[CC:[0-9]+]], $4, $5
+; 32R6-DAG:      selnez $2, $[[R0]], $[[CC]]
+; 32R6-DAG:      selnez $3, $[[R1]], $[[CC]]
+
+; 64-CMOV:       ld $2, 0(${{[0-9]+}})
+; 64-CMOV:       movz $2, $zero, $4
+
+; 64R6:          ld $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; 64R6:          selnez $2, $[[R0]], $4
+
+  %tobool = icmp ne i64 %s, 0
+  %0 = load i64* @g2, align 4
+  %cond = select i1 %tobool, i64 %0, i64 0
+  ret i64 %cond
+}
diff --git a/test/CodeGen/NVPTX/access-non-generic.ll b/test/CodeGen/NVPTX/access-non-generic.ll
index 0622aa3..c225abf 100644
--- a/test/CodeGen/NVPTX/access-non-generic.ll
+++ b/test/CodeGen/NVPTX/access-non-generic.ll
@@ -74,13 +74,13 @@ define float @ld_st_shared_f32(i32 %i, float %v) {
   ret float %sum5
 }
 
-; Verifies nvptx-favor-non-generic keeps addrspacecasts between pointers of
-; different element types.
+; When hoisting an addrspacecast between different pointer types, replace the
+; addrspacecast with a bitcast.
 define i32 @ld_int_from_float() {
 ; IR-LABEL: @ld_int_from_float
-; IR: addrspacecast
+; IR: load i32 addrspace(3)* bitcast (float addrspace(3)* @scalar to i32 addrspace(3)*)
 ; PTX-LABEL: ld_int_from_float(
-; PTX: cvta.shared.u{{(32|64)}}
+; PTX: ld.shared.u{{(32|64)}}
   %1 = load i32* addrspacecast(float addrspace(3)* @scalar to i32*), align 4
   ret i32 %1
 }
diff --git a/test/CodeGen/NVPTX/arg-lowering.ll b/test/CodeGen/NVPTX/arg-lowering.ll
new file mode 100644
index 0000000..f7b8a14
--- /dev/null
+++ b/test/CodeGen/NVPTX/arg-lowering.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+; CHECK: .visible .func  (.param .align 16 .b8 func_retval0[16]) foo0(
+; CHECK:          .param .align 4 .b8 foo0_param_0[8]
+define <4 x float> @foo0({float, float} %arg0) {
+  ret <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>
+}
+
+; CHECK: .visible .func  (.param .align 8 .b8 func_retval0[8]) foo1(
+; CHECK:          .param .align 8 .b8 foo1_param_0[16]
+define <2 x float> @foo1({float, float, i64} %arg0) {
+  ret <2 x float> <float 1.0, float 1.0>
+}
diff --git a/test/CodeGen/NVPTX/atomics.ll b/test/CodeGen/NVPTX/atomics.ll
new file mode 100644
index 0000000..10ab73d
--- /dev/null
+++ b/test/CodeGen/NVPTX/atomics.ll
@@ -0,0 +1,141 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+
+; CHECK: atom0
+define i32 @atom0(i32* %addr, i32 %val) {
+; CHECK: atom.add.u32
+  %ret = atomicrmw add i32* %addr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+; CHECK: atom1
+define i64 @atom1(i64* %addr, i64 %val) {
+; CHECK: atom.add.u64
+  %ret = atomicrmw add i64* %addr, i64 %val seq_cst
+  ret i64 %ret
+}
+
+; CHECK: atom2
+define i32 @atom2(i32* %subr, i32 %val) {
+; CHECK: neg.s32
+; CHECK: atom.add.u32
+  %ret = atomicrmw sub i32* %subr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+; CHECK: atom3
+define i64 @atom3(i64* %subr, i64 %val) {
+; CHECK: neg.s64
+; CHECK: atom.add.u64
+  %ret = atomicrmw sub i64* %subr, i64 %val seq_cst
+  ret i64 %ret
+}
+
+; CHECK: atom4
+define i32 @atom4(i32* %subr, i32 %val) {
+; CHECK: atom.and.b32
+  %ret = atomicrmw and i32* %subr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+; CHECK: atom5
+define i64 @atom5(i64* %subr, i64 %val) {
+; CHECK: atom.and.b64
+  %ret = atomicrmw and i64* %subr, i64 %val seq_cst
+  ret i64 %ret
+}
+
+;; NAND not yet supported
+;define i32 @atom6(i32* %subr, i32 %val) {
+;  %ret = atomicrmw nand i32* %subr, i32 %val seq_cst
+;  ret i32 %ret
+;}
+
+;define i64 @atom7(i64* %subr, i64 %val) {
+;  %ret = atomicrmw nand i64* %subr, i64 %val seq_cst
+;  ret i64 %ret
+;}
+
+; CHECK: atom8
+define i32 @atom8(i32* %subr, i32 %val) {
+; CHECK: atom.or.b32
+  %ret = atomicrmw or i32* %subr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+; CHECK: atom9
+define i64 @atom9(i64* %subr, i64 %val) {
+; CHECK: atom.or.b64
+  %ret = atomicrmw or i64* %subr, i64 %val seq_cst
+  ret i64 %ret
+}
+
+; CHECK: atom10
+define i32 @atom10(i32* %subr, i32 %val) {
+; CHECK: atom.xor.b32
+  %ret = atomicrmw xor i32* %subr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+; CHECK: atom11
+define i64 @atom11(i64* %subr, i64 %val) {
+; CHECK: atom.xor.b64
+  %ret = atomicrmw xor i64* %subr, i64 %val seq_cst
+  ret i64 %ret
+}
+
+; CHECK: atom12
+define i32 @atom12(i32* %subr, i32 %val) {
+; CHECK: atom.max.s32
+  %ret = atomicrmw max i32* %subr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+; CHECK: atom13
+define i64 @atom13(i64* %subr, i64 %val) {
+; CHECK: atom.max.s64
+  %ret = atomicrmw max i64* %subr, i64 %val seq_cst
+  ret i64 %ret
+}
+
+; CHECK: atom14
+define i32 @atom14(i32* %subr, i32 %val) {
+; CHECK: atom.min.s32
+  %ret = atomicrmw min i32* %subr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+; CHECK: atom15
+define i64 @atom15(i64* %subr, i64 %val) {
+; CHECK: atom.min.s64
+  %ret = atomicrmw min i64* %subr, i64 %val seq_cst
+  ret i64 %ret
+}
+
+; CHECK: atom16
+define i32 @atom16(i32* %subr, i32 %val) {
+; CHECK: atom.max.u32
+  %ret = atomicrmw umax i32* %subr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+; CHECK: atom17
+define i64 @atom17(i64* %subr, i64 %val) {
+; CHECK: atom.max.u64
+  %ret = atomicrmw umax i64* %subr, i64 %val seq_cst
+  ret i64 %ret
+}
+
+; CHECK: atom18
+define i32 @atom18(i32* %subr, i32 %val) {
+; CHECK: atom.min.u32
+  %ret = atomicrmw umin i32* %subr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+; CHECK: atom19
+define i64 @atom19(i64* %subr, i64 %val) {
+; CHECK: atom.min.u64
+  %ret = atomicrmw umin i64* %subr, i64 %val seq_cst
+  ret i64 %ret
+}
diff --git a/test/CodeGen/NVPTX/bfe.ll b/test/CodeGen/NVPTX/bfe.ll
new file mode 100644
index 0000000..2e816fe
--- /dev/null
+++ b/test/CodeGen/NVPTX/bfe.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+
+; CHECK: bfe0
+define i32 @bfe0(i32 %a) {
+; CHECK: bfe.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, 4, 4
+; CHECK-NOT: shr
+; CHECK-NOT: and
+  %val0 = ashr i32 %a, 4
+  %val1 = and i32 %val0, 15
+  ret i32 %val1
+}
+
+; CHECK: bfe1
+define i32 @bfe1(i32 %a) {
+; CHECK: bfe.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, 3, 3
+; CHECK-NOT: shr
+; CHECK-NOT: and
+  %val0 = ashr i32 %a, 3
+  %val1 = and i32 %val0, 7
+  ret i32 %val1
+}
+
+; CHECK: bfe2
+define i32 @bfe2(i32 %a) {
+; CHECK: bfe.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, 5, 3
+; CHECK-NOT: shr
+; CHECK-NOT: and
+  %val0 = ashr i32 %a, 5
+  %val1 = and i32 %val0, 7
+  ret i32 %val1
+}
diff --git a/test/CodeGen/NVPTX/envreg.ll b/test/CodeGen/NVPTX/envreg.ll
new file mode 100644
index 0000000..a341b49
--- /dev/null
+++ b/test/CodeGen/NVPTX/envreg.ll
@@ -0,0 +1,139 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg0()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg1()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg2()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg3()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg4()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg5()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg6()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg7()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg8()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg9()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg10()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg11()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg12()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg13()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg14()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg15()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg16()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg17()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg18()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg19()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg20()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg21()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg22()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg23()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg24()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg25()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg26()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg27()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg28()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg29()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg30()
+declare i32 @llvm.nvvm.read.ptx.sreg.envreg31()
+
+
+; CHECK: foo
+define i32 @foo() {
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg0
+  %val0 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg0()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg1
+  %val1 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg1()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg2
+  %val2 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg2()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg3
+  %val3 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg3()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg4
+  %val4 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg4()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg5
+  %val5 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg5()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg6
+  %val6 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg6()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg7
+  %val7 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg7()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg8
+  %val8 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg8()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg9
+  %val9 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg9()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg10
+  %val10 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg10()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg11
+  %val11 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg11()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg12
+  %val12 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg12()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg13
+  %val13 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg13()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg14
+  %val14 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg14()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg15
+  %val15 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg15()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg16
+  %val16 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg16()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg17
+  %val17 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg17()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg18
+  %val18 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg18()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg19
+  %val19 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg19()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg20
+  %val20 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg20()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg21
+  %val21 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg21()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg22
+  %val22 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg22()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg23
+  %val23 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg23()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg24
+  %val24 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg24()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg25
+  %val25 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg25()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg26
+  %val26 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg26()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg27
+  %val27 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg27()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg28
+  %val28 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg28()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg29
+  %val29 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg29()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg30
+  %val30 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg30()
+; CHECK: mov.b32 %r{{[0-9]+}}, %envreg31
+  %val31 = tail call i32 @llvm.nvvm.read.ptx.sreg.envreg31()
+
+
+  %ret0 = add i32 %val0, %val1
+  %ret1 = add i32 %ret0, %val2
+  %ret2 = add i32 %ret1, %val3
+  %ret3 = add i32 %ret2, %val4
+  %ret4 = add i32 %ret3, %val5
+  %ret5 = add i32 %ret4, %val6
+  %ret6 = add i32 %ret5, %val7
+  %ret7 = add i32 %ret6, %val8
+  %ret8 = add i32 %ret7, %val9
+  %ret9 = add i32 %ret8, %val10
+  %ret10 = add i32 %ret9, %val11
+  %ret11 = add i32 %ret10, %val12
+  %ret12 = add i32 %ret11, %val13
+  %ret13 = add i32 %ret12, %val14
+  %ret14 = add i32 %ret13, %val15
+  %ret15 = add i32 %ret14, %val16
+  %ret16 = add i32 %ret15, %val17
+  %ret17 = add i32 %ret16, %val18
+  %ret18 = add i32 %ret17, %val19
+  %ret19 = add i32 %ret18, %val20
+  %ret20 = add i32 %ret19, %val21
+  %ret21 = add i32 %ret20, %val22
+  %ret22 = add i32 %ret21, %val23
+  %ret23 = add i32 %ret22, %val24
+  %ret24 = add i32 %ret23, %val25
+  %ret25 = add i32 %ret24, %val26
+  %ret26 = add i32 %ret25, %val27
+  %ret27 = add i32 %ret26, %val28
+  %ret28 = add i32 %ret27, %val29
+  %ret29 = add i32 %ret28, %val30
+  %ret30 = add i32 %ret29, %val31
+
+  ret i32 %ret30
+}
diff --git a/test/CodeGen/NVPTX/gvar-init.ll b/test/CodeGen/NVPTX/gvar-init.ll
new file mode 100644
index 0000000..8c95942
--- /dev/null
+++ b/test/CodeGen/NVPTX/gvar-init.ll
@@ -0,0 +1,5 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+; Error out if initializer is given for address spaces that do not support initializers
+; XFAIL: *
+@g0 = addrspace(3) global i32 42
diff --git a/test/CodeGen/NVPTX/imad.ll b/test/CodeGen/NVPTX/imad.ll
new file mode 100644
index 0000000..67421c7
--- /dev/null
+++ b/test/CodeGen/NVPTX/imad.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+; CHECK: imad
+define i32 @imad(i32 %a, i32 %b, i32 %c) {
+; CHECK: mad.lo.s32
+  %val0 = mul i32 %a, %b
+  %val1 = add i32 %val0, %c
+  ret i32 %val1
+}
diff --git a/test/CodeGen/NVPTX/inline-asm.ll b/test/CodeGen/NVPTX/inline-asm.ll
index d76eb42..6f0578d 100644
--- a/test/CodeGen/NVPTX/inline-asm.ll
+++ b/test/CodeGen/NVPTX/inline-asm.ll
@@ -7,3 +7,10 @@ entry:
   %0 = call float asm "ex2.approx.ftz.f32 $0, $1;", "=f,f"(float %x)
   ret float %0
 }
+
+define i32 @foo(i1 signext %cond, i32 %a, i32 %b) #0 {
+entry:
+; CHECK: selp.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %p{{[0-9]+}}
+  %0 = tail call i32 asm "selp.b32 $0, $1, $2, $3;", "=r,r,r,b"(i32 %a, i32 %b, i1 %cond)
+  ret i32 %0
+}
diff --git a/test/CodeGen/NVPTX/isspacep.ll b/test/CodeGen/NVPTX/isspacep.ll
new file mode 100644
index 0000000..47fa7a6
--- /dev/null
+++ b/test/CodeGen/NVPTX/isspacep.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+declare i1 @llvm.nvvm.isspacep.const(i8*) readnone noinline
+declare i1 @llvm.nvvm.isspacep.global(i8*) readnone noinline
+declare i1 @llvm.nvvm.isspacep.local(i8*) readnone noinline
+declare i1 @llvm.nvvm.isspacep.shared(i8*) readnone noinline
+
+; CHECK: is_const
+define i1 @is_const(i8* %addr) {
+; CHECK: isspacep.const
+  %v = tail call i1 @llvm.nvvm.isspacep.const(i8* %addr)
+  ret i1 %v
+}
+
+; CHECK: is_global
+define i1 @is_global(i8* %addr) {
+; CHECK: isspacep.global
+  %v = tail call i1 @llvm.nvvm.isspacep.global(i8* %addr)
+  ret i1 %v
+}
+
+; CHECK: is_local
+define i1 @is_local(i8* %addr) {
+; CHECK: isspacep.local
+  %v = tail call i1 @llvm.nvvm.isspacep.local(i8* %addr)
+  ret i1 %v
+}
+
+; CHECK: is_shared
+define i1 @is_shared(i8* %addr) {
+; CHECK: isspacep.shared
+  %v = tail call i1 @llvm.nvvm.isspacep.shared(i8* %addr)
+  ret i1 %v
+}
+
diff --git a/test/CodeGen/NVPTX/ldu-i8.ll b/test/CodeGen/NVPTX/ldu-i8.ll
index 81a82b2..9cc6675 100644
--- a/test/CodeGen/NVPTX/ldu-i8.ll
+++ b/test/CodeGen/NVPTX/ldu-i8.ll
@@ -2,13 +2,15 @@
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
-declare i8 @llvm.nvvm.ldu.global.i.i8(i8*)
+declare i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8*)
 
 define i8 @foo(i8* %a) {
 ; Ensure we properly truncate off the high-order 24 bits
 ; CHECK:        ldu.global.u8
 ; CHECK:        cvt.u32.u16
 ; CHECK:        and.b32         %r{{[0-9]+}}, %r{{[0-9]+}}, 255
-  %val = tail call i8 @llvm.nvvm.ldu.global.i.i8(i8* %a)
+  %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8* %a), !align !0
   ret i8 %val
 }
+
+!0 = metadata !{i32 4}
diff --git a/test/CodeGen/NVPTX/ldu-ldg.ll b/test/CodeGen/NVPTX/ldu-ldg.ll
new file mode 100644
index 0000000..3b0619f
--- /dev/null
+++ b/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+
+declare i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr)
+declare i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr)
+declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr)
+declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr)
+
+
+; CHECK: func0
+define i8 @func0(i8 addrspace(1)* %ptr) {
+; ldu.global.u8
+  %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr), !align !0
+  ret i8 %val
+}
+
+; CHECK: func1
+define i32 @func1(i32 addrspace(1)* %ptr) {
+; ldu.global.u32
+  %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr), !align !0
+  ret i32 %val
+}
+
+; CHECK: func2
+define i8 @func2(i8 addrspace(1)* %ptr) {
+; ld.global.nc.u8
+  %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr), !align !0
+  ret i8 %val
+}
+
+; CHECK: func3
+define i32 @func3(i32 addrspace(1)* %ptr) {
+; ld.global.nc.u32
+  %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr), !align !0
+  ret i32 %val
+}
+
+
+
+!0 = metadata !{i32 4}
diff --git a/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
index 26cadc4..55707ea 100644
--- a/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
+++ b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
@@ -7,9 +7,9 @@ define void @reg_plus_offset(i32* %a) {
 ; CHECK:        ldu.global.u32  %r{{[0-9]+}}, [%r{{[0-9]+}}+32];
 ; CHECK:        ldu.global.u32  %r{{[0-9]+}}, [%r{{[0-9]+}}+36];
   %p2 = getelementptr i32* %a, i32 8
-  %t1 = call i32 @llvm.nvvm.ldu.global.i.i32(i32* %p2), !align !1
+  %t1 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p2), !align !1
   %p3 = getelementptr i32* %a, i32 9
-  %t2 = call i32 @llvm.nvvm.ldu.global.i.i32(i32* %p3), !align !1
+  %t2 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p3), !align !1
   %t3 = mul i32 %t1, %t2
   store i32 %t3, i32* %a
   ret void
@@ -17,5 +17,5 @@ define void @reg_plus_offset(i32* %a) {
 
 !1 = metadata !{ i32 4 }
 
-declare i32 @llvm.nvvm.ldu.global.i.i32(i32*)
+declare i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32*)
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
diff --git a/test/CodeGen/NVPTX/lit.local.cfg b/test/CodeGen/NVPTX/lit.local.cfg
index 85cf8c2..2cb98eb 100644
--- a/test/CodeGen/NVPTX/lit.local.cfg
+++ b/test/CodeGen/NVPTX/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'NVPTX' in targets:
+if not 'NVPTX' in config.root.targets:
     config.unsupported = True
diff --git a/test/CodeGen/NVPTX/managed.ll b/test/CodeGen/NVPTX/managed.ll
new file mode 100644
index 0000000..4d7e781
--- /dev/null
+++ b/test/CodeGen/NVPTX/managed.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+
+; CHECK: .visible .global .align 4 .u32 device_g;
+@device_g = addrspace(1) global i32 zeroinitializer
+; CHECK: .visible .global .attribute(.managed) .align 4 .u32 managed_g;
+@managed_g = addrspace(1) global i32 zeroinitializer
+
+
+!nvvm.annotations = !{!0}
+!0 = metadata !{i32 addrspace(1)* @managed_g, metadata !"managed", i32 1}
diff --git a/test/CodeGen/NVPTX/mulwide.ll b/test/CodeGen/NVPTX/mulwide.ll
new file mode 100644
index 0000000..927946c
--- /dev/null
+++ b/test/CodeGen/NVPTX/mulwide.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+; CHECK: mulwide16
+define i32 @mulwide16(i16 %a, i16 %b) {
+; CHECK: mul.wide.s16
+  %val0 = sext i16 %a to i32
+  %val1 = sext i16 %b to i32
+  %val2 = mul i32 %val0, %val1
+  ret i32 %val2
+}
+
+; CHECK: mulwideu16
+define i32 @mulwideu16(i16 %a, i16 %b) {
+; CHECK: mul.wide.u16
+  %val0 = zext i16 %a to i32
+  %val1 = zext i16 %b to i32
+  %val2 = mul i32 %val0, %val1
+  ret i32 %val2
+}
+
+; CHECK: mulwide32
+define i64 @mulwide32(i32 %a, i32 %b) {
+; CHECK: mul.wide.s32
+  %val0 = sext i32 %a to i64
+  %val1 = sext i32 %b to i64
+  %val2 = mul i64 %val0, %val1
+  ret i64 %val2
+}
+
+; CHECK: mulwideu32
+define i64 @mulwideu32(i32 %a, i32 %b) {
+; CHECK: mul.wide.u32
+  %val0 = zext i32 %a to i64
+  %val1 = zext i32 %b to i64
+  %val2 = mul i64 %val0, %val1
+  ret i64 %val2
+}
diff --git a/test/CodeGen/NVPTX/nvvm-reflect.ll b/test/CodeGen/NVPTX/nvvm-reflect.ll
index 0d02194..21e9c69 100644
--- a/test/CodeGen/NVPTX/nvvm-reflect.ll
+++ b/test/CodeGen/NVPTX/nvvm-reflect.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -S -nvvm-reflect -nvvm-reflect-list USE_MUL=0 -O2 | FileCheck %s --check-prefix=USE_MUL_0
 ; RUN: opt < %s -S -nvvm-reflect -nvvm-reflect-list USE_MUL=1 -O2 | FileCheck %s --check-prefix=USE_MUL_1
 
-@str = private addrspace(4) unnamed_addr constant [8 x i8] c"USE_MUL\00"
+@str = private unnamed_addr addrspace(4) constant [8 x i8] c"USE_MUL\00"
 
 declare i32 @__nvvm_reflect(i8*)
 declare i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)*)
@@ -32,3 +32,17 @@ exit:
   %ret = phi float [%ret1, %use_mul], [%ret2, %use_add]
   ret float %ret
 }
+
+declare i32 @llvm.nvvm.reflect.p0i8(i8*)
+
+; USE_MUL_0: define i32 @intrinsic
+; USE_MUL_1: define i32 @intrinsic
+define i32 @intrinsic() {
+; USE_MUL_0-NOT: call i32 @llvm.nvvm.reflect
+; USE_MUL_0: ret i32 0
+; USE_MUL_1-NOT: call i32 @llvm.nvvm.reflect
+; USE_MUL_1: ret i32 1
+  %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([8 x i8] addrspace(4)* @str, i32 0, i32 0))
+  %reflect = tail call i32 @llvm.nvvm.reflect.p0i8(i8* %ptr)
+  ret i32 %reflect
+}
diff --git a/test/CodeGen/NVPTX/rotate.ll b/test/CodeGen/NVPTX/rotate.ll
new file mode 100644
index 0000000..dfc8b4f
--- /dev/null
+++ b/test/CodeGen/NVPTX/rotate.ll
@@ -0,0 +1,58 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s
+
+
+declare i32 @llvm.nvvm.rotate.b32(i32, i32)
+declare i64 @llvm.nvvm.rotate.b64(i64, i32)
+declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)
+
+; SM20: rotate32
+; SM35: rotate32
+define i32 @rotate32(i32 %a, i32 %b) {
+; SM20: shl.b32
+; SM20: sub.s32
+; SM20: shr.b32
+; SM20: add.u32
+; SM35: shf.l.wrap.b32
+  %val = tail call i32 @llvm.nvvm.rotate.b32(i32 %a, i32 %b)
+  ret i32 %val
+}
+
+; SM20: rotate64
+; SM35: rotate64
+define i64 @rotate64(i64 %a, i32 %b) {
+; SM20: shl.b64
+; SM20: sub.u32
+; SM20: shr.b64
+; SM20: add.u64
+; SM35: shf.l.wrap.b32
+; SM35: shf.l.wrap.b32
+  %val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 %b)
+  ret i64 %val
+}
+
+; SM20: rotateright64
+; SM35: rotateright64
+define i64 @rotateright64(i64 %a, i32 %b) {
+; SM20: shr.b64
+; SM20: sub.u32
+; SM20: shl.b64
+; SM20: add.u64
+; SM35: shf.r.wrap.b32
+; SM35: shf.r.wrap.b32
+  %val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 %b)
+  ret i64 %val
+}
+
+; SM20: rotl0
+; SM35: rotl0
+define i32 @rotl0(i32 %x) {
+; SM20: shl.b32
+; SM20: shr.b32
+; SM20: add.u32
+; SM35: shf.l.wrap.b32
+  %t0 = shl i32 %x, 8
+  %t1 = lshr i32 %x, 24
+  %t2 = or i32 %t0, %t1
+  ret i32 %t2
+}
diff --git a/test/CodeGen/NVPTX/shift-parts.ll b/test/CodeGen/NVPTX/shift-parts.ll
new file mode 100644
index 0000000..748297c
--- /dev/null
+++ b/test/CodeGen/NVPTX/shift-parts.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+; CHECK: shift_parts_left_128
+define void @shift_parts_left_128(i128* %val, i128* %amtptr) {
+; CHECK: shl.b64
+; CHECK: mov.u32
+; CHECK: sub.s32
+; CHECK: shr.u64
+; CHECK: or.b64
+; CHECK: add.s32
+; CHECK: shl.b64
+; CHECK: setp.gt.s32
+; CHECK: selp.b64
+; CHECK: shl.b64
+  %amt = load i128* %amtptr
+  %a = load i128* %val
+  %val0 = shl i128 %a, %amt
+  store i128 %val0, i128* %val
+  ret void
+}
+
+; CHECK: shift_parts_right_128
+define void @shift_parts_right_128(i128* %val, i128* %amtptr) {
+; CHECK: shr.u64
+; CHECK: sub.s32
+; CHECK: shl.b64
+; CHECK: or.b64
+; CHECK: add.s32
+; CHECK: shr.s64
+; CHECK: setp.gt.s32
+; CHECK: selp.b64
+; CHECK: shr.s64
+  %amt = load i128* %amtptr
+  %a = load i128* %val
+  %val0 = ashr i128 %a, %amt
+  store i128 %val0, i128* %val
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/weak-global.ll b/test/CodeGen/NVPTX/weak-global.ll
new file mode 100644
index 0000000..2bef4c5
--- /dev/null
+++ b/test/CodeGen/NVPTX/weak-global.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+; CHECK: .weak .global .align 4 .u32 g
+@g = common addrspace(1) global i32 zeroinitializer
+
+define i32 @func0() {
+  %val = load i32 addrspace(1)* @g
+  ret i32 %val
+}
diff --git a/test/CodeGen/NVPTX/weak-linkage.ll b/test/CodeGen/NVPTX/weak-linkage.ll
new file mode 100644
index 0000000..7a13357
--- /dev/null
+++ b/test/CodeGen/NVPTX/weak-linkage.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+
+; CHECK: .weak .func foo
+define weak void @foo() {
+  ret void
+}
+
+; CHECK: .visible .func bar
+define void @bar() {
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/Atomics-32.ll b/test/CodeGen/PowerPC/Atomics-32.ll
index b5c03e2..b7f23b1 100644
--- a/test/CodeGen/PowerPC/Atomics-32.ll
+++ b/test/CodeGen/PowerPC/Atomics-32.ll
@@ -529,63 +529,73 @@ define void @test_compare_and_swap() nounwind {
 entry:
   %0 = load i8* @uc, align 1
   %1 = load i8* @sc, align 1
-  %2 = cmpxchg i8* @sc, i8 %0, i8 %1 monotonic monotonic
+  %pair2 = cmpxchg i8* @sc, i8 %0, i8 %1 monotonic monotonic
+  %2 = extractvalue { i8, i1 } %pair2, 0
   store i8 %2, i8* @sc, align 1
   %3 = load i8* @uc, align 1
   %4 = load i8* @sc, align 1
-  %5 = cmpxchg i8* @uc, i8 %3, i8 %4 monotonic monotonic
+  %pair5 = cmpxchg i8* @uc, i8 %3, i8 %4 monotonic monotonic
+  %5 = extractvalue { i8, i1 } %pair5, 0
   store i8 %5, i8* @uc, align 1
   %6 = load i8* @uc, align 1
   %7 = zext i8 %6 to i16
   %8 = load i8* @sc, align 1
   %9 = sext i8 %8 to i16
   %10 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %11 = cmpxchg i16* %10, i16 %7, i16 %9 monotonic monotonic
+  %pair11 = cmpxchg i16* %10, i16 %7, i16 %9 monotonic monotonic
+  %11 = extractvalue { i16, i1 } %pair11, 0
   store i16 %11, i16* @ss, align 2
   %12 = load i8* @uc, align 1
   %13 = zext i8 %12 to i16
   %14 = load i8* @sc, align 1
   %15 = sext i8 %14 to i16
   %16 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %17 = cmpxchg i16* %16, i16 %13, i16 %15 monotonic monotonic
+  %pair17 = cmpxchg i16* %16, i16 %13, i16 %15 monotonic monotonic
+  %17 = extractvalue { i16, i1 } %pair17, 0
   store i16 %17, i16* @us, align 2
   %18 = load i8* @uc, align 1
   %19 = zext i8 %18 to i32
   %20 = load i8* @sc, align 1
   %21 = sext i8 %20 to i32
   %22 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %23 = cmpxchg i32* %22, i32 %19, i32 %21 monotonic monotonic
+  %pair23 = cmpxchg i32* %22, i32 %19, i32 %21 monotonic monotonic
+  %23 = extractvalue { i32, i1 } %pair23, 0
   store i32 %23, i32* @si, align 4
   %24 = load i8* @uc, align 1
   %25 = zext i8 %24 to i32
   %26 = load i8* @sc, align 1
   %27 = sext i8 %26 to i32
   %28 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %29 = cmpxchg i32* %28, i32 %25, i32 %27 monotonic monotonic
+  %pair29 = cmpxchg i32* %28, i32 %25, i32 %27 monotonic monotonic
+  %29 = extractvalue { i32, i1 } %pair29, 0
   store i32 %29, i32* @ui, align 4
   %30 = load i8* @uc, align 1
   %31 = zext i8 %30 to i32
   %32 = load i8* @sc, align 1
   %33 = sext i8 %32 to i32
   %34 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %35 = cmpxchg i32* %34, i32 %31, i32 %33 monotonic monotonic
+  %pair35 = cmpxchg i32* %34, i32 %31, i32 %33 monotonic monotonic
+  %35 = extractvalue { i32, i1 } %pair35, 0
   store i32 %35, i32* @sl, align 4
   %36 = load i8* @uc, align 1
   %37 = zext i8 %36 to i32
   %38 = load i8* @sc, align 1
   %39 = sext i8 %38 to i32
   %40 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %41 = cmpxchg i32* %40, i32 %37, i32 %39 monotonic monotonic
+  %pair41 = cmpxchg i32* %40, i32 %37, i32 %39 monotonic monotonic
+  %41 = extractvalue { i32, i1 } %pair41, 0
   store i32 %41, i32* @ul, align 4
   %42 = load i8* @uc, align 1
   %43 = load i8* @sc, align 1
-  %44 = cmpxchg i8* @sc, i8 %42, i8 %43 monotonic monotonic
+  %pair44 = cmpxchg i8* @sc, i8 %42, i8 %43 monotonic monotonic
+  %44 = extractvalue { i8, i1 } %pair44, 0
   %45 = icmp eq i8 %44, %42
   %46 = zext i1 %45 to i32
   store i32 %46, i32* @ui, align 4
   %47 = load i8* @uc, align 1
   %48 = load i8* @sc, align 1
-  %49 = cmpxchg i8* @uc, i8 %47, i8 %48 monotonic monotonic
+  %pair49 = cmpxchg i8* @uc, i8 %47, i8 %48 monotonic monotonic
+  %49 = extractvalue { i8, i1 } %pair49, 0
   %50 = icmp eq i8 %49, %47
   %51 = zext i1 %50 to i32
   store i32 %51, i32* @ui, align 4
@@ -594,7 +604,8 @@ entry:
   %54 = load i8* @sc, align 1
   %55 = sext i8 %54 to i16
   %56 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %57 = cmpxchg i16* %56, i16 %53, i16 %55 monotonic monotonic
+  %pair57 = cmpxchg i16* %56, i16 %53, i16 %55 monotonic monotonic
+  %57 = extractvalue { i16, i1 } %pair57, 0
   %58 = icmp eq i16 %57, %53
   %59 = zext i1 %58 to i32
   store i32 %59, i32* @ui, align 4
@@ -603,7 +614,8 @@ entry:
   %62 = load i8* @sc, align 1
   %63 = sext i8 %62 to i16
   %64 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %65 = cmpxchg i16* %64, i16 %61, i16 %63 monotonic monotonic
+  %pair65 = cmpxchg i16* %64, i16 %61, i16 %63 monotonic monotonic
+  %65 = extractvalue { i16, i1 } %pair65, 0
   %66 = icmp eq i16 %65, %61
   %67 = zext i1 %66 to i32
   store i32 %67, i32* @ui, align 4
@@ -612,7 +624,8 @@ entry:
   %70 = load i8* @sc, align 1
   %71 = sext i8 %70 to i32
   %72 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %73 = cmpxchg i32* %72, i32 %69, i32 %71 monotonic monotonic
+  %pair73 = cmpxchg i32* %72, i32 %69, i32 %71 monotonic monotonic
+  %73 = extractvalue { i32, i1 } %pair73, 0
   %74 = icmp eq i32 %73, %69
   %75 = zext i1 %74 to i32
   store i32 %75, i32* @ui, align 4
@@ -621,7 +634,8 @@ entry:
   %78 = load i8* @sc, align 1
   %79 = sext i8 %78 to i32
   %80 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %81 = cmpxchg i32* %80, i32 %77, i32 %79 monotonic monotonic
+  %pair81 = cmpxchg i32* %80, i32 %77, i32 %79 monotonic monotonic
+  %81 = extractvalue { i32, i1 } %pair81, 0
   %82 = icmp eq i32 %81, %77
   %83 = zext i1 %82 to i32
   store i32 %83, i32* @ui, align 4
@@ -630,7 +644,8 @@ entry:
   %86 = load i8* @sc, align 1
   %87 = sext i8 %86 to i32
   %88 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %89 = cmpxchg i32* %88, i32 %85, i32 %87 monotonic monotonic
+  %pair89 = cmpxchg i32* %88, i32 %85, i32 %87 monotonic monotonic
+  %89 = extractvalue { i32, i1 } %pair89, 0
   %90 = icmp eq i32 %89, %85
   %91 = zext i1 %90 to i32
   store i32 %91, i32* @ui, align 4
@@ -639,7 +654,8 @@ entry:
   %94 = load i8* @sc, align 1
   %95 = sext i8 %94 to i32
   %96 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %97 = cmpxchg i32* %96, i32 %93, i32 %95 monotonic monotonic
+  %pair97 = cmpxchg i32* %96, i32 %93, i32 %95 monotonic monotonic
+  %97 = extractvalue { i32, i1 } %pair97, 0
   %98 = icmp eq i32 %97, %93
   %99 = zext i1 %98 to i32
   store i32 %99, i32* @ui, align 4
diff --git a/test/CodeGen/PowerPC/Frames-alloca.ll b/test/CodeGen/PowerPC/Frames-alloca.ll
index 4588bc0..c701fef 100644
--- a/test/CodeGen/PowerPC/Frames-alloca.ll
+++ b/test/CodeGen/PowerPC/Frames-alloca.ll
@@ -12,15 +12,15 @@
 ; CHECK-PPC32-NOFP: stw r31, -4(r1)
 ; CHECK-PPC32-NOFP: lwz r1, 0(r1)
 ; CHECK-PPC32-NOFP: lwz r31, -4(r1)
-; CHECK-PPC32-RS: stwu r1, -80(r1)
-; CHECK-PPC32-RS-NOFP: stwu r1, -80(r1)
+; CHECK-PPC32-RS: stwu r1, -48(r1)
+; CHECK-PPC32-RS-NOFP: stwu r1, -48(r1)
 
 ; CHECK-PPC64: std r31, -8(r1)
-; CHECK-PPC64: stdu r1, -128(r1)
+; CHECK-PPC64: stdu r1, -64(r1)
 ; CHECK-PPC64: ld r1, 0(r1)
 ; CHECK-PPC64: ld r31, -8(r1)
 ; CHECK-PPC64-NOFP: std r31, -8(r1)
-; CHECK-PPC64-NOFP: stdu r1, -128(r1)
+; CHECK-PPC64-NOFP: stdu r1, -64(r1)
 ; CHECK-PPC64-NOFP: ld r1, 0(r1)
 ; CHECK-PPC64-NOFP: ld r31, -8(r1)
 
diff --git a/test/CodeGen/PowerPC/Frames-large.ll b/test/CodeGen/PowerPC/Frames-large.ll
index d07fea7..0ccea42 100644
--- a/test/CodeGen/PowerPC/Frames-large.ll
+++ b/test/CodeGen/PowerPC/Frames-large.ll
@@ -15,9 +15,9 @@ define i32* @f1() nounwind {
 
 ; PPC32-NOFP: _f1:
 ; PPC32-NOFP: 	lis r0, -1
-; PPC32-NOFP: 	ori r0, r0, 32704
+; PPC32-NOFP: 	ori r0, r0, 32736
 ; PPC32-NOFP: 	stwux r1, r1, r0
-; PPC32-NOFP: 	addi r3, r1, 68
+; PPC32-NOFP: 	addi r3, r1, 36
 ; PPC32-NOFP: 	lwz r1, 0(r1)
 ; PPC32-NOFP: 	blr 
 
@@ -25,10 +25,10 @@ define i32* @f1() nounwind {
 ; PPC32-FP: _f1:
 ; PPC32-FP:	lis r0, -1
 ; PPC32-FP:	stw r31, -4(r1)
-; PPC32-FP:	ori r0, r0, 32704
+; PPC32-FP:	ori r0, r0, 32736
 ; PPC32-FP:	stwux r1, r1, r0
 ; PPC32-FP:	mr r31, r1
-; PPC32-FP:	addi r3, r31, 64
+; PPC32-FP:	addi r3, r31, 32
 ; PPC32-FP:	lwz r1, 0(r1)
 ; PPC32-FP:	lwz r31, -4(r1)
 ; PPC32-FP:	blr 
@@ -36,9 +36,9 @@ define i32* @f1() nounwind {
 
 ; PPC64-NOFP: _f1:
 ; PPC64-NOFP: 	lis r0, -1
-; PPC64-NOFP: 	ori r0, r0, 32656
+; PPC64-NOFP: 	ori r0, r0, 32720
 ; PPC64-NOFP: 	stdux r1, r1, r0
-; PPC64-NOFP: 	addi r3, r1, 116
+; PPC64-NOFP: 	addi r3, r1, 52
 ; PPC64-NOFP: 	ld r1, 0(r1)
 ; PPC64-NOFP: 	blr 
 
@@ -46,10 +46,10 @@ define i32* @f1() nounwind {
 ; PPC64-FP: _f1:
 ; PPC64-FP:	lis r0, -1
 ; PPC64-FP:	std r31, -8(r1)
-; PPC64-FP:	ori r0, r0, 32640
+; PPC64-FP:	ori r0, r0, 32704
 ; PPC64-FP:	stdux r1, r1, r0
 ; PPC64-FP:	mr r31, r1
-; PPC64-FP:	addi r3, r31, 124
+; PPC64-FP:	addi r3, r31, 60
 ; PPC64-FP:	ld r1, 0(r1)
 ; PPC64-FP:	ld r31, -8(r1)
 ; PPC64-FP:	blr 
diff --git a/test/CodeGen/PowerPC/Frames-small.ll b/test/CodeGen/PowerPC/Frames-small.ll
index 0f6bd10..28c1a5b 100644
--- a/test/CodeGen/PowerPC/Frames-small.ll
+++ b/test/CodeGen/PowerPC/Frames-small.ll
@@ -1,25 +1,25 @@
 ; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -o %t1
 ; RUN: not grep "stw r31, -4(r1)" %t1
-; RUN: grep "stwu r1, -16448(r1)" %t1
-; RUN: grep "addi r1, r1, 16448" %t1
+; RUN: grep "stwu r1, -16416(r1)" %t1
+; RUN: grep "addi r1, r1, 16416" %t1
 ; RUN: llc < %s -march=ppc32 | \
 ; RUN: not grep "lwz r31, -4(r1)"
 ; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -disable-fp-elim \
 ; RUN:   -o %t2
 ; RUN: grep "stw r31, -4(r1)" %t2
-; RUN: grep "stwu r1, -16448(r1)" %t2
-; RUN: grep "addi r1, r1, 16448" %t2
+; RUN: grep "stwu r1, -16416(r1)" %t2
+; RUN: grep "addi r1, r1, 16416" %t2
 ; RUN: grep "lwz r31, -4(r1)" %t2
 ; RUN: llc < %s -march=ppc64 -mtriple=powerpc-apple-darwin8 -o %t3
 ; RUN: not grep "std r31, -8(r1)" %t3
-; RUN: grep "stdu r1, -16496(r1)" %t3
-; RUN: grep "addi r1, r1, 16496" %t3
+; RUN: grep "stdu r1, -16432(r1)" %t3
+; RUN: grep "addi r1, r1, 16432" %t3
 ; RUN: not grep "ld r31, -8(r1)" %t3
 ; RUN: llc < %s -march=ppc64 -mtriple=powerpc-apple-darwin8 -disable-fp-elim \
 ; RUN:   -o %t4
 ; RUN: grep "std r31, -8(r1)" %t4
-; RUN: grep "stdu r1, -16512(r1)" %t4
-; RUN: grep "addi r1, r1, 16512" %t4
+; RUN: grep "stdu r1, -16448(r1)" %t4
+; RUN: grep "addi r1, r1, 16448" %t4
 ; RUN: grep "ld r31, -8(r1)" %t4
 
 define i32* @f1() {
diff --git a/test/CodeGen/PowerPC/atomic-1.ll b/test/CodeGen/PowerPC/atomic-1.ll
index 083df47..997a016 100644
--- a/test/CodeGen/PowerPC/atomic-1.ll
+++ b/test/CodeGen/PowerPC/atomic-1.ll
@@ -11,7 +11,8 @@ define i32 @exchange_and_add(i32* %mem, i32 %val) nounwind {
 define i32 @exchange_and_cmp(i32* %mem) nounwind {
 ; CHECK-LABEL: exchange_and_cmp:
 ; CHECK: lwarx
-  %tmp = cmpxchg i32* %mem, i32 0, i32 1 monotonic monotonic
+  %tmppair = cmpxchg i32* %mem, i32 0, i32 1 monotonic monotonic
+  %tmp = extractvalue { i32, i1 } %tmppair, 0
 ; CHECK: stwcx.
 ; CHECK: stwcx.
   ret i32 %tmp
diff --git a/test/CodeGen/PowerPC/atomic-2.ll b/test/CodeGen/PowerPC/atomic-2.ll
index 261335e..843250f 100644
--- a/test/CodeGen/PowerPC/atomic-2.ll
+++ b/test/CodeGen/PowerPC/atomic-2.ll
@@ -11,7 +11,8 @@ define i64 @exchange_and_add(i64* %mem, i64 %val) nounwind {
 define i64 @exchange_and_cmp(i64* %mem) nounwind {
 ; CHECK-LABEL: exchange_and_cmp:
 ; CHECK: ldarx
-  %tmp = cmpxchg i64* %mem, i64 0, i64 1 monotonic monotonic
+  %tmppair = cmpxchg i64* %mem, i64 0, i64 1 monotonic monotonic
+  %tmp = extractvalue { i64, i1 } %tmppair, 0
 ; CHECK: stdcx.
 ; CHECK: stdcx.
   ret i64 %tmp
diff --git a/test/CodeGen/PowerPC/early-ret2.ll b/test/CodeGen/PowerPC/early-ret2.ll
index a8e456f..1784777 100644
--- a/test/CodeGen/PowerPC/early-ret2.ll
+++ b/test/CodeGen/PowerPC/early-ret2.ll
@@ -11,7 +11,7 @@ while.body.lr.ph:                                 ; preds = %entry
   br i1 undef, label %while.end, label %while.body
 
 while.body:                                       ; preds = %while.body, %while.body.lr.ph
-  br i1 false, label %while.end, label %while.body, !llvm.vectorizer.already_vectorized !0
+  br i1 false, label %while.end, label %while.body, !llvm.loop.vectorize.already_vectorized !0
 
 while.end:                                        ; preds = %while.body, %while.body.lr.ph, %entry
   ret void
diff --git a/test/CodeGen/PowerPC/fast-isel-conversion-p5.ll b/test/CodeGen/PowerPC/fast-isel-conversion-p5.ll
index db0d8ed..ac41e8c 100644
--- a/test/CodeGen/PowerPC/fast-isel-conversion-p5.ll
+++ b/test/CodeGen/PowerPC/fast-isel-conversion-p5.ll
@@ -116,18 +116,6 @@ entry:
   ret void
 }
 
-define void @fptoui_float_i64(float %a) nounwind ssp {
-entry:
-; ELF64: fptoui_float_i64
-  %b.addr = alloca i64, align 4
-  %conv = fptoui float %a to i64
-; ELF64: fctiduz
-; ELF64: stfd
-; ELF64: ld
-  store i64 %conv, i64* %b.addr, align 4
-  ret void
-}
-
 define void @fptoui_double_i32(double %a) nounwind ssp {
 entry:
 ; ELF64: fptoui_double_i32
@@ -140,14 +128,3 @@ entry:
   ret void
 }
 
-define void @fptoui_double_i64(double %a) nounwind ssp {
-entry:
-; ELF64: fptoui_double_i64
-  %b.addr = alloca i64, align 8
-  %conv = fptoui double %a to i64
-; ELF64: fctiduz
-; ELF64: stfd
-; ELF64: ld
-  store i64 %conv, i64* %b.addr, align 8
-  ret void
-}
diff --git a/test/CodeGen/PowerPC/fast-isel-conversion.ll b/test/CodeGen/PowerPC/fast-isel-conversion.ll
index a31c312..5e00675 100644
--- a/test/CodeGen/PowerPC/fast-isel-conversion.ll
+++ b/test/CodeGen/PowerPC/fast-isel-conversion.ll
@@ -1,15 +1,24 @@
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+; RUN: llc < %s -O0 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=970 | FileCheck %s --check-prefix=PPC970
+
+;; Tests for 970 don't use -fast-isel-abort because we intentionally punt
+;; to SelectionDAG in some cases.
 
 ; Test sitofp
 
 define void @sitofp_single_i64(i64 %a, float %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_single_i64
+; PPC970: sitofp_single_i64
   %b.addr = alloca float, align 4
   %conv = sitofp i64 %a to float
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfids
+; PPC970: std
+; PPC970: lfd
+; PPC970: fcfid
+; PPC970: frsp
   store float %conv, float* %b.addr, align 4
   ret void
 }
@@ -17,11 +26,16 @@ entry:
 define void @sitofp_single_i32(i32 %a, float %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_single_i32
+; PPC970: sitofp_single_i32
   %b.addr = alloca float, align 4
   %conv = sitofp i32 %a to float
 ; ELF64: std
 ; ELF64: lfiwax
 ; ELF64: fcfids
+; PPC970: std
+; PPC970: lfd
+; PPC970: fcfid
+; PPC970: frsp
   store float %conv, float* %b.addr, align 4
   ret void
 }
@@ -29,12 +43,18 @@ entry:
 define void @sitofp_single_i16(i16 %a, float %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_single_i16
+; PPC970: sitofp_single_i16
   %b.addr = alloca float, align 4
   %conv = sitofp i16 %a to float
 ; ELF64: extsh
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfids
+; PPC970: extsh
+; PPC970: std
+; PPC970: lfd
+; PPC970: fcfid
+; PPC970: frsp
   store float %conv, float* %b.addr, align 4
   ret void
 }
@@ -42,12 +62,18 @@ entry:
 define void @sitofp_single_i8(i8 %a) nounwind ssp {
 entry:
 ; ELF64: sitofp_single_i8
+; PPC970: sitofp_single_i8
   %b.addr = alloca float, align 4
   %conv = sitofp i8 %a to float
 ; ELF64: extsb
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfids
+; PPC970: extsb
+; PPC970: std
+; PPC970: lfd
+; PPC970: fcfid
+; PPC970: frsp
   store float %conv, float* %b.addr, align 4
   ret void
 }
@@ -55,11 +81,15 @@ entry:
 define void @sitofp_double_i32(i32 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_double_i32
+; PPC970: sitofp_double_i32
   %b.addr = alloca double, align 8
   %conv = sitofp i32 %a to double
 ; ELF64: std
 ; ELF64: lfiwax
 ; ELF64: fcfid
+; PPC970: std
+; PPC970: lfd
+; PPC970: fcfid
   store double %conv, double* %b.addr, align 8
   ret void
 }
@@ -67,11 +97,15 @@ entry:
 define void @sitofp_double_i64(i64 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_double_i64
+; PPC970: sitofp_double_i64
   %b.addr = alloca double, align 8
   %conv = sitofp i64 %a to double
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfid
+; PPC970: std
+; PPC970: lfd
+; PPC970: fcfid
   store double %conv, double* %b.addr, align 8
   ret void
 }
@@ -79,12 +113,17 @@ entry:
 define void @sitofp_double_i16(i16 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_double_i16
+; PPC970: sitofp_double_i16
   %b.addr = alloca double, align 8
   %conv = sitofp i16 %a to double
 ; ELF64: extsh
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfid
+; PPC970: extsh
+; PPC970: std
+; PPC970: lfd
+; PPC970: fcfid
   store double %conv, double* %b.addr, align 8
   ret void
 }
@@ -92,12 +131,17 @@ entry:
 define void @sitofp_double_i8(i8 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_double_i8
+; PPC970: sitofp_double_i8
   %b.addr = alloca double, align 8
   %conv = sitofp i8 %a to double
 ; ELF64: extsb
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfid
+; PPC970: extsb
+; PPC970: std
+; PPC970: lfd
+; PPC970: fcfid
   store double %conv, double* %b.addr, align 8
   ret void
 }
@@ -107,11 +151,13 @@ entry:
 define void @uitofp_single_i64(i64 %a, float %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_single_i64
+; PPC970: uitofp_single_i64
   %b.addr = alloca float, align 4
   %conv = uitofp i64 %a to float
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfidus
+; PPC970-NOT: fcfidus
   store float %conv, float* %b.addr, align 4
   ret void
 }
@@ -119,11 +165,14 @@ entry:
 define void @uitofp_single_i32(i32 %a, float %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_single_i32
+; PPC970: uitofp_single_i32
   %b.addr = alloca float, align 4
   %conv = uitofp i32 %a to float
 ; ELF64: std
 ; ELF64: lfiwzx
 ; ELF64: fcfidus
+; PPC970-NOT: lfiwzx
+; PPC970-NOT: fcfidus
   store float %conv, float* %b.addr, align 4
   ret void
 }
@@ -131,12 +180,18 @@ entry:
 define void @uitofp_single_i16(i16 %a, float %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_single_i16
+; PPC970: uitofp_single_i16
   %b.addr = alloca float, align 4
   %conv = uitofp i16 %a to float
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfidus
+; PPC970: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 16, 31
+; PPC970: std
+; PPC970: lfd
+; PPC970: fcfid
+; PPC970: frsp
   store float %conv, float* %b.addr, align 4
   ret void
 }
@@ -144,12 +199,18 @@ entry:
 define void @uitofp_single_i8(i8 %a) nounwind ssp {
 entry:
 ; ELF64: uitofp_single_i8
+; PPC970: uitofp_single_i8
   %b.addr = alloca float, align 4
   %conv = uitofp i8 %a to float
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfidus
+; PPC970: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 24, 31
+; PPC970: std
+; PPC970: lfd
+; PPC970: fcfid
+; PPC970: frsp
   store float %conv, float* %b.addr, align 4
   ret void
 }
@@ -157,11 +218,13 @@ entry:
 define void @uitofp_double_i64(i64 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_double_i64
+; PPC970: uitofp_double_i64
   %b.addr = alloca double, align 8
   %conv = uitofp i64 %a to double
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfidu
+; PPC970-NOT: fcfidu
   store double %conv, double* %b.addr, align 8
   ret void
 }
@@ -169,11 +232,14 @@ entry:
 define void @uitofp_double_i32(i32 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_double_i32
+; PPC970: uitofp_double_i32
   %b.addr = alloca double, align 8
   %conv = uitofp i32 %a to double
 ; ELF64: std
 ; ELF64: lfiwzx
 ; ELF64: fcfidu
+; PPC970-NOT: lfiwzx
+; PPC970-NOT: fcfidu
   store double %conv, double* %b.addr, align 8
   ret void
 }
@@ -181,12 +247,17 @@ entry:
 define void @uitofp_double_i16(i16 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_double_i16
+; PPC970: uitofp_double_i16
   %b.addr = alloca double, align 8
   %conv = uitofp i16 %a to double
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfidu
+; PPC970: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 16, 31
+; PPC970: std
+; PPC970: lfd
+; PPC970: fcfid
   store double %conv, double* %b.addr, align 8
   ret void
 }
@@ -194,12 +265,17 @@ entry:
 define void @uitofp_double_i8(i8 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_double_i8
+; PPC970: uitofp_double_i8
   %b.addr = alloca double, align 8
   %conv = uitofp i8 %a to double
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfidu
+; PPC970: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 24, 31
+; PPC970: std
+; PPC970: lfd
+; PPC970: fcfid
   store double %conv, double* %b.addr, align 8
   ret void
 }
@@ -209,11 +285,15 @@ entry:
 define void @fptosi_float_i32(float %a) nounwind ssp {
 entry:
 ; ELF64: fptosi_float_i32
+; PPC970: fptosi_float_i32
   %b.addr = alloca i32, align 4
   %conv = fptosi float %a to i32
 ; ELF64: fctiwz
 ; ELF64: stfd
 ; ELF64: lwa
+; PPC970: fctiwz
+; PPC970: stfd
+; PPC970: lwa
   store i32 %conv, i32* %b.addr, align 4
   ret void
 }
@@ -221,11 +301,15 @@ entry:
 define void @fptosi_float_i64(float %a) nounwind ssp {
 entry:
 ; ELF64: fptosi_float_i64
+; PPC970: fptosi_float_i64
   %b.addr = alloca i64, align 4
   %conv = fptosi float %a to i64
 ; ELF64: fctidz
 ; ELF64: stfd
 ; ELF64: ld
+; PPC970: fctidz
+; PPC970: stfd
+; PPC970: ld
   store i64 %conv, i64* %b.addr, align 4
   ret void
 }
@@ -233,11 +317,15 @@ entry:
 define void @fptosi_double_i32(double %a) nounwind ssp {
 entry:
 ; ELF64: fptosi_double_i32
+; PPC970: fptosi_double_i32
   %b.addr = alloca i32, align 8
   %conv = fptosi double %a to i32
 ; ELF64: fctiwz
 ; ELF64: stfd
 ; ELF64: lwa
+; PPC970: fctiwz
+; PPC970: stfd
+; PPC970: lwa
   store i32 %conv, i32* %b.addr, align 8
   ret void
 }
@@ -245,11 +333,15 @@ entry:
 define void @fptosi_double_i64(double %a) nounwind ssp {
 entry:
 ; ELF64: fptosi_double_i64
+; PPC970: fptosi_double_i64
   %b.addr = alloca i64, align 8
   %conv = fptosi double %a to i64
 ; ELF64: fctidz
 ; ELF64: stfd
 ; ELF64: ld
+; PPC970: fctidz
+; PPC970: stfd
+; PPC970: ld
   store i64 %conv, i64* %b.addr, align 8
   ret void
 }
@@ -259,11 +351,15 @@ entry:
 define void @fptoui_float_i32(float %a) nounwind ssp {
 entry:
 ; ELF64: fptoui_float_i32
+; PPC970: fptoui_float_i32
   %b.addr = alloca i32, align 4
   %conv = fptoui float %a to i32
 ; ELF64: fctiwuz
 ; ELF64: stfd
 ; ELF64: lwz
+; PPC970: fctidz
+; PPC970: stfd
+; PPC970: lwz
   store i32 %conv, i32* %b.addr, align 4
   ret void
 }
@@ -271,11 +367,13 @@ entry:
 define void @fptoui_float_i64(float %a) nounwind ssp {
 entry:
 ; ELF64: fptoui_float_i64
+; PPC970: fptoui_float_i64
   %b.addr = alloca i64, align 4
   %conv = fptoui float %a to i64
 ; ELF64: fctiduz
 ; ELF64: stfd
 ; ELF64: ld
+; PPC970-NOT: fctiduz
   store i64 %conv, i64* %b.addr, align 4
   ret void
 }
@@ -283,11 +381,15 @@ entry:
 define void @fptoui_double_i32(double %a) nounwind ssp {
 entry:
 ; ELF64: fptoui_double_i32
+; PPC970: fptoui_double_i32
   %b.addr = alloca i32, align 8
   %conv = fptoui double %a to i32
 ; ELF64: fctiwuz
 ; ELF64: stfd
 ; ELF64: lwz
+; PPC970: fctidz
+; PPC970: stfd
+; PPC970: lwz
   store i32 %conv, i32* %b.addr, align 8
   ret void
 }
@@ -295,11 +397,13 @@ entry:
 define void @fptoui_double_i64(double %a) nounwind ssp {
 entry:
 ; ELF64: fptoui_double_i64
+; PPC970: fptoui_double_i64
   %b.addr = alloca i64, align 8
   %conv = fptoui double %a to i64
 ; ELF64: fctiduz
 ; ELF64: stfd
 ; ELF64: ld
+; PPC970-NOT: fctiduz
   store i64 %conv, i64* %b.addr, align 8
   ret void
 }
diff --git a/test/CodeGen/PowerPC/func-addr.ll b/test/CodeGen/PowerPC/func-addr.ll
new file mode 100644
index 0000000..4533c62
--- /dev/null
+++ b/test/CodeGen/PowerPC/func-addr.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple powerpc64-linux < %s | FileCheck %s
+; RUN: llc -O0 -mtriple powerpc64-linux < %s | FileCheck %s
+
+define void @foo()  {
+  ret void
+}
+declare i32 @bar(i8*)
+
+; CHECK-LABEL: {{^}}zed:
+; CHECK:        addis 3, 2, foo@toc@ha
+; CHECK-NEXT:   addi 3, 3, foo@toc@l
+; CHECK-NEXT:   bl bar
+
+define  void @zed() {
+  call i32 @bar(i8* bitcast (void ()* @foo to i8*))
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/hello-reloc.s b/test/CodeGen/PowerPC/hello-reloc.s
index 1e3fb8f..97dfbb5 100644
--- a/test/CodeGen/PowerPC/hello-reloc.s
+++ b/test/CodeGen/PowerPC/hello-reloc.s
@@ -62,17 +62,17 @@ L_.str:                                 ; @.str
 ; DARWIN-G4-DUMP:AddressSize: 32bit
 ; DARWIN-G4-DUMP:Relocations [
 ; DARWIN-G4-DUMP:  Section __text {
-; DARWIN-G4-DUMP:    0x34 1 2 0 PPC_RELOC_BR24 0 -
-; DARWIN-G4-DUMP:    0x30 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 _main
-; DARWIN-G4-DUMP:    0x0 0 2 n/a PPC_RELOC_PAIR 1 _main
-; DARWIN-G4-DUMP:    0x2C 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 _main
-; DARWIN-G4-DUMP:    0x60 0 2 n/a PPC_RELOC_PAIR 1 _main
+; DARWIN-G4-DUMP:    0x34 1 2 0 PPC_RELOC_BR24 0 0x3
+; DARWIN-G4-DUMP:    0x30 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 0x74
+; DARWIN-G4-DUMP:    0x0 0 2 n/a PPC_RELOC_PAIR 1 0x14
+; DARWIN-G4-DUMP:    0x2C 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 0x74
+; DARWIN-G4-DUMP:    0x60 0 2 n/a PPC_RELOC_PAIR 1 0x14
 ; DARWIN-G4-DUMP:  }
 ; DARWIN-G4-DUMP:  Section __picsymbolstub1 {
-; DARWIN-G4-DUMP:    0x14 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 _main
-; DARWIN-G4-DUMP:    0x0 0 2 n/a PPC_RELOC_PAIR 1 _main
-; DARWIN-G4-DUMP:    0xC 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 _main
-; DARWIN-G4-DUMP:    0x18 0 2 n/a PPC_RELOC_PAIR 1 _main
+; DARWIN-G4-DUMP:    0x14 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 0x70
+; DARWIN-G4-DUMP:    0x0 0 2 n/a PPC_RELOC_PAIR 1 0x58
+; DARWIN-G4-DUMP:    0xC 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 0x70
+; DARWIN-G4-DUMP:    0x18 0 2 n/a PPC_RELOC_PAIR 1 0x58
 ; DARWIN-G4-DUMP:  }
 ; DARWIN-G4-DUMP:  Section __la_symbol_ptr {
 ; DARWIN-G4-DUMP:    0x0 0 2 1 PPC_RELOC_VANILLA 0 dyld_stub_binding_helper
diff --git a/test/CodeGen/PowerPC/lit.local.cfg b/test/CodeGen/PowerPC/lit.local.cfg
index 2e46300..5d33887 100644
--- a/test/CodeGen/PowerPC/lit.local.cfg
+++ b/test/CodeGen/PowerPC/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'PowerPC' in targets:
+if not 'PowerPC' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/PowerPC/ppc64-altivec-abi.ll b/test/CodeGen/PowerPC/ppc64-altivec-abi.ll
new file mode 100644
index 0000000..0bed329
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-altivec-abi.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=ppc64 -mattr=+altivec | FileCheck %s
+
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Verify that in the 64-bit Linux ABI, vector arguments take up space
+; in the parameter save area.
+
+define i64 @callee(i64 %a, <4 x i32> %b, i64 %c, <4 x i32> %d, i64 %e) {
+entry:
+  ret i64 %e
+}
+; CHECK-LABEL: callee:
+; CHECK: ld 3, 112(1)
+
+define void @caller(i64 %x, <4 x i32> %y) {
+entry:
+  tail call void @test(i64 %x, <4 x i32> %y, i64 %x, <4 x i32> %y, i64 %x)
+  ret void
+}
+; CHECK-LABEL: caller:
+; CHECK: std 3, 112(1)
+
+declare void @test(i64, <4 x i32>, i64, <4 x i32>, i64)
+
diff --git a/test/CodeGen/PowerPC/ppc64-byval-align.ll b/test/CodeGen/PowerPC/ppc64-byval-align.ll
new file mode 100644
index 0000000..0e73cf2
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-byval-align.ll
@@ -0,0 +1,56 @@
+; RUN: llc -O1 < %s -march=ppc64 -mcpu=pwr7 | FileCheck %s
+
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.test = type { i64, [8 x i8] }
+%struct.pad = type { [8 x i64] }
+
+@gt = common global %struct.test zeroinitializer, align 16
+@gp = common global %struct.pad zeroinitializer, align 8
+
+define signext i32 @callee1(i32 signext %x, %struct.test* byval align 16 nocapture readnone %y, i32 signext %z) {
+entry:
+  ret i32 %z
+}
+; CHECK-LABEL: @callee1
+; CHECK: mr 3, 7
+; CHECK: blr
+
+declare signext i32 @test1(i32 signext, %struct.test* byval align 16, i32 signext)
+define void @caller1(i32 signext %z) {
+entry:
+  %call = tail call signext i32 @test1(i32 signext 0, %struct.test* byval align 16 @gt, i32 signext %z)
+  ret void
+}
+; CHECK-LABEL: @caller1
+; CHECK: mr [[REG:[0-9]+]], 3
+; CHECK: mr 7, [[REG]]
+; CHECK: bl test1
+
+define i64 @callee2(%struct.pad* byval nocapture readnone %x, i32 signext %y, %struct.test* byval align 16 nocapture readonly %z) {
+entry:
+  %x1 = getelementptr inbounds %struct.test* %z, i64 0, i32 0
+  %0 = load i64* %x1, align 16
+  ret i64 %0
+}
+; CHECK-LABEL: @callee2
+; CHECK: ld [[REG:[0-9]+]], 128(1)
+; CHECK: mr 3, [[REG]]
+; CHECK: blr
+
+declare i64 @test2(%struct.pad* byval, i32 signext, %struct.test* byval align 16)
+define void @caller2(i64 %z) {
+entry:
+  %tmp = alloca %struct.test, align 16
+  %.compoundliteral.sroa.0.0..sroa_idx = getelementptr inbounds %struct.test* %tmp, i64 0, i32 0
+  store i64 %z, i64* %.compoundliteral.sroa.0.0..sroa_idx, align 16
+  %call = call i64 @test2(%struct.pad* byval @gp, i32 signext 0, %struct.test* byval align 16 %tmp)
+  ret void
+}
+; CHECK-LABEL: @caller2
+; CHECK: std 3, [[OFF:[0-9]+]](1)
+; CHECK: ld [[REG:[0-9]+]], [[OFF]](1)
+; CHECK: std [[REG]], 128(1)
+; CHECK: bl test2
+
diff --git a/test/CodeGen/PowerPC/ppc64-calls.ll b/test/CodeGen/PowerPC/ppc64-calls.ll
index 1f3bb71..31794be 100644
--- a/test/CodeGen/PowerPC/ppc64-calls.ll
+++ b/test/CodeGen/PowerPC/ppc64-calls.ll
@@ -42,12 +42,18 @@ define void @test_indirect(void ()* nocapture %fp) nounwind {
   ret void
 }
 
-; Absolute vales should be have the TOC restore 'nop'
+; Absolute values must use the regular indirect call sequence
+; The main purpose of this test is to ensure that BLA is not
+; used on 64-bit SVR4 (as e.g. on Darwin).
 define void @test_abs() nounwind {
 ; CHECK-LABEL: test_abs:
   tail call void inttoptr (i64 1024 to void ()*)() nounwind
-; CHECK: bla 1024
-; CHECK-NEXT: nop
+; CHECK: ld [[FP:[0-9]+]], 1024(0)
+; CHECK: ld 11, 1040(0)
+; CHECK: ld 2, 1032(0)
+; CHECK-NEXT: mtctr [[FP]]
+; CHECK-NEXT: bctrl
+; CHECK-NEXT: ld 2, 40(1)
   ret void
 }
 
diff --git a/test/CodeGen/PowerPC/ppc64-smallarg.ll b/test/CodeGen/PowerPC/ppc64-smallarg.ll
new file mode 100644
index 0000000..0d5b078
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-smallarg.ll
@@ -0,0 +1,59 @@
+; Verify that small structures and float arguments are passed in the
+; least significant part of a stack slot doubleword.
+
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.large_arg = type { [8 x i64] }
+%struct.small_arg = type { i16, i8 }
+
+@gl = common global %struct.large_arg zeroinitializer, align 8
+@gs = common global %struct.small_arg zeroinitializer, align 2
+@gf = common global float 0.000000e+00, align 4
+
+define void @callee1(%struct.small_arg* noalias nocapture sret %agg.result, %struct.large_arg* byval nocapture readnone %pad, %struct.small_arg* byval nocapture readonly %x) {
+entry:
+  %0 = bitcast %struct.small_arg* %x to i32*
+  %1 = bitcast %struct.small_arg* %agg.result to i32*
+  %2 = load i32* %0, align 2
+  store i32 %2, i32* %1, align 2
+  ret void
+}
+; CHECK: @callee1
+; CHECK: lwz {{[0-9]+}}, 124(1)
+; CHECK: blr
+
+define void @caller1() {
+entry:
+  %tmp = alloca %struct.small_arg, align 2
+  call void @test1(%struct.small_arg* sret %tmp, %struct.large_arg* byval @gl, %struct.small_arg* byval @gs)
+  ret void
+}
+; CHECK: @caller1
+; CHECK: stw {{[0-9]+}}, 124(1)
+; CHECK: bl test1
+
+declare void @test1(%struct.small_arg* sret, %struct.large_arg* byval, %struct.small_arg* byval)
+
+define float @callee2(float %pad1, float %pad2, float %pad3, float %pad4, float %pad5, float %pad6, float %pad7, float %pad8, float %pad9, float %pad10, float %pad11, float %pad12, float %pad13, float %x) {
+entry:
+  ret float %x
+}
+; CHECK: @callee2
+; CHECK: lfs {{[0-9]+}}, 156(1)
+; CHECK: blr
+
+define void @caller2() {
+entry:
+  %0 = load float* @gf, align 4
+  %call = tail call float @test2(float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float %0)
+  ret void
+}
+; CHECK: @caller2
+; CHECK: stfs {{[0-9]+}}, 156(1)
+; CHECK: bl test2
+
+declare float @test2(float, float, float, float, float, float, float, float, float, float, float, float, float, float)
+
diff --git a/test/CodeGen/PowerPC/ppc64le-smallarg.ll b/test/CodeGen/PowerPC/ppc64le-smallarg.ll
new file mode 100644
index 0000000..fcb1e92
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64le-smallarg.ll
@@ -0,0 +1,59 @@
+; Verify that small structures and float arguments are passed in the
+; least significant part of a stack slot doubleword.
+
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+%struct.large_arg = type { [8 x i64] }
+%struct.small_arg = type { i16, i8 }
+
+@gl = common global %struct.large_arg zeroinitializer, align 8
+@gs = common global %struct.small_arg zeroinitializer, align 2
+@gf = common global float 0.000000e+00, align 4
+
+define void @callee1(%struct.small_arg* noalias nocapture sret %agg.result, %struct.large_arg* byval nocapture readnone %pad, %struct.small_arg* byval nocapture readonly %x) {
+entry:
+  %0 = bitcast %struct.small_arg* %x to i32*
+  %1 = bitcast %struct.small_arg* %agg.result to i32*
+  %2 = load i32* %0, align 2
+  store i32 %2, i32* %1, align 2
+  ret void
+}
+; CHECK: @callee1
+; CHECK: lwz {{[0-9]+}}, 120(1)
+; CHECK: blr
+
+define void @caller1() {
+entry:
+  %tmp = alloca %struct.small_arg, align 2
+  call void @test1(%struct.small_arg* sret %tmp, %struct.large_arg* byval @gl, %struct.small_arg* byval @gs)
+  ret void
+}
+; CHECK: @caller1
+; CHECK: stw {{[0-9]+}}, 120(1)
+; CHECK: bl test1
+
+declare void @test1(%struct.small_arg* sret, %struct.large_arg* byval, %struct.small_arg* byval)
+
+define float @callee2(float %pad1, float %pad2, float %pad3, float %pad4, float %pad5, float %pad6, float %pad7, float %pad8, float %pad9, float %pad10, float %pad11, float %pad12, float %pad13, float %x) {
+entry:
+  ret float %x
+}
+; CHECK: @callee2
+; CHECK: lfs {{[0-9]+}}, 152(1)
+; CHECK: blr
+
+define void @caller2() {
+entry:
+  %0 = load float* @gf, align 4
+  %call = tail call float @test2(float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float %0)
+  ret void
+}
+; CHECK: @caller2
+; CHECK: stfs {{[0-9]+}}, 152(1)
+; CHECK: bl test2
+
+declare float @test2(float, float, float, float, float, float, float, float, float, float, float, float, float, float)
+
diff --git a/test/CodeGen/PowerPC/ppcf128-endian.ll b/test/CodeGen/PowerPC/ppcf128-endian.ll
new file mode 100644
index 0000000..2a5f13a
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppcf128-endian.ll
@@ -0,0 +1,154 @@
+; RUN: llc -mcpu=pwr7 -mattr=+altivec < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+@g = common global ppc_fp128 0xM00000000000000000000000000000000, align 16
+
+define void @callee(ppc_fp128 %x) {
+entry:
+  %x.addr = alloca ppc_fp128, align 16
+  store ppc_fp128 %x, ppc_fp128* %x.addr, align 16
+  %0 = load ppc_fp128* %x.addr, align 16
+  store ppc_fp128 %0, ppc_fp128* @g, align 16
+  ret void
+}
+; CHECK: @callee
+; CHECK: ld [[REG:[0-9]+]], .LC
+; CHECK: stfd 2, 8([[REG]])
+; CHECK: stfd 1, 0([[REG]])
+; CHECK: blr
+
+define void @caller() {
+entry:
+  %0 = load ppc_fp128* @g, align 16
+  call void @test(ppc_fp128 %0)
+  ret void
+}
+; CHECK: @caller
+; CHECK: ld [[REG:[0-9]+]], .LC
+; CHECK: lfd 2, 8([[REG]])
+; CHECK: lfd 1, 0([[REG]])
+; CHECK: bl test
+
+declare void @test(ppc_fp128)
+
+define void @caller_const() {
+entry:
+  call void @test(ppc_fp128 0xM3FF00000000000000000000000000000)
+  ret void
+}
+; CHECK: .LCPI[[LC:[0-9]+]]_0:
+; CHECK: .long   1065353216
+; CHECK: .LCPI[[LC]]_1:
+; CHECK: .long   0
+; CHECK: @caller_const
+; CHECK: addi [[REG0:[0-9]+]], {{[0-9]+}}, .LCPI[[LC]]_0
+; CHECK: addi [[REG1:[0-9]+]], {{[0-9]+}}, .LCPI[[LC]]_1
+; CHECK: lfs 1, 0([[REG0]])
+; CHECK: lfs 2, 0([[REG1]])
+; CHECK: bl test
+
+define ppc_fp128 @result() {
+entry:
+  %0 = load ppc_fp128* @g, align 16
+  ret ppc_fp128 %0
+}
+; CHECK: @result
+; CHECK: ld [[REG:[0-9]+]], .LC
+; CHECK: lfd 1, 0([[REG]])
+; CHECK: lfd 2, 8([[REG]])
+; CHECK: blr
+
+define void @use_result() {
+entry:
+  %call = tail call ppc_fp128 @test_result() #3
+  store ppc_fp128 %call, ppc_fp128* @g, align 16
+  ret void
+}
+; CHECK: @use_result
+; CHECK: bl test_result
+; CHECK: ld [[REG:[0-9]+]], .LC
+; CHECK: stfd 2, 8([[REG]])
+; CHECK: stfd 1, 0([[REG]])
+; CHECK: blr
+
+declare ppc_fp128 @test_result()
+
+define void @caller_result() {
+entry:
+  %call = tail call ppc_fp128 @test_result()
+  tail call void @test(ppc_fp128 %call)
+  ret void
+}
+; CHECK: @caller_result
+; CHECK: bl test_result
+; CHECK-NEXT: nop
+; CHECK-NEXT: bl test
+; CHECK-NEXT: nop
+
+define i128 @convert_from(ppc_fp128 %x) {
+entry:
+  %0 = bitcast ppc_fp128 %x to i128
+  ret i128 %0
+}
+; CHECK: @convert_from
+; CHECK: stfd 1, [[OFF1:.*]](1)
+; CHECK: stfd 2, [[OFF2:.*]](1)
+; CHECK: ld 3, [[OFF1]](1)
+; CHECK: ld 4, [[OFF2]](1)
+; CHECK: blr
+
+define ppc_fp128 @convert_to(i128 %x) {
+entry:
+  %0 = bitcast i128 %x to ppc_fp128
+  ret ppc_fp128 %0
+}
+; CHECK: @convert_to
+; CHECK: std 3, [[OFF1:.*]](1)
+; CHECK: std 4, [[OFF2:.*]](1)
+; CHECK: lfd 1, [[OFF1]](1)
+; CHECK: lfd 2, [[OFF2]](1)
+; CHECK: blr
+
+define ppc_fp128 @convert_to2(i128 %x) {
+entry:
+  %shl = shl i128 %x, 1
+  %0 = bitcast i128 %shl to ppc_fp128
+  ret ppc_fp128 %0
+}
+
+; CHECK: @convert_to
+; CHECK: std 3, [[OFF1:.*]](1)
+; CHECK: std 4, [[OFF2:.*]](1)
+; CHECK: lfd 1, [[OFF1]](1)
+; CHECK: lfd 2, [[OFF2]](1)
+; CHECK: blr
+
+define double @convert_vector(<4 x i32> %x) {
+entry:
+  %cast = bitcast <4 x i32> %x to ppc_fp128
+  %conv = fptrunc ppc_fp128 %cast to double
+  ret double %conv
+}
+; CHECK: @convert_vector
+; CHECK: addi [[REG:[0-9]+]], 1, [[OFF:.*]]
+; CHECK: stvx 2, 0, [[REG]]
+; CHECK: lfd 1, [[OFF]](1)
+; CHECK: blr
+
+declare void @llvm.va_start(i8*)
+
+define double @vararg(i32 %a, ...) {
+entry:
+  %va = alloca i8*, align 8
+  %va1 = bitcast i8** %va to i8*
+  call void @llvm.va_start(i8* %va1)
+  %arg = va_arg i8** %va, ppc_fp128
+  %conv = fptrunc ppc_fp128 %arg to double
+  ret double %conv
+}
+; CHECK: @vararg
+; CHECK: lfd 1, 0({{[0-9]+}})
+; CHECK: blr
+
diff --git a/test/CodeGen/PowerPC/resolvefi-basereg.ll b/test/CodeGen/PowerPC/resolvefi-basereg.ll
new file mode 100644
index 0000000..62c2d13
--- /dev/null
+++ b/test/CodeGen/PowerPC/resolvefi-basereg.ll
@@ -0,0 +1,362 @@
+; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s
+
+; Due to a bug in resolveFrameIndex we ended up with invalid addresses
+; containing a base register 0.  Verify that this no longer happens.
+; CHECK-NOT: (0)
+
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.Info = type { i32, i32, i8*, i8*, i8*, [32 x i8*], i64, [32 x i64], i64, i64, i64, [32 x i64] }
+%struct.S1998 = type { [2 x i32*], i64, i64, double, i16, i32, [29 x %struct.anon], i16, i8, i32, [8 x i8] }
+%struct.anon = type { [16 x double], i32, i16, i32, [3 x i8], [6 x i8], [4 x i32], i8 }
+
+@info = global %struct.Info zeroinitializer, align 8
+@fails = global i32 0, align 4
+@intarray = global [256 x i32] zeroinitializer, align 4
+@s1998 = global %struct.S1998 zeroinitializer, align 16
+@a1998 = external global [5 x %struct.S1998]
+
+define void @test1998() {
+entry:
+  %i = alloca i32, align 4
+  %j = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %agg.tmp = alloca %struct.S1998, align 16
+  %agg.tmp111 = alloca %struct.S1998, align 16
+  %agg.tmp112 = alloca %struct.S1998, align 16
+  %agg.tmp113 = alloca %struct.S1998, align 16
+  %agg.tmp114 = alloca %struct.S1998, align 16
+  %agg.tmp115 = alloca %struct.S1998, align 16
+  %agg.tmp116 = alloca %struct.S1998, align 16
+  %agg.tmp117 = alloca %struct.S1998, align 16
+  %agg.tmp118 = alloca %struct.S1998, align 16
+  %agg.tmp119 = alloca %struct.S1998, align 16
+  call void @llvm.memset.p0i8.i64(i8* bitcast (%struct.S1998* @s1998 to i8*), i8 0, i64 5168, i32 16, i1 false)
+  call void @llvm.memset.p0i8.i64(i8* bitcast ([5 x %struct.S1998]* @a1998 to i8*), i8 0, i64 25840, i32 16, i1 false)
+  call void @llvm.memset.p0i8.i64(i8* bitcast (%struct.Info* @info to i8*), i8 0, i64 832, i32 8, i1 false)
+  store i8* bitcast (%struct.S1998* @s1998 to i8*), i8** getelementptr inbounds (%struct.Info* @info, i32 0, i32 2), align 8
+  store i8* bitcast ([5 x %struct.S1998]* @a1998 to i8*), i8** getelementptr inbounds (%struct.Info* @info, i32 0, i32 3), align 8
+  store i8* bitcast (%struct.S1998* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 3) to i8*), i8** getelementptr inbounds (%struct.Info* @info, i32 0, i32 4), align 8
+  store i64 5168, i64* getelementptr inbounds (%struct.Info* @info, i32 0, i32 6), align 8
+  store i64 16, i64* getelementptr inbounds (%struct.Info* @info, i32 0, i32 8), align 8
+  store i64 16, i64* getelementptr inbounds (%struct.Info* @info, i32 0, i32 9), align 8
+  store i64 16, i64* getelementptr inbounds (%struct.Info* @info, i32 0, i32 10), align 8
+  %0 = load i64* getelementptr inbounds (%struct.Info* @info, i32 0, i32 8), align 8
+  %sub = sub i64 %0, 1
+  %and = and i64 ptrtoint (%struct.S1998* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 3) to i64), %sub
+  %tobool = icmp ne i64 %and, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %1 = load i32* @fails, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* @fails, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  store i32 0, i32* %i, align 4
+  store i32 0, i32* %j, align 4
+  %2 = load i32* %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom
+  store i8* bitcast (i32** getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 0, i64 1) to i8*), i8** %arrayidx, align 8
+  %3 = load i32* %i, align 4
+  %idxprom1 = sext i32 %3 to i64
+  %arrayidx2 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom1
+  store i64 8, i64* %arrayidx2, align 8
+  %4 = load i32* %i, align 4
+  %idxprom3 = sext i32 %4 to i64
+  %arrayidx4 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom3
+  store i64 8, i64* %arrayidx4, align 8
+  store i32* getelementptr inbounds ([256 x i32]* @intarray, i32 0, i64 190), i32** getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 0, i64 1), align 8
+  store i32* getelementptr inbounds ([256 x i32]* @intarray, i32 0, i64 241), i32** getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 0, i64 1), align 8
+  %5 = load i32* %i, align 4
+  %inc5 = add nsw i32 %5, 1
+  store i32 %inc5, i32* %i, align 4
+  %6 = load i32* %i, align 4
+  %idxprom6 = sext i32 %6 to i64
+  %arrayidx7 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom6
+  store i8* bitcast (i64* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 1) to i8*), i8** %arrayidx7, align 8
+  %7 = load i32* %i, align 4
+  %idxprom8 = sext i32 %7 to i64
+  %arrayidx9 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom8
+  store i64 8, i64* %arrayidx9, align 8
+  %8 = load i32* %i, align 4
+  %idxprom10 = sext i32 %8 to i64
+  %arrayidx11 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom10
+  store i64 8, i64* %arrayidx11, align 8
+  store i64 -3866974208859106459, i64* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 1), align 8
+  store i64 -185376695371304091, i64* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 1), align 8
+  %9 = load i32* %i, align 4
+  %inc12 = add nsw i32 %9, 1
+  store i32 %inc12, i32* %i, align 4
+  %10 = load i32* %i, align 4
+  %idxprom13 = sext i32 %10 to i64
+  %arrayidx14 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom13
+  store i8* bitcast (i64* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 2) to i8*), i8** %arrayidx14, align 8
+  %11 = load i32* %i, align 4
+  %idxprom15 = sext i32 %11 to i64
+  %arrayidx16 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom15
+  store i64 8, i64* %arrayidx16, align 8
+  %12 = load i32* %i, align 4
+  %idxprom17 = sext i32 %12 to i64
+  %arrayidx18 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom17
+  store i64 8, i64* %arrayidx18, align 8
+  store i64 -963638028680427187, i64* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 2), align 8
+  store i64 7510542175772455554, i64* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 2), align 8
+  %13 = load i32* %i, align 4
+  %inc19 = add nsw i32 %13, 1
+  store i32 %inc19, i32* %i, align 4
+  %14 = load i32* %i, align 4
+  %idxprom20 = sext i32 %14 to i64
+  %arrayidx21 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom20
+  store i8* bitcast (double* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 3) to i8*), i8** %arrayidx21, align 8
+  %15 = load i32* %i, align 4
+  %idxprom22 = sext i32 %15 to i64
+  %arrayidx23 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom22
+  store i64 8, i64* %arrayidx23, align 8
+  %16 = load i32* %i, align 4
+  %idxprom24 = sext i32 %16 to i64
+  %arrayidx25 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom24
+  store i64 16, i64* %arrayidx25, align 8
+  store double 0xC0F8783300000000, double* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 3), align 16
+  store double 0xC10DF3CCC0000000, double* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 3), align 16
+  %17 = load i32* %i, align 4
+  %inc26 = add nsw i32 %17, 1
+  store i32 %inc26, i32* %i, align 4
+  %18 = load i32* %i, align 4
+  %idxprom27 = sext i32 %18 to i64
+  %arrayidx28 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom27
+  store i8* bitcast (i16* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 4) to i8*), i8** %arrayidx28, align 8
+  %19 = load i32* %i, align 4
+  %idxprom29 = sext i32 %19 to i64
+  %arrayidx30 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom29
+  store i64 2, i64* %arrayidx30, align 8
+  %20 = load i32* %i, align 4
+  %idxprom31 = sext i32 %20 to i64
+  %arrayidx32 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom31
+  store i64 2, i64* %arrayidx32, align 8
+  store i16 -15897, i16* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 4), align 2
+  store i16 30935, i16* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 4), align 2
+  %21 = load i32* %i, align 4
+  %inc33 = add nsw i32 %21, 1
+  store i32 %inc33, i32* %i, align 4
+  store i32 -419541644, i32* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 5), align 4
+  store i32 2125926812, i32* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 5), align 4
+  %22 = load i32* %j, align 4
+  %inc34 = add nsw i32 %22, 1
+  store i32 %inc34, i32* %j, align 4
+  %23 = load i32* %i, align 4
+  %idxprom35 = sext i32 %23 to i64
+  %arrayidx36 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom35
+  store i8* bitcast (double* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 0, i64 0) to i8*), i8** %arrayidx36, align 8
+  %24 = load i32* %i, align 4
+  %idxprom37 = sext i32 %24 to i64
+  %arrayidx38 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom37
+  store i64 8, i64* %arrayidx38, align 8
+  %25 = load i32* %i, align 4
+  %idxprom39 = sext i32 %25 to i64
+  %arrayidx40 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom39
+  store i64 8, i64* %arrayidx40, align 8
+  store double 0xC0FC765780000000, double* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 0, i64 0), align 8
+  store double 0xC1025CD7A0000000, double* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 0, i64 0), align 8
+  %26 = load i32* %i, align 4
+  %inc41 = add nsw i32 %26, 1
+  store i32 %inc41, i32* %i, align 4
+  %bf.load = load i32* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 1), align 8
+  %bf.clear = and i32 %bf.load, 7
+  %bf.set = or i32 %bf.clear, 16
+  store i32 %bf.set, i32* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 1), align 8
+  %bf.load42 = load i32* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 1), align 8
+  %bf.clear43 = and i32 %bf.load42, 7
+  %bf.set44 = or i32 %bf.clear43, 24
+  store i32 %bf.set44, i32* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 1), align 8
+  %27 = load i32* %j, align 4
+  %inc45 = add nsw i32 %27, 1
+  store i32 %inc45, i32* %j, align 4
+  %bf.load46 = load i16* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 2), align 4
+  %bf.clear47 = and i16 %bf.load46, 127
+  store i16 %bf.clear47, i16* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 2), align 4
+  %bf.load48 = load i16* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 2), align 4
+  %bf.clear49 = and i16 %bf.load48, 127
+  store i16 %bf.clear49, i16* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 2), align 4
+  %28 = load i32* %j, align 4
+  %inc50 = add nsw i32 %28, 1
+  store i32 %inc50, i32* %j, align 4
+  %bf.load51 = load i32* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 3), align 8
+  %bf.clear52 = and i32 %bf.load51, 63
+  store i32 %bf.clear52, i32* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 3), align 8
+  %bf.load53 = load i32* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 3), align 8
+  %bf.clear54 = and i32 %bf.load53, 63
+  %bf.set55 = or i32 %bf.clear54, 64
+  store i32 %bf.set55, i32* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 3), align 8
+  %29 = load i32* %j, align 4
+  %inc56 = add nsw i32 %29, 1
+  store i32 %inc56, i32* %j, align 4
+  %bf.load57 = load i24* bitcast ([3 x i8]* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 4) to i24*), align 4
+  %bf.clear58 = and i24 %bf.load57, 63
+  store i24 %bf.clear58, i24* bitcast ([3 x i8]* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 4) to i24*), align 4
+  %bf.load59 = load i24* bitcast ([3 x i8]* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 4) to i24*), align 4
+  %bf.clear60 = and i24 %bf.load59, 63
+  store i24 %bf.clear60, i24* bitcast ([3 x i8]* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 4) to i24*), align 4
+  %30 = load i32* %j, align 4
+  %inc61 = add nsw i32 %30, 1
+  store i32 %inc61, i32* %j, align 4
+  %31 = load i32* %i, align 4
+  %idxprom62 = sext i32 %31 to i64
+  %arrayidx63 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom62
+  store i8* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 5, i64 5), i8** %arrayidx63, align 8
+  %32 = load i32* %i, align 4
+  %idxprom64 = sext i32 %32 to i64
+  %arrayidx65 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom64
+  store i64 1, i64* %arrayidx65, align 8
+  %33 = load i32* %i, align 4
+  %idxprom66 = sext i32 %33 to i64
+  %arrayidx67 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom66
+  store i64 1, i64* %arrayidx67, align 8
+  store i8 -83, i8* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 5, i64 5), align 1
+  store i8 -67, i8* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 5, i64 5), align 1
+  %34 = load i32* %i, align 4
+  %inc68 = add nsw i32 %34, 1
+  store i32 %inc68, i32* %i, align 4
+  %35 = load i32* %i, align 4
+  %idxprom69 = sext i32 %35 to i64
+  %arrayidx70 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom69
+  store i8* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 5, i64 1), i8** %arrayidx70, align 8
+  %36 = load i32* %i, align 4
+  %idxprom71 = sext i32 %36 to i64
+  %arrayidx72 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom71
+  store i64 1, i64* %arrayidx72, align 8
+  %37 = load i32* %i, align 4
+  %idxprom73 = sext i32 %37 to i64
+  %arrayidx74 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom73
+  store i64 1, i64* %arrayidx74, align 8
+  store i8 34, i8* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 5, i64 1), align 1
+  store i8 64, i8* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 5, i64 1), align 1
+  %38 = load i32* %i, align 4
+  %inc75 = add nsw i32 %38, 1
+  store i32 %inc75, i32* %i, align 4
+  %39 = load i32* %i, align 4
+  %idxprom76 = sext i32 %39 to i64
+  %arrayidx77 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom76
+  store i8* bitcast (i32* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 6, i64 3) to i8*), i8** %arrayidx77, align 8
+  %40 = load i32* %i, align 4
+  %idxprom78 = sext i32 %40 to i64
+  %arrayidx79 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom78
+  store i64 4, i64* %arrayidx79, align 8
+  %41 = load i32* %i, align 4
+  %idxprom80 = sext i32 %41 to i64
+  %arrayidx81 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom80
+  store i64 4, i64* %arrayidx81, align 8
+  store i32 -3, i32* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 6, i64 3), align 4
+  store i32 -3, i32* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 6, i64 3), align 4
+  %42 = load i32* %i, align 4
+  %inc82 = add nsw i32 %42, 1
+  store i32 %inc82, i32* %i, align 4
+  %43 = load i32* %i, align 4
+  %idxprom83 = sext i32 %43 to i64
+  %arrayidx84 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom83
+  store i8* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 7), i8** %arrayidx84, align 8
+  %44 = load i32* %i, align 4
+  %idxprom85 = sext i32 %44 to i64
+  %arrayidx86 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom85
+  store i64 1, i64* %arrayidx86, align 8
+  %45 = load i32* %i, align 4
+  %idxprom87 = sext i32 %45 to i64
+  %arrayidx88 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom87
+  store i64 1, i64* %arrayidx88, align 8
+  store i8 106, i8* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 6, i64 4, i32 7), align 1
+  store i8 -102, i8* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 6, i64 4, i32 7), align 1
+  %46 = load i32* %i, align 4
+  %inc89 = add nsw i32 %46, 1
+  store i32 %inc89, i32* %i, align 4
+  %47 = load i32* %i, align 4
+  %idxprom90 = sext i32 %47 to i64
+  %arrayidx91 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom90
+  store i8* bitcast (i16* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 7) to i8*), i8** %arrayidx91, align 8
+  %48 = load i32* %i, align 4
+  %idxprom92 = sext i32 %48 to i64
+  %arrayidx93 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom92
+  store i64 2, i64* %arrayidx93, align 8
+  %49 = load i32* %i, align 4
+  %idxprom94 = sext i32 %49 to i64
+  %arrayidx95 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom94
+  store i64 2, i64* %arrayidx95, align 8
+  store i16 29665, i16* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 7), align 2
+  store i16 7107, i16* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 7), align 2
+  %50 = load i32* %i, align 4
+  %inc96 = add nsw i32 %50, 1
+  store i32 %inc96, i32* %i, align 4
+  %51 = load i32* %i, align 4
+  %idxprom97 = sext i32 %51 to i64
+  %arrayidx98 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom97
+  store i8* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 8), i8** %arrayidx98, align 8
+  %52 = load i32* %i, align 4
+  %idxprom99 = sext i32 %52 to i64
+  %arrayidx100 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom99
+  store i64 1, i64* %arrayidx100, align 8
+  %53 = load i32* %i, align 4
+  %idxprom101 = sext i32 %53 to i64
+  %arrayidx102 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom101
+  store i64 1, i64* %arrayidx102, align 8
+  store i8 52, i8* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 8), align 1
+  store i8 -86, i8* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 8), align 1
+  %54 = load i32* %i, align 4
+  %inc103 = add nsw i32 %54, 1
+  store i32 %inc103, i32* %i, align 4
+  %55 = load i32* %i, align 4
+  %idxprom104 = sext i32 %55 to i64
+  %arrayidx105 = getelementptr inbounds [32 x i8*]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 5), i32 0, i64 %idxprom104
+  store i8* bitcast (i32* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 9) to i8*), i8** %arrayidx105, align 8
+  %56 = load i32* %i, align 4
+  %idxprom106 = sext i32 %56 to i64
+  %arrayidx107 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 7), i32 0, i64 %idxprom106
+  store i64 4, i64* %arrayidx107, align 8
+  %57 = load i32* %i, align 4
+  %idxprom108 = sext i32 %57 to i64
+  %arrayidx109 = getelementptr inbounds [32 x i64]* getelementptr inbounds (%struct.Info* @info, i32 0, i32 11), i32 0, i64 %idxprom108
+  store i64 4, i64* %arrayidx109, align 8
+  store i32 -54118453, i32* getelementptr inbounds (%struct.S1998* @s1998, i32 0, i32 9), align 4
+  store i32 1668755823, i32* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2, i32 9), align 4
+  %58 = load i32* %i, align 4
+  %inc110 = add nsw i32 %58, 1
+  store i32 %inc110, i32* %i, align 4
+  store i32 %inc110, i32* %tmp
+  %59 = load i32* %tmp
+  %60 = load i32* %i, align 4
+  store i32 %60, i32* getelementptr inbounds (%struct.Info* @info, i32 0, i32 0), align 4
+  %61 = load i32* %j, align 4
+  store i32 %61, i32* getelementptr inbounds (%struct.Info* @info, i32 0, i32 1), align 4
+  %62 = bitcast %struct.S1998* %agg.tmp111 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %62, i8* bitcast (%struct.S1998* @s1998 to i8*), i64 5168, i32 16, i1 false)
+  %63 = bitcast %struct.S1998* %agg.tmp112 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %63, i8* bitcast (%struct.S1998* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2) to i8*), i64 5168, i32 16, i1 false)
+  call void @check1998(%struct.S1998* sret %agg.tmp, %struct.S1998* byval align 16 %agg.tmp111, %struct.S1998* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 1), %struct.S1998* byval align 16 %agg.tmp112)
+  call void @checkx1998(%struct.S1998* byval align 16 %agg.tmp)
+  %64 = bitcast %struct.S1998* %agg.tmp113 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %64, i8* bitcast (%struct.S1998* @s1998 to i8*), i64 5168, i32 16, i1 false)
+  %65 = bitcast %struct.S1998* %agg.tmp114 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %65, i8* bitcast (%struct.S1998* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2) to i8*), i64 5168, i32 16, i1 false)
+  %66 = bitcast %struct.S1998* %agg.tmp115 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %66, i8* bitcast (%struct.S1998* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2) to i8*), i64 5168, i32 16, i1 false)
+  call void (i32, ...)* @check1998va(i32 signext 1, double 1.000000e+00, %struct.S1998* byval align 16 %agg.tmp113, i64 2, %struct.S1998* byval align 16 %agg.tmp114, %struct.S1998* byval align 16 %agg.tmp115)
+  %67 = bitcast %struct.S1998* %agg.tmp116 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %67, i8* bitcast (%struct.S1998* @s1998 to i8*), i64 5168, i32 16, i1 false)
+  %68 = bitcast %struct.S1998* %agg.tmp117 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %68, i8* bitcast (%struct.S1998* @s1998 to i8*), i64 5168, i32 16, i1 false)
+  %69 = bitcast %struct.S1998* %agg.tmp118 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %69, i8* bitcast (%struct.S1998* getelementptr inbounds ([5 x %struct.S1998]* @a1998, i32 0, i64 2) to i8*), i64 5168, i32 16, i1 false)
+  %70 = bitcast %struct.S1998* %agg.tmp119 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %70, i8* bitcast (%struct.S1998* @s1998 to i8*), i64 5168, i32 16, i1 false)
+  call void (i32, ...)* @check1998va(i32 signext 2, %struct.S1998* byval align 16 %agg.tmp116, %struct.S1998* byval align 16 %agg.tmp117, ppc_fp128 0xM40000000000000000000000000000000, %struct.S1998* byval align 16 %agg.tmp118, %struct.S1998* byval align 16 %agg.tmp119)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
+
+declare void @check1998(%struct.S1998* sret, %struct.S1998* byval align 16, %struct.S1998*, %struct.S1998* byval align 16)
+declare void @check1998va(i32 signext, ...)
+declare void @checkx1998(%struct.S1998* byval align 16 %arg)
+
diff --git a/test/CodeGen/PowerPC/svr4-redzone.ll b/test/CodeGen/PowerPC/svr4-redzone.ll
index 7c51b67..bee3ac3 100644
--- a/test/CodeGen/PowerPC/svr4-redzone.ll
+++ b/test/CodeGen/PowerPC/svr4-redzone.ll
@@ -36,4 +36,4 @@ entry:
 ; PPC32: stwu 1, -240(1)
 
 ; PPC64-LABEL: bigstack:
-; PPC64: stdu 1, -352(1)
+; PPC64: stdu 1, -288(1)
diff --git a/test/CodeGen/PowerPC/vec_cmp.ll b/test/CodeGen/PowerPC/vec_cmp.ll
index 4bce8c8..2733089 100644
--- a/test/CodeGen/PowerPC/vec_cmp.ll
+++ b/test/CodeGen/PowerPC/vec_cmp.ll
@@ -36,7 +36,7 @@ define <8 x i8> @v8si8_cmp(<8 x i8> %x, <8 x i8> %y) nounwind readnone {
 ; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
 
 
-; Adicional tests for v16i8 since it is a altivec native type
+; Additional tests for v16i8 since it is a altivec native type
 
 define <16 x i8> @v16si8_cmp_eq(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
   %cmp = icmp eq <16 x i8> %x, %y
@@ -165,7 +165,7 @@ define <4 x i16> @v4si16_cmp(<4 x i16> %x, <4 x i16> %y) nounwind readnone {
 ; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
 
 
-; Adicional tests for v8i16 since it is an altivec native type
+; Additional tests for v8i16 since it is an altivec native type
 
 define <8 x i16> @v8si16_cmp_eq(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
 entry:
@@ -298,7 +298,7 @@ define <2 x i32> @v2si32_cmp(<2 x i32> %x, <2 x i32> %y) nounwind readnone {
 ; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
 
 
-; Adicional tests for v4si32 since it is an altivec native type
+; Additional tests for v4si32 since it is an altivec native type
 
 define <4 x i32> @v4si32_cmp_eq(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
 entry:
@@ -449,7 +449,7 @@ entry:
 ; CHECK: vcmpeqfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
 
 
-; Adicional tests for v4f32 since it is a altivec native type
+; Additional tests for v4f32 since it is a altivec native type
 
 define <4 x float> @v4f32_cmp_eq(<4 x float> %x, <4 x float> %y) nounwind readnone {
 entry:
diff --git a/test/CodeGen/PowerPC/vec_misaligned.ll b/test/CodeGen/PowerPC/vec_misaligned.ll
index d7ed64a..304a84d 100644
--- a/test/CodeGen/PowerPC/vec_misaligned.ll
+++ b/test/CodeGen/PowerPC/vec_misaligned.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -march=ppc32 -mcpu=g5
+; RUN: llc < %s -march=ppc32 -mcpu=g5 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec | FileCheck %s -check-prefix=CHECK-LE
 
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
 target triple = "powerpc-apple-darwin8"
@@ -8,6 +10,8 @@ target triple = "powerpc-apple-darwin8"
 
 define void @foo(i32 %x, ...) {
 entry:
+; CHECK: foo:
+; CHECK-LE: foo:
 	%x_addr = alloca i32		; <i32*> [#uses=1]
 	%ap = alloca i8*		; <i8**> [#uses=3]
 	%ap.0 = alloca i8*		; <i8**> [#uses=3]
@@ -27,6 +31,10 @@ entry:
 	%tmp8 = getelementptr %struct.u16qi* %tmp6, i32 0, i32 0		; <<16 x i8>*> [#uses=1]
 	%tmp9 = getelementptr %struct.u16qi* %tmp7, i32 0, i32 0		; <<16 x i8>*> [#uses=1]
 	%tmp10 = load <16 x i8>* %tmp9, align 4		; <<16 x i8>> [#uses=1]
+; CHECK: lvsl
+; CHECK: vperm
+; CHECK-LE: lvsr
+; CHECK-LE: vperm
 	store <16 x i8> %tmp10, <16 x i8>* %tmp8, align 4
 	br label %return
 
diff --git a/test/CodeGen/PowerPC/vec_mul.ll b/test/CodeGen/PowerPC/vec_mul.ll
index c376751..8a44815 100644
--- a/test/CodeGen/PowerPC/vec_mul.ll
+++ b/test/CodeGen/PowerPC/vec_mul.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -march=ppc32 -mattr=+altivec | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -march=ppc64 -mattr=+altivec | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -march=ppc64 -mattr=+altivec | FileCheck %s -check-prefix=CHECK-LE
 
 define <4 x i32> @test_v4i32(<4 x i32>* %X, <4 x i32>* %Y) {
 	%tmp = load <4 x i32>* %X		; <<4 x i32>> [#uses=1]
@@ -9,6 +11,9 @@ define <4 x i32> @test_v4i32(<4 x i32>* %X, <4 x i32>* %Y) {
 ; CHECK-LABEL: test_v4i32:
 ; CHECK: vmsumuhm
 ; CHECK-NOT: mullw
+; CHECK-LE-LABEL: test_v4i32:
+; CHECK-LE: vmsumuhm
+; CHECK-LE-NOT: mullw
 
 define <8 x i16> @test_v8i16(<8 x i16>* %X, <8 x i16>* %Y) {
 	%tmp = load <8 x i16>* %X		; <<8 x i16>> [#uses=1]
@@ -19,6 +24,9 @@ define <8 x i16> @test_v8i16(<8 x i16>* %X, <8 x i16>* %Y) {
 ; CHECK-LABEL: test_v8i16:
 ; CHECK: vmladduhm
 ; CHECK-NOT: mullw
+; CHECK-LE-LABEL: test_v8i16:
+; CHECK-LE: vmladduhm
+; CHECK-LE-NOT: mullw
 
 define <16 x i8> @test_v16i8(<16 x i8>* %X, <16 x i8>* %Y) {
 	%tmp = load <16 x i8>* %X		; <<16 x i8>> [#uses=1]
@@ -30,6 +38,11 @@ define <16 x i8> @test_v16i8(<16 x i8>* %X, <16 x i8>* %Y) {
 ; CHECK: vmuloub
 ; CHECK: vmuleub
 ; CHECK-NOT: mullw
+; CHECK-LE-LABEL: test_v16i8:
+; CHECK-LE: vmuloub [[REG1:[0-9]+]]
+; CHECK-LE: vmuleub [[REG2:[0-9]+]]
+; CHECK-LE: vperm {{[0-9]+}}, [[REG2]], [[REG1]]
+; CHECK-LE-NOT: mullw
 
 define <4 x float> @test_float(<4 x float>* %X, <4 x float>* %Y) {
 	%tmp = load <4 x float>* %X
@@ -44,3 +57,7 @@ define <4 x float> @test_float(<4 x float>* %X, <4 x float>* %Y) {
 ; CHECK: vspltisw [[ZNEG:[0-9]+]], -1
 ; CHECK: vslw     {{[0-9]+}}, [[ZNEG]], [[ZNEG]]
 ; CHECK: vmaddfp
+; CHECK-LE-LABEL: test_float:
+; CHECK-LE: vspltisw [[ZNEG:[0-9]+]], -1
+; CHECK-LE: vslw     {{[0-9]+}}, [[ZNEG]], [[ZNEG]]
+; CHECK-LE: vmaddfp
diff --git a/test/CodeGen/PowerPC/vec_shuffle_le.ll b/test/CodeGen/PowerPC/vec_shuffle_le.ll
new file mode 100644
index 0000000..635721c
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_shuffle_le.ll
@@ -0,0 +1,191 @@
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec | FileCheck %s
+
+define void @VPKUHUM_xy(<16 x i8>* %A, <16 x i8>* %B) {
+entry:
+; CHECK: VPKUHUM_xy:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK: vpkuhum
+        store <16 x i8> %tmp3, <16 x i8>* %A
+        ret void
+}
+
+define void @VPKUHUM_xx(<16 x i8>* %A) {
+entry:
+; CHECK: VPKUHUM_xx:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK: vpkuhum
+        store <16 x i8> %tmp2, <16 x i8>* %A
+        ret void
+}
+
+define void @VPKUWUM_xy(<16 x i8>* %A, <16 x i8>* %B) {
+entry:
+; CHECK: VPKUWUM_xy:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13, i32 16, i32 17, i32 20, i32 21, i32 24, i32 25, i32 28, i32 29>
+; CHECK: vpkuwum
+        store <16 x i8> %tmp3, <16 x i8>* %A
+        ret void
+}
+
+define void @VPKUWUM_xx(<16 x i8>* %A) {
+entry:
+; CHECK: VPKUWUM_xx:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13, i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
+; CHECK: vpkuwum
+        store <16 x i8> %tmp2, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGLB_xy(<16 x i8>* %A, <16 x i8>* %B) {
+entry:
+; CHECK: VMRGLB_xy:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+; CHECK: vmrglb
+        store <16 x i8> %tmp3, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGLB_xx(<16 x i8>* %A) {
+entry:
+; CHECK: VMRGLB_xx:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; CHECK: vmrglb
+        store <16 x i8> %tmp2, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGHB_xy(<16 x i8>* %A, <16 x i8>* %B) {
+entry:
+; CHECK: VMRGHB_xy:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+; CHECK: vmrghb
+        store <16 x i8> %tmp3, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGHB_xx(<16 x i8>* %A) {
+entry:
+; CHECK: VMRGHB_xx:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+; CHECK: vmrghb
+        store <16 x i8> %tmp2, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGLH_xy(<16 x i8>* %A, <16 x i8>* %B) {
+entry:
+; CHECK: VMRGLH_xy:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 2, i32 3, i32 18, i32 19, i32 4, i32 5, i32 20, i32 21, i32 6, i32 7, i32 22, i32 23>
+; CHECK: vmrglh
+        store <16 x i8> %tmp3, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGLH_xx(<16 x i8>* %A) {
+entry:
+; CHECK: VMRGLH_xx:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 4, i32 5, i32 4, i32 5, i32 6, i32 7, i32 6, i32 7>
+; CHECK: vmrglh
+        store <16 x i8> %tmp2, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGHH_xy(<16 x i8>* %A, <16 x i8>* %B) {
+entry:
+; CHECK: VMRGHH_xy:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 9, i32 24, i32 25, i32 10, i32 11, i32 26, i32 27, i32 12, i32 13, i32 28, i32 29, i32 14, i32 15, i32 30, i32 31>
+; CHECK: vmrghh
+        store <16 x i8> %tmp3, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGHH_xx(<16 x i8>* %A) {
+entry:
+; CHECK: VMRGHH_xx:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 8, i32 9, i32 8, i32 9, i32 10, i32 11, i32 10, i32 11, i32 12, i32 13, i32 12, i32 13, i32 14, i32 15, i32 14, i32 15>
+; CHECK: vmrghh
+        store <16 x i8> %tmp2, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGLW_xy(<16 x i8>* %A, <16 x i8>* %B) {
+entry:
+; CHECK: VMRGLW_xy:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23>
+; CHECK: vmrglw
+        store <16 x i8> %tmp3, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGLW_xx(<16 x i8>* %A) {
+entry:
+; CHECK: VMRGLW_xx:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+; CHECK: vmrglw
+        store <16 x i8> %tmp2, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGHW_xy(<16 x i8>* %A, <16 x i8>* %B) {
+entry:
+; CHECK: VMRGHW_xy:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
+; CHECK: vmrghw
+        store <16 x i8> %tmp3, <16 x i8>* %A
+        ret void
+}
+
+define void @VMRGHW_xx(<16 x i8>* %A) {
+entry:
+; CHECK: VMRGHW_xx:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15>
+; CHECK: vmrghw
+        store <16 x i8> %tmp2, <16 x i8>* %A
+        ret void
+}
+
+define void @VSLDOI_xy(<16 x i8>* %A, <16 x i8>* %B) {
+entry:
+; CHECK: VSLDOI_xy:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+; CHECK: vsldoi
+        store <16 x i8> %tmp3, <16 x i8>* %A
+        ret void
+}
+
+define void @VSLDOI_xx(<16 x i8>* %A) {
+entry:
+; CHECK: VSLDOI_xx:
+        %tmp = load <16 x i8>* %A
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+; CHECK: vsldoi
+        store <16 x i8> %tmp2, <16 x i8>* %A
+        ret void
+}
+
diff --git a/test/CodeGen/PowerPC/vperm-instcombine.ll b/test/CodeGen/PowerPC/vperm-instcombine.ll
new file mode 100644
index 0000000..d9084c8
--- /dev/null
+++ b/test/CodeGen/PowerPC/vperm-instcombine.ll
@@ -0,0 +1,17 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+define <16 x i8> @foo() nounwind ssp {
+; CHECK: @foo
+;; Arguments are {0,1,...,15},{16,17,...,31},{30,28,26,...,0}
+  %1 = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> <i32 50462976, i32 117835012, i32 185207048, i32 252579084>, <4 x i32> <i32 319951120, i32 387323156, i32 454695192, i32 522067228>, <16 x i8> <i8 30, i8 28, i8 26, i8 24, i8 22, i8 20, i8 18, i8 16, i8 14, i8 12, i8 10, i8 8, i8 6, i8 4, i8 2, i8 0>)
+  %2 = bitcast <4 x i32> %1 to <16 x i8>
+  ret <16 x i8> %2
+;; Revised arguments are {16,17,...31},{0,1,...,15},{1,3,5,...,31}
+;; optimized into the following:
+; CHECK: ret <16 x i8> <i8 17, i8 19, i8 21, i8 23, i8 25, i8 27, i8 29, i8 31, i8 1, i8 3, i8 5, i8 7, i8 9, i8 11, i8 13, i8 15>
+}
+
+declare <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32>, <4 x i32>, <16 x i8>)
diff --git a/test/CodeGen/PowerPC/vperm-lowering.ll b/test/CodeGen/PowerPC/vperm-lowering.ll
new file mode 100644
index 0000000..d55d26c
--- /dev/null
+++ b/test/CodeGen/PowerPC/vperm-lowering.ll
@@ -0,0 +1,66 @@
+; RUN: llc -O0 -fast-isel=false -mcpu=ppc64 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+define <16 x i8> @foo() nounwind ssp {
+  %1 = shufflevector <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 1, i32 6, i32 11>
+  ret <16 x i8> %1
+}
+
+; CHECK: .LCPI0_0:
+; CHECK: .byte 31
+; CHECK: .byte 26
+; CHECK: .byte 21
+; CHECK: .byte 16
+; CHECK: .byte 11
+; CHECK: .byte 6
+; CHECK: .byte 1
+; CHECK: .byte 28
+; CHECK: .byte 23
+; CHECK: .byte 18
+; CHECK: .byte 13
+; CHECK: .byte 8
+; CHECK: .byte 3
+; CHECK: .byte 30
+; CHECK: .byte 25
+; CHECK: .byte 20
+; CHECK: .LCPI0_1:
+; CHECK: .byte 0
+; CHECK: .byte 1
+; CHECK: .byte 2
+; CHECK: .byte 3
+; CHECK: .byte 4
+; CHECK: .byte 5
+; CHECK: .byte 6
+; CHECK: .byte 7
+; CHECK: .byte 8
+; CHECK: .byte 9
+; CHECK: .byte 10
+; CHECK: .byte 11
+; CHECK: .byte 12
+; CHECK: .byte 13
+; CHECK: .byte 14
+; CHECK: .byte 15
+; CHECK: .LCPI0_2:
+; CHECK: .byte 16
+; CHECK: .byte 17
+; CHECK: .byte 18
+; CHECK: .byte 19
+; CHECK: .byte 20
+; CHECK: .byte 21
+; CHECK: .byte 22
+; CHECK: .byte 23
+; CHECK: .byte 24
+; CHECK: .byte 25
+; CHECK: .byte 26
+; CHECK: .byte 27
+; CHECK: .byte 28
+; CHECK: .byte 29
+; CHECK: .byte 30
+; CHECK: .byte 31
+; CHECK: foo:
+; CHECK: addis [[REG1:[0-9]+]], 2, .LCPI0_2@toc@ha
+; CHECK: addi [[REG2:[0-9]+]], [[REG1]], .LCPI0_2@toc@l
+; CHECK: lvx [[REG3:[0-9]+]], 0, [[REG2]]
+; CHECK: vperm {{[0-9]+}}, [[REG3]], {{[0-9]+}}, {{[0-9]+}}
diff --git a/test/CodeGen/R600/add_i64.ll b/test/CodeGen/R600/add_i64.ll
index c9eaeda..f733d90 100644
--- a/test/CodeGen/R600/add_i64.ll
+++ b/test/CodeGen/R600/add_i64.ll
@@ -70,9 +70,9 @@ define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> add
 }
 
 ; SI-LABEL: @trunc_i64_add_to_i32
-; SI: S_LOAD_DWORD [[SREG0:s[0-9]+]],
-; SI: S_LOAD_DWORD [[SREG1:s[0-9]+]],
-; SI: S_ADD_I32 [[SRESULT:s[0-9]+]], [[SREG1]], [[SREG0]]
+; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG0:[0-9]+]]
+; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG1:[0-9]+]]
+; SI: S_ADD_I32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]]
 ; SI-NOT: ADDC
 ; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: BUFFER_STORE_DWORD [[VRESULT]],
diff --git a/test/CodeGen/R600/and.ll b/test/CodeGen/R600/and.ll
index ee9bc83..cf11481 100644
--- a/test/CodeGen/R600/and.ll
+++ b/test/CodeGen/R600/and.ll
@@ -1,13 +1,12 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-;EG-CHECK: @test2
-;EG-CHECK: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; FUNC-LABEL: @test2
+; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test2
-;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -18,17 +17,16 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-;EG-CHECK: @test4
-;EG-CHECK: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; FUNC-LABEL: @test4
+; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test4
-;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -38,3 +36,75 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: @s_and_i32
+; SI: S_AND_B32
+define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %and = and i32 %a, %b
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_and_constant_i32
+; SI: S_AND_B32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687
+define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
+  %and = and i32 %a, 1234567
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_and_i32
+; SI: V_AND_B32
+define void @v_and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
+  %a = load i32 addrspace(1)* %aptr, align 4
+  %b = load i32 addrspace(1)* %bptr, align 4
+  %and = and i32 %a, %b
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_and_constant_i32
+; SI: V_AND_B32
+define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+  %a = load i32 addrspace(1)* %aptr, align 4
+  %and = and i32 %a, 1234567
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_and_i64
+; SI: S_AND_B64
+define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+  %and = and i64 %a, %b
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @s_and_constant_i64
+; SI: S_AND_B64
+define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+  %and = and i64 %a, 281474976710655
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @v_and_i64
+; SI: V_AND_B32
+; SI: V_AND_B32
+define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+  %a = load i64 addrspace(1)* %aptr, align 8
+  %b = load i64 addrspace(1)* %bptr, align 8
+  %and = and i64 %a, %b
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @v_and_constant_i64
+; SI: V_AND_B32
+; SI: V_AND_B32
+define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+  %a = load i64 addrspace(1)* %aptr, align 8
+  %and = and i64 %a, 1234567
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/array-ptr-calc-i32.ll b/test/CodeGen/R600/array-ptr-calc-i32.ll
index c2362da..3230353 100644
--- a/test/CodeGen/R600/array-ptr-calc-i32.ll
+++ b/test/CodeGen/R600/array-ptr-calc-i32.ll
@@ -10,7 +10,12 @@ declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
 
 ; SI-LABEL: @test_private_array_ptr_calc:
 ; SI: V_ADD_I32_e32 [[PTRREG:v[0-9]+]]
-; SI: V_MOVRELD_B32_e32 {{v[0-9]+}}, [[PTRREG]]
+;
+; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
+; alloca to a vector.  It currently fails because it does not know how
+; to interpret:
+; getelementptr [4 x i32]* %alloca, i32 1, i32 %b
+; SI: DS_WRITE_B32 {{v[0-9]+}}, [[PTRREG]]
 define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
   %alloca = alloca [4 x i32], i32 4, align 16
   %tid = call i32 @llvm.SI.tid() readnone
diff --git a/test/CodeGen/R600/atomic_cmp_swap_local.ll b/test/CodeGen/R600/atomic_cmp_swap_local.ll
new file mode 100644
index 0000000..eb9539e
--- /dev/null
+++ b/test/CodeGen/R600/atomic_cmp_swap_local.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: @lds_atomic_cmpxchg_ret_i32_offset:
+; SI: S_LOAD_DWORD [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: S_LOAD_DWORD [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: V_MOV_B32_e32 [[VCMP:v[0-9]+]], 7
+; SI-DAG: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; SI-DAG: V_MOV_B32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
+; SI: DS_CMPST_RTN_B32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]], 0x10, [M0]
+; SI: S_ENDPGM
+define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_cmpxchg_ret_i64_offset:
+; SI: S_LOAD_DWORDX2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: S_LOAD_DWORD [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: S_MOV_B64  s{{\[}}[[LOSCMP:[0-9]+]]:[[HISCMP:[0-9]+]]{{\]}}, 7
+; SI-DAG: V_MOV_B32_e32 v[[LOVCMP:[0-9]+]], s[[LOSCMP]]
+; SI-DAG: V_MOV_B32_e32 v[[HIVCMP:[0-9]+]], s[[HISCMP]]
+; SI-DAG: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; SI-DAG: V_MOV_B32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
+; SI-DAG: V_MOV_B32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
+; SI: DS_CMPST_RTN_B64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}}, 0x20, [M0]
+; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
+; SI: S_ENDPGM
+define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
+  %result = extractvalue { i64, i1 } %pair, 0
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/atomic_load_add.ll b/test/CodeGen/R600/atomic_load_add.ll
index cb0242c..c26f9cd 100644
--- a/test/CodeGen/R600/atomic_load_add.ll
+++ b/test/CodeGen/R600/atomic_load_add.ll
@@ -1,23 +1,38 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
-; R600-CHECK-LABEL: @atomic_add_local
-; R600-CHECK: LDS_ADD *
-; SI-CHECK-LABEL: @atomic_add_local
-; SI-CHECK: DS_ADD_U32_RTN
+; FUNC-LABEL: @atomic_add_local
+; R600: LDS_ADD *
+; SI: DS_ADD_RTN_U32
 define void @atomic_add_local(i32 addrspace(3)* %local) {
-entry:
-   %0 = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
+   %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
    ret void
 }
 
-; R600-CHECK-LABEL: @atomic_add_ret_local
-; R600-CHECK: LDS_ADD_RET *
-; SI-CHECK-LABEL: @atomic_add_ret_local
-; SI-CHECK: DS_ADD_U32_RTN
+; FUNC-LABEL: @atomic_add_local_const_offset
+; R600: LDS_ADD *
+; SI: DS_ADD_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
+  %gep = getelementptr i32 addrspace(3)* %local, i32 4
+  %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: @atomic_add_ret_local
+; R600: LDS_ADD_RET *
+; SI: DS_ADD_RTN_U32
 define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
-entry:
-  %0 = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
-  store i32 %0, i32 addrspace(1)* %out
+  %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @atomic_add_ret_local_const_offset
+; R600: LDS_ADD_RET *
+; SI: DS_ADD_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x14
+define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+  %gep = getelementptr i32 addrspace(3)* %local, i32 5
+  %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
+  store i32 %val, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/atomic_load_sub.ll b/test/CodeGen/R600/atomic_load_sub.ll
index 7c26e52..3569d91 100644
--- a/test/CodeGen/R600/atomic_load_sub.ll
+++ b/test/CodeGen/R600/atomic_load_sub.ll
@@ -1,23 +1,38 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; R600-CHECK-LABEL: @atomic_sub_local
-; R600-CHECK: LDS_SUB *
-; SI-CHECK-LABEL: @atomic_sub_local
-; SI-CHECK: DS_SUB_U32_RTN
+; FUNC-LABEL: @atomic_sub_local
+; R600: LDS_SUB *
+; SI: DS_SUB_RTN_U32
 define void @atomic_sub_local(i32 addrspace(3)* %local) {
-entry:
-   %0 = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
+   %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
    ret void
 }
 
-; R600-CHECK-LABEL: @atomic_sub_ret_local
-; R600-CHECK: LDS_SUB_RET *
-; SI-CHECK-LABEL: @atomic_sub_ret_local
-; SI-CHECK: DS_SUB_U32_RTN
+; FUNC-LABEL: @atomic_sub_local_const_offset
+; R600: LDS_SUB *
+; SI: DS_SUB_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
+  %gep = getelementptr i32 addrspace(3)* %local, i32 4
+  %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: @atomic_sub_ret_local
+; R600: LDS_SUB_RET *
+; SI: DS_SUB_RTN_U32
 define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
-entry:
-  %0 = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
-  store i32 %0, i32 addrspace(1)* %out
+  %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @atomic_sub_ret_local_const_offset
+; R600: LDS_SUB_RET *
+; SI: DS_SUB_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x14
+define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+  %gep = getelementptr i32 addrspace(3)* %local, i32 5
+  %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
+  store i32 %val, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/big_alu.ll b/test/CodeGen/R600/big_alu.ll
index 6b68376..511e8ef 100644
--- a/test/CodeGen/R600/big_alu.ll
+++ b/test/CodeGen/R600/big_alu.ll
@@ -101,7 +101,7 @@ IF137:                                            ; preds = %main_body
   %88 = insertelement <4 x float> %87, float %32, i32 2
   %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3
   %90 = call float @llvm.AMDGPU.dp4(<4 x float> %85, <4 x float> %89)
-  %91 = call float @llvm.AMDGPU.rsq(float %90)
+  %91 = call float @llvm.AMDGPU.rsq.f32(float %90)
   %92 = fmul float %30, %91
   %93 = fmul float %31, %91
   %94 = fmul float %32, %91
@@ -344,7 +344,7 @@ ENDIF136:                                         ; preds = %main_body, %ENDIF15
   %325 = insertelement <4 x float> %324, float %318, i32 2
   %326 = insertelement <4 x float> %325, float 0.000000e+00, i32 3
   %327 = call float @llvm.AMDGPU.dp4(<4 x float> %322, <4 x float> %326)
-  %328 = call float @llvm.AMDGPU.rsq(float %327)
+  %328 = call float @llvm.AMDGPU.rsq.f32(float %327)
   %329 = fmul float %314, %328
   %330 = fmul float %316, %328
   %331 = fmul float %318, %328
@@ -377,7 +377,7 @@ ENDIF136:                                         ; preds = %main_body, %ENDIF15
   %358 = insertelement <4 x float> %357, float %45, i32 2
   %359 = insertelement <4 x float> %358, float 0.000000e+00, i32 3
   %360 = call float @llvm.AMDGPU.dp4(<4 x float> %355, <4 x float> %359)
-  %361 = call float @llvm.AMDGPU.rsq(float %360)
+  %361 = call float @llvm.AMDGPU.rsq.f32(float %360)
   %362 = fmul float %45, %361
   %363 = call float @fabs(float %362)
   %364 = fmul float %176, 0x3FECCCCCC0000000
@@ -403,7 +403,7 @@ ENDIF136:                                         ; preds = %main_body, %ENDIF15
   %384 = insertelement <4 x float> %383, float %45, i32 2
   %385 = insertelement <4 x float> %384, float 0.000000e+00, i32 3
   %386 = call float @llvm.AMDGPU.dp4(<4 x float> %381, <4 x float> %385)
-  %387 = call float @llvm.AMDGPU.rsq(float %386)
+  %387 = call float @llvm.AMDGPU.rsq.f32(float %386)
   %388 = fmul float %45, %387
   %389 = call float @fabs(float %388)
   %390 = fmul float %176, 0x3FF51EB860000000
@@ -1041,7 +1041,7 @@ IF179:                                            ; preds = %ENDIF175
   %896 = insertelement <4 x float> %895, float %45, i32 2
   %897 = insertelement <4 x float> %896, float 0.000000e+00, i32 3
   %898 = call float @llvm.AMDGPU.dp4(<4 x float> %893, <4 x float> %897)
-  %899 = call float @llvm.AMDGPU.rsq(float %898)
+  %899 = call float @llvm.AMDGPU.rsq.f32(float %898)
   %900 = fmul float %45, %899
   %901 = call float @fabs(float %900)
   %902 = fmul float %176, 0x3FECCCCCC0000000
@@ -1150,7 +1150,7 @@ ENDIF178:                                         ; preds = %ENDIF175, %IF179
 declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
 
 ; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq(float) #1
+declare float @llvm.AMDGPU.rsq.f32(float) #1
 
 ; Function Attrs: readnone
 declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
diff --git a/test/CodeGen/R600/bitcast.ll b/test/CodeGen/R600/bitcast.ll
index 5bfc008..0be79e6 100644
--- a/test/CodeGen/R600/bitcast.ll
+++ b/test/CodeGen/R600/bitcast.ll
@@ -1,9 +1,11 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; This test just checks that the compiler doesn't crash.
-; CHECK-LABEL: @v32i8_to_v8i32
-; CHECK: S_ENDPGM
 
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+; FUNC-LABEL: @v32i8_to_v8i32
+; SI: S_ENDPGM
 define void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
 entry:
   %1 = load <32 x i8> addrspace(2)* %0
@@ -15,12 +17,8 @@ entry:
   ret void
 }
 
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
-
-; CHECK-LABEL: @i8ptr_v16i8ptr
-; CHECK: S_ENDPGM
+; FUNC-LABEL: @i8ptr_v16i8ptr
+; SI: S_ENDPGM
 define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
@@ -28,3 +26,53 @@ entry:
   store <16 x i8> %1, <16 x i8> addrspace(1)* %out
   ret void
 }
+
+define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %load = load float addrspace(1)* %in, align 4
+  %bc = bitcast float %load to <2 x i16>
+  store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
+  %load = load <2 x i16> addrspace(1)* %in, align 4
+  %bc = bitcast <2 x i16> %load to float
+  store float %bc, float addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+  %load = load <4 x i8> addrspace(1)* %in, align 4
+  %bc = bitcast <4 x i8> %load to i32
+  store i32 %bc, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %bc = bitcast i32 %load to <4 x i8>
+  store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bitcast_v2i32_to_f64
+; SI: S_ENDPGM
+define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %val = load <2 x i32> addrspace(1)* %in, align 8
+  %add = add <2 x i32> %val, <i32 4, i32 9>
+  %bc = bitcast <2 x i32> %add to double
+  store double %bc, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @bitcast_f64_to_v2i32
+; SI: S_ENDPGM
+define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
+  %val = load double addrspace(1)* %in, align 8
+  %add = fadd double %val, 4.0
+  %bc = bitcast double %add to <2 x i32>
+  store <2 x i32> %bc, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/bswap.ll b/test/CodeGen/R600/bswap.ll
new file mode 100644
index 0000000..6aebe85
--- /dev/null
+++ b/test/CodeGen/R600/bswap.ll
@@ -0,0 +1,50 @@
+; RUN: llc -march=r600 -mcpu=SI < %s
+
+declare i32 @llvm.bswap.i32(i32) nounwind readnone
+declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
+declare i64 @llvm.bswap.i64(i64) nounwind readnone
+declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone
+declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
+
+define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
+  store i32 %bswap, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
+  %val = load <2 x i32> addrspace(1)* %in, align 8
+  %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
+  store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
+  %val = load <4 x i32> addrspace(1)* %in, align 16
+  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone
+  store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+  %val = load i64 addrspace(1)* %in, align 8
+  %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
+  store i64 %bswap, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+define void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
+  %val = load <2 x i64> addrspace(1)* %in, align 16
+  %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone
+  store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16
+  ret void
+}
+
+define void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
+  %val = load <4 x i64> addrspace(1)* %in, align 32
+  %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone
+  store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32
+  ret void
+}
diff --git a/test/CodeGen/R600/ctlz_zero_undef.ll b/test/CodeGen/R600/ctlz_zero_undef.ll
new file mode 100644
index 0000000..15b5188
--- /dev/null
+++ b/test/CodeGen/R600/ctlz_zero_undef.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
+declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
+
+; FUNC-LABEL: @s_ctlz_zero_undef_i32:
+; SI: S_LOAD_DWORD [[VAL:s[0-9]+]],
+; SI: S_FLBIT_I32_B32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: BUFFER_STORE_DWORD [[VRESULT]],
+; SI: S_ENDPGM
+define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctlz_zero_undef_i32:
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_FFBH_U32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32 addrspace(1)* %valptr, align 4
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctlz_zero_undef_v2i32:
+; SI: BUFFER_LOAD_DWORDX2
+; SI: V_FFBH_U32_e32
+; SI: V_FFBH_U32_e32
+; SI: BUFFER_STORE_DWORDX2
+; SI: S_ENDPGM
+define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+  %val = load <2 x i32> addrspace(1)* %valptr, align 8
+  %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
+  store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @v_ctlz_zero_undef_v4i32:
+; SI: BUFFER_LOAD_DWORDX4
+; SI: V_FFBH_U32_e32
+; SI: V_FFBH_U32_e32
+; SI: V_FFBH_U32_e32
+; SI: V_FFBH_U32_e32
+; SI: BUFFER_STORE_DWORDX4
+; SI: S_ENDPGM
+define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+  %val = load <4 x i32> addrspace(1)* %valptr, align 16
+  %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
+  store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/R600/ctpop.ll b/test/CodeGen/R600/ctpop.ll
new file mode 100644
index 0000000..15be8e1
--- /dev/null
+++ b/test/CodeGen/R600/ctpop.ll
@@ -0,0 +1,284 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone
+declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
+
+; FUNC-LABEL: @s_ctpop_i32:
+; SI: S_LOAD_DWORD [[SVAL:s[0-9]+]],
+; SI: S_BCNT1_I32_B32 [[SRESULT:s[0-9]+]], [[SVAL]]
+; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: BUFFER_STORE_DWORD [[VRESULT]],
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  store i32 %ctpop, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; XXX - Why 0 in register?
+; FUNC-LABEL: @v_ctpop_i32:
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
+; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VZERO]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  store i32 %ctpop, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_add_chain_i32
+; SI: BUFFER_LOAD_DWORD [[VAL0:v[0-9]+]],
+; SI: BUFFER_LOAD_DWORD [[VAL1:v[0-9]+]],
+; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
+; SI: V_BCNT_U32_B32_e32 [[MIDRESULT:v[0-9]+]], [[VAL1]], [[VZERO]]
+; SI-NOT: ADD
+; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
+  %val0 = load i32 addrspace(1)* %in0, align 4
+  %val1 = load i32 addrspace(1)* %in1, align 4
+  %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+  %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
+  %add = add i32 %ctpop0, %ctpop1
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_v2i32:
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
+  %val = load <2 x i32> addrspace(1)* %in, align 8
+  %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
+  store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_v4i32:
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
+  %val = load <4 x i32> addrspace(1)* %in, align 16
+  %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
+  store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_v8i32:
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
+  %val = load <8 x i32> addrspace(1)* %in, align 32
+  %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
+  store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_v16i32:
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: V_BCNT_U32_B32_e32
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
+  %val = load <16 x i32> addrspace(1)* %in, align 32
+  %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
+  store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_i32_add_inline_constant:
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 %ctpop, 4
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_i32_add_inline_constant_inv:
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 4, %ctpop
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_i32_add_literal:
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_MOV_B32_e32 [[LIT:v[0-9]+]], 0x1869f
+; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 %ctpop, 99999
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_i32_add_var:
+; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI-DAG: S_LOAD_DWORD [[VAR:s[0-9]+]],
+; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 %ctpop, %const
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_i32_add_var_inv:
+; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI-DAG: S_LOAD_DWORD [[VAR:s[0-9]+]],
+; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 %const, %ctpop
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_i32_add_vvar_inv
+; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]], {{.*}} + 0x0
+; SI-DAG: BUFFER_LOAD_DWORD [[VAR:v[0-9]+]], {{.*}} + 0x10
+; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %gep = getelementptr i32 addrspace(1)* %constptr, i32 4
+  %const = load i32 addrspace(1)* %gep, align 4
+  %add = add i32 %const, %ctpop
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FIXME: We currently disallow SALU instructions in all branches,
+; but there are some cases when the should be allowed.
+
+; FUNC-LABEL: @ctpop_i32_in_br
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+; EG: BCNT_INT
+define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
+entry:
+  %0 = icmp eq i32 %cond, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i32 addrspace(1)* %in
+  %2 = call i32 @llvm.ctpop.i32(i32 %1)
+  br label %endif
+
+else:
+  %3 = getelementptr i32 addrspace(1)* %in, i32 1
+  %4 = load i32 addrspace(1)* %3
+  br label %endif
+
+endif:
+  %5 = phi i32 [%2, %if], [%4, %else]
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/ctpop64.ll b/test/CodeGen/R600/ctpop64.ll
new file mode 100644
index 0000000..b36ecc6
--- /dev/null
+++ b/test/CodeGen/R600/ctpop64.ll
@@ -0,0 +1,122 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
+declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone
+declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
+declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone
+
+; FUNC-LABEL: @s_ctpop_i64:
+; SI: S_LOAD_DWORDX2 [[SVAL:s\[[0-9]+:[0-9]+\]]],
+; SI: S_BCNT1_I32_B64 [[SRESULT:s[0-9]+]], [[SVAL]]
+; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: BUFFER_STORE_DWORD [[VRESULT]],
+; SI: S_ENDPGM
+define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+  %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
+  %truncctpop = trunc i64 %ctpop to i32
+  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_i64:
+; SI: BUFFER_LOAD_DWORDX2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
+; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
+; SI: V_BCNT_U32_B32_e32 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], [[VZERO]]
+; SI-NEXT: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %val = load i64 addrspace(1)* %in, align 8
+  %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
+  %truncctpop = trunc i64 %ctpop to i32
+  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_ctpop_v2i64:
+; SI: S_BCNT1_I32_B64
+; SI: S_BCNT1_I32_B64
+; SI: S_ENDPGM
+define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
+  %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
+  %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
+  store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @s_ctpop_v4i64:
+; SI: S_BCNT1_I32_B64
+; SI: S_BCNT1_I32_B64
+; SI: S_BCNT1_I32_B64
+; SI: S_BCNT1_I32_B64
+; SI: S_ENDPGM
+define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
+  %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
+  %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
+  store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_v2i64:
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: S_ENDPGM
+define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
+  %val = load <2 x i64> addrspace(1)* %in, align 16
+  %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
+  %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
+  store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @v_ctpop_v4i64:
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: V_BCNT_U32_B32
+; SI: S_ENDPGM
+define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
+  %val = load <4 x i64> addrspace(1)* %in, align 32
+  %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
+  %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
+  store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FIXME: We currently disallow SALU instructions in all branches,
+; but there are some cases when the should be allowed.
+
+; FUNC-LABEL: @ctpop_i64_in_br
+; SI: V_BCNT_U32_B32_e64 [[BCNT_LO:v[0-9]+]], v{{[0-9]+}}, 0
+; SI: V_BCNT_U32_B32_e32 v[[BCNT:[0-9]+]], v{{[0-9]+}}, [[BCNT_LO]]
+; SI: V_MOV_B32_e32 v[[ZERO:[0-9]+]], 0
+; SI: BUFFER_STORE_DWORDX2 v[
+; SI: [[BCNT]]:[[ZERO]]]
+; SI: S_ENDPGM
+define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i32 %cond) {
+entry:
+  %0 = icmp eq i32 %cond, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64 addrspace(1)* %in
+  %2 = call i64 @llvm.ctpop.i64(i64 %1)
+  br label %endif
+
+else:
+  %3 = getelementptr i64 addrspace(1)* %in, i32 1
+  %4 = load i64 addrspace(1)* %3
+  br label %endif
+
+endif:
+  %5 = phi i64 [%2, %if], [%4, %else]
+  store i64 %5, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/cttz_zero_undef.ll b/test/CodeGen/R600/cttz_zero_undef.ll
new file mode 100644
index 0000000..cf44f8e
--- /dev/null
+++ b/test/CodeGen/R600/cttz_zero_undef.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
+declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
+declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
+
+; FUNC-LABEL: @s_cttz_zero_undef_i32:
+; SI: S_LOAD_DWORD [[VAL:s[0-9]+]],
+; SI: S_FF1_I32_B32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: BUFFER_STORE_DWORD [[VRESULT]],
+; SI: S_ENDPGM
+define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
+  store i32 %cttz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_cttz_zero_undef_i32:
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_FFBL_B32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32 addrspace(1)* %valptr, align 4
+  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
+  store i32 %cttz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_cttz_zero_undef_v2i32:
+; SI: BUFFER_LOAD_DWORDX2
+; SI: V_FFBL_B32_e32
+; SI: V_FFBL_B32_e32
+; SI: BUFFER_STORE_DWORDX2
+; SI: S_ENDPGM
+define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+  %val = load <2 x i32> addrspace(1)* %valptr, align 8
+  %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
+  store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @v_cttz_zero_undef_v4i32:
+; SI: BUFFER_LOAD_DWORDX4
+; SI: V_FFBL_B32_e32
+; SI: V_FFBL_B32_e32
+; SI: V_FFBL_B32_e32
+; SI: V_FFBL_B32_e32
+; SI: BUFFER_STORE_DWORDX4
+; SI: S_ENDPGM
+define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+  %val = load <4 x i32> addrspace(1)* %valptr, align 16
+  %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
+  store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/R600/cvt_f32_ubyte.ll b/test/CodeGen/R600/cvt_f32_ubyte.ll
new file mode 100644
index 0000000..fe97a44
--- /dev/null
+++ b/test/CodeGen/R600/cvt_f32_ubyte.ll
@@ -0,0 +1,171 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: @load_i8_to_f32:
+; SI: BUFFER_LOAD_UBYTE [[LOADREG:v[0-9]+]],
+; SI-NOT: BFE
+; SI-NOT: LSHR
+; SI: V_CVT_F32_UBYTE0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
+; SI: BUFFER_STORE_DWORD [[CONV]],
+define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+  %load = load i8 addrspace(1)* %in, align 1
+  %cvt = uitofp i8 %load to float
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @load_v2i8_to_v2f32:
+; SI: BUFFER_LOAD_USHORT [[LOADREG:v[0-9]+]],
+; SI-NOT: BFE
+; SI-NOT: LSHR
+; SI-NOT: AND
+; SI-DAG: V_CVT_F32_UBYTE1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
+; SI-DAG: V_CVT_F32_UBYTE0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
+; SI: BUFFER_STORE_DWORDX2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <2 x i8> addrspace(1)* %in, align 1
+  %cvt = uitofp <2 x i8> %load to <2 x float>
+  store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @load_v3i8_to_v3f32:
+; SI-NOT: BFE
+; SI-NOT: V_CVT_F32_UBYTE3_e32
+; SI-DAG: V_CVT_F32_UBYTE2_e32
+; SI-DAG: V_CVT_F32_UBYTE1_e32
+; SI-DAG: V_CVT_F32_UBYTE0_e32
+; SI: BUFFER_STORE_DWORDX2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <3 x i8> addrspace(1)* %in, align 1
+  %cvt = uitofp <3 x i8> %load to <3 x float>
+  store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @load_v4i8_to_v4f32:
+; SI: BUFFER_LOAD_DWORD [[LOADREG:v[0-9]+]],
+; SI-NOT: BFE
+; SI-NOT: LSHR
+; SI-DAG: V_CVT_F32_UBYTE3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
+; SI-DAG: V_CVT_F32_UBYTE2_e32 v{{[0-9]+}}, [[LOADREG]]
+; SI-DAG: V_CVT_F32_UBYTE1_e32 v{{[0-9]+}}, [[LOADREG]]
+; SI-DAG: V_CVT_F32_UBYTE0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
+; SI: BUFFER_STORE_DWORDX4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <4 x i8> addrspace(1)* %in, align 1
+  %cvt = uitofp <4 x i8> %load to <4 x float>
+  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; XXX - This should really still be able to use the V_CVT_F32_UBYTE0
+; for each component, but computeKnownBits doesn't handle vectors very
+; well.
+
+; SI-LABEL: @load_v4i8_to_v4f32_2_uses:
+; SI: BUFFER_LOAD_UBYTE
+; SI: V_CVT_F32_UBYTE0_e32
+; SI: BUFFER_LOAD_UBYTE
+; SI: V_CVT_F32_UBYTE0_e32
+; SI: BUFFER_LOAD_UBYTE
+; SI: V_CVT_F32_UBYTE0_e32
+; SI: BUFFER_LOAD_UBYTE
+; SI: V_CVT_F32_UBYTE0_e32
+
+; XXX - replace with this when v4i8 loads aren't scalarized anymore.
+; XSI: BUFFER_LOAD_DWORD
+; XSI: V_CVT_F32_U32_e32
+; XSI: V_CVT_F32_U32_e32
+; XSI: V_CVT_F32_U32_e32
+; XSI: V_CVT_F32_U32_e32
+; SI: S_ENDPGM
+define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <4 x i8> addrspace(1)* %in, align 4
+  %cvt = uitofp <4 x i8> %load to <4 x float>
+  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
+  store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4
+  ret void
+}
+
+; Make sure this doesn't crash.
+; SI-LABEL: @load_v7i8_to_v7f32:
+; SI: S_ENDPGM
+define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <7 x i8> addrspace(1)* %in, align 1
+  %cvt = uitofp <7 x i8> %load to <7 x float>
+  store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @load_v8i8_to_v8f32:
+; SI: BUFFER_LOAD_DWORDX2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
+; SI-NOT: BFE
+; SI-NOT: LSHR
+; SI-DAG: V_CVT_F32_UBYTE3_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: V_CVT_F32_UBYTE2_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: V_CVT_F32_UBYTE1_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: V_CVT_F32_UBYTE0_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: V_CVT_F32_UBYTE3_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-DAG: V_CVT_F32_UBYTE2_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-DAG: V_CVT_F32_UBYTE1_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-DAG: V_CVT_F32_UBYTE0_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-NOT: BFE
+; SI-NOT: LSHR
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <8 x i8> addrspace(1)* %in, align 1
+  %cvt = uitofp <8 x i8> %load to <8 x float>
+  store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @i8_zext_inreg_i32_to_f32:
+; SI: BUFFER_LOAD_DWORD [[LOADREG:v[0-9]+]],
+; SI: V_ADD_I32_e32 [[ADD:v[0-9]+]], 2, [[LOADREG]]
+; SI-NEXT: V_CVT_F32_UBYTE0_e32 [[CONV:v[0-9]+]], [[ADD]]
+; SI: BUFFER_STORE_DWORD [[CONV]],
+define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 2
+  %inreg = and i32 %add, 255
+  %cvt = uitofp i32 %inreg to float
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @i8_zext_inreg_hi1_to_f32:
+define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %inreg = and i32 %load, 65280
+  %shr = lshr i32 %inreg, 8
+  %cvt = uitofp i32 %shr to float
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; We don't get these ones because of the zext, but instcombine removes
+; them so it shouldn't really matter.
+define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+  %load = load i8 addrspace(1)* %in, align 1
+  %ext = zext i8 %load to i32
+  %cvt = uitofp i32 %ext to float
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <4 x i8> addrspace(1)* %in, align 1
+  %ext = zext <4 x i8> %load to <4 x i32>
+  %cvt = uitofp <4 x i32> %ext to <4 x float>
+  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/R600/default-fp-mode.ll b/test/CodeGen/R600/default-fp-mode.ll
new file mode 100644
index 0000000..214b2c2
--- /dev/null
+++ b/test/CodeGen/R600/default-fp-mode.ll
@@ -0,0 +1,10 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: @test_kernel
+; SI: FloatMode: 240
+; SI: IeeeMode: 0
+define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
+  store float 0.0, float addrspace(1)* %out0
+  store double 0.0, double addrspace(1)* %out1
+  ret void
+}
diff --git a/test/CodeGen/R600/fceil.ll b/test/CodeGen/R600/fceil.ll
index b8b945f..458363a 100644
--- a/test/CodeGen/R600/fceil.ll
+++ b/test/CodeGen/R600/fceil.ll
@@ -1,84 +1,131 @@
-; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
-declare double @llvm.ceil.f64(double) nounwind readnone
-declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
-declare <3 x double> @llvm.ceil.v3f64(<3 x double>) nounwind readnone
-declare <4 x double> @llvm.ceil.v4f64(<4 x double>) nounwind readnone
-declare <8 x double> @llvm.ceil.v8f64(<8 x double>) nounwind readnone
-declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
+declare float @llvm.ceil.f32(float) nounwind readnone
+declare <2 x float> @llvm.ceil.v2f32(<2 x float>) nounwind readnone
+declare <3 x float> @llvm.ceil.v3f32(<3 x float>) nounwind readnone
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone
+declare <8 x float> @llvm.ceil.v8f32(<8 x float>) nounwind readnone
+declare <16 x float> @llvm.ceil.v16f32(<16 x float>) nounwind readnone
 
-; CI-LABEL: @fceil_f64:
-; CI: V_CEIL_F64_e32
-define void @fceil_f64(double addrspace(1)* %out, double %x) {
-  %y = call double @llvm.ceil.f64(double %x) nounwind readnone
-  store double %y, double addrspace(1)* %out
+; FUNC-LABEL: @fceil_f32:
+; SI: V_CEIL_F32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+define void @fceil_f32(float addrspace(1)* %out, float %x) {
+  %y = call float @llvm.ceil.f32(float %x) nounwind readnone
+  store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CI-LABEL: @fceil_v2f64:
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
-  %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone
-  store <2 x double> %y, <2 x double> addrspace(1)* %out
+; FUNC-LABEL: @fceil_v2f32:
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: CEIL {{\*? *}}[[RESULT]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
+  %y = call <2 x float> @llvm.ceil.v2f32(<2 x float> %x) nounwind readnone
+  store <2 x float> %y, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; FIXME-CI-LABEL: @fceil_v3f64:
-; FIXME-CI: V_CEIL_F64_e32
-; FIXME-CI: V_CEIL_F64_e32
-; FIXME-CI: V_CEIL_F64_e32
-; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
-;   %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone
-;   store <3 x double> %y, <3 x double> addrspace(1)* %out
-;   ret void
-; }
+; FUNC-LABEL: @fceil_v3f32:
+; FIXME-SI: V_CEIL_F32_e32
+; FIXME-SI: V_CEIL_F32_e32
+; FIXME-SI: V_CEIL_F32_e32
+; FIXME-EG: v3 is treated as v2 and v1, hence 2 stores
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
+  %y = call <3 x float> @llvm.ceil.v3f32(<3 x float> %x) nounwind readnone
+  store <3 x float> %y, <3 x float> addrspace(1)* %out
+  ret void
+}
 
-; CI-LABEL: @fceil_v4f64:
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
-  %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone
-  store <4 x double> %y, <4 x double> addrspace(1)* %out
+; FUNC-LABEL: @fceil_v4f32:
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: CEIL {{\*? *}}[[RESULT]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
+  %y = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone
+  store <4 x float> %y, <4 x float> addrspace(1)* %out
   ret void
 }
 
-; CI-LABEL: @fceil_v8f64:
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
-  %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone
-  store <8 x double> %y, <8 x double> addrspace(1)* %out
+; FUNC-LABEL: @fceil_v8f32:
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
+  %y = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) nounwind readnone
+  store <8 x float> %y, <8 x float> addrspace(1)* %out
   ret void
 }
 
-; CI-LABEL: @fceil_v16f64:
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
-  %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone
-  store <16 x double> %y, <16 x double> addrspace(1)* %out
+; FUNC-LABEL: @fceil_v16f32:
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; SI: V_CEIL_F32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT3:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT4:T[0-9]+]]{{\.[XYZW]}}
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
+define void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
+  %y = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) nounwind readnone
+  store <16 x float> %y, <16 x float> addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/fceil64.ll b/test/CodeGen/R600/fceil64.ll
new file mode 100644
index 0000000..b42aefa
--- /dev/null
+++ b/test/CodeGen/R600/fceil64.ll
@@ -0,0 +1,103 @@
+; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.ceil.f64(double) nounwind readnone
+declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
+declare <3 x double> @llvm.ceil.v3f64(<3 x double>) nounwind readnone
+declare <4 x double> @llvm.ceil.v4f64(<4 x double>) nounwind readnone
+declare <8 x double> @llvm.ceil.v8f64(<8 x double>) nounwind readnone
+declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
+
+; FUNC-LABEL: @fceil_f64:
+; CI: V_CEIL_F64_e32
+; SI: S_BFE_I32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: S_ADD_I32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
+; SI: S_LSHR_B64
+; SI: S_NOT_B64
+; SI: S_AND_B64
+; SI: S_AND_B32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI: CMP_LT_I32
+; SI: CNDMASK_B32
+; SI: CNDMASK_B32
+; SI: CMP_GT_I32
+; SI: CNDMASK_B32
+; SI: CNDMASK_B32
+; SI: CMP_GT_F64
+; SI: CNDMASK_B32
+; SI: CMP_NE_I32
+; SI: CNDMASK_B32
+; SI: CNDMASK_B32
+; SI: V_ADD_F64
+define void @fceil_f64(double addrspace(1)* %out, double %x) {
+  %y = call double @llvm.ceil.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fceil_v2f64:
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+  %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone
+  store <2 x double> %y, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FIXME-FUNC-LABEL: @fceil_v3f64:
+; FIXME-CI: V_CEIL_F64_e32
+; FIXME-CI: V_CEIL_F64_e32
+; FIXME-CI: V_CEIL_F64_e32
+; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+;   %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone
+;   store <3 x double> %y, <3 x double> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: @fceil_v4f64:
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+  %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone
+  store <4 x double> %y, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fceil_v8f64:
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+  %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone
+  store <8 x double> %y, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fceil_v16f64:
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+; CI: V_CEIL_F64_e32
+define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+  %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone
+  store <16 x double> %y, <16 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fcopysign.f32.ll b/test/CodeGen/R600/fcopysign.f32.ll
new file mode 100644
index 0000000..7b4425b
--- /dev/null
+++ b/test/CodeGen/R600/fcopysign.f32.ll
@@ -0,0 +1,50 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+declare float @llvm.copysign.f32(float, float) nounwind readnone
+declare <2 x float> @llvm.copysign.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+; Try to identify arg based on higher address.
+; FUNC-LABEL: @test_copysign_f32:
+; SI: S_LOAD_DWORD [[SSIGN:s[0-9]+]], {{.*}} 0xc
+; SI: V_MOV_B32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]]
+; SI-DAG: S_LOAD_DWORD [[SMAG:s[0-9]+]], {{.*}} 0xb
+; SI-DAG: V_MOV_B32_e32 [[VMAG:v[0-9]+]], [[SMAG]]
+; SI-DAG: S_MOV_B32 [[SCONST:s[0-9]+]], 0x7fffffff
+; SI: V_BFI_B32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+
+; EG: BFI_INT
+define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
+  %result = call float @llvm.copysign.f32(float %mag, float %sign)
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_copysign_v2f32:
+; SI: S_ENDPGM
+
+; EG: BFI_INT
+; EG: BFI_INT
+define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind {
+  %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
+  store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_copysign_v4f32:
+; SI: S_ENDPGM
+
+; EG: BFI_INT
+; EG: BFI_INT
+; EG: BFI_INT
+; EG: BFI_INT
+define void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind {
+  %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign)
+  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
diff --git a/test/CodeGen/R600/fcopysign.f64.ll b/test/CodeGen/R600/fcopysign.f64.ll
new file mode 100644
index 0000000..ea7a6db
--- /dev/null
+++ b/test/CodeGen/R600/fcopysign.f64.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.copysign.f64(double, double) nounwind readnone
+declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone
+
+; FUNC-LABEL: @test_copysign_f64:
+; SI-DAG: S_LOAD_DWORDX2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: V_MOV_B32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
+; SI-DAG: S_LOAD_DWORDX2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: V_MOV_B32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
+; SI-DAG: S_MOV_B32 [[SCONST:s[0-9]+]], 0x7fffffff
+; SI: V_BFI_B32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
+; SI: V_MOV_B32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
+; SI: BUFFER_STORE_DWORDX2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
+; SI: S_ENDPGM
+define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
+  %result = call double @llvm.copysign.f64(double %mag, double %sign)
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_copysign_v2f64:
+; SI: S_ENDPGM
+define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
+  %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
+  store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_copysign_v4f64:
+; SI: S_ENDPGM
+define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
+  %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
+  store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/ffloor.ll b/test/CodeGen/R600/ffloor.ll
index 51d2b89..31c6116 100644
--- a/test/CodeGen/R600/ffloor.ll
+++ b/test/CodeGen/R600/ffloor.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI %s
+; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare double @llvm.floor.f64(double) nounwind readnone
 declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
@@ -7,15 +8,34 @@ declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone
 declare <8 x double> @llvm.floor.v8f64(<8 x double>) nounwind readnone
 declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
 
-; CI-LABEL: @ffloor_f64:
+; FUNC-LABEL: @ffloor_f64:
 ; CI: V_FLOOR_F64_e32
+
+; SI: S_BFE_I32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: S_ADD_I32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
+; SI: S_LSHR_B64
+; SI: S_NOT_B64
+; SI: S_AND_B64
+; SI: S_AND_B32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI: CMP_LT_I32
+; SI: CNDMASK_B32
+; SI: CNDMASK_B32
+; SI: CMP_GT_I32
+; SI: CNDMASK_B32
+; SI: CNDMASK_B32
+; SI: CMP_LT_F64
+; SI: CNDMASK_B32
+; SI: CMP_NE_I32
+; SI: CNDMASK_B32
+; SI: CNDMASK_B32
+; SI: V_ADD_F64
 define void @ffloor_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.floor.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; CI-LABEL: @ffloor_v2f64:
+; FUNC-LABEL: @ffloor_v2f64:
 ; CI: V_FLOOR_F64_e32
 ; CI: V_FLOOR_F64_e32
 define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
@@ -24,7 +44,7 @@ define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   ret void
 }
 
-; FIXME-CI-LABEL: @ffloor_v3f64:
+; FIXME-FUNC-LABEL: @ffloor_v3f64:
 ; FIXME-CI: V_FLOOR_F64_e32
 ; FIXME-CI: V_FLOOR_F64_e32
 ; FIXME-CI: V_FLOOR_F64_e32
@@ -34,7 +54,7 @@ define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
 ;   ret void
 ; }
 
-; CI-LABEL: @ffloor_v4f64:
+; FUNC-LABEL: @ffloor_v4f64:
 ; CI: V_FLOOR_F64_e32
 ; CI: V_FLOOR_F64_e32
 ; CI: V_FLOOR_F64_e32
@@ -45,7 +65,7 @@ define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   ret void
 }
 
-; CI-LABEL: @ffloor_v8f64:
+; FUNC-LABEL: @ffloor_v8f64:
 ; CI: V_FLOOR_F64_e32
 ; CI: V_FLOOR_F64_e32
 ; CI: V_FLOOR_F64_e32
@@ -60,7 +80,7 @@ define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   ret void
 }
 
-; CI-LABEL: @ffloor_v16f64:
+; FUNC-LABEL: @ffloor_v16f64:
 ; CI: V_FLOOR_F64_e32
 ; CI: V_FLOOR_F64_e32
 ; CI: V_FLOOR_F64_e32
diff --git a/test/CodeGen/R600/fma.ll b/test/CodeGen/R600/fma.ll
index 51e9d29..d72ffec 100644
--- a/test/CodeGen/R600/fma.ll
+++ b/test/CodeGen/R600/fma.ll
@@ -1,8 +1,15 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; CHECK: @fma_f32
-; CHECK: V_FMA_F32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+declare float @llvm.fma.f32(float, float, float) nounwind readnone
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
 
+declare double @llvm.fma.f64(double, double, double) nounwind readnone
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+
+; FUNC-LABEL: @fma_f32
+; SI: V_FMA_F32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
 define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                      float addrspace(1)* %in2, float addrspace(1)* %in3) {
    %r0 = load float addrspace(1)* %in1
@@ -13,11 +20,36 @@ define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
    ret void
 }
 
-declare float @llvm.fma.f32(float, float, float)
+; FUNC-LABEL: @fma_v2f32
+; SI: V_FMA_F32
+; SI: V_FMA_F32
+define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
+                       <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) {
+   %r0 = load <2 x float> addrspace(1)* %in1
+   %r1 = load <2 x float> addrspace(1)* %in2
+   %r2 = load <2 x float> addrspace(1)* %in3
+   %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2)
+   store <2 x float> %r3, <2 x float> addrspace(1)* %out
+   ret void
+}
 
-; CHECK: @fma_f64
-; CHECK: V_FMA_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; FUNC-LABEL: @fma_v4f32
+; SI: V_FMA_F32
+; SI: V_FMA_F32
+; SI: V_FMA_F32
+; SI: V_FMA_F32
+define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
+                       <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) {
+   %r0 = load <4 x float> addrspace(1)* %in1
+   %r1 = load <4 x float> addrspace(1)* %in2
+   %r2 = load <4 x float> addrspace(1)* %in3
+   %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2)
+   store <4 x float> %r3, <4 x float> addrspace(1)* %out
+   ret void
+}
 
+; FUNC-LABEL: @fma_f64
+; SI: V_FMA_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2, double addrspace(1)* %in3) {
    %r0 = load double addrspace(1)* %in1
@@ -28,4 +60,30 @@ define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
    ret void
 }
 
-declare double @llvm.fma.f64(double, double, double)
+; FUNC-LABEL: @fma_v2f64
+; SI: V_FMA_F64
+; SI: V_FMA_F64
+define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+                       <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) {
+   %r0 = load <2 x double> addrspace(1)* %in1
+   %r1 = load <2 x double> addrspace(1)* %in2
+   %r2 = load <2 x double> addrspace(1)* %in3
+   %r3 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2)
+   store <2 x double> %r3, <2 x double> addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: @fma_v4f64
+; SI: V_FMA_F64
+; SI: V_FMA_F64
+; SI: V_FMA_F64
+; SI: V_FMA_F64
+define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
+                       <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) {
+   %r0 = load <4 x double> addrspace(1)* %in1
+   %r1 = load <4 x double> addrspace(1)* %in2
+   %r2 = load <4 x double> addrspace(1)* %in3
+   %r3 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %r0, <4 x double> %r1, <4 x double> %r2)
+   store <4 x double> %r3, <4 x double> addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/R600/fnearbyint.ll b/test/CodeGen/R600/fnearbyint.ll
new file mode 100644
index 0000000..1c1d731
--- /dev/null
+++ b/test/CodeGen/R600/fnearbyint.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s
+
+; This should have the exactly the same output as the test for rint,
+; so no need to check anything.
+
+declare float @llvm.nearbyint.f32(float) #0
+declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) #0
+declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #0
+declare double @llvm.nearbyint.f64(double) #0
+declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #0
+declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0
+
+
+define void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 {
+entry:
+  %0 = call float @llvm.nearbyint.f32(float %in)
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+define void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+entry:
+  %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+define void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
+entry:
+  %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define void @nearbyint_f64(double addrspace(1)* %out, double %in) {
+entry:
+  %0 = call double @llvm.nearbyint.f64(double %in)
+  store double %0, double addrspace(1)* %out
+  ret void
+}
+define void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+entry:
+  %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in)
+  store <2 x double> %0, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+define void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+entry:
+  %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in)
+  store <4 x double> %0, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readonly }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/R600/fp16_to_fp32.ll b/test/CodeGen/R600/fp16_to_fp32.ll
new file mode 100644
index 0000000..fa2e379
--- /dev/null
+++ b/test/CodeGen/R600/fp16_to_fp32.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i16 @llvm.convert.to.fp16(float) nounwind readnone
+
+; SI-LABEL: @test_convert_fp16_to_fp32:
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]]
+; SI: V_CVT_F16_F32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: BUFFER_STORE_SHORT [[RESULT]]
+define void @test_convert_fp16_to_fp32(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %val = load float addrspace(1)* %in, align 4
+  %cvt = call i16 @llvm.convert.to.fp16(float %val) nounwind readnone
+  store i16 %cvt, i16 addrspace(1)* %out, align 2
+  ret void
+}
diff --git a/test/CodeGen/R600/fp32_to_fp16.ll b/test/CodeGen/R600/fp32_to_fp16.ll
new file mode 100644
index 0000000..9997cd3
--- /dev/null
+++ b/test/CodeGen/R600/fp32_to_fp16.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.convert.from.fp16(i16) nounwind readnone
+
+; SI-LABEL: @test_convert_fp16_to_fp32:
+; SI: BUFFER_LOAD_USHORT [[VAL:v[0-9]+]]
+; SI: V_CVT_F32_F16_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: BUFFER_STORE_DWORD [[RESULT]]
+define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+  %val = load i16 addrspace(1)* %in, align 2
+  %cvt = call float @llvm.convert.from.fp16(i16 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/fp_to_sint_i64.ll b/test/CodeGen/R600/fp_to_sint_i64.ll
new file mode 100644
index 0000000..ec3e198
--- /dev/null
+++ b/test/CodeGen/R600/fp_to_sint_i64.ll
@@ -0,0 +1,12 @@
+; FIXME: Merge into fp_to_sint.ll when EG/NI supports 64-bit types
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
+
+; SI-LABEL: @fp_to_sint_i64
+; Check that the compiler doesn't crash with a "cannot select" error
+; SI: S_ENDPGM
+define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fptosi float %in to i64
+  store i64 %0, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fsub64.ll b/test/CodeGen/R600/fsub64.ll
index 1445a20..f5e5708 100644
--- a/test/CodeGen/R600/fsub64.ll
+++ b/test/CodeGen/R600/fsub64.ll
@@ -1,8 +1,7 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
-
-; CHECK: @fsub_f64
-; CHECK: V_ADD_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}, 0, 0, 0, 0, 2
+; RUN: llc -march=r600 -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
+; SI-LABEL: @fsub_f64:
+; SI: V_ADD_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
diff --git a/test/CodeGen/R600/ftrunc.ll b/test/CodeGen/R600/ftrunc.ll
index 6b235ff..0d7d467 100644
--- a/test/CodeGen/R600/ftrunc.ll
+++ b/test/CodeGen/R600/ftrunc.ll
@@ -1,84 +1,119 @@
-; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG --check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s
 
-declare double @llvm.trunc.f64(double) nounwind readnone
-declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
-declare <3 x double> @llvm.trunc.v3f64(<3 x double>) nounwind readnone
-declare <4 x double> @llvm.trunc.v4f64(<4 x double>) nounwind readnone
-declare <8 x double> @llvm.trunc.v8f64(<8 x double>) nounwind readnone
-declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone
+declare float @llvm.trunc.f32(float) nounwind readnone
+declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone
+declare <3 x float> @llvm.trunc.v3f32(<3 x float>) nounwind readnone
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone
+declare <8 x float> @llvm.trunc.v8f32(<8 x float>) nounwind readnone
+declare <16 x float> @llvm.trunc.v16f32(<16 x float>) nounwind readnone
 
-; CI-LABEL: @ftrunc_f64:
-; CI: V_TRUNC_F64_e32
-define void @ftrunc_f64(double addrspace(1)* %out, double %x) {
-  %y = call double @llvm.trunc.f64(double %x) nounwind readnone
-  store double %y, double addrspace(1)* %out
+; FUNC-LABEL: @ftrunc_f32:
+; EG: TRUNC
+; SI: V_TRUNC_F32_e32
+define void @ftrunc_f32(float addrspace(1)* %out, float %x) {
+  %y = call float @llvm.trunc.f32(float %x) nounwind readnone
+  store float %y, float addrspace(1)* %out
   ret void
 }
 
-; CI-LABEL: @ftrunc_v2f64:
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
-  %y = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x) nounwind readnone
-  store <2 x double> %y, <2 x double> addrspace(1)* %out
+; FUNC-LABEL: @ftrunc_v2f32:
+; EG: TRUNC
+; EG: TRUNC
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
+  %y = call <2 x float> @llvm.trunc.v2f32(<2 x float> %x) nounwind readnone
+  store <2 x float> %y, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; FIXME-CI-LABEL: @ftrunc_v3f64:
-; FIXME-CI: V_TRUNC_F64_e32
-; FIXME-CI: V_TRUNC_F64_e32
-; FIXME-CI: V_TRUNC_F64_e32
-; define void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
-;   %y = call <3 x double> @llvm.trunc.v3f64(<3 x double> %x) nounwind readnone
-;   store <3 x double> %y, <3 x double> addrspace(1)* %out
+; FIXME-FUNC-LABEL: @ftrunc_v3f32:
+; FIXME-EG: TRUNC
+; FIXME-EG: TRUNC
+; FIXME-EG: TRUNC
+; FIXME-SI: V_TRUNC_F32_e32
+; FIXME-SI: V_TRUNC_F32_e32
+; FIXME-SI: V_TRUNC_F32_e32
+; define void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
+;   %y = call <3 x float> @llvm.trunc.v3f32(<3 x float> %x) nounwind readnone
+;   store <3 x float> %y, <3 x float> addrspace(1)* %out
 ;   ret void
 ; }
 
-; CI-LABEL: @ftrunc_v4f64:
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
-  %y = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone
-  store <4 x double> %y, <4 x double> addrspace(1)* %out
+; FUNC-LABEL: @ftrunc_v4f32:
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
+  %y = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone
+  store <4 x float> %y, <4 x float> addrspace(1)* %out
   ret void
 }
 
-; CI-LABEL: @ftrunc_v8f64:
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
-  %y = call <8 x double> @llvm.trunc.v8f64(<8 x double> %x) nounwind readnone
-  store <8 x double> %y, <8 x double> addrspace(1)* %out
+; FUNC-LABEL: @ftrunc_v8f32:
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
+  %y = call <8 x float> @llvm.trunc.v8f32(<8 x float> %x) nounwind readnone
+  store <8 x float> %y, <8 x float> addrspace(1)* %out
   ret void
 }
 
-; CI-LABEL: @ftrunc_v16f64:
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-; CI: V_TRUNC_F64_e32
-define void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
-  %y = call <16 x double> @llvm.trunc.v16f64(<16 x double> %x) nounwind readnone
-  store <16 x double> %y, <16 x double> addrspace(1)* %out
+; FUNC-LABEL: @ftrunc_v16f32:
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+; SI: V_TRUNC_F32_e32
+define void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
+  %y = call <16 x float> @llvm.trunc.v16f32(<16 x float> %x) nounwind readnone
+  store <16 x float> %y, <16 x float> addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/gv-const-addrspace.ll b/test/CodeGen/R600/gv-const-addrspace.ll
index 0176061..db64a6f 100644
--- a/test/CodeGen/R600/gv-const-addrspace.ll
+++ b/test/CodeGen/R600/gv-const-addrspace.ll
@@ -6,7 +6,7 @@
 
 ; XXX: Test on SI once 64-bit adds are supportes.
 
-@float_gv = internal addrspace(2) unnamed_addr constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4
+@float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4
 
 ; FUNC-LABEL: @float
 
@@ -25,7 +25,7 @@ entry:
   ret void
 }
 
-@i32_gv = internal addrspace(2) unnamed_addr constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4
+@i32_gv = internal unnamed_addr addrspace(2) constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4
 
 ; FUNC-LABEL: @i32
 
@@ -47,7 +47,7 @@ entry:
 
 %struct.foo = type { float, [5 x i32] }
 
-@struct_foo_gv = internal addrspace(2) unnamed_addr constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ]
+@struct_foo_gv = internal unnamed_addr addrspace(2) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ]
 
 ; FUNC-LABEL: @struct_foo_gv_load
 
diff --git a/test/CodeGen/R600/indirect-private-64.ll b/test/CodeGen/R600/indirect-private-64.ll
index 4d1f734..b127b7e 100644
--- a/test/CodeGen/R600/indirect-private-64.ll
+++ b/test/CodeGen/R600/indirect-private-64.ll
@@ -3,10 +3,8 @@
 declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
 
 ; SI-LABEL: @private_access_f64_alloca:
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELS_B32_e32
-; SI: V_MOVRELS_B32_e32
+; SI: DS_WRITE_B64
+; SI: DS_READ_B64
 define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
   %val = load double addrspace(1)* %in, align 8
   %array = alloca double, i32 16, align 8
@@ -19,14 +17,10 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
 }
 
 ; SI-LABEL: @private_access_v2f64_alloca:
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELS_B32_e32
-; SI: V_MOVRELS_B32_e32
-; SI: V_MOVRELS_B32_e32
-; SI: V_MOVRELS_B32_e32
+; SI: DS_WRITE_B64
+; SI: DS_WRITE_B64
+; SI: DS_READ_B64
+; SI: DS_READ_B64
 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
   %val = load <2 x double> addrspace(1)* %in, align 16
   %array = alloca <2 x double>, i32 16, align 16
@@ -39,10 +33,8 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out
 }
 
 ; SI-LABEL: @private_access_i64_alloca:
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELS_B32_e32
-; SI: V_MOVRELS_B32_e32
+; SI: DS_WRITE_B64
+; SI: DS_READ_B64
 define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
   %val = load i64 addrspace(1)* %in, align 8
   %array = alloca i64, i32 16, align 8
@@ -55,14 +47,10 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
 }
 
 ; SI-LABEL: @private_access_v2i64_alloca:
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELD_B32_e32
-; SI: V_MOVRELS_B32_e32
-; SI: V_MOVRELS_B32_e32
-; SI: V_MOVRELS_B32_e32
-; SI: V_MOVRELS_B32_e32
+; SI: DS_WRITE_B64
+; SI: DS_WRITE_B64
+; SI: DS_READ_B64
+; SI: DS_READ_B64
 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
   %val = load <2 x i64> addrspace(1)* %in, align 16
   %array = alloca <2 x i64>, i32 16, align 16
diff --git a/test/CodeGen/R600/input-mods.ll b/test/CodeGen/R600/input-mods.ll
new file mode 100644
index 0000000..13bfbab
--- /dev/null
+++ b/test/CodeGen/R600/input-mods.ll
@@ -0,0 +1,26 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
+;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
+
+;EG-CHECK-LABEL: @test
+;EG-CHECK: EXP_IEEE *
+;CM-CHECK-LABEL: @test
+;CM-CHECK: EXP_IEEE T{{[0-9]+}}.X, -|T{{[0-9]+}}.X|
+;CM-CHECK: EXP_IEEE T{{[0-9]+}}.Y (MASKED), -|T{{[0-9]+}}.X|
+;CM-CHECK: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X|
+;CM-CHECK: EXP_IEEE * T{{[0-9]+}}.W (MASKED), -|T{{[0-9]+}}.X|
+
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = call float @llvm.fabs.f32(float %r0)
+   %r2 = fsub float -0.000000e+00, %r1
+   %r3 = call float @llvm.exp2.f32(float %r2)
+   %vec = insertelement <4 x float> undef, float %r3, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+declare float @llvm.exp2.f32(float) readnone
+declare float @llvm.fabs.f32(float) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/large-alloca.ll b/test/CodeGen/R600/large-alloca.ll
new file mode 100644
index 0000000..d8be6d4
--- /dev/null
+++ b/test/CodeGen/R600/large-alloca.ll
@@ -0,0 +1,14 @@
+; XFAIL: *
+; REQUIRES: asserts
+; RUN: llc -march=r600 -mcpu=SI < %s
+
+define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) nounwind {
+  %large = alloca [8192 x i32], align 4
+  %gep = getelementptr [8192 x i32]* %large, i32 0, i32 8191
+  store i32 %x, i32* %gep
+  %gep1 = getelementptr [8192 x i32]* %large, i32 0, i32 %y
+  %0 = load i32* %gep1
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/R600/large-constant-initializer.ll b/test/CodeGen/R600/large-constant-initializer.ll
new file mode 100644
index 0000000..552cd05
--- /dev/null
+++ b/test/CodeGen/R600/large-constant-initializer.ll
@@ -0,0 +1,19 @@
+; XFAIL: *
+; REQUIRES: asserts
+; RUN: llc -march=r600 -mcpu=SI < %s
+
+@gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4
+
+define void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind {
+  %val = load i32 addrspace(2)* getelementptr ([239 x i32] addrspace(2)* @gv, i64 0, i64 239), align 4
+  %mul12 = mul nsw i32 %val, 7
+  br i1 undef, label %exit, label %bb
+
+bb:
+  %cmp = icmp slt i32 %x, 0
+  br label %exit
+
+exit:
+  ret void
+}
+
diff --git a/test/CodeGen/R600/lds-output-queue.ll b/test/CodeGen/R600/lds-output-queue.ll
index af0db0d..d5dc061 100644
--- a/test/CodeGen/R600/lds-output-queue.ll
+++ b/test/CodeGen/R600/lds-output-queue.ll
@@ -8,7 +8,7 @@
 ; CHECK-NOT: ALU clause
 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
 
-@local_mem = internal addrspace(3) unnamed_addr global [2 x i32] [i32 1, i32 2], align 4
+@local_mem = internal unnamed_addr addrspace(3) global [2 x i32] [i32 1, i32 2], align 4
 
 define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
 entry:
diff --git a/test/CodeGen/R600/lds-size.ll b/test/CodeGen/R600/lds-size.ll
index 2185180..9182e25 100644
--- a/test/CodeGen/R600/lds-size.ll
+++ b/test/CodeGen/R600/lds-size.ll
@@ -6,7 +6,7 @@
 ; CHECK-LABEL: @test
 ; CHECK: .long   166120
 ; CHECK-NEXT: .long   1
-@lds = internal addrspace(3) unnamed_addr global i32 zeroinitializer, align 4
+@lds = internal unnamed_addr addrspace(3) global i32 zeroinitializer, align 4
 
 define void @test(i32 addrspace(1)* %out, i32 %cond) {
 entry:
diff --git a/test/CodeGen/R600/lit.local.cfg b/test/CodeGen/R600/lit.local.cfg
index 2d8930a..ad9ce25 100644
--- a/test/CodeGen/R600/lit.local.cfg
+++ b/test/CodeGen/R600/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'R600' in targets:
+if not 'R600' in config.root.targets:
     config.unsupported = True
diff --git a/test/CodeGen/R600/llvm.AMDGPU.abs.ll b/test/CodeGen/R600/llvm.AMDGPU.abs.ll
new file mode 100644
index 0000000..a0a47b7
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.abs.ll
@@ -0,0 +1,48 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.abs(i32) nounwind readnone
+
+; Legacy name
+declare i32 @llvm.AMDIL.abs.i32(i32) nounwind readnone
+
+; FUNC-LABEL: @s_abs_i32
+; SI: S_SUB_I32
+; SI: S_MAX_I32
+; SI: S_ENDPGM
+
+; EG: SUB_INT
+; EG: MAX_INT
+define void @s_abs_i32(i32 addrspace(1)* %out, i32 %src) nounwind {
+  %abs = call i32 @llvm.AMDGPU.abs(i32 %src) nounwind readnone
+  store i32 %abs, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_abs_i32
+; SI: V_SUB_I32_e32
+; SI: V_MAX_I32_e32
+; SI: S_ENDPGM
+
+; EG: SUB_INT
+; EG: MAX_INT
+define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+  %val = load i32 addrspace(1)* %src, align 4
+  %abs = call i32 @llvm.AMDGPU.abs(i32 %val) nounwind readnone
+  store i32 %abs, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @abs_i32_legacy_amdil
+; SI: V_SUB_I32_e32
+; SI: V_MAX_I32_e32
+; SI: S_ENDPGM
+
+; EG: SUB_INT
+; EG: MAX_INT
+define void @abs_i32_legacy_amdil(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+  %val = load i32 addrspace(1)* %src, align 4
+  %abs = call i32 @llvm.AMDIL.abs.i32(i32 %val) nounwind readnone
+  store i32 %abs, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.brev.ll b/test/CodeGen/R600/llvm.AMDGPU.brev.ll
new file mode 100644
index 0000000..68a5ad0
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.brev.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.brev(i32) nounwind readnone
+
+; FUNC-LABEL: @s_brev_i32:
+; SI: S_LOAD_DWORD [[VAL:s[0-9]+]],
+; SI: S_BREV_B32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: BUFFER_STORE_DWORD [[VRESULT]],
+; SI: S_ENDPGM
+define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_brev_i32:
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_BFREV_B32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32 addrspace(1)* %valptr, align 4
+  %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.clamp.ll b/test/CodeGen/R600/llvm.AMDGPU.clamp.ll
new file mode 100644
index 0000000..d608953
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.clamp.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.AMDGPU.clamp.f32(float, float, float) nounwind readnone
+declare float @llvm.AMDIL.clamp.f32(float, float, float) nounwind readnone
+
+; FUNC-LABEL: @clamp_0_1_f32
+; SI: S_LOAD_DWORD [[ARG:s[0-9]+]],
+; SI: V_ADD_F32_e64 [[RESULT:v[0-9]+]], [[ARG]], 0, 1, 0
+; SI: BUFFER_STORE_DWORD [[RESULT]]
+; SI: S_ENDPGM
+
+; EG: MOV_SAT
+define void @clamp_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
+  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone
+  store float %clamp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @clamp_0_1_amdil_legacy_f32
+; SI: S_LOAD_DWORD [[ARG:s[0-9]+]],
+; SI: V_ADD_F32_e64 [[RESULT:v[0-9]+]], [[ARG]], 0, 1, 0
+; SI: BUFFER_STORE_DWORD [[RESULT]]
+define void @clamp_0_1_amdil_legacy_f32(float addrspace(1)* %out, float %src) nounwind {
+  %clamp = call float @llvm.AMDIL.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone
+  store float %clamp, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll b/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll
new file mode 100644
index 0000000..6facb47
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.AMDGPU.cvt.f32.ubyte0(i32) nounwind readnone
+declare float @llvm.AMDGPU.cvt.f32.ubyte1(i32) nounwind readnone
+declare float @llvm.AMDGPU.cvt.f32.ubyte2(i32) nounwind readnone
+declare float @llvm.AMDGPU.cvt.f32.ubyte3(i32) nounwind readnone
+
+; SI-LABEL: @test_unpack_byte0_to_float:
+; SI: V_CVT_F32_UBYTE0
+define void @test_unpack_byte0_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte0(i32 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @test_unpack_byte1_to_float:
+; SI: V_CVT_F32_UBYTE1
+define void @test_unpack_byte1_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte1(i32 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @test_unpack_byte2_to_float:
+; SI: V_CVT_F32_UBYTE2
+define void @test_unpack_byte2_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte2(i32 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @test_unpack_byte3_to_float:
+; SI: V_CVT_F32_UBYTE3
+define void @test_unpack_byte3_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte3(i32 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll b/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
new file mode 100644
index 0000000..c8c7357
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.AMDGPU.div.fixup.f32(float, float, float) nounwind readnone
+declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readnone
+
+; SI-LABEL: @test_div_fixup_f32:
+; SI-DAG: S_LOAD_DWORD [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: S_LOAD_DWORD [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: V_MOV_B32_e32 [[VC:v[0-9]+]], [[SC]]
+; SI-DAG: S_LOAD_DWORD [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: V_MOV_B32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: V_DIV_FIXUP_F32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+  %result = call float @llvm.AMDGPU.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @test_div_fixup_f64:
+; SI: V_DIV_FIXUP_F64
+define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
+  %result = call double @llvm.AMDGPU.div.fixup.f64(double %a, double %b, double %c) nounwind readnone
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll b/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
new file mode 100644
index 0000000..4f1e827
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.AMDGPU.div.fmas.f32(float, float, float) nounwind readnone
+declare double @llvm.AMDGPU.div.fmas.f64(double, double, double) nounwind readnone
+
+; SI-LABEL: @test_div_fmas_f32:
+; SI-DAG: S_LOAD_DWORD [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: S_LOAD_DWORD [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: V_MOV_B32_e32 [[VC:v[0-9]+]], [[SC]]
+; SI-DAG: S_LOAD_DWORD [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: V_MOV_B32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: V_DIV_FMAS_F32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @test_div_fmas_f64:
+; SI: V_DIV_FMAS_F64
+define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
+  %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c) nounwind readnone
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll b/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
new file mode 100644
index 0000000..527c8da
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
@@ -0,0 +1,48 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone
+declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone
+
+; SI-LABEL @test_div_scale_f32_1:
+; SI: V_DIV_SCALE_F32
+define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr) nounwind {
+  %a = load float addrspace(1)* %aptr, align 4
+  %b = load float addrspace(1)* %bptr, align 4
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_2:
+; SI: V_DIV_SCALE_F32
+define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr) nounwind {
+  %a = load float addrspace(1)* %aptr, align 4
+  %b = load float addrspace(1)* %bptr, align 4
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_1:
+; SI: V_DIV_SCALE_F64
+define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr, double addrspace(1)* %cptr) nounwind {
+  %a = load double addrspace(1)* %aptr, align 8
+  %b = load double addrspace(1)* %bptr, align 8
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_1:
+; SI: V_DIV_SCALE_F64
+define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr, double addrspace(1)* %cptr) nounwind {
+  %a = load double addrspace(1)* %aptr, align 8
+  %b = load double addrspace(1)* %bptr, align 8
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.fract.ll b/test/CodeGen/R600/llvm.AMDGPU.fract.ll
new file mode 100644
index 0000000..72ec1c5
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.fract.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.AMDGPU.fract.f32(float) nounwind readnone
+
+; Legacy name
+declare float @llvm.AMDIL.fraction.f32(float) nounwind readnone
+
+; FUNC-LABEL: @fract_f32
+; SI: V_FRACT_F32
+; EG: FRACT
+define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
+  %val = load float addrspace(1)* %src, align 4
+  %fract = call float @llvm.AMDGPU.fract.f32(float %val) nounwind readnone
+  store float %fract, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fract_f32_legacy_amdil
+; SI: V_FRACT_F32
+; EG: FRACT
+define void @fract_f32_legacy_amdil(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
+  %val = load float addrspace(1)* %src, align 4
+  %fract = call float @llvm.AMDIL.fraction.f32(float %val) nounwind readnone
+  store float %fract, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll b/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll
new file mode 100644
index 0000000..51964ee
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll
@@ -0,0 +1,13 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.AMDGPU.legacy.rsq(float) nounwind readnone
+
+; FUNC-LABEL: @rsq_legacy_f32
+; SI: V_RSQ_LEGACY_F32_e32
+; EG: RECIPSQRT_IEEE
+define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) nounwind {
+  %rsq = call float @llvm.AMDGPU.legacy.rsq(float %src) nounwind readnone
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rcp.ll b/test/CodeGen/R600/llvm.AMDGPU.rcp.ll
new file mode 100644
index 0000000..ca5260d
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.rcp.ll
@@ -0,0 +1,58 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone
+declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
+
+
+declare float @llvm.sqrt.f32(float) nounwind readnone
+declare double @llvm.sqrt.f64(double) nounwind readnone
+
+; FUNC-LABEL: @rcp_f32
+; SI: V_RCP_F32_e32
+define void @rcp_f32(float addrspace(1)* %out, float %src) nounwind {
+  %rcp = call float @llvm.AMDGPU.rcp.f32(float %src) nounwind readnone
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @rcp_f64
+; SI: V_RCP_F64_e32
+define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind {
+  %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @rcp_pat_f32
+; SI: V_RCP_F32_e32
+define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
+  %rcp = fdiv float 1.0, %src
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @rcp_pat_f64
+; SI: V_RCP_F64_e32
+define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
+  %rcp = fdiv double 1.0, %src
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @rsq_rcp_pat_f32
+; SI: V_RSQ_F32_e32
+define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
+  %sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone
+  %rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @rsq_rcp_pat_f64
+; SI: V_RSQ_F64_e32
+define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
+  %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone
+  %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll
new file mode 100644
index 0000000..100d6ff
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone
+
+; FUNC-LABEL: @rsq_clamped_f64
+; SI: V_RSQ_CLAMP_F64_e32
+define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind {
+  %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone
+  store double %rsq_clamped, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll
new file mode 100644
index 0000000..683df73
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+declare float @llvm.AMDGPU.rsq.clamped.f32(float) nounwind readnone
+
+; FUNC-LABEL: @rsq_clamped_f32
+; SI: V_RSQ_CLAMP_F32_e32
+; EG: RECIPSQRT_CLAMPED
+define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
+  %rsq_clamped = call float @llvm.AMDGPU.rsq.clamped.f32(float %src) nounwind readnone
+  store float %rsq_clamped, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.ll
new file mode 100644
index 0000000..27cf6b2
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.rsq.ll
@@ -0,0 +1,13 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone
+
+; FUNC-LABEL: @rsq_f32
+; SI: V_RSQ_F32_e32
+; EG: RECIPSQRT_IEEE
+define void @rsq_f32(float addrspace(1)* %out, float %src) nounwind {
+  %rsq = call float @llvm.AMDGPU.rsq.f32(float %src) nounwind readnone
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll b/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
new file mode 100644
index 0000000..1c736d4
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone
+
+; SI-LABEL: @test_trig_preop_f64:
+; SI-DAG: BUFFER_LOAD_DWORD [[SEG:v[0-9]+]]
+; SI-DAG: BUFFER_LOAD_DWORDX2 [[SRC:v\[[0-9]+:[0-9]+\]]],
+; SI: V_TRIG_PREOP_F64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]]
+; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
+; SI: S_ENDPGM
+define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %a = load double addrspace(1)* %aptr, align 8
+  %b = load i32 addrspace(1)* %bptr, align 4
+  %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 %b) nounwind readnone
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @test_trig_preop_f64_imm_segment:
+; SI: BUFFER_LOAD_DWORDX2 [[SRC:v\[[0-9]+:[0-9]+\]]],
+; SI: V_TRIG_PREOP_F64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7
+; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
+; SI: S_ENDPGM
+define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
+  %a = load double addrspace(1)* %aptr, align 8
+  %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 7) nounwind readnone
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.SI.gather4.ll b/test/CodeGen/R600/llvm.SI.gather4.ll
new file mode 100644
index 0000000..8402faa
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.gather4.ll
@@ -0,0 +1,508 @@
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: @gather4_v2
+;CHECK: IMAGE_GATHER4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_v2() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4
+;CHECK: IMAGE_GATHER4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_cl
+;CHECK: IMAGE_GATHER4_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_l
+;CHECK: IMAGE_GATHER4_L {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_b
+;CHECK: IMAGE_GATHER4_B {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_b_cl
+;CHECK: IMAGE_GATHER4_B_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_b_cl_v8
+;CHECK: IMAGE_GATHER4_B_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_b_cl_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_lz_v2
+;CHECK: IMAGE_GATHER4_LZ {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_lz_v2() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_lz
+;CHECK: IMAGE_GATHER4_LZ {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+
+;CHECK-LABEL: @gather4_o
+;CHECK: IMAGE_GATHER4_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_cl_o
+;CHECK: IMAGE_GATHER4_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_cl_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_cl_o_v8
+;CHECK: IMAGE_GATHER4_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_cl_o_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_l_o
+;CHECK: IMAGE_GATHER4_L_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_l_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_l_o_v8
+;CHECK: IMAGE_GATHER4_L_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_l_o_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_b_o
+;CHECK: IMAGE_GATHER4_B_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_b_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_b_o_v8
+;CHECK: IMAGE_GATHER4_B_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_b_o_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_b_cl_o
+;CHECK: IMAGE_GATHER4_B_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_b_cl_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_lz_o
+;CHECK: IMAGE_GATHER4_LZ_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_lz_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+
+;CHECK-LABEL: @gather4_c
+;CHECK: IMAGE_GATHER4_C {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_c_cl
+;CHECK: IMAGE_GATHER4_C_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_c_cl_v8
+;CHECK: IMAGE_GATHER4_C_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_cl_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_c_l
+;CHECK: IMAGE_GATHER4_C_L {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_c_l_v8
+;CHECK: IMAGE_GATHER4_C_L {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_l_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_c_b
+;CHECK: IMAGE_GATHER4_C_B {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_c_b_v8
+;CHECK: IMAGE_GATHER4_C_B {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_b_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_c_b_cl
+;CHECK: IMAGE_GATHER4_C_B_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_c_lz
+;CHECK: IMAGE_GATHER4_C_LZ {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+
+;CHECK-LABEL: @gather4_c_o
+;CHECK: IMAGE_GATHER4_C_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_c_o_v8
+;CHECK: IMAGE_GATHER4_C_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_o_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_c_cl_o
+;CHECK: IMAGE_GATHER4_C_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_cl_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_c_l_o
+;CHECK: IMAGE_GATHER4_C_L_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_l_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_c_b_o
+;CHECK: IMAGE_GATHER4_C_B_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_b_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_c_b_cl_o
+;CHECK: IMAGE_GATHER4_C_B_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_b_cl_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_c_lz_o
+;CHECK: IMAGE_GATHER4_C_LZ_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_lz_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: @gather4_c_lz_o_v8
+;CHECK: IMAGE_GATHER4_C_LZ_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_lz_o_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+
+declare <4 x float> @llvm.SI.gather4.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.SI.getlod.ll b/test/CodeGen/R600/llvm.SI.getlod.ll
new file mode 100644
index 0000000..a7a17ec
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.getlod.ll
@@ -0,0 +1,44 @@
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: @getlod
+;CHECK: IMAGE_GET_LOD {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @getlod() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.getlod.i32(i32 undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
+  ret void
+}
+
+;CHECK-LABEL: @getlod_v2
+;CHECK: IMAGE_GET_LOD {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @getlod_v2() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.getlod.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
+  ret void
+}
+
+;CHECK-LABEL: @getlod_v4
+;CHECK: IMAGE_GET_LOD {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @getlod_v4() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.getlod.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
+  ret void
+}
+
+
+declare <4 x float> @llvm.SI.getlod.i32(i32, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.getlod.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.getlod.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.exp2.ll b/test/CodeGen/R600/llvm.exp2.ll
index 13bfbab..119d5ef 100644
--- a/test/CodeGen/R600/llvm.exp2.ll
+++ b/test/CodeGen/R600/llvm.exp2.ll
@@ -1,26 +1,79 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
-;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK --check-prefix=FUNC
+;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK --check-prefix=FUNC
+;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
 
-;EG-CHECK-LABEL: @test
-;EG-CHECK: EXP_IEEE *
-;CM-CHECK-LABEL: @test
-;CM-CHECK: EXP_IEEE T{{[0-9]+}}.X, -|T{{[0-9]+}}.X|
-;CM-CHECK: EXP_IEEE T{{[0-9]+}}.Y (MASKED), -|T{{[0-9]+}}.X|
-;CM-CHECK: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X|
-;CM-CHECK: EXP_IEEE * T{{[0-9]+}}.W (MASKED), -|T{{[0-9]+}}.X|
+;FUNC-LABEL: @test
+;EG-CHECK: EXP_IEEE
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;SI-CHECK: V_EXP_F32
 
-define void @test(<4 x float> inreg %reg0) #0 {
-   %r0 = extractelement <4 x float> %reg0, i32 0
-   %r1 = call float @llvm.fabs.f32(float %r0)
-   %r2 = fsub float -0.000000e+00, %r1
-   %r3 = call float @llvm.exp2.f32(float %r2)
-   %vec = insertelement <4 x float> undef, float %r3, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+define void @test(float addrspace(1)* %out, float %in) {
+entry:
+   %0 = call float @llvm.exp2.f32(float %in)
+   store float %0, float addrspace(1)* %out
    ret void
 }
 
-declare float @llvm.exp2.f32(float) readnone
-declare float @llvm.fabs.f32(float) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+;FUNC-LABEL: @testv2
+;EG-CHECK: EXP_IEEE
+;EG-CHECK: EXP_IEEE
+; FIXME: We should be able to merge these packets together on Cayman so we
+; have a maximum of 4 instructions.
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;SI-CHECK: V_EXP_F32
+;SI-CHECK: V_EXP_F32
+
+define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
 
-attributes #0 = { "ShaderType"="0" }
+;FUNC-LABEL: @testv4
+;EG-CHECK: EXP_IEEE
+;EG-CHECK: EXP_IEEE
+;EG-CHECK: EXP_IEEE
+;EG-CHECK: EXP_IEEE
+; FIXME: We should be able to merge these packets together on Cayman so we
+; have a maximum of 4 instructions.
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;SI-CHECK: V_EXP_F32
+;SI-CHECK: V_EXP_F32
+;SI-CHECK: V_EXP_F32
+;SI-CHECK: V_EXP_F32
+define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.exp2.f32(float) readnone
+declare <2 x float> @llvm.exp2.v2f32(<2 x float>) readnone
+declare <4 x float> @llvm.exp2.v4f32(<4 x float>) readnone
diff --git a/test/CodeGen/R600/llvm.log2.ll b/test/CodeGen/R600/llvm.log2.ll
new file mode 100644
index 0000000..4cba2d4
--- /dev/null
+++ b/test/CodeGen/R600/llvm.log2.ll
@@ -0,0 +1,79 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK --check-prefix=FUNC
+;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK --check-prefix=FUNC
+;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
+
+;FUNC-LABEL: @test
+;EG-CHECK: LOG_IEEE
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;SI-CHECK: V_LOG_F32
+
+define void @test(float addrspace(1)* %out, float %in) {
+entry:
+   %0 = call float @llvm.log2.f32(float %in)
+   store float %0, float addrspace(1)* %out
+   ret void
+}
+
+;FUNC-LABEL: @testv2
+;EG-CHECK: LOG_IEEE
+;EG-CHECK: LOG_IEEE
+; FIXME: We should be able to merge these packets together on Cayman so we
+; have a maximum of 4 instructions.
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;SI-CHECK: V_LOG_F32
+;SI-CHECK: V_LOG_F32
+
+define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: @testv4
+;EG-CHECK: LOG_IEEE
+;EG-CHECK: LOG_IEEE
+;EG-CHECK: LOG_IEEE
+;EG-CHECK: LOG_IEEE
+; FIXME: We should be able to merge these packets together on Cayman so we
+; have a maximum of 4 instructions.
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;SI-CHECK: V_LOG_F32
+;SI-CHECK: V_LOG_F32
+;SI-CHECK: V_LOG_F32
+;SI-CHECK: V_LOG_F32
+define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.log2.f32(float) readnone
+declare <2 x float> @llvm.log2.v2f32(<2 x float>) readnone
+declare <4 x float> @llvm.log2.v4f32(<4 x float>) readnone
diff --git a/test/CodeGen/R600/llvm.rint.f64.ll b/test/CodeGen/R600/llvm.rint.f64.ll
index a7a909a..3e2884b 100644
--- a/test/CodeGen/R600/llvm.rint.f64.ll
+++ b/test/CodeGen/R600/llvm.rint.f64.ll
@@ -1,30 +1,38 @@
 ; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @f64
+; FUNC-LABEL: @rint_f64
 ; CI: V_RNDNE_F64_e32
-define void @f64(double addrspace(1)* %out, double %in) {
+
+; SI-DAG: V_ADD_F64
+; SI-DAG: V_ADD_F64
+; SI-DAG V_CMP_GT_F64_e64
+; SI: V_CNDMASK_B32
+; SI: V_CNDMASK_B32
+; SI: S_ENDPGM
+define void @rint_f64(double addrspace(1)* %out, double %in) {
 entry:
   %0 = call double @llvm.rint.f64(double %in)
   store double %0, double addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @v2f64
+; FUNC-LABEL: @rint_v2f64
 ; CI: V_RNDNE_F64_e32
 ; CI: V_RNDNE_F64_e32
-define void @v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
 entry:
   %0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %in)
   store <2 x double> %0, <2 x double> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @v4f64
+; FUNC-LABEL: @rint_v4f64
 ; CI: V_RNDNE_F64_e32
 ; CI: V_RNDNE_F64_e32
 ; CI: V_RNDNE_F64_e32
 ; CI: V_RNDNE_F64_e32
-define void @v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
 entry:
   %0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %in)
   store <4 x double> %0, <4 x double> addrspace(1)* %out
diff --git a/test/CodeGen/R600/llvm.rint.ll b/test/CodeGen/R600/llvm.rint.ll
index db8352f..209bb43 100644
--- a/test/CodeGen/R600/llvm.rint.ll
+++ b/test/CodeGen/R600/llvm.rint.ll
@@ -1,31 +1,31 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @f32
+; FUNC-LABEL: @rint_f32
 ; R600: RNDNE
 
 ; SI: V_RNDNE_F32_e32
-define void @f32(float addrspace(1)* %out, float %in) {
+define void @rint_f32(float addrspace(1)* %out, float %in) {
 entry:
-  %0 = call float @llvm.rint.f32(float %in)
+  %0 = call float @llvm.rint.f32(float %in) #0
   store float %0, float addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @v2f32
+; FUNC-LABEL: @rint_v2f32
 ; R600: RNDNE
 ; R600: RNDNE
 
 ; SI: V_RNDNE_F32_e32
 ; SI: V_RNDNE_F32_e32
-define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
-  %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in)
+  %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in) #0
   store <2 x float> %0, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @v4f32
+; FUNC-LABEL: @rint_v4f32
 ; R600: RNDNE
 ; R600: RNDNE
 ; R600: RNDNE
@@ -35,15 +35,27 @@ entry:
 ; SI: V_RNDNE_F32_e32
 ; SI: V_RNDNE_F32_e32
 ; SI: V_RNDNE_F32_e32
-define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
-  %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in)
+  %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in) #0
   store <4 x float> %0, <4 x float> addrspace(1)* %out
   ret void
 }
 
+; FUNC-LABEL: @legacy_amdil_round_nearest_f32
+; R600: RNDNE
+
+; SI: V_RNDNE_F32_e32
+define void @legacy_amdil_round_nearest_f32(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = call float @llvm.AMDIL.round.nearest.f32(float %in) #0
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.AMDIL.round.nearest.f32(float) #0
 declare float @llvm.rint.f32(float) #0
 declare <2 x float> @llvm.rint.v2f32(<2 x float>) #0
 declare <4 x float> @llvm.rint.v4f32(<4 x float>) #0
 
-attributes #0 = { nounwind readonly }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll
index 1486c4d..a57df5c 100644
--- a/test/CodeGen/R600/load.ll
+++ b/test/CodeGen/R600/load.ll
@@ -696,8 +696,7 @@ entry:
 ; R600-CHECK: LDS_READ_RET
 ; R600-CHECK: LDS_READ_RET
 ; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_B32
-; SI-CHECK: DS_READ_B32
+; SI-CHECK: DS_READ_B64
 define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) {
 entry:
   %0 = load <2 x float> addrspace(3)* %in
diff --git a/test/CodeGen/R600/local-atomics.ll b/test/CodeGen/R600/local-atomics.ll
new file mode 100644
index 0000000..5a44951
--- /dev/null
+++ b/test/CodeGen/R600/local-atomics.ll
@@ -0,0 +1,254 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; FUNC-LABEL: @lds_atomic_xchg_ret_i32:
+; SI: S_LOAD_DWORD [[SPTR:s[0-9]+]],
+; SI: V_MOV_B32_e32 [[DATA:v[0-9]+]], 4
+; SI: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: DS_WRXCHG_RTN_B32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]], 0x0, [M0]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_xchg_ret_i32_offset:
+; SI: DS_WRXCHG_RTN_B32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; XXX - Is it really necessary to load 4 into VGPR?
+; FUNC-LABEL: @lds_atomic_add_ret_i32:
+; SI: S_LOAD_DWORD [[SPTR:s[0-9]+]],
+; SI: V_MOV_B32_e32 [[DATA:v[0-9]+]], 4
+; SI: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: DS_ADD_RTN_U32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]], 0x0, [M0]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_add_ret_i32_offset:
+; SI: DS_ADD_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_inc_ret_i32:
+; SI: S_MOV_B32 [[SNEGONE:s[0-9]+]], -1
+; SI: V_MOV_B32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: DS_INC_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]], 0x0
+; SI: S_ENDPGM
+define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_inc_ret_i32_offset:
+; SI: S_MOV_B32 [[SNEGONE:s[0-9]+]], -1
+; SI: V_MOV_B32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: DS_INC_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]], 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_sub_ret_i32:
+; SI: DS_SUB_RTN_U32
+; SI: S_ENDPGM
+define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_sub_ret_i32_offset:
+; SI: DS_SUB_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_dec_ret_i32:
+; SI: S_MOV_B32 [[SNEGONE:s[0-9]+]], -1
+; SI: V_MOV_B32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: DS_DEC_RTN_U32  v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]], 0x0
+; SI: S_ENDPGM
+define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_dec_ret_i32_offset:
+; SI: S_MOV_B32 [[SNEGONE:s[0-9]+]], -1
+; SI: V_MOV_B32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: DS_DEC_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]], 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_and_ret_i32:
+; SI: DS_AND_RTN_B32
+; SI: S_ENDPGM
+define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_and_ret_i32_offset:
+; SI: DS_AND_RTN_B32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_or_ret_i32:
+; SI: DS_OR_RTN_B32
+; SI: S_ENDPGM
+define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_or_ret_i32_offset:
+; SI: DS_OR_RTN_B32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_xor_ret_i32:
+; SI: DS_XOR_RTN_B32
+; SI: S_ENDPGM
+define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_xor_ret_i32_offset:
+; SI: DS_XOR_RTN_B32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FIXME: There is no atomic nand instr
+; XFUNC-LABEL: @lds_atomic_nand_ret_i32:uction, so we somehow need to expand this.
+; define void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
+;   store i32 %result, i32 addrspace(1)* %out, align 4
+;   ret void
+; }
+
+; FUNC-LABEL: @lds_atomic_min_ret_i32:
+; SI: DS_MIN_RTN_I32
+; SI: S_ENDPGM
+define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_min_ret_i32_offset:
+; SI: DS_MIN_RTN_I32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_max_ret_i32:
+; SI: DS_MAX_RTN_I32
+; SI: S_ENDPGM
+define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_max_ret_i32_offset:
+; SI: DS_MAX_RTN_I32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_umin_ret_i32:
+; SI: DS_MIN_RTN_U32
+; SI: S_ENDPGM
+define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_umin_ret_i32_offset:
+; SI: DS_MIN_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_umax_ret_i32:
+; SI: DS_MAX_RTN_U32
+; SI: S_ENDPGM
+define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_umax_ret_i32_offset:
+; SI: DS_MAX_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: S_ENDPGM
+define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/local-atomics64.ll b/test/CodeGen/R600/local-atomics64.ll
new file mode 100644
index 0000000..849b033
--- /dev/null
+++ b/test/CodeGen/R600/local-atomics64.ll
@@ -0,0 +1,251 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; FUNC-LABEL: @lds_atomic_xchg_ret_i64:
+; SI: DS_WRXCHG_RTN_B64
+; SI: S_ENDPGM
+define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_xchg_ret_i64_offset:
+; SI: DS_WRXCHG_RTN_B64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_add_ret_i64:
+; SI: DS_ADD_RTN_U64
+; SI: S_ENDPGM
+define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_add_ret_i64_offset:
+; SI: S_LOAD_DWORD [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: S_MOV_B64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, 9
+; SI-DAG: V_MOV_B32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
+; SI-DAG: V_MOV_B32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI-DAG: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; SI: DS_ADD_RTN_U64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}, 0x20, [M0]
+; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
+; SI: S_ENDPGM
+define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i64 4
+  %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_inc_ret_i64:
+; SI: S_MOV_B64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
+; SI-DAG: V_MOV_B32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
+; SI-DAG: V_MOV_B32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI: DS_INC_RTN_U64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}},
+; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
+; SI: S_ENDPGM
+define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_inc_ret_i64_offset:
+; SI: DS_INC_RTN_U64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_sub_ret_i64:
+; SI: DS_SUB_RTN_U64
+; SI: S_ENDPGM
+define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_sub_ret_i64_offset:
+; SI: DS_SUB_RTN_U64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_dec_ret_i64:
+; SI: S_MOV_B64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
+; SI-DAG: V_MOV_B32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
+; SI-DAG: V_MOV_B32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI: DS_DEC_RTN_U64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}},
+; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
+; SI: S_ENDPGM
+define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_dec_ret_i64_offset:
+; SI: DS_DEC_RTN_U64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_and_ret_i64:
+; SI: DS_AND_RTN_B64
+; SI: S_ENDPGM
+define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_and_ret_i64_offset:
+; SI: DS_AND_RTN_B64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_or_ret_i64:
+; SI: DS_OR_RTN_B64
+; SI: S_ENDPGM
+define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_or_ret_i64_offset:
+; SI: DS_OR_RTN_B64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_xor_ret_i64:
+; SI: DS_XOR_RTN_B64
+; SI: S_ENDPGM
+define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_xor_ret_i64_offset:
+; SI: DS_XOR_RTN_B64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FIXME: There is no atomic nand instr
+; XFUNC-LABEL: @lds_atomic_nand_ret_i64:uction, so we somehow need to expand this.
+; define void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
+;   store i64 %result, i64 addrspace(1)* %out, align 8
+;   ret void
+; }
+
+; FUNC-LABEL: @lds_atomic_min_ret_i64:
+; SI: DS_MIN_RTN_I64
+; SI: S_ENDPGM
+define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_min_ret_i64_offset:
+; SI: DS_MIN_RTN_I64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_max_ret_i64:
+; SI: DS_MAX_RTN_I64
+; SI: S_ENDPGM
+define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_max_ret_i64_offset:
+; SI: DS_MAX_RTN_I64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_umin_ret_i64:
+; SI: DS_MIN_RTN_U64
+; SI: S_ENDPGM
+define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_umin_ret_i64_offset:
+; SI: DS_MIN_RTN_U64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_umax_ret_i64:
+; SI: DS_MAX_RTN_U64
+; SI: S_ENDPGM
+define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @lds_atomic_umax_ret_i64_offset:
+; SI: DS_MAX_RTN_U64 {{.*}} 0x20
+; SI: S_ENDPGM
+define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll
index 1e42285..e29e4cc 100644
--- a/test/CodeGen/R600/local-memory-two-objects.ll
+++ b/test/CodeGen/R600/local-memory-two-objects.ll
@@ -1,8 +1,8 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-@local_memory_two_objects.local_mem0 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4
-@local_memory_two_objects.local_mem1 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4
+@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] zeroinitializer, align 4
+@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] zeroinitializer, align 4
 
 ; EG-CHECK: @local_memory_two_objects
 
diff --git a/test/CodeGen/R600/local-memory.ll b/test/CodeGen/R600/local-memory.ll
index 6ebe41d..51af484 100644
--- a/test/CodeGen/R600/local-memory.ll
+++ b/test/CodeGen/R600/local-memory.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 ; RUN: llc < %s -march=r600 -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=CI-CHECK %s
 
-@local_memory.local_mem = internal addrspace(3) unnamed_addr global [128 x i32] zeroinitializer, align 4
+@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] zeroinitializer, align 4
 
 ; EG-CHECK-LABEL: @local_memory
 ; SI-CHECK-LABEL: @local_memory
diff --git a/test/CodeGen/R600/mul.ll b/test/CodeGen/R600/mul.ll
index 6ed754c..d231e92 100644
--- a/test/CodeGen/R600/mul.ll
+++ b/test/CodeGen/R600/mul.ll
@@ -1,14 +1,14 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s --check-prefix=FUNC
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; mul24 and mad24 are affected
 
-;FUNC-LABEL: @test2
-;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; FUNC-LABEL: @test2
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -19,16 +19,16 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-;FUNC-LABEL: @test4
-;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; FUNC-LABEL: @test4
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -39,11 +39,11 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   ret void
 }
 
-; SI-CHECK-LABEL: @trunc_i64_mul_to_i32
-; SI-CHECK: S_LOAD_DWORD
-; SI-CHECK: S_LOAD_DWORD
-; SI-CHECK: V_MUL_LO_I32
-; SI-CHECK: BUFFER_STORE_DWORD
+; FUNC-LABEL: @trunc_i64_mul_to_i32
+; SI: S_LOAD_DWORD
+; SI: S_LOAD_DWORD
+; SI: V_MUL_LO_I32
+; SI: BUFFER_STORE_DWORD
 define void @trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %mul = mul i64 %b, %a
   %trunc = trunc i64 %mul to i32
diff --git a/test/CodeGen/R600/no-initializer-constant-addrspace.ll b/test/CodeGen/R600/no-initializer-constant-addrspace.ll
new file mode 100644
index 0000000..ab82e7e
--- /dev/null
+++ b/test/CodeGen/R600/no-initializer-constant-addrspace.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march=r600 -mcpu=SI -o /dev/null %s
+; RUN: llc -march=r600 -mcpu=cypress -o /dev/null %s
+
+@extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4
+
+; FUNC-LABEL: @load_extern_const_init
+define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
+  %val = load i32 addrspace(2)* getelementptr ([5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+@undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4
+
+; FUNC-LABEL: @load_undef_const_init
+define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
+  %val = load i32 addrspace(2)* getelementptr ([5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll
index 9878366..91a70b7 100644
--- a/test/CodeGen/R600/or.ll
+++ b/test/CodeGen/R600/or.ll
@@ -116,9 +116,9 @@ define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64
 }
 
 ; SI-LABEL: @trunc_i64_or_to_i32
-; SI: S_LOAD_DWORD [[SREG0:s[0-9]+]],
-; SI: S_LOAD_DWORD [[SREG1:s[0-9]+]],
-; SI: S_OR_B32 [[SRESULT:s[0-9]+]], [[SREG1]], [[SREG0]]
+; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG0:[0-9]+]]
+; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG1:[0-9]+]]
+; SI: S_OR_B32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]]
 ; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: BUFFER_STORE_DWORD [[VRESULT]],
 define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
diff --git a/test/CodeGen/R600/parallelandifcollapse.ll b/test/CodeGen/R600/parallelandifcollapse.ll
index 4afaf68..8a269e0 100644
--- a/test/CodeGen/R600/parallelandifcollapse.ll
+++ b/test/CodeGen/R600/parallelandifcollapse.ll
@@ -7,6 +7,12 @@
 ; CHECK: AND_INT
 ; CHECK-NEXT: AND_INT
 ; CHECK-NEXT: OR_INT
+
+; FIXME: For some reason having the allocas here allowed the flatten cfg pass
+; to do its transfomation, however now that we are using local memory for
+; allocas, the transformation isn't happening.
+; XFAIL: *
+
 define void @_Z9chk1D_512v() #0 {
 entry:
   %a0 = alloca i32, align 4
diff --git a/test/CodeGen/R600/parallelorifcollapse.ll b/test/CodeGen/R600/parallelorifcollapse.ll
index b0db7cd..feca688 100644
--- a/test/CodeGen/R600/parallelorifcollapse.ll
+++ b/test/CodeGen/R600/parallelorifcollapse.ll
@@ -3,6 +3,11 @@
 ;
 ; CFG flattening should use parallel-or to generate branch conditions and
 ; then merge if-regions with the same bodies.
+
+; FIXME: For some reason having the allocas here allowed the flatten cfg pass
+; to do its transfomation, however now that we are using local memory for
+; allocas, the transformation isn't happening.
+; XFAIL: *
 ;
 ; CHECK: OR_INT
 ; CHECK-NEXT: OR_INT
diff --git a/test/CodeGen/R600/private-memory-atomics.ll b/test/CodeGen/R600/private-memory-atomics.ll
new file mode 100644
index 0000000..def4f9d
--- /dev/null
+++ b/test/CodeGen/R600/private-memory-atomics.ll
@@ -0,0 +1,31 @@
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s
+
+; This works because promote allocas pass replaces these with LDS atomics.
+
+; Private atomics have no real use, but at least shouldn't crash on it.
+define void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+entry:
+  %tmp = alloca [2 x i32]
+  %tmp1 = getelementptr [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr [2 x i32]* %tmp, i32 0, i32 1
+  store i32 0, i32* %tmp1
+  store i32 1, i32* %tmp2
+  %tmp3 = getelementptr [2 x i32]* %tmp, i32 0, i32 %in
+  %tmp4 = atomicrmw add i32* %tmp3, i32 7 acq_rel
+  store i32 %tmp4, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+entry:
+  %tmp = alloca [2 x i32]
+  %tmp1 = getelementptr [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr [2 x i32]* %tmp, i32 0, i32 1
+  store i32 0, i32* %tmp1
+  store i32 1, i32* %tmp2
+  %tmp3 = getelementptr [2 x i32]* %tmp, i32 0, i32 %in
+  %tmp4 = cmpxchg i32* %tmp3, i32 0, i32 1 acq_rel monotonic
+  %val = extractvalue { i32, i1 } %tmp4, 0
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/private-memory-broken.ll b/test/CodeGen/R600/private-memory-broken.ll
new file mode 100644
index 0000000..4086085
--- /dev/null
+++ b/test/CodeGen/R600/private-memory-broken.ll
@@ -0,0 +1,20 @@
+; RUN: not llc -verify-machineinstrs -march=r600 -mcpu=SI %s -o /dev/null 2>&1 | FileCheck %s
+
+; Make sure promote alloca pass doesn't crash
+
+; CHECK: unsupported call
+
+declare i32 @foo(i32*) nounwind
+
+define void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+entry:
+  %tmp = alloca [2 x i32]
+  %tmp1 = getelementptr [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr [2 x i32]* %tmp, i32 0, i32 1
+  store i32 0, i32* %tmp1
+  store i32 1, i32* %tmp2
+  %tmp3 = getelementptr [2 x i32]* %tmp, i32 0, i32 %in
+  %val = call i32 @foo(i32* %tmp3) nounwind
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
index d3453f2..89122be 100644
--- a/test/CodeGen/R600/private-memory.ll
+++ b/test/CodeGen/R600/private-memory.ll
@@ -1,24 +1,19 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
 ; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
 
-; This test checks that uses and defs of the AR register happen in the same
-; instruction clause.
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
 ; FUNC-LABEL: @mova_same_clause
 
-; R600-CHECK: MOVA_INT
-; R600-CHECK-NOT: ALU clause
-; R600-CHECK: 0 + AR.x
-; R600-CHECK: MOVA_INT
-; R600-CHECK-NOT: ALU clause
-; R600-CHECK: 0 + AR.x
-
-; SI-CHECK: V_READFIRSTLANE_B32 vcc_lo
-; SI-CHECK: V_MOVRELD
-; SI-CHECK: S_CBRANCH
-; SI-CHECK: V_READFIRSTLANE_B32 vcc_lo
-; SI-CHECK: V_MOVRELD
-; SI-CHECK: S_CBRANCH
+; R600-CHECK: LDS_WRITE
+; R600-CHECK: LDS_WRITE
+; R600-CHECK: LDS_READ
+; R600-CHECK: LDS_READ
+
+; SI-CHECK: DS_WRITE_B32
+; SI-CHECK: DS_WRITE_B32
+; SI-CHECK: DS_READ_B32
+; SI-CHECK: DS_READ_B32
 define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
   %stack = alloca [5 x i32], align 4
@@ -114,12 +109,8 @@ for.end:
 
 ; FUNC-LABEL: @short_array
 
-; R600-CHECK: MOV {{\** *}}T{{[0-9]\.[XYZW]}}, literal
-; R600-CHECK: 65536
-; R600-CHECK: *
 ; R600-CHECK: MOVA_INT
 
-; SI-CHECK: V_MOV_B32_e32 v{{[0-9]}}, 0x10000
 ; SI-CHECK: V_MOVRELS_B32_e32
 define void @short_array(i32 addrspace(1)* %out, i32 %index) {
 entry:
@@ -137,10 +128,7 @@ entry:
 
 ; FUNC-LABEL: @char_array
 
-; R600-CHECK: OR_INT {{\** *}}T{{[0-9]\.[XYZW]}}, {{[PVT0-9]+\.[XYZW]}}, literal
-; R600-CHECK: 256
-; R600-CHECK: *
-; R600-CHECK-NEXT: MOVA_INT
+; R600-CHECK: MOVA_INT
 
 ; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 0x100
 ; SI-CHECK: V_MOVRELS_B32_e32
@@ -185,7 +173,9 @@ entry:
 ; Test that two stack objects are not stored in the same register
 ; The second stack object should be in T3.X
 ; FUNC-LABEL: @no_overlap
-; R600-CHECK: MOV {{\** *}}T3.X
+; R600_CHECK: MOV
+; R600_CHECK: [[CHAN:[XYZW]]]+
+; R600-CHECK-NOT: [[CHAN]]+
 ; SI-CHECK: V_MOV_B32_e32 v3
 define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -211,6 +201,85 @@ entry:
   ret void
 }
 
+define void @char_array_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %alloca = alloca [2 x [2 x i8]]
+  %gep0 = getelementptr [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1
+  store i8 0, i8* %gep0
+  store i8 1, i8* %gep1
+  %gep2 = getelementptr [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i8* %gep2
+  %sext = sext i8 %load to i32
+  store i32 %sext, i32 addrspace(1)* %out
+  ret void
+}
 
+define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %alloca = alloca [2 x [2 x i32]]
+  %gep0 = getelementptr [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
+  store i32 0, i32* %gep0
+  store i32 1, i32* %gep1
+  %gep2 = getelementptr [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i32* %gep2
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) {
+entry:
+  %alloca = alloca [2 x [2 x i64]]
+  %gep0 = getelementptr [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1
+  store i64 0, i64* %gep0
+  store i64 1, i64* %gep1
+  %gep2 = getelementptr [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i64* %gep2
+  store i64 %load, i64 addrspace(1)* %out
+  ret void
+}
+
+%struct.pair32 = type { i32, i32 }
+
+define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %alloca = alloca [2 x [2 x %struct.pair32]]
+  %gep0 = getelementptr [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
+  %gep1 = getelementptr [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1
+  store i32 0, i32* %gep0
+  store i32 1, i32* %gep1
+  %gep2 = getelementptr [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0
+  %load = load i32* %gep2
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %alloca = alloca [2 x %struct.pair32]
+  %gep0 = getelementptr [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
+  %gep1 = getelementptr [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0
+  store i32 0, i32* %gep0
+  store i32 1, i32* %gep1
+  %gep2 = getelementptr [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0
+  %load = load i32* %gep2
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+entry:
+  %tmp = alloca [2 x i32]
+  %tmp1 = getelementptr [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr [2 x i32]* %tmp, i32 0, i32 1
+  store i32 0, i32* %tmp1
+  store i32 1, i32* %tmp2
+  %cmp = icmp eq i32 %in, 0
+  %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2
+  %load = load i32* %sel
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
 
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
diff --git a/test/CodeGen/R600/pv.ll b/test/CodeGen/R600/pv.ll
index f322bc7..55eb56d 100644
--- a/test/CodeGen/R600/pv.ll
+++ b/test/CodeGen/R600/pv.ll
@@ -103,7 +103,7 @@ main_body:
   %95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3
   %96 = call float @llvm.AMDGPU.dp4(<4 x float> %91, <4 x float> %95)
   %97 = call float @fabs(float %96)
-  %98 = call float @llvm.AMDGPU.rsq(float %97)
+  %98 = call float @llvm.AMDGPU.rsq.f32(float %97)
   %99 = fmul float %4, %98
   %100 = fmul float %5, %98
   %101 = fmul float %6, %98
@@ -225,7 +225,7 @@ declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
 declare float @fabs(float) #2
 
 ; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq(float) #1
+declare float @llvm.AMDGPU.rsq.f32(float) #1
 
 ; Function Attrs: readnone
 declare float @llvm.AMDIL.clamp.(float, float, float) #1
diff --git a/test/CodeGen/R600/reorder-stores.ll b/test/CodeGen/R600/reorder-stores.ll
new file mode 100644
index 0000000..be2fcc6
--- /dev/null
+++ b/test/CodeGen/R600/reorder-stores.ll
@@ -0,0 +1,104 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: @no_reorder_v2f64_global_load_store
+; SI: BUFFER_LOAD_DWORDX2
+; SI: BUFFER_LOAD_DWORDX2
+; SI: BUFFER_LOAD_DWORDX2
+; SI: BUFFER_LOAD_DWORDX2
+; SI: BUFFER_STORE_DWORDX2
+; SI: BUFFER_STORE_DWORDX2
+; SI: BUFFER_STORE_DWORDX2
+; SI: BUFFER_STORE_DWORDX2
+; SI: S_ENDPGM
+define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
+  %tmp1 = load <2 x double> addrspace(1)* %x, align 16
+  %tmp4 = load <2 x double> addrspace(1)* %y, align 16
+  store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16
+  store <2 x double> %tmp1, <2 x double> addrspace(1)* %y, align 16
+  ret void
+}
+
+; SI-LABEL: @no_reorder_scalarized_v2f64_local_load_store
+; SI: DS_READ_B64
+; SI: DS_READ_B64
+; SI: DS_WRITE_B64
+; SI: DS_WRITE_B64
+; SI: S_ENDPGM
+define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
+  %tmp1 = load <2 x double> addrspace(3)* %x, align 16
+  %tmp4 = load <2 x double> addrspace(3)* %y, align 16
+  store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16
+  store <2 x double> %tmp1, <2 x double> addrspace(3)* %y, align 16
+  ret void
+}
+
+; SI-LABEL: @no_reorder_split_v8i32_global_load_store
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+
+
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: S_ENDPGM
+define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
+  %tmp1 = load <8 x i32> addrspace(1)* %x, align 32
+  %tmp4 = load <8 x i32> addrspace(1)* %y, align 32
+  store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32
+  store <8 x i32> %tmp1, <8 x i32> addrspace(1)* %y, align 32
+  ret void
+}
+
+; SI-LABEL: @no_reorder_extload_64
+; SI: DS_READ_B64
+; SI: DS_READ_B64
+; SI: DS_WRITE_B64
+; SI-NOT: DS_READ
+; SI: DS_WRITE_B64
+; SI: S_ENDPGM
+define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
+  %tmp1 = load <2 x i32> addrspace(3)* %x, align 8
+  %tmp4 = load <2 x i32> addrspace(3)* %y, align 8
+  %tmp1ext = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4ext = zext <2 x i32> %tmp4 to <2 x i64>
+  %tmp7 = add <2 x i64> %tmp1ext, <i64 1, i64 1>
+  %tmp9 = add <2 x i64> %tmp4ext, <i64 1, i64 1>
+  %trunctmp9 = trunc <2 x i64> %tmp9 to <2 x i32>
+  %trunctmp7 = trunc <2 x i64> %tmp7 to <2 x i32>
+  store <2 x i32> %trunctmp9, <2 x i32> addrspace(3)* %x, align 8
+  store <2 x i32> %trunctmp7, <2 x i32> addrspace(3)* %y, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/rotl.i64.ll b/test/CodeGen/R600/rotl.i64.ll
new file mode 100644
index 0000000..bda0b66
--- /dev/null
+++ b/test/CodeGen/R600/rotl.i64.ll
@@ -0,0 +1,34 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: @s_rotl_i64:
+; SI: S_LSHL_B64
+; SI: S_SUB_I32
+; SI: S_LSHR_B64
+; SI: S_OR_B64
+define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
+entry:
+  %0 = shl i64 %x, %y
+  %1 = sub i64 64, %y
+  %2 = lshr i64 %x, %1
+  %3 = or i64 %0, %2
+  store i64 %3, i64 addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: @v_rotl_i64:
+; SI: V_LSHL_B64
+; SI: V_SUB_I32
+; SI: V_LSHR_B64
+; SI: V_OR_B32
+; SI: V_OR_B32
+define void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
+entry:
+  %x = load i64 addrspace(1)* %xptr, align 8
+  %y = load i64 addrspace(1)* %yptr, align 8
+  %tmp0 = shl i64 %x, %y
+  %tmp1 = sub i64 64, %y
+  %tmp2 = lshr i64 %x, %tmp1
+  %tmp3 = or i64 %tmp0, %tmp2
+  store i64 %tmp3, i64 addrspace(1)* %in, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/rotl.ll b/test/CodeGen/R600/rotl.ll
new file mode 100644
index 0000000..83f657f
--- /dev/null
+++ b/test/CodeGen/R600/rotl.ll
@@ -0,0 +1,54 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: @rotl_i32:
+; R600: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x
+; R600-NEXT: 32
+; R600: BIT_ALIGN_INT {{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].Z, PV.{{[XYZW]}}
+
+; SI: S_SUB_I32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}}
+; SI: V_MOV_B32_e32 [[VDST:v[0-9]+]], [[SDST]]
+; SI: V_ALIGNBIT_B32 {{v[0-9]+, [s][0-9]+, v[0-9]+}}, [[VDST]]
+define void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+entry:
+  %0 = shl i32 %x, %y
+  %1 = sub i32 32, %y
+  %2 = lshr i32 %x, %1
+  %3 = or i32 %0, %2
+  store i32 %3, i32 addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: @rotl_v2i32
+; SI: S_SUB_I32
+; SI: V_ALIGNBIT_B32
+; SI: S_SUB_I32
+; SI: V_ALIGNBIT_B32
+define void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+entry:
+  %0 = shl <2 x i32> %x, %y
+  %1 = sub <2 x i32> <i32 32, i32 32>, %y
+  %2 = lshr <2 x i32> %x, %1
+  %3 = or <2 x i32> %0, %2
+  store <2 x i32> %3, <2 x i32> addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: @rotl_v4i32
+; SI: S_SUB_I32
+; SI: V_ALIGNBIT_B32
+; SI: S_SUB_I32
+; SI: V_ALIGNBIT_B32
+; SI: S_SUB_I32
+; SI: V_ALIGNBIT_B32
+; SI: S_SUB_I32
+; SI: V_ALIGNBIT_B32
+define void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
+entry:
+  %0 = shl <4 x i32> %x, %y
+  %1 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
+  %2 = lshr <4 x i32> %x, %1
+  %3 = or <4 x i32> %0, %2
+  store <4 x i32> %3, <4 x i32> addrspace(1)* %in
+  ret void
+}
diff --git a/test/CodeGen/R600/rotr.i64.ll b/test/CodeGen/R600/rotr.i64.ll
new file mode 100644
index 0000000..c264751
--- /dev/null
+++ b/test/CodeGen/R600/rotr.i64.ll
@@ -0,0 +1,58 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: @s_rotr_i64
+; SI: S_LSHR_B64
+; SI: S_SUB_I32
+; SI: S_LSHL_B64
+; SI: S_OR_B64
+define void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
+entry:
+  %tmp0 = sub i64 64, %y
+  %tmp1 = shl i64 %x, %tmp0
+  %tmp2 = lshr i64 %x, %y
+  %tmp3 = or i64 %tmp1, %tmp2
+  store i64 %tmp3, i64 addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: @v_rotr_i64
+; SI: V_LSHR_B64
+; SI: V_SUB_I32
+; SI: V_LSHL_B64
+; SI: V_OR_B32
+; SI: V_OR_B32
+define void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
+entry:
+  %x = load i64 addrspace(1)* %xptr, align 8
+  %y = load i64 addrspace(1)* %yptr, align 8
+  %tmp0 = sub i64 64, %y
+  %tmp1 = shl i64 %x, %tmp0
+  %tmp2 = lshr i64 %x, %y
+  %tmp3 = or i64 %tmp1, %tmp2
+  store i64 %tmp3, i64 addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: @s_rotr_v2i64
+define void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) {
+entry:
+  %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
+  %tmp1 = shl <2 x i64> %x, %tmp0
+  %tmp2 = lshr <2 x i64> %x, %y
+  %tmp3 = or <2 x i64> %tmp1, %tmp2
+  store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: @v_rotr_v2i64
+define void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) {
+entry:
+  %x = load <2 x i64> addrspace(1)* %xptr, align 8
+  %y = load <2 x i64> addrspace(1)* %yptr, align 8
+  %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
+  %tmp1 = shl <2 x i64> %x, %tmp0
+  %tmp2 = lshr <2 x i64> %x, %y
+  %tmp3 = or <2 x i64> %tmp1, %tmp2
+  store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in
+  ret void
+}
diff --git a/test/CodeGen/R600/rotr.ll b/test/CodeGen/R600/rotr.ll
index edf7aee..a5a4da4 100644
--- a/test/CodeGen/R600/rotr.ll
+++ b/test/CodeGen/R600/rotr.ll
@@ -1,37 +1,52 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; R600-CHECK-LABEL: @rotr:
-; R600-CHECK: BIT_ALIGN_INT
+; FUNC-LABEL: @rotr_i32:
+; R600: BIT_ALIGN_INT
 
-; SI-CHECK-LABEL: @rotr:
-; SI-CHECK: V_ALIGNBIT_B32
-define void @rotr(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+; SI: V_ALIGNBIT_B32
+define void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
-  %0 = sub i32 32, %y
-  %1 = shl i32 %x, %0
-  %2 = lshr i32 %x, %y
-  %3 = or i32 %1, %2
-  store i32 %3, i32 addrspace(1)* %in
+  %tmp0 = sub i32 32, %y
+  %tmp1 = shl i32 %x, %tmp0
+  %tmp2 = lshr i32 %x, %y
+  %tmp3 = or i32 %tmp1, %tmp2
+  store i32 %tmp3, i32 addrspace(1)* %in
   ret void
 }
 
-; R600-CHECK-LABEL: @rotl:
-; R600-CHECK: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x
-; R600-CHECK-NEXT: 32
-; R600-CHECK: BIT_ALIGN_INT {{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].Z, PV.{{[XYZW]}}
+; FUNC-LABEL: @rotr_v2i32:
+; R600: BIT_ALIGN_INT
+; R600: BIT_ALIGN_INT
 
+; SI: V_ALIGNBIT_B32
+; SI: V_ALIGNBIT_B32
+define void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+entry:
+  %tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
+  %tmp1 = shl <2 x i32> %x, %tmp0
+  %tmp2 = lshr <2 x i32> %x, %y
+  %tmp3 = or <2 x i32> %tmp1, %tmp2
+  store <2 x i32> %tmp3, <2 x i32> addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: @rotr_v4i32:
+; R600: BIT_ALIGN_INT
+; R600: BIT_ALIGN_INT
+; R600: BIT_ALIGN_INT
+; R600: BIT_ALIGN_INT
 
-; SI-CHECK-LABEL: @rotl:
-; SI-CHECK: S_SUB_I32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}}
-; SI-CHECK: V_MOV_B32_e32 [[VDST:v[0-9]+]], [[SDST]]
-; SI-CHECK: V_ALIGNBIT_B32 {{v[0-9]+, [s][0-9]+, v[0-9]+}}, [[VDST]]
-define void @rotl(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+; SI: V_ALIGNBIT_B32
+; SI: V_ALIGNBIT_B32
+; SI: V_ALIGNBIT_B32
+; SI: V_ALIGNBIT_B32
+define void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
 entry:
-  %0 = shl i32 %x, %y
-  %1 = sub i32 32, %y
-  %2 = lshr i32 %x, %1
-  %3 = or i32 %0, %2
-  store i32 %3, i32 addrspace(1)* %in
+  %tmp0 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
+  %tmp1 = shl <4 x i32> %x, %tmp0
+  %tmp2 = lshr <4 x i32> %x, %y
+  %tmp3 = or <4 x i32> %tmp1, %tmp2
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %in
   ret void
 }
diff --git a/test/CodeGen/R600/rsq.ll b/test/CodeGen/R600/rsq.ll
new file mode 100644
index 0000000..87c0570
--- /dev/null
+++ b/test/CodeGen/R600/rsq.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.sqrt.f32(float) nounwind readnone
+declare double @llvm.sqrt.f64(double) nounwind readnone
+
+; SI-LABEL: @rsq_f32
+; SI: V_RSQ_F32_e32
+; SI: S_ENDPGM
+define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %val = load float addrspace(1)* %in, align 4
+  %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
+  %div = fdiv float 1.0, %sqrt
+  store float %div, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @rsq_f64
+; SI: V_RSQ_F64_e32
+; SI: S_ENDPGM
+define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
+  %val = load double addrspace(1)* %in, align 4
+  %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone
+  %div = fdiv double 1.0, %sqrt
+  store double %div, double addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/saddo.ll b/test/CodeGen/R600/saddo.ll
new file mode 100644
index 0000000..c80480e
--- /dev/null
+++ b/test/CodeGen/R600/saddo.ll
@@ -0,0 +1,62 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
+
+declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
+declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+
+; FUNC-LABEL: @saddo_i64_zext
+define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %sadd, 0
+  %carry = extractvalue { i64, i1 } %sadd, 1
+  %ext = zext i1 %carry to i64
+  %add2 = add i64 %val, %ext
+  store i64 %add2, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @s_saddo_i32
+define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+  %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %sadd, 0
+  %carry = extractvalue { i32, i1 } %sadd, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: @v_saddo_i32
+define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %a = load i32 addrspace(1)* %aptr, align 4
+  %b = load i32 addrspace(1)* %bptr, align 4
+  %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %sadd, 0
+  %carry = extractvalue { i32, i1 } %sadd, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: @s_saddo_i64
+define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %sadd, 0
+  %carry = extractvalue { i64, i1 } %sadd, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: @v_saddo_i64
+; SI: V_ADD_I32
+; SI: V_ADDC_U32
+define void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %a = load i64 addrspace(1)* %aptr, align 4
+  %b = load i64 addrspace(1)* %bptr, align 4
+  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %sadd, 0
+  %carry = extractvalue { i64, i1 } %sadd, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
diff --git a/test/CodeGen/R600/scalar_to_vector.ll b/test/CodeGen/R600/scalar_to_vector.ll
new file mode 100644
index 0000000..bcccb06
--- /dev/null
+++ b/test/CodeGen/R600/scalar_to_vector.ll
@@ -0,0 +1,80 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: @scalar_to_vector_v2i32
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_LSHRREV_B32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
+; SI: BUFFER_STORE_SHORT [[RESULT]]
+; SI: BUFFER_STORE_SHORT [[RESULT]]
+; SI: BUFFER_STORE_SHORT [[RESULT]]
+; SI: BUFFER_STORE_SHORT [[RESULT]]
+; SI: S_ENDPGM
+define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %tmp1 = load i32 addrspace(1)* %in, align 4
+  %bc = bitcast i32 %tmp1 to <2 x i16>
+  %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @scalar_to_vector_v2f32
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_LSHRREV_B32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
+; SI: BUFFER_STORE_SHORT [[RESULT]]
+; SI: BUFFER_STORE_SHORT [[RESULT]]
+; SI: BUFFER_STORE_SHORT [[RESULT]]
+; SI: BUFFER_STORE_SHORT [[RESULT]]
+; SI: S_ENDPGM
+define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tmp1 = load float addrspace(1)* %in, align 4
+  %bc = bitcast float %tmp1 to <2 x i16>
+  %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8
+  ret void
+}
+
+; Getting a SCALAR_TO_VECTOR seems to be tricky. These cases managed
+; to produce one, but for some reason never made it to selection.
+
+
+; define void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+;   %tmp1 = load i32 addrspace(1)* %in, align 4
+;   %bc = bitcast i32 %tmp1 to <4 x i8>
+
+;   %tmp2 = shufflevector <4 x i8> %bc, <4 x i8> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;   store <8 x i8> %tmp2, <8 x i8> addrspace(1)* %out, align 4
+;   ret void
+; }
+
+; define void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind {
+;   %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0
+;   %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1
+;   %bc = bitcast <2 x i64> %newvec1 to <4 x i32>
+;   %add = add <4 x i32> %bc, <i32 1, i32 2, i32 3, i32 4>
+;   store <4 x i32> %add, <4 x i32> addrspace(1)* %out, align 16
+;   ret void
+; }
+
+; define void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind {
+;   %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0
+;   %bc = bitcast <4 x i32> %newvec0 to <8 x i16>
+;   %add = add <8 x i16> %bc, <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4>
+;   store <8 x i16> %add, <8 x i16> addrspace(1)* %out, align 16
+;   ret void
+; }
+
+; define void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind {
+;   %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
+;   %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
+;   %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
+;   store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16
+;   ret void
+; }
+
+; define void @scalar_to_vector_test6(<4 x i16> addrspace(1)* %out) nounwind {
+;   %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
+;   %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
+;   %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
+;   store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16
+;   ret void
+; }
diff --git a/test/CodeGen/R600/sdiv.ll b/test/CodeGen/R600/sdiv.ll
index 3dd10c8..e922d5c 100644
--- a/test/CodeGen/R600/sdiv.ll
+++ b/test/CodeGen/R600/sdiv.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; The code generated by sdiv is long and complex and may frequently change.
 ; The goal of this test is to make sure the ISel doesn't fail.
@@ -9,9 +10,9 @@
 ; This was fixed by adding an additional pattern in R600Instructions.td to
 ; match this pattern with a CNDGE_INT.
 
-; CHECK: CF_END
-
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+; FUNC-LABEL: @sdiv_i32
+; EG: CF_END
+define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
   %num = load i32 addrspace(1) * %in
   %den = load i32 addrspace(1) * %den_ptr
@@ -19,3 +20,84 @@ define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: @sdiv_i32_4
+define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32 addrspace(1) * %in
+  %result = sdiv i32 %num, 4
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; Multiply by a weird constant to make sure setIntDivIsCheap is
+; working.
+
+; FUNC-LABEL: @slow_sdiv_i32_3435
+; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
+; SI: V_MOV_B32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
+; SI: V_MUL_HI_I32 [[TMP:v[0-9]+]], [[VAL]], [[MAGIC]]
+; SI: V_ADD_I32
+; SI: V_LSHRREV_B32
+; SI: V_ASHRREV_I32
+; SI: V_ADD_I32
+; SI: BUFFER_STORE_DWORD
+; SI: S_ENDPGM
+define void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32 addrspace(1) * %in
+  %result = sdiv i32 %num, 3435
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %den_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
+  %num = load <2 x i32> addrspace(1) * %in
+  %den = load <2 x i32> addrspace(1) * %den_ptr
+  %result = sdiv <2 x i32> %num, %den
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %num = load <2 x i32> addrspace(1) * %in
+  %result = sdiv <2 x i32> %num, <i32 4, i32 4>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %den_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %num = load <4 x i32> addrspace(1) * %in
+  %den = load <4 x i32> addrspace(1) * %den_ptr
+  %result = sdiv <4 x i32> %num, %den
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %num = load <4 x i32> addrspace(1) * %in
+  %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; Tests for 64-bit divide bypass.
+; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+;   %result = sdiv i64 %a, %b
+;   store i64 %result, i64 addrspace(1)* %out, align 8
+;   ret void
+; }
+
+; define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+;   %result = srem i64 %a, %b
+;   store i64 %result, i64 addrspace(1)* %out, align 8
+;   ret void
+; }
+
+; define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+;   %resultdiv = sdiv i64 %a, %b
+;   %resultrem = srem i64 %a, %b
+;   %result = add i64 %resultdiv, %resultrem
+;   store i64 %result, i64 addrspace(1)* %out, align 8
+;   ret void
+; }
diff --git a/test/CodeGen/R600/setcc-equivalent.ll b/test/CodeGen/R600/setcc-equivalent.ll
index 4c50aa3..f796748 100644
--- a/test/CodeGen/R600/setcc-equivalent.ll
+++ b/test/CodeGen/R600/setcc-equivalent.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+; XFAIL: *
 
 ; EG-LABEL: @and_setcc_setcc_i32
 ; EG: AND_INT
diff --git a/test/CodeGen/R600/sgpr-copy.ll b/test/CodeGen/R600/sgpr-copy.ll
index c581d86..c7d5bf9 100644
--- a/test/CodeGen/R600/sgpr-copy.ll
+++ b/test/CodeGen/R600/sgpr-copy.ll
@@ -70,7 +70,7 @@ main_body:
   %55 = fadd float %54, %53
   %56 = fmul float %45, %45
   %57 = fadd float %55, %56
-  %58 = call float @llvm.AMDGPU.rsq(float %57)
+  %58 = call float @llvm.AMDGPU.rsq.f32(float %57)
   %59 = fmul float %43, %58
   %60 = fmul float %44, %58
   %61 = fmul float %45, %58
@@ -212,7 +212,7 @@ declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
 declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
 
 ; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq(float) #3
+declare float @llvm.AMDGPU.rsq.f32(float) #3
 
 ; Function Attrs: readnone
 declare float @llvm.AMDIL.exp.(float) #3
diff --git a/test/CodeGen/R600/shl.ll b/test/CodeGen/R600/shl.ll
index 4a6aab4..43fab2a 100644
--- a/test/CodeGen/R600/shl.ll
+++ b/test/CodeGen/R600/shl.ll
@@ -39,5 +39,118 @@ define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
   ret void
 }
 
-; XXX: Add SI test for i64 shl once i64 stores and i64 function arguments are
-; supported.
+;EG-CHECK: @shl_i64
+;EG-CHECK: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
+;EG-CHECK: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
+;EG-CHECK: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-CHECK-DAG: LSHL {{\*? *}}[[HISMTMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], [[SHIFT]]
+;EG-CHECK-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-CHECK-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]}}
+;EG-CHECK-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
+
+;SI-CHECK: @shl_i64
+;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
+  %a = load i64 addrspace(1) * %in
+  %b = load i64 addrspace(1) * %b_ptr
+  %result = shl i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK: @shl_v2i64
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHA]]
+;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHB]]
+;EG-CHECK-DAG: LSHR {{.*}}, 1
+;EG-CHECK-DAG: LSHR {{.*}}, 1
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHB]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHB]]
+;EG-CHECK-DAG: LSHL
+;EG-CHECK-DAG: LSHL
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+
+;SI-CHECK: @shl_v2i64
+;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
+  %a = load <2 x i64> addrspace(1) * %in
+  %b = load <2 x i64> addrspace(1) * %b_ptr
+  %result = shl <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK: @shl_v4i64
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHA]]
+;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHB]]
+;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHC]]
+;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHD]]
+;EG-CHECK-DAG: LSHR {{.*}}, 1
+;EG-CHECK-DAG: LSHR {{.*}}, 1
+;EG-CHECK-DAG: LSHR {{.*}}, 1
+;EG-CHECK-DAG: LSHR {{.*}}, 1
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHB]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHC]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHD]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHB]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHC]]
+;EG-CHECK-DAG: LSHL {{.*}}, [[SHD]]
+;EG-CHECK-DAG: LSHL
+;EG-CHECK-DAG: LSHL
+;EG-CHECK-DAG: LSHL
+;EG-CHECK-DAG: LSHL
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+
+;SI-CHECK: @shl_v4i64
+;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1
+  %a = load <4 x i64> addrspace(1) * %in
+  %b = load <4 x i64> addrspace(1) * %b_ptr
+  %result = shl <4 x i64> %a, %b
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/si-sgpr-spill.ll b/test/CodeGen/R600/si-sgpr-spill.ll
index b34a757..53a0965 100644
--- a/test/CodeGen/R600/si-sgpr-spill.ll
+++ b/test/CodeGen/R600/si-sgpr-spill.ll
@@ -203,7 +203,7 @@ main_body:
   %198 = fadd float %197, %196
   %199 = fmul float %97, %97
   %200 = fadd float %198, %199
-  %201 = call float @llvm.AMDGPU.rsq(float %200)
+  %201 = call float @llvm.AMDGPU.rsq.f32(float %200)
   %202 = fmul float %95, %201
   %203 = fmul float %96, %201
   %204 = fmul float %202, %29
@@ -384,7 +384,7 @@ IF67:                                             ; preds = %LOOP65
   %355 = fadd float %354, %353
   %356 = fmul float %352, %352
   %357 = fadd float %355, %356
-  %358 = call float @llvm.AMDGPU.rsq(float %357)
+  %358 = call float @llvm.AMDGPU.rsq.f32(float %357)
   %359 = fmul float %350, %358
   %360 = fmul float %351, %358
   %361 = fmul float %352, %358
@@ -512,7 +512,7 @@ IF67:                                             ; preds = %LOOP65
   %483 = fadd float %482, %481
   %484 = fmul float %109, %109
   %485 = fadd float %483, %484
-  %486 = call float @llvm.AMDGPU.rsq(float %485)
+  %486 = call float @llvm.AMDGPU.rsq.f32(float %485)
   %487 = fmul float %107, %486
   %488 = fmul float %108, %486
   %489 = fmul float %109, %486
@@ -541,7 +541,7 @@ IF67:                                             ; preds = %LOOP65
   %512 = fadd float %511, %510
   %513 = fmul float %97, %97
   %514 = fadd float %512, %513
-  %515 = call float @llvm.AMDGPU.rsq(float %514)
+  %515 = call float @llvm.AMDGPU.rsq.f32(float %514)
   %516 = fmul float %95, %515
   %517 = fmul float %96, %515
   %518 = fmul float %97, %515
@@ -658,7 +658,7 @@ declare i32 @llvm.SI.tid() #2
 declare float @ceil(float) #3
 
 ; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq(float) #2
+declare float @llvm.AMDGPU.rsq.f32(float) #2
 
 ; Function Attrs: nounwind readnone
 declare <4 x float> @llvm.SI.sampled.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32) #1
@@ -887,7 +887,7 @@ main_body:
   %212 = fadd float %211, %210
   %213 = fmul float %209, %209
   %214 = fadd float %212, %213
-  %215 = call float @llvm.AMDGPU.rsq(float %214)
+  %215 = call float @llvm.AMDGPU.rsq.f32(float %214)
   %216 = fmul float %205, %215
   %217 = fmul float %207, %215
   %218 = fmul float %209, %215
@@ -1123,7 +1123,7 @@ IF189:                                            ; preds = %LOOP
   %434 = fsub float -0.000000e+00, %433
   %435 = fadd float 0x3FF00068E0000000, %434
   %436 = call float @llvm.AMDIL.clamp.(float %435, float 0.000000e+00, float 1.000000e+00)
-  %437 = call float @llvm.AMDGPU.rsq(float %436)
+  %437 = call float @llvm.AMDGPU.rsq.f32(float %436)
   %438 = fmul float %437, %436
   %439 = fsub float -0.000000e+00, %436
   %440 = call float @llvm.AMDGPU.cndlt(float %439, float %438, float 0.000000e+00)
@@ -1147,7 +1147,7 @@ IF189:                                            ; preds = %LOOP
   %458 = fadd float %457, %456
   %459 = fmul float %455, %455
   %460 = fadd float %458, %459
-  %461 = call float @llvm.AMDGPU.rsq(float %460)
+  %461 = call float @llvm.AMDGPU.rsq.f32(float %460)
   %462 = fmul float %451, %461
   %463 = fmul float %453, %461
   %464 = fmul float %455, %461
@@ -1257,7 +1257,7 @@ ENDIF197:                                         ; preds = %IF189, %IF198
   %559 = fadd float %558, %557
   %560 = fmul float %556, %556
   %561 = fadd float %559, %560
-  %562 = call float @llvm.AMDGPU.rsq(float %561)
+  %562 = call float @llvm.AMDGPU.rsq.f32(float %561)
   %563 = fmul float %562, %561
   %564 = fsub float -0.000000e+00, %561
   %565 = call float @llvm.AMDGPU.cndlt(float %564, float %563, float 0.000000e+00)
diff --git a/test/CodeGen/R600/sign_extend.ll b/test/CodeGen/R600/sign_extend.ll
index 1212cee..e3bee50 100644
--- a/test/CodeGen/R600/sign_extend.ll
+++ b/test/CodeGen/R600/sign_extend.ll
@@ -1,12 +1,61 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+; SI-LABEL: @s_sext_i1_to_i32:
+; SI: V_CNDMASK_B32_e64
+; SI: S_ENDPGM
+define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp eq i32 %a, %b
+  %sext = sext i1 %cmp to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
 
-; CHECK: V_ASHR
-define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c)  {
+; SI-LABEL: @test:
+; SI: V_ASHR
+; SI: S_ENDPG
+define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
 entry:
-  %0 = mul i32 %a, %b
-  %1 = add i32 %0, %c
-  %2 = sext i32 %1 to i64
-  store i64 %2, i64 addrspace(1)* %out
+  %mul = mul i32 %a, %b
+  %add = add i32 %mul, %c
+  %sext = sext i32 %add to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @s_sext_i1_to_i64:
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: S_ENDPGM
+define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp eq i32 %a, %b
+  %sext = sext i1 %cmp to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @s_sext_i32_to_i64:
+; SI: S_ASHR_I32
+; SI: S_ENDPGM
+define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
+  %sext = sext i32 %a to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @v_sext_i32_to_i64:
+; SI: V_ASHR
+; SI: S_ENDPGM
+define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32 addrspace(1)* %in, align 4
+  %sext = sext i32 %val to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @s_sext_i16_to_i64:
+; SI: S_ENDPGM
+define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
+  %sext = sext i16 %a to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
   ret void
 }
diff --git a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
index d9f60ea..dee4326 100644
--- a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
+++ b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
 
+; XFAIL: *
+
 ; 64-bit select was originally lowered with a build_pair, and this
 ; could be simplified to 1 cndmask instead of 2, but that broken when
 ; it started being implemented with a v2i32 build_vector and
@@ -12,9 +14,10 @@ define void @trunc_select_i64(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) {
   ret void
 }
 
+; FIXME: Fix truncating store for local memory
 ; SI-LABEL: @trunc_load_alloca_i64:
-; SI: V_MOVRELS_B32
-; SI-NOT: V_MOVRELS_B32
+; SI: DS_READ_B32
+; SI-NOT: DS_READ_B64
 ; SI: S_ENDPGM
 define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) {
   %idx = add i32 %a, %b
diff --git a/test/CodeGen/R600/sint_to_fp.ll b/test/CodeGen/R600/sint_to_fp.ll
index 9241799..b27dfda 100644
--- a/test/CodeGen/R600/sint_to_fp.ll
+++ b/test/CodeGen/R600/sint_to_fp.ll
@@ -29,3 +29,25 @@ define void @sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspac
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: @sint_to_fp_i1_f32:
+; SI: V_CMP_EQ_I32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; SI-NEXT: V_CNDMASK_B32_e64 [[RESULT:v[0-9]+]], 0, -1.000000e+00, [[CMP]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) {
+  %cmp = icmp eq i32 %in, 0
+  %fp = uitofp i1 %cmp to float
+  store float %fp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sint_to_fp_i1_f32_load:
+; SI: V_CNDMASK_B32_e64 [[RESULT:v[0-9]+]], 0, -1.000000e+00
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) {
+  %fp = sitofp i1 %in to float
+  store float %fp, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/sint_to_fp64.ll b/test/CodeGen/R600/sint_to_fp64.ll
index 5abc9d1..12b8cf5 100644
--- a/test/CodeGen/R600/sint_to_fp64.ll
+++ b/test/CodeGen/R600/sint_to_fp64.ll
@@ -1,9 +1,35 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; CHECK: @sint_to_fp64
-; CHECK: V_CVT_F64_I32_e32
+; SI: @sint_to_fp64
+; SI: V_CVT_F64_I32_e32
 define void @sint_to_fp64(double addrspace(1)* %out, i32 %in) {
   %result = sitofp i32 %in to double
   store double %result, double addrspace(1)* %out
   ret void
 }
+
+; SI-LABEL: @sint_to_fp_i1_f64:
+; SI: V_CMP_EQ_I32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
+; we should be able to fold the SGPRs into the V_CNDMASK instructions.
+; SI: V_CNDMASK_B32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; SI: V_CNDMASK_B32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; SI: BUFFER_STORE_DWORDX2
+; SI: S_ENDPGM
+define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
+  %cmp = icmp eq i32 %in, 0
+  %fp = sitofp i1 %cmp to double
+  store double %fp, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @sint_to_fp_i1_f64_load:
+; SI: V_CNDMASK_B32_e64 [[IRESULT:v[0-9]]], 0, -1
+; SI-NEXT: V_CVT_F64_I32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
+; SI: BUFFER_STORE_DWORDX2 [[RESULT]]
+; SI: S_ENDPGM
+define void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
+  %fp = sitofp i1 %in to double
+  store double %fp, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/sra.ll b/test/CodeGen/R600/sra.ll
index fe9df10..9eb3dc5 100644
--- a/test/CodeGen/R600/sra.ll
+++ b/test/CodeGen/R600/sra.ll
@@ -52,3 +52,133 @@ entry:
   ret void
 }
 
+;EG-CHECK-LABEL: @ashr_i64_2
+;EG-CHECK: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
+;EG-CHECK: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
+;EG-CHECK: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-CHECK-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
+;EG-CHECK-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-CHECK-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+;EG-CHECK-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
+;EG-CHECK-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+
+;SI-CHECK-LABEL: @ashr_i64_2
+;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+entry:
+  %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
+  %a = load i64 addrspace(1) * %in
+  %b = load i64 addrspace(1) * %b_ptr
+  %result = ashr i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK-LABEL: @ashr_v2i64
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHA]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHB]]
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: ASHR {{.*}}, [[SHA]]
+;EG-CHECK-DAG: ASHR {{.*}}, [[SHB]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ASHR
+;EG-CHECK-DAG: ASHR
+;EG-CHECK-DAG: ASHR {{.*}}, literal
+;EG-CHECK-DAG: ASHR {{.*}}, literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+
+;SI-CHECK-LABEL: @ashr_v2i64
+;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
+  %a = load <2 x i64> addrspace(1) * %in
+  %b = load <2 x i64> addrspace(1) * %b_ptr
+  %result = ashr <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK-LABEL: @ashr_v4i64
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHA]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHB]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHC]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHD]]
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: ASHR {{.*}}, [[SHA]]
+;EG-CHECK-DAG: ASHR {{.*}}, [[SHB]]
+;EG-CHECK-DAG: ASHR {{.*}}, [[SHC]]
+;EG-CHECK-DAG: ASHR {{.*}}, [[SHD]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ASHR
+;EG-CHECK-DAG: ASHR
+;EG-CHECK-DAG: ASHR
+;EG-CHECK-DAG: ASHR
+;EG-CHECK-DAG: ASHR {{.*}}, literal
+;EG-CHECK-DAG: ASHR {{.*}}, literal
+;EG-CHECK-DAG: ASHR {{.*}}, literal
+;EG-CHECK-DAG: ASHR {{.*}}, literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+
+;SI-CHECK-LABEL: @ashr_v4i64
+;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1
+  %a = load <4 x i64> addrspace(1) * %in
+  %b = load <4 x i64> addrspace(1) * %b_ptr
+  %result = ashr <4 x i64> %a, %b
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/R600/srem.ll b/test/CodeGen/R600/srem.ll
new file mode 100644
index 0000000..65e3395
--- /dev/null
+++ b/test/CodeGen/R600/srem.ll
@@ -0,0 +1,50 @@
+; RUN: llc -march=r600 -mcpu=SI < %s
+; RUN: llc -march=r600 -mcpu=redwood < %s
+
+define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in
+  %den = load i32 addrspace(1) * %den_ptr
+  %result = srem i32 %num, %den
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32 addrspace(1) * %in
+  %result = srem i32 %num, 4
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %den_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
+  %num = load <2 x i32> addrspace(1) * %in
+  %den = load <2 x i32> addrspace(1) * %den_ptr
+  %result = srem <2 x i32> %num, %den
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %num = load <2 x i32> addrspace(1) * %in
+  %result = srem <2 x i32> %num, <i32 4, i32 4>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %den_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %num = load <4 x i32> addrspace(1) * %in
+  %den = load <4 x i32> addrspace(1) * %den_ptr
+  %result = srem <4 x i32> %num, %den
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %num = load <4 x i32> addrspace(1) * %in
+  %result = srem <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/srl.ll b/test/CodeGen/R600/srl.ll
index 7637355..44ad73f 100644
--- a/test/CodeGen/R600/srl.ll
+++ b/test/CodeGen/R600/srl.ll
@@ -39,3 +39,129 @@ define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+;EG-CHECK: @lshr_i64
+;EG-CHECK: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
+;EG-CHECK: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
+;EG-CHECK: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-CHECK-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
+;EG-CHECK-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-CHECK-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+;EG-CHECK-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+;EG-CHECK-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
+
+;SI-CHECK: @lshr_i64
+;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
+  %a = load i64 addrspace(1) * %in
+  %b = load i64 addrspace(1) * %b_ptr
+  %result = lshr i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK: @lshr_v2i64
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHA]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHB]]
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: LSHR
+;EG-CHECK-DAG: LSHR
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+
+;SI-CHECK: @lshr_v2i64
+;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
+  %a = load <2 x i64> addrspace(1) * %in
+  %b = load <2 x i64> addrspace(1) * %b_ptr
+  %result = lshr <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+
+;EG-CHECK: @lshr_v4i64
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHA]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHB]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHC]]
+;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHD]]
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHL {{.*}}, 1
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHC]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHD]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHA]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHB]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHC]]
+;EG-CHECK-DAG: LSHR {{.*}}, [[SHD]]
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: OR_INT
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-CHECK-DAG: LSHR
+;EG-CHECK-DAG: LSHR
+;EG-CHECK-DAG: LSHR
+;EG-CHECK-DAG: LSHR
+;EG-CHECK-DAG: LSHR
+;EG-CHECK-DAG: LSHR
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
+;EG-CHECK-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT {{.*}}, 0.0
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+;EG-CHECK-DAG: CNDE_INT
+
+;SI-CHECK: @lshr_v4i64
+;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1
+  %a = load <4 x i64> addrspace(1) * %in
+  %b = load <4 x i64> addrspace(1) * %b_ptr
+  %result = lshr <4 x i64> %a, %b
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/ssubo.ll b/test/CodeGen/R600/ssubo.ll
new file mode 100644
index 0000000..b330276
--- /dev/null
+++ b/test/CodeGen/R600/ssubo.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
+
+declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
+declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
+
+; FUNC-LABEL: @ssubo_i64_zext
+define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %ssub, 0
+  %carry = extractvalue { i64, i1 } %ssub, 1
+  %ext = zext i1 %carry to i64
+  %add2 = add i64 %val, %ext
+  store i64 %add2, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @s_ssubo_i32
+define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+  %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %ssub, 0
+  %carry = extractvalue { i32, i1 } %ssub, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: @v_ssubo_i32
+define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %a = load i32 addrspace(1)* %aptr, align 4
+  %b = load i32 addrspace(1)* %bptr, align 4
+  %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %ssub, 0
+  %carry = extractvalue { i32, i1 } %ssub, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: @s_ssubo_i64
+; SI: S_SUB_I32
+; SI: S_SUBB_U32
+define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+  %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %ssub, 0
+  %carry = extractvalue { i64, i1 } %ssub, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: @v_ssubo_i64
+; SI: V_SUB_I32_e32
+; SI: V_SUBB_U32_e32
+define void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %a = load i64 addrspace(1)* %aptr, align 4
+  %b = load i64 addrspace(1)* %bptr, align 4
+  %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %ssub, 0
+  %carry = extractvalue { i64, i1 } %ssub, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll
index c0c8ccc..dd27533 100644
--- a/test/CodeGen/R600/store.ll
+++ b/test/CodeGen/R600/store.ll
@@ -263,8 +263,7 @@ entry:
 ; CM-CHECK: LDS_WRITE
 ; CM-CHECK: LDS_WRITE
 ; SI-CHECK-LABEL: @store_local_v2i32
-; SI-CHECK: DS_WRITE_B32
-; SI-CHECK: DS_WRITE_B32
+; SI-CHECK: DS_WRITE_B64
 define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(3)* %out
diff --git a/test/CodeGen/R600/sub.ll b/test/CodeGen/R600/sub.ll
index e321ed6..8e64148 100644
--- a/test/CodeGen/R600/sub.ll
+++ b/test/CodeGen/R600/sub.ll
@@ -1,5 +1,7 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+;RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() readnone
 
 ;FUNC-LABEL: @test2
 ;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@@ -37,23 +39,37 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   ret void
 }
 
-;FUNC_LABEL: @test5
+; FUNC-LABEL: @s_sub_i64:
+; SI: S_SUB_I32
+; SI: S_SUBB_U32
 
-;EG-DAG: SETGE_UINT
-;EG-DAG: CNDE_INT
-;EG-DAG: SUB_INT
-;EG-DAG: SUB_INT
-;EG-DAG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: CNDE_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
+  %result = sub i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
 
-;SI: S_XOR_B64
-;SI-DAG: S_ADD_I32
-;SI-DAG: S_ADDC_U32
-;SI-DAG: S_ADD_I32
-;SI-DAG: S_ADDC_U32
+; FUNC-LABEL: @v_sub_i64:
+; SI: V_SUB_I32_e32
+; SI: V_SUBB_U32_e32
 
-define void @test5(i64 addrspace(1)* %out, i64 %a, i64 %b) {
-entry:
-  %0 = sub i64 %a, %b
-  store i64 %0, i64 addrspace(1)* %out
+; EG-DAG: SETGE_UINT
+; EG-DAG: CNDE_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %a_ptr = getelementptr i64 addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr i64 addrspace(1)* %inB, i32 %tid
+  %a = load i64 addrspace(1)* %a_ptr
+  %b = load i64 addrspace(1)* %b_ptr
+  %result = sub i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
diff --git a/test/CodeGen/R600/uaddo.ll b/test/CodeGen/R600/uaddo.ll
index 3b69687..a80e502 100644
--- a/test/CodeGen/R600/uaddo.ll
+++ b/test/CodeGen/R600/uaddo.ll
@@ -1,8 +1,10 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
 
+declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
 
-; SI-LABEL: @uaddo_i64_zext
+; FUNC-LABEL: @uaddo_i64_zext
 ; SI: ADD
 ; SI: ADDC
 ; SI: ADDC
@@ -15,3 +17,53 @@ define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   store i64 %add2, i64 addrspace(1)* %out, align 8
   ret void
 }
+
+; FUNC-LABEL: @s_uaddo_i32
+; SI: S_ADD_I32
+define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %uadd, 0
+  %carry = extractvalue { i32, i1 } %uadd, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: @v_uaddo_i32
+; SI: V_ADD_I32
+define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %a = load i32 addrspace(1)* %aptr, align 4
+  %b = load i32 addrspace(1)* %bptr, align 4
+  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %uadd, 0
+  %carry = extractvalue { i32, i1 } %uadd, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: @s_uaddo_i64
+; SI: S_ADD_I32
+; SI: S_ADDC_U32
+define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %uadd, 0
+  %carry = extractvalue { i64, i1 } %uadd, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: @v_uaddo_i64
+; SI: V_ADD_I32
+; SI: V_ADDC_U32
+define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %a = load i64 addrspace(1)* %aptr, align 4
+  %b = load i64 addrspace(1)* %bptr, align 4
+  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %uadd, 0
+  %carry = extractvalue { i64, i1 } %uadd, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
diff --git a/test/CodeGen/R600/udivrem.ll b/test/CodeGen/R600/udivrem.ll
new file mode 100644
index 0000000..5f5753a
--- /dev/null
+++ b/test/CodeGen/R600/udivrem.ll
@@ -0,0 +1,358 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+
+; FUNC-LABEL: @test_udivrem
+; EG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG: CNDE_INT
+; EG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG: CNDE_INT
+; EG: MULHI
+; EG: MULLO_INT
+; EG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: V_RCP_IFLAG_F32_e32 [[RCP:v[0-9]+]]
+; SI-DAG: V_MUL_HI_U32 [[RCP_HI:v[0-9]+]], [[RCP]]
+; SI-DAG: V_MUL_LO_I32 [[RCP_LO:v[0-9]+]], [[RCP]]
+; SI-DAG: V_SUB_I32_e32 [[NEG_RCP_LO:v[0-9]+]], 0, [[RCP_LO]]
+; SI: V_CNDMASK_B32_e64
+; SI: V_MUL_HI_U32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]]
+; SI-DAG: V_ADD_I32_e32 [[RCP_A_E:v[0-9]+]], [[E]], [[RCP]]
+; SI-DAG: V_SUBREV_I32_e32 [[RCP_S_E:v[0-9]+]], [[E]], [[RCP]]
+; SI: V_CNDMASK_B32_e64
+; SI: V_MUL_HI_U32 [[Quotient:v[0-9]+]]
+; SI: V_MUL_LO_I32 [[Num_S_Remainder:v[0-9]+]]
+; SI-DAG: V_SUB_I32_e32 [[Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[Num_S_Remainder]]
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI: V_AND_B32_e32 [[Tmp1:v[0-9]+]]
+; SI-DAG: V_ADD_I32_e32 [[Quotient_A_One:v[0-9]+]], 1, [[Quotient]]
+; SI-DAG: V_SUBREV_I32_e32 [[Quotient_S_One:v[0-9]+]],
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_ADD_I32_e32 [[Remainder_A_Den:v[0-9]+]],
+; SI-DAG: V_SUBREV_I32_e32 [[Remainder_S_Den:v[0-9]+]],
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI: S_ENDPGM
+define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+  %result0 = udiv i32 %x, %y
+  store i32 %result0, i32 addrspace(1)* %out
+  %result1 = urem i32 %x, %y
+  store i32 %result1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @test_udivrem_v2
+; EG-DAG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG-DAG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG-DAG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI-DAG: V_RCP_IFLAG_F32_e32 [[FIRST_RCP:v[0-9]+]]
+; SI-DAG: V_MUL_HI_U32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: V_MUL_LO_I32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: V_SUB_I32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_MUL_HI_U32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
+; SI-DAG: V_ADD_I32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
+; SI-DAG: V_SUBREV_I32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_MUL_HI_U32 [[FIRST_Quotient:v[0-9]+]]
+; SI-DAG: V_MUL_LO_I32 [[FIRST_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: V_SUB_I32_e32 [[FIRST_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FIRST_Num_S_Remainder]]
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_AND_B32_e32 [[FIRST_Tmp1:v[0-9]+]]
+; SI-DAG: V_ADD_I32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
+; SI-DAG: V_SUBREV_I32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_ADD_I32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: V_SUBREV_I32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_RCP_IFLAG_F32_e32 [[SECOND_RCP:v[0-9]+]]
+; SI-DAG: V_MUL_HI_U32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: V_MUL_LO_I32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: V_SUB_I32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_MUL_HI_U32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
+; SI-DAG: V_ADD_I32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
+; SI-DAG: V_SUBREV_I32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_MUL_HI_U32 [[SECOND_Quotient:v[0-9]+]]
+; SI-DAG: V_MUL_LO_I32 [[SECOND_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: V_SUB_I32_e32 [[SECOND_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[SECOND_Num_S_Remainder]]
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_AND_B32_e32 [[SECOND_Tmp1:v[0-9]+]]
+; SI-DAG: V_ADD_I32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
+; SI-DAG: V_SUBREV_I32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_ADD_I32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: V_SUBREV_I32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI: S_ENDPGM
+define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
+  %result0 = udiv <2 x i32> %x, %y
+  store <2 x i32> %result0, <2 x i32> addrspace(1)* %out
+  %result1 = urem <2 x i32> %x, %y
+  store <2 x i32> %result1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+
+; FUNC-LABEL: @test_udivrem_v4
+; EG-DAG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG-DAG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG-DAG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG-DAG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG-DAG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI-DAG: V_RCP_IFLAG_F32_e32 [[FIRST_RCP:v[0-9]+]]
+; SI-DAG: V_MUL_HI_U32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: V_MUL_LO_I32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: V_SUB_I32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_MUL_HI_U32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
+; SI-DAG: V_ADD_I32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
+; SI-DAG: V_SUBREV_I32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_MUL_HI_U32 [[FIRST_Quotient:v[0-9]+]]
+; SI-DAG: V_MUL_LO_I32 [[FIRST_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: V_SUB_I32_e32 [[FIRST_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FIRST_Num_S_Remainder]]
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_AND_B32_e32 [[FIRST_Tmp1:v[0-9]+]]
+; SI-DAG: V_ADD_I32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
+; SI-DAG: V_SUBREV_I32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_ADD_I32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: V_SUBREV_I32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_RCP_IFLAG_F32_e32 [[SECOND_RCP:v[0-9]+]]
+; SI-DAG: V_MUL_HI_U32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: V_MUL_LO_I32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: V_SUB_I32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_MUL_HI_U32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
+; SI-DAG: V_ADD_I32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
+; SI-DAG: V_SUBREV_I32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_MUL_HI_U32 [[SECOND_Quotient:v[0-9]+]]
+; SI-DAG: V_MUL_LO_I32 [[SECOND_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: V_SUB_I32_e32 [[SECOND_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[SECOND_Num_S_Remainder]]
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_AND_B32_e32 [[SECOND_Tmp1:v[0-9]+]]
+; SI-DAG: V_ADD_I32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
+; SI-DAG: V_SUBREV_I32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_ADD_I32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: V_SUBREV_I32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_RCP_IFLAG_F32_e32 [[THIRD_RCP:v[0-9]+]]
+; SI-DAG: V_MUL_HI_U32 [[THIRD_RCP_HI:v[0-9]+]], [[THIRD_RCP]]
+; SI-DAG: V_MUL_LO_I32 [[THIRD_RCP_LO:v[0-9]+]], [[THIRD_RCP]]
+; SI-DAG: V_SUB_I32_e32 [[THIRD_NEG_RCP_LO:v[0-9]+]], 0, [[THIRD_RCP_LO]]
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_MUL_HI_U32 [[THIRD_E:v[0-9]+]], {{v[0-9]+}}, [[THIRD_RCP]]
+; SI-DAG: V_ADD_I32_e32 [[THIRD_RCP_A_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]]
+; SI-DAG: V_SUBREV_I32_e32 [[THIRD_RCP_S_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]]
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_MUL_HI_U32 [[THIRD_Quotient:v[0-9]+]]
+; SI-DAG: V_MUL_LO_I32 [[THIRD_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: V_SUB_I32_e32 [[THIRD_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[THIRD_Num_S_Remainder]]
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_AND_B32_e32 [[THIRD_Tmp1:v[0-9]+]]
+; SI-DAG: V_ADD_I32_e32 [[THIRD_Quotient_A_One:v[0-9]+]], {{.*}}, [[THIRD_Quotient]]
+; SI-DAG: V_SUBREV_I32_e32 [[THIRD_Quotient_S_One:v[0-9]+]],
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_ADD_I32_e32 [[THIRD_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: V_SUBREV_I32_e32 [[THIRD_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_RCP_IFLAG_F32_e32 [[FOURTH_RCP:v[0-9]+]]
+; SI-DAG: V_MUL_HI_U32 [[FOURTH_RCP_HI:v[0-9]+]], [[FOURTH_RCP]]
+; SI-DAG: V_MUL_LO_I32 [[FOURTH_RCP_LO:v[0-9]+]], [[FOURTH_RCP]]
+; SI-DAG: V_SUB_I32_e32 [[FOURTH_NEG_RCP_LO:v[0-9]+]], 0, [[FOURTH_RCP_LO]]
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_MUL_HI_U32 [[FOURTH_E:v[0-9]+]], {{v[0-9]+}}, [[FOURTH_RCP]]
+; SI-DAG: V_ADD_I32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
+; SI-DAG: V_SUBREV_I32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_MUL_HI_U32 [[FOURTH_Quotient:v[0-9]+]]
+; SI-DAG: V_MUL_LO_I32 [[FOURTH_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: V_SUB_I32_e32 [[FOURTH_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FOURTH_Num_S_Remainder]]
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_AND_B32_e32 [[FOURTH_Tmp1:v[0-9]+]]
+; SI-DAG: V_ADD_I32_e32 [[FOURTH_Quotient_A_One:v[0-9]+]], {{.*}}, [[FOURTH_Quotient]]
+; SI-DAG: V_SUBREV_I32_e32 [[FOURTH_Quotient_S_One:v[0-9]+]],
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_ADD_I32_e32 [[FOURTH_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: V_SUBREV_I32_e32 [[FOURTH_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: V_CNDMASK_B32_e64
+; SI-DAG: V_CNDMASK_B32_e64
+; SI: S_ENDPGM
+define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
+  %result0 = udiv <4 x i32> %x, %y
+  store <4 x i32> %result0, <4 x i32> addrspace(1)* %out
+  %result1 = urem <4 x i32> %x, %y
+  store <4 x i32> %result1, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/uint_to_fp.f64.ll b/test/CodeGen/R600/uint_to_fp.f64.ll
index 75150c2..9a41796 100644
--- a/test/CodeGen/R600/uint_to_fp.f64.ll
+++ b/test/CodeGen/R600/uint_to_fp.f64.ll
@@ -2,8 +2,35 @@
 
 ; SI-LABEL: @uint_to_fp_f64_i32
 ; SI: V_CVT_F64_U32_e32
+; SI: S_ENDPGM
 define void @uint_to_fp_f64_i32(double addrspace(1)* %out, i32 %in) {
   %cast = uitofp i32 %in to double
   store double %cast, double addrspace(1)* %out, align 8
   ret void
 }
+
+; SI-LABEL: @uint_to_fp_i1_f64:
+; SI: V_CMP_EQ_I32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
+; we should be able to fold the SGPRs into the V_CNDMASK instructions.
+; SI: V_CNDMASK_B32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; SI: V_CNDMASK_B32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; SI: BUFFER_STORE_DWORDX2
+; SI: S_ENDPGM
+define void @uint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
+  %cmp = icmp eq i32 %in, 0
+  %fp = uitofp i1 %cmp to double
+  store double %fp, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @uint_to_fp_i1_f64_load:
+; SI: V_CNDMASK_B32_e64 [[IRESULT:v[0-9]]], 0, 1
+; SI-NEXT: V_CVT_F64_U32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
+; SI: BUFFER_STORE_DWORDX2 [[RESULT]]
+; SI: S_ENDPGM
+define void @uint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
+  %fp = uitofp i1 %in to double
+  store double %fp, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/uint_to_fp.ll b/test/CodeGen/R600/uint_to_fp.ll
index a5ac355..8f5d42d 100644
--- a/test/CodeGen/R600/uint_to_fp.ll
+++ b/test/CodeGen/R600/uint_to_fp.ll
@@ -1,28 +1,30 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
-; R600-CHECK-LABEL: @uint_to_fp_v2i32
-; R600-CHECK-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
-; R600-CHECK-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
-; SI-CHECK-LABEL: @uint_to_fp_v2i32
-; SI-CHECK: V_CVT_F32_U32_e32
-; SI-CHECK: V_CVT_F32_U32_e32
+; FUNC-LABEL: @uint_to_fp_v2i32
+; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
+
+; SI: V_CVT_F32_U32_e32
+; SI: V_CVT_F32_U32_e32
+; SI: S_ENDPGM
 define void @uint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
   %result = uitofp <2 x i32> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK-LABEL: @uint_to_fp_v4i32
-; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI-CHECK-LABEL: @uint_to_fp_v4i32
-; SI-CHECK: V_CVT_F32_U32_e32
-; SI-CHECK: V_CVT_F32_U32_e32
-; SI-CHECK: V_CVT_F32_U32_e32
-; SI-CHECK: V_CVT_F32_U32_e32
+; FUNC-LABEL: @uint_to_fp_v4i32
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: V_CVT_F32_U32_e32
+; SI: V_CVT_F32_U32_e32
+; SI: V_CVT_F32_U32_e32
+; SI: V_CVT_F32_U32_e32
+; SI: S_ENDPGM
 define void @uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %value = load <4 x i32> addrspace(1) * %in
   %result = uitofp <4 x i32> %value to <4 x float>
@@ -30,17 +32,39 @@ define void @uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspac
   ret void
 }
 
-; R600-CHECK-LABEL: @uint_to_fp_i64_f32
-; R600-CHECK: UINT_TO_FLT
-; R600-CHECK: UINT_TO_FLT
-; R600-CHECK: MULADD_IEEE
-; SI-CHECK-LABEL: @uint_to_fp_i64_f32
-; SI-CHECK: V_CVT_F32_U32_e32
-; SI-CHECK: V_CVT_F32_U32_e32
-; SI-CHECK: V_MAD_F32
+; FUNC-LABEL: @uint_to_fp_i64_f32
+; R600: UINT_TO_FLT
+; R600: UINT_TO_FLT
+; R600: MULADD_IEEE
+; SI: V_CVT_F32_U32_e32
+; SI: V_CVT_F32_U32_e32
+; SI: V_MAD_F32
+; SI: S_ENDPGM
 define void @uint_to_fp_i64_f32(float addrspace(1)* %out, i64 %in) {
 entry:
   %0 = uitofp i64 %in to float
   store float %0, float addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: @uint_to_fp_i1_f32:
+; SI: V_CMP_EQ_I32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; SI-NEXT: V_CNDMASK_B32_e64 [[RESULT:v[0-9]+]], 0, 1.000000e+00, [[CMP]]
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @uint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) {
+  %cmp = icmp eq i32 %in, 0
+  %fp = uitofp i1 %cmp to float
+  store float %fp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @uint_to_fp_i1_f32_load:
+; SI: V_CNDMASK_B32_e64 [[RESULT:v[0-9]+]], 0, 1.000000e+00
+; SI: BUFFER_STORE_DWORD [[RESULT]],
+; SI: S_ENDPGM
+define void @uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) {
+  %fp = uitofp i1 %in to float
+  store float %fp, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/usubo.ll b/test/CodeGen/R600/usubo.ll
new file mode 100644
index 0000000..d57a2c7
--- /dev/null
+++ b/test/CodeGen/R600/usubo.ll
@@ -0,0 +1,66 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
+
+declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
+declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
+
+; FUNC-LABEL: @usubo_i64_zext
+define void @usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %usub, 0
+  %carry = extractvalue { i64, i1 } %usub, 1
+  %ext = zext i1 %carry to i64
+  %add2 = add i64 %val, %ext
+  store i64 %add2, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @s_usubo_i32
+; SI: S_SUB_I32
+define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+  %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %usub, 0
+  %carry = extractvalue { i32, i1 } %usub, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: @v_usubo_i32
+; SI: V_SUBREV_I32_e32
+define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %a = load i32 addrspace(1)* %aptr, align 4
+  %b = load i32 addrspace(1)* %bptr, align 4
+  %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %usub, 0
+  %carry = extractvalue { i32, i1 } %usub, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: @s_usubo_i64
+; SI: S_SUB_I32
+; SI: S_SUBB_U32
+define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %usub, 0
+  %carry = extractvalue { i64, i1 } %usub, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: @v_usubo_i64
+; SI: V_SUB_I32
+; SI: V_SUBB_U32
+define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %a = load i64 addrspace(1)* %aptr, align 4
+  %b = load i64 addrspace(1)* %bptr, align 4
+  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %usub, 0
+  %carry = extractvalue { i64, i1 } %usub, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
diff --git a/test/CodeGen/R600/vector-alloca.ll b/test/CodeGen/R600/vector-alloca.ll
new file mode 100644
index 0000000..6543f6d
--- /dev/null
+++ b/test/CodeGen/R600/vector-alloca.ll
@@ -0,0 +1,74 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; FUNC-LABEL: @vector_read
+; EG: MOV
+; EG: MOV
+; EG: MOV
+; EG: MOV
+; EG: MOVA_INT
+define void @vector_read(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %0 = alloca [4 x i32]
+  %x = getelementptr [4 x i32]* %0, i32 0, i32 0
+  %y = getelementptr [4 x i32]* %0, i32 0, i32 1
+  %z = getelementptr [4 x i32]* %0, i32 0, i32 2
+  %w = getelementptr [4 x i32]* %0, i32 0, i32 3
+  store i32 0, i32* %x
+  store i32 1, i32* %y
+  store i32 2, i32* %z
+  store i32 3, i32* %w
+  %1 = getelementptr [4 x i32]* %0, i32 0, i32 %index
+  %2 = load i32* %1
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @vector_write
+; EG: MOV
+; EG: MOV
+; EG: MOV
+; EG: MOV
+; EG: MOVA_INT
+; EG: MOVA_INT
+define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+entry:
+  %0 = alloca [4 x i32]
+  %x = getelementptr [4 x i32]* %0, i32 0, i32 0
+  %y = getelementptr [4 x i32]* %0, i32 0, i32 1
+  %z = getelementptr [4 x i32]* %0, i32 0, i32 2
+  %w = getelementptr [4 x i32]* %0, i32 0, i32 3
+  store i32 0, i32* %x
+  store i32 0, i32* %y
+  store i32 0, i32* %z
+  store i32 0, i32* %w
+  %1 = getelementptr [4 x i32]* %0, i32 0, i32 %w_index
+  store i32 1, i32* %1
+  %2 = getelementptr [4 x i32]* %0, i32 0, i32 %r_index
+  %3 = load i32* %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; This test should be optimize to:
+; store i32 0, i32 addrspace(1)* %out
+; FUNC-LABEL: @bitcast_gep
+; CHECK: STORE_RAW
+define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+entry:
+  %0 = alloca [4 x i32]
+  %x = getelementptr [4 x i32]* %0, i32 0, i32 0
+  %y = getelementptr [4 x i32]* %0, i32 0, i32 1
+  %z = getelementptr [4 x i32]* %0, i32 0, i32 2
+  %w = getelementptr [4 x i32]* %0, i32 0, i32 3
+  store i32 0, i32* %x
+  store i32 0, i32* %y
+  store i32 0, i32* %z
+  store i32 0, i32* %w
+  %1 = getelementptr [4 x i32]* %0, i32 0, i32 1
+  %2 = bitcast i32* %1 to [4 x i32]*
+  %3 = getelementptr [4 x i32]* %2, i32 0, i32 0
+  %4 = load i32* %3
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll
index 5a5c86d..ab618cf 100644
--- a/test/CodeGen/R600/xor.ll
+++ b/test/CodeGen/R600/xor.ll
@@ -90,3 +90,69 @@ define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
+
+; SI-CHECK-LABEL: @vector_xor_i64
+; SI-CHECK: V_XOR_B32_e32
+; SI-CHECK: V_XOR_B32_e32
+; SI-CHECK: S_ENDPGM
+define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
+  %a = load i64 addrspace(1)* %in0
+  %b = load i64 addrspace(1)* %in1
+  %result = xor i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-CHECK-LABEL: @scalar_xor_i64
+; SI-CHECK: S_XOR_B64
+; SI-CHECK: S_ENDPGM
+define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+  %result = xor i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-CHECK-LABEL: @scalar_not_i64
+; SI-CHECK: S_NOT_B64
+define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
+  %result = xor i64 %a, -1
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-CHECK-LABEL: @vector_not_i64
+; SI-CHECK: V_NOT_B32
+; SI-CHECK: V_NOT_B32
+define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
+  %a = load i64 addrspace(1)* %in0
+  %b = load i64 addrspace(1)* %in1
+  %result = xor i64 %a, -1
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; Test that we have a pattern to match xor inside a branch.
+; Note that in the future the backend may be smart enough to
+; use an SALU instruction for this.
+
+; SI-CHECK-LABEL: @xor_cf
+; SI-CHECK: V_XOR
+; SI-CHECK: V_XOR
+define void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) {
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = xor i64 %a, %b
+  br label %endif
+
+else:
+  %2 = load i64 addrspace(1)* %in
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/SPARC/atomics.ll b/test/CodeGen/SPARC/atomics.ll
index 5e41300..ee6c1f8 100644
--- a/test/CodeGen/SPARC/atomics.ll
+++ b/test/CodeGen/SPARC/atomics.ll
@@ -38,7 +38,8 @@ entry:
 
 define i32 @test_cmpxchg_i32(i32 %a, i32* %ptr) {
 entry:
-  %b = cmpxchg i32* %ptr, i32 %a, i32 123 monotonic monotonic
+  %pair = cmpxchg i32* %ptr, i32 %a, i32 123 monotonic monotonic
+  %b = extractvalue { i32, i1 } %pair, 0
   ret i32 %b
 }
 
@@ -48,7 +49,8 @@ entry:
 
 define i64 @test_cmpxchg_i64(i64 %a, i64* %ptr) {
 entry:
-  %b = cmpxchg i64* %ptr, i64 %a, i64 123 monotonic monotonic
+  %pair = cmpxchg i64* %ptr, i64 %a, i64 123 monotonic monotonic
+  %b = extractvalue { i64, i1 } %pair, 0
   ret i64 %b
 }
 
diff --git a/test/CodeGen/SPARC/lit.local.cfg b/test/CodeGen/SPARC/lit.local.cfg
index 4d344fa..fa6a54e 100644
--- a/test/CodeGen/SPARC/lit.local.cfg
+++ b/test/CodeGen/SPARC/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Sparc' in targets:
+if not 'Sparc' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/SystemZ/Large/lit.local.cfg b/test/CodeGen/SystemZ/Large/lit.local.cfg
index 9a02f84..4f22a97 100644
--- a/test/CodeGen/SystemZ/Large/lit.local.cfg
+++ b/test/CodeGen/SystemZ/Large/lit.local.cfg
@@ -5,6 +5,5 @@ config.suffixes = ['.py']
 if config.root.host_arch not in ['SystemZ']:
     config.unsupported = True
 
-targets = set(config.root.targets_to_build.split())
-if not 'SystemZ' in targets:
+if not 'SystemZ' in config.root.targets:
     config.unsupported = True
diff --git a/test/CodeGen/SystemZ/cmpxchg-01.ll b/test/CodeGen/SystemZ/cmpxchg-01.ll
index bb0b18a..5118aad 100644
--- a/test/CodeGen/SystemZ/cmpxchg-01.ll
+++ b/test/CodeGen/SystemZ/cmpxchg-01.ll
@@ -32,7 +32,8 @@ define i8 @f1(i8 %dummy, i8 *%src, i8 %cmp, i8 %swap) {
 ; CHECK-SHIFT: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
 ; CHECK-SHIFT: rll
 ; CHECK-SHIFT: rll {{%r[0-9]+}}, %r5, -8([[NEGSHIFT]])
-  %res = cmpxchg i8 *%src, i8 %cmp, i8 %swap seq_cst seq_cst
+  %pair = cmpxchg i8 *%src, i8 %cmp, i8 %swap seq_cst seq_cst
+  %res = extractvalue { i8, i1 } %pair, 0
   ret i8 %res
 }
 
@@ -50,6 +51,7 @@ define i8 @f2(i8 *%src) {
 ; CHECK-SHIFT: risbg
 ; CHECK-SHIFT: risbg [[SWAP]], {{%r[0-9]+}}, 32, 55, 0
 ; CHECK-SHIFT: br %r14
-  %res = cmpxchg i8 *%src, i8 42, i8 88 seq_cst seq_cst
+  %pair = cmpxchg i8 *%src, i8 42, i8 88 seq_cst seq_cst
+  %res = extractvalue { i8, i1 } %pair, 0
   ret i8 %res
 }
diff --git a/test/CodeGen/SystemZ/cmpxchg-02.ll b/test/CodeGen/SystemZ/cmpxchg-02.ll
index 8d46a8c..9eb0628 100644
--- a/test/CodeGen/SystemZ/cmpxchg-02.ll
+++ b/test/CodeGen/SystemZ/cmpxchg-02.ll
@@ -32,7 +32,8 @@ define i16 @f1(i16 %dummy, i16 *%src, i16 %cmp, i16 %swap) {
 ; CHECK-SHIFT: lcr [[NEGSHIFT:%r[1-9]+]], [[SHIFT]]
 ; CHECK-SHIFT: rll
 ; CHECK-SHIFT: rll {{%r[0-9]+}}, %r5, -16([[NEGSHIFT]])
-  %res = cmpxchg i16 *%src, i16 %cmp, i16 %swap seq_cst seq_cst
+  %pair = cmpxchg i16 *%src, i16 %cmp, i16 %swap seq_cst seq_cst
+  %res = extractvalue { i16, i1 } %pair, 0
   ret i16 %res
 }
 
@@ -50,6 +51,7 @@ define i16 @f2(i16 *%src) {
 ; CHECK-SHIFT: risbg
 ; CHECK-SHIFT: risbg [[SWAP]], {{%r[0-9]+}}, 32, 47, 0
 ; CHECK-SHIFT: br %r14
-  %res = cmpxchg i16 *%src, i16 42, i16 88 seq_cst seq_cst
+  %pair = cmpxchg i16 *%src, i16 42, i16 88 seq_cst seq_cst
+  %res = extractvalue { i16, i1 } %pair, 0
   ret i16 %res
 }
diff --git a/test/CodeGen/SystemZ/cmpxchg-03.ll b/test/CodeGen/SystemZ/cmpxchg-03.ll
index f6a2ad0..c5fab4d 100644
--- a/test/CodeGen/SystemZ/cmpxchg-03.ll
+++ b/test/CodeGen/SystemZ/cmpxchg-03.ll
@@ -7,7 +7,8 @@ define i32 @f1(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK-LABEL: f1:
 ; CHECK: cs %r2, %r3, 0(%r4)
 ; CHECK: br %r14
-  %val = cmpxchg i32 *%src, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%src, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -17,7 +18,8 @@ define i32 @f2(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: cs %r2, %r3, 4092(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 1023
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -27,7 +29,8 @@ define i32 @f3(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: csy %r2, %r3, 4096(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 1024
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -37,7 +40,8 @@ define i32 @f4(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: csy %r2, %r3, 524284(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 131071
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -49,7 +53,8 @@ define i32 @f5(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: cs %r2, %r3, 0(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 131072
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -59,7 +64,8 @@ define i32 @f6(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: csy %r2, %r3, -4(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 -1
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -69,7 +75,8 @@ define i32 @f7(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: csy %r2, %r3, -524288(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 -131072
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -81,7 +88,8 @@ define i32 @f8(i32 %cmp, i32 %swap, i32 *%src) {
 ; CHECK: cs %r2, %r3, 0(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%src, i64 -131073
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -93,7 +101,8 @@ define i32 @f9(i32 %cmp, i32 %swap, i64 %src, i64 %index) {
 ; CHECK: br %r14
   %add1 = add i64 %src, %index
   %ptr = inttoptr i64 %add1 to i32 *
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -106,7 +115,8 @@ define i32 @f10(i32 %cmp, i32 %swap, i64 %src, i64 %index) {
   %add1 = add i64 %src, %index
   %add2 = add i64 %add1, 4096
   %ptr = inttoptr i64 %add2 to i32 *
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -116,7 +126,8 @@ define i32 @f11(i32 %dummy, i32 %swap, i32 *%ptr) {
 ; CHECK: lhi %r2, 1001
 ; CHECK: cs %r2, %r3, 0(%r4)
 ; CHECK: br %r14
-  %val = cmpxchg i32 *%ptr, i32 1001, i32 %swap seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 1001, i32 %swap seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
 
@@ -126,6 +137,7 @@ define i32 @f12(i32 %cmp, i32 *%ptr) {
 ; CHECK: lhi [[SWAP:%r[0-9]+]], 1002
 ; CHECK: cs %r2, [[SWAP]], 0(%r3)
 ; CHECK: br %r14
-  %val = cmpxchg i32 *%ptr, i32 %cmp, i32 1002 seq_cst seq_cst
+  %pair = cmpxchg i32 *%ptr, i32 %cmp, i32 1002 seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
 }
diff --git a/test/CodeGen/SystemZ/cmpxchg-04.ll b/test/CodeGen/SystemZ/cmpxchg-04.ll
index 069bad6..ba1493e 100644
--- a/test/CodeGen/SystemZ/cmpxchg-04.ll
+++ b/test/CodeGen/SystemZ/cmpxchg-04.ll
@@ -7,7 +7,8 @@ define i64 @f1(i64 %cmp, i64 %swap, i64 *%src) {
 ; CHECK-LABEL: f1:
 ; CHECK: csg %r2, %r3, 0(%r4)
 ; CHECK: br %r14
-  %val = cmpxchg i64 *%src, i64 %cmp, i64 %swap seq_cst seq_cst
+  %pairval = cmpxchg i64 *%src, i64 %cmp, i64 %swap seq_cst seq_cst
+  %val = extractvalue { i64, i1 } %pairval, 0
   ret i64 %val
 }
 
@@ -17,7 +18,8 @@ define i64 @f2(i64 %cmp, i64 %swap, i64 *%src) {
 ; CHECK: csg %r2, %r3, 524280(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%src, i64 65535
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %pairval = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %val = extractvalue { i64, i1 } %pairval, 0
   ret i64 %val
 }
 
@@ -29,7 +31,8 @@ define i64 @f3(i64 %cmp, i64 %swap, i64 *%src) {
 ; CHECK: csg %r2, %r3, 0(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%src, i64 65536
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %pairval = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %val = extractvalue { i64, i1 } %pairval, 0
   ret i64 %val
 }
 
@@ -39,7 +42,8 @@ define i64 @f4(i64 %cmp, i64 %swap, i64 *%src) {
 ; CHECK: csg %r2, %r3, -8(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%src, i64 -1
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %pairval = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %val = extractvalue { i64, i1 } %pairval, 0
   ret i64 %val
 }
 
@@ -49,7 +53,8 @@ define i64 @f5(i64 %cmp, i64 %swap, i64 *%src) {
 ; CHECK: csg %r2, %r3, -524288(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%src, i64 -65536
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %pairval = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %val = extractvalue { i64, i1 } %pairval, 0
   ret i64 %val
 }
 
@@ -61,7 +66,8 @@ define i64 @f6(i64 %cmp, i64 %swap, i64 *%src) {
 ; CHECK: csg %r2, %r3, 0(%r4)
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%src, i64 -65537
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %pairval = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %val = extractvalue { i64, i1 } %pairval, 0
   ret i64 %val
 }
 
@@ -73,7 +79,8 @@ define i64 @f7(i64 %cmp, i64 %swap, i64 %src, i64 %index) {
 ; CHECK: br %r14
   %add1 = add i64 %src, %index
   %ptr = inttoptr i64 %add1 to i64 *
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %pairval = cmpxchg i64 *%ptr, i64 %cmp, i64 %swap seq_cst seq_cst
+  %val = extractvalue { i64, i1 } %pairval, 0
   ret i64 %val
 }
 
@@ -83,7 +90,8 @@ define i64 @f8(i64 %dummy, i64 %swap, i64 *%ptr) {
 ; CHECK: lghi %r2, 1001
 ; CHECK: csg %r2, %r3, 0(%r4)
 ; CHECK: br %r14
-  %val = cmpxchg i64 *%ptr, i64 1001, i64 %swap seq_cst seq_cst
+  %pairval = cmpxchg i64 *%ptr, i64 1001, i64 %swap seq_cst seq_cst
+  %val = extractvalue { i64, i1 } %pairval, 0
   ret i64 %val
 }
 
@@ -93,6 +101,7 @@ define i64 @f9(i64 %cmp, i64 *%ptr) {
 ; CHECK: lghi [[SWAP:%r[0-9]+]], 1002
 ; CHECK: csg %r2, [[SWAP]], 0(%r3)
 ; CHECK: br %r14
-  %val = cmpxchg i64 *%ptr, i64 %cmp, i64 1002 seq_cst seq_cst
+  %pairval = cmpxchg i64 *%ptr, i64 %cmp, i64 1002 seq_cst seq_cst
+  %val = extractvalue { i64, i1 } %pairval, 0
   ret i64 %val
 }
diff --git a/test/CodeGen/SystemZ/lit.local.cfg b/test/CodeGen/SystemZ/lit.local.cfg
index b12af09..5c02dd3 100644
--- a/test/CodeGen/SystemZ/lit.local.cfg
+++ b/test/CodeGen/SystemZ/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'SystemZ' in targets:
+if not 'SystemZ' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll b/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll
new file mode 100644
index 0000000..ae66369
--- /dev/null
+++ b/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=thumbv6m-eabi -o - | FileCheck %s
+; XFAIL: *
+
+define void @foo(i32* %A) #0 {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: push {r7, lr}
+; CHECK: ldm [[REG0:r[0-9]]]!,
+; CHECK-NEXT: subs [[REG0]]
+; CHECK-NEXT: bl
+  %0 = load i32* %A, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i32 1
+  %1 = load i32* %arrayidx1, align 4
+  tail call void @bar(i32* %A, i32 %0, i32 %1) #2
+  ret void
+}
+
+declare void @bar(i32*, i32, i32) #1
diff --git a/test/CodeGen/Thumb/dyn-stackalloc.ll b/test/CodeGen/Thumb/dyn-stackalloc.ll
index 6bc39af..6c6de55 100644
--- a/test/CodeGen/Thumb/dyn-stackalloc.ll
+++ b/test/CodeGen/Thumb/dyn-stackalloc.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra | FileCheck %s -check-prefix=CHECK -check-prefix=RA_GREEDY
-; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic | FileCheck %s -check-prefix=CHECK -check-prefix=RA_BASIC
+; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra | FileCheck %s
+; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic | FileCheck %s
 
 	%struct.state = type { i32, %struct.info*, float**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i8* }
 	%struct.info = type { i32, i32, i32, i32, i32, i32, i32, i8* }
@@ -45,8 +45,7 @@ define void @t2(%struct.comment* %vc, i8* %tag, i8* %contents) {
 ; CHECK: sub sp, #
 ; CHECK: mov r[[R0:[0-9]+]], sp
 ; CHECK: str r{{[0-9+]}}, [r[[R0]]
-; RA_GREEDY: str r{{[0-9+]}}, [r[[R0]]
-; RA_BASIC: stm r[[R0]]!
+; CHECK: str r{{[0-9+]}}, [r[[R0]]
 ; CHECK-NOT: ldr r0, [sp
 ; CHECK: mov r[[R1:[0-9]+]], sp
 ; CHECK: subs r[[R2:[0-9]+]], r[[R1]], r{{[0-9]+}}
diff --git a/test/CodeGen/Thumb/fastcc.ll b/test/CodeGen/Thumb/fastcc.ll
new file mode 100644
index 0000000..98ff684
--- /dev/null
+++ b/test/CodeGen/Thumb/fastcc.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mcpu=arm926ej-s -mattr=+vfp2
+
+; This is a regression test, to ensure that fastcc functions are correctly
+; handled when compiling for a processor which has a floating-point unit which
+; is not accessible from the selected instruction set.
+
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv5e-none-linux-gnueabi"
+
+; Function Attrs: optsize
+define fastcc void @_foo(float %walpha) #0 {
+entry:
+  br label %for.body13
+
+for.body13:                                       ; preds = %for.body13, %entry
+  br i1 undef, label %for.end182.critedge, label %for.body13
+
+for.end182.critedge:                              ; preds = %for.body13
+  %conv183 = fpext float %walpha to double
+  %mul184 = fmul double %conv183, 8.200000e-01
+  %conv185 = fptrunc double %mul184 to float
+  %conv188 = fpext float %conv185 to double
+  %mul189 = fmul double %conv188, 6.000000e-01
+  %conv190 = fptrunc double %mul189 to float
+  br label %for.body193
+
+for.body193:                                      ; preds = %for.body193, %for.end182.critedge
+  %mul195 = fmul float %conv190, undef
+  br label %for.body193
+}
+
+attributes #0 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5.0 "}
diff --git a/test/CodeGen/Thumb/lit.local.cfg b/test/CodeGen/Thumb/lit.local.cfg
index 8a3ba96..98c6700 100644
--- a/test/CodeGen/Thumb/lit.local.cfg
+++ b/test/CodeGen/Thumb/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/Thumb/thumb-ldm.ll b/test/CodeGen/Thumb/thumb-ldm.ll
index dd98e6f..95f3edc 100644
--- a/test/CodeGen/Thumb/thumb-ldm.ll
+++ b/test/CodeGen/Thumb/thumb-ldm.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=thumbv6m-eabi -o - | FileCheck %s
+; XFAIL: *
 
 @X = external global [0 x i32]          ; <[0 x i32]*> [#uses=5]
 
diff --git a/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
index 06cfd9b..dedc82b 100644
--- a/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
+++ b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=thumbv6m-eabi %s -o - | FileCheck %s
+; XFAIL: *
 
 @d = external global [64 x i32]
 @s = external global [64 x i32]
diff --git a/test/CodeGen/Thumb2/2009-08-01-WrongLDRBOpc.ll b/test/CodeGen/Thumb2/2009-08-01-WrongLDRBOpc.ll
index e014453..09e0ed1 100644
--- a/test/CodeGen/Thumb2/2009-08-01-WrongLDRBOpc.ll
+++ b/test/CodeGen/Thumb2/2009-08-01-WrongLDRBOpc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mcpu=cortex-a8 -relocation-model=pic -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mcpu=cortex-a8 -relocation-model=pic -disable-fp-elim -arm-atomic-cfg-tidy=0 | FileCheck %s
 
 @csize = external global [100 x [20 x [4 x i8]]]		; <[100 x [20 x [4 x i8]]]*> [#uses=1]
 @vsize = external global [100 x [20 x [4 x i8]]]		; <[100 x [20 x [4 x i8]]]*> [#uses=1]
diff --git a/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll b/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll
index 940cfd1..c8eac8d 100644
--- a/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll
+++ b/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabi | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabi -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; PR4659
 ; PR4682
 
diff --git a/test/CodeGen/Thumb2/2009-09-28-ITBlockBug.ll b/test/CodeGen/Thumb2/2009-09-28-ITBlockBug.ll
index 52066d3..a9a2478 100644
--- a/test/CodeGen/Thumb2/2009-09-28-ITBlockBug.ll
+++ b/test/CodeGen/Thumb2/2009-09-28-ITBlockBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -disable-cgp-branch-opts | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -disable-cgp-branch-opts -arm-atomic-cfg-tidy=0 | FileCheck %s
 
 %struct.pix_pos = type { i32, i32, i32, i32, i32, i32 }
 
diff --git a/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll b/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll
index 1b8bdb1..8beb5b1 100644
--- a/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll
+++ b/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -O3 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 -O3 | FileCheck %s
 ; rdar://7493908
 
 ; Make sure the result of the first dynamic_alloc isn't copied back to sp more
diff --git a/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll b/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll
index 810bfb7..f3046e1 100644
--- a/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll
+++ b/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O3 -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O3 -relocation-model=pic -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; rdar://8115404
 ; Tail merging must not split an IT block.
 
diff --git a/test/CodeGen/Thumb2/2010-11-22-EpilogueBug.ll b/test/CodeGen/Thumb2/2010-11-22-EpilogueBug.ll
index 75f5439..3d89390 100644
--- a/test/CodeGen/Thumb2/2010-11-22-EpilogueBug.ll
+++ b/test/CodeGen/Thumb2/2010-11-22-EpilogueBug.ll
@@ -1,5 +1,5 @@
 ; rdar://8465407
-; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-atomic-cfg-tidy=0 | FileCheck %s
 
 %struct.buf = type opaque
 
diff --git a/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll b/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll
index b1ce3bb..240df83 100644
--- a/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll
+++ b/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumbv7-apple-darwin10 < %s | FileCheck %s
+; RUN: llc -mtriple=thumbv7-apple-darwin10 -arm-atomic-cfg-tidy=0 < %s | FileCheck %s
 
 %struct.op = type { %struct.op*, %struct.op*, %struct.op* ()*, i32, i16, i16, i8, i8 }
 
diff --git a/test/CodeGen/Thumb2/buildvector-crash.ll b/test/CodeGen/Thumb2/buildvector-crash.ll
index 8a3c895..16e2298 100644
--- a/test/CodeGen/Thumb2/buildvector-crash.ll
+++ b/test/CodeGen/Thumb2/buildvector-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -O3 -mtriple=thumbv7-apple-ios -arm-atomic-cfg-tidy=0 -mcpu=cortex-a8 | FileCheck %s
 ; Formerly crashed, 3573915.
 
 define void @RotateStarsFP_Vec() nounwind {
diff --git a/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll b/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll
index a9f948c..88c7f0f 100644
--- a/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll
+++ b/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin9 -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 | FileCheck %s
 
 define void @fht(float* nocapture %fz, i16 signext %n) nounwind {
 ; CHECK-LABEL: fht:
diff --git a/test/CodeGen/Thumb2/ldr-str-imm12.ll b/test/CodeGen/Thumb2/ldr-str-imm12.ll
index 36544d1..d20eef0 100644
--- a/test/CodeGen/Thumb2/ldr-str-imm12.ll
+++ b/test/CodeGen/Thumb2/ldr-str-imm12.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -relocation-model=pic -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-atomic-cfg-tidy=0 -mcpu=cortex-a8 -relocation-model=pic -disable-fp-elim | FileCheck %s
 ; rdar://7352504
 ; Make sure we use "str r9, [sp, #+28]" instead of "sub.w r4, r7, #256" followed by "str r9, [r4, #-32]".
 
diff --git a/test/CodeGen/Thumb2/lit.local.cfg b/test/CodeGen/Thumb2/lit.local.cfg
index 8a3ba96..98c6700 100644
--- a/test/CodeGen/Thumb2/lit.local.cfg
+++ b/test/CodeGen/Thumb2/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/Thumb2/thumb2-branch.ll b/test/CodeGen/Thumb2/thumb2-branch.ll
index a00b22d..332ed50 100644
--- a/test/CodeGen/Thumb2/thumb2-branch.ll
+++ b/test/CodeGen/Thumb2/thumb2-branch.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mattr=+thumb2 -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; If-conversion defeats the purpose of this test, which is to check
 ; conditional branch generation, so a call to make sure it doesn't
 ; happen and we get actual branches.
diff --git a/test/CodeGen/Thumb2/thumb2-cbnz.ll b/test/CodeGen/Thumb2/thumb2-cbnz.ll
index 893bd0f..f0f7916 100644
--- a/test/CodeGen/Thumb2/thumb2-cbnz.ll
+++ b/test/CodeGen/Thumb2/thumb2-cbnz.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; rdar://7354379
 
 declare double @foo(double) nounwind readnone
diff --git a/test/CodeGen/Thumb2/thumb2-ifcvt2.ll b/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
index 403cd48..a861912 100644
--- a/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
+++ b/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-default-it | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv8-apple-ios -arm-no-restrict-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-atomic-cfg-tidy=0 -arm-default-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8-apple-ios -arm-atomic-cfg-tidy=0 -arm-no-restrict-it | FileCheck %s
 
 define void @foo(i32 %X, i32 %Y) {
 entry:
diff --git a/test/CodeGen/Thumb2/thumb2-ifcvt3.ll b/test/CodeGen/Thumb2/thumb2-ifcvt3.ll
index a71aa3f..79667d4 100644
--- a/test/CodeGen/Thumb2/thumb2-ifcvt3.ll
+++ b/test/CodeGen/Thumb2/thumb2-ifcvt3.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-default-it | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv8-apple-darwin -arm-no-restrict-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-atomic-cfg-tidy=0 -arm-default-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8-apple-darwin -arm-atomic-cfg-tidy=0 -arm-no-restrict-it | FileCheck %s
 
 ; There shouldn't be a unconditional branch at end of bb52.
 ; rdar://7184787
diff --git a/test/CodeGen/Thumb2/thumb2-spill-q.ll b/test/CodeGen/Thumb2/thumb2-spill-q.ll
index 52c1063..94f4725 100644
--- a/test/CodeGen/Thumb2/thumb2-spill-q.ll
+++ b/test/CodeGen/Thumb2/thumb2-spill-q.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-elf -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-elf -mattr=+neon -arm-atomic-cfg-tidy=0 | FileCheck %s
 ; PR4789
 
 %bar = type { float, float, float }
diff --git a/test/CodeGen/Thumb2/tpsoft.ll b/test/CodeGen/Thumb2/tpsoft.ll
new file mode 100644
index 0000000..6ab8bf0
--- /dev/null
+++ b/test/CodeGen/Thumb2/tpsoft.ll
@@ -0,0 +1,54 @@
+; RUN: llc  %s -mtriple=thumbv7-linux-gnueabi -o - | \
+; RUN:    FileCheck  -check-prefix=ELFASM %s
+; RUN: llc  %s -mtriple=thumbebv7-linux-gnueabi -o - | \
+; RUN:    FileCheck  -check-prefix=ELFASM %s
+; RUN: llc  %s -mtriple=thumbv7-linux-gnueabi -filetype=obj -o - | \
+; RUN:    llvm-readobj -s -sd | FileCheck  -check-prefix=ELFOBJ -check-prefix=ELFOBJ-LE %s
+; RUN: llc  %s -mtriple=thumbebv7-linux-gnueabi -filetype=obj -o - | \
+; RUN:    llvm-readobj -s -sd | FileCheck  -check-prefix=ELFOBJ -check-prefix=ELFOBJ-BE %s
+
+;; Make sure that bl __aeabi_read_tp is materialized and fixed up correctly
+;; in the obj case.
+
+@i = external thread_local global i32
+@a = external global i8
+@b = external global [10 x i8]
+
+define arm_aapcs_vfpcc i32 @main() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  switch i32 %0, label %bb2 [
+    i32 12, label %bb
+    i32 13, label %bb1
+  ]
+
+bb:                                               ; preds = %entry
+  %1 = tail call arm_aapcs_vfpcc  i32 @foo(i8* @a) nounwind
+  ret i32 %1
+; ELFASM:       	bl	__aeabi_read_tp
+
+
+; ELFOBJ:      Sections [
+; ELFOBJ:        Section {
+; ELFOBJ:          Name: .text
+; ELFOBJ-LE:          SectionData (
+;;;                  BL __aeabi_read_tp is ---------+
+;;;                                                 V
+; ELFOBJ-LE-NEXT:     0000: 2DE90048 0E487844 0168FFF7 FEFF4058
+; ELFOBJ-BE:          SectionData (
+;;;                  BL __aeabi_read_tp is ---------+
+;;;                                                 V
+; ELFOBJ-BE-NEXT:     0000: E92D4800 480E4478 6801F7FF FFFE5840
+
+
+bb1:                                              ; preds = %entry
+  %2 = tail call arm_aapcs_vfpcc  i32 @bar(i32* bitcast ([10 x i8]* @b to i32*)) nounwind
+  ret i32 %2
+
+bb2:                                              ; preds = %entry
+  ret i32 -1
+}
+
+declare arm_aapcs_vfpcc i32 @foo(i8*)
+
+declare arm_aapcs_vfpcc i32 @bar(i32*)
diff --git a/test/CodeGen/Thumb2/v8_IT_3.ll b/test/CodeGen/Thumb2/v8_IT_3.ll
index 4dca246..a028dee 100644
--- a/test/CodeGen/Thumb2/v8_IT_3.ll
+++ b/test/CodeGen/Thumb2/v8_IT_3.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -mtriple=thumbv8 | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv8 -relocation-model=pic | FileCheck %s --check-prefix=CHECK-PIC
-; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it -relocation-model=pic | FileCheck %s --check-prefix=CHECK-PIC
+; RUN: llc < %s -mtriple=thumbv8 -arm-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7 -arm-atomic-cfg-tidy=0 -arm-restrict-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8 -arm-atomic-cfg-tidy=0 -relocation-model=pic | FileCheck %s --check-prefix=CHECK-PIC
+; RUN: llc < %s -mtriple=thumbv7 -arm-atomic-cfg-tidy=0 -arm-restrict-it -relocation-model=pic | FileCheck %s --check-prefix=CHECK-PIC
 
 %struct.FF = type { i32 (i32*)*, i32 (i32*, i32*, i32, i32, i32, i32)*, i32 (i32, i32, i8*)*, void ()*, i32 (i32, i8*, i32*)*, i32 ()* }
 %struct.BD = type { %struct.BD*, i32, i32, i32, i32, i64, i32 (%struct.BD*, i8*, i64, i32)*, i32 (%struct.BD*, i8*, i32, i32)*, i32 (%struct.BD*, i8*, i64, i32)*, i32 (%struct.BD*, i8*, i32, i32)*, i32 (%struct.BD*, i64, i32)*, [16 x i8], i64, i64 }
diff --git a/test/CodeGen/Thumb2/v8_IT_5.ll b/test/CodeGen/Thumb2/v8_IT_5.ll
index 2f352d6..2da75ad 100644
--- a/test/CodeGen/Thumb2/v8_IT_5.ll
+++ b/test/CodeGen/Thumb2/v8_IT_5.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumbv8 | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8 -arm-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7 -arm-atomic-cfg-tidy=0 -arm-restrict-it | FileCheck %s
 ; CHECK: it	ne
 ; CHECK-NEXT: cmpne
 ; CHECK-NEXT: bne [[JUMPTARGET:.LBB[0-9]+_[0-9]+]]
diff --git a/test/CodeGen/X86/2007-05-05-Personality.ll b/test/CodeGen/X86/2007-05-05-Personality.ll
index 5b8fe72..b99c58c 100644
--- a/test/CodeGen/X86/2007-05-05-Personality.ll
+++ b/test/CodeGen/X86/2007-05-05-Personality.ll
@@ -1,12 +1,14 @@
 ; RUN: llc < %s -mtriple=i686-pc-linux-gnu -o -     | FileCheck %s  --check-prefix=LIN
-; RUN: llc < %s -mtriple=x86_64-pc-windows-gnu -o - | FileCheck %s  --check-prefix=LIN
 ; RUN: llc < %s -mtriple=i386-pc-mingw32 -o -       | FileCheck %s  --check-prefix=WIN
 ; RUN: llc < %s -mtriple=i686-pc-windows-gnu -o -   | FileCheck %s  --check-prefix=WIN
+; RUN: llc < %s -mtriple=x86_64-pc-windows-gnu -o - | FileCheck %s  --check-prefix=WIN64
 
 ; LIN: .cfi_personality 0, __gnat_eh_personality
 ; LIN: .cfi_lsda 0, .Lexception0
 ; WIN: .cfi_personality 0, ___gnat_eh_personality
 ; WIN: .cfi_lsda 0, Lexception0
+; WIN64: .seh_handler __gnat_eh_personality
+; WIN64: .seh_handlerdata
 
 @error = external global i8
 
@@ -15,7 +17,7 @@ entry:
   invoke void @raise()
           to label %eh_then unwind label %unwind
 
-unwind:                                           ; preds = %entry 
+unwind:                                           ; preds = %entry
   %eh_ptr = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*)
               catch i8* @error
   %eh_select = extractvalue { i8*, i32 } %eh_ptr, 1
diff --git a/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll b/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll
deleted file mode 100644
index 0ae1897..0000000
--- a/test/CodeGen/X86/2007-09-18-ShuffleXformBug.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep -- -86
-
-define i16 @f(<4 x float>* %tmp116117.i1061.i) nounwind {
-entry:
-	alloca [4 x <4 x float>]		; <[4 x <4 x float>]*>:0 [#uses=167]
-	alloca [4 x <4 x float>]		; <[4 x <4 x float>]*>:1 [#uses=170]
-	alloca [4 x <4 x i32>]		; <[4 x <4 x i32>]*>:2 [#uses=12]
-	%.sub6235.i = getelementptr [4 x <4 x float>]* %0, i32 0, i32 0		; <<4 x float>*> [#uses=76]
-	%.sub.i = getelementptr [4 x <4 x float>]* %1, i32 0, i32 0		; <<4 x float>*> [#uses=59]
-
-	%tmp124.i1062.i = getelementptr <4 x float>* %tmp116117.i1061.i, i32 63		; <<4 x float>*> [#uses=1]
-	%tmp125.i1063.i = load <4 x float>* %tmp124.i1062.i		; <<4 x float>> [#uses=5]
-	%tmp828.i1077.i = shufflevector <4 x float> %tmp125.i1063.i, <4 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >		; <<4 x float>> [#uses=4]
-	%tmp704.i1085.i = load <4 x float>* %.sub6235.i		; <<4 x float>> [#uses=1]
-	%tmp712.i1086.i = call <4 x float> @llvm.x86.sse.max.ps( <4 x float> %tmp704.i1085.i, <4 x float> %tmp828.i1077.i )		; <<4 x float>> [#uses=1]
-	store <4 x float> %tmp712.i1086.i, <4 x float>* %.sub.i
-
-	%tmp2587.i1145.gep.i = getelementptr [4 x <4 x float>]* %1, i32 0, i32 0, i32 2		; <float*> [#uses=1]
-	%tmp5334.i = load float* %tmp2587.i1145.gep.i		; <float> [#uses=5]
-	%tmp2723.i1170.i = insertelement <4 x float> undef, float %tmp5334.i, i32 2		; <<4 x float>> [#uses=5]
-	store <4 x float> %tmp2723.i1170.i, <4 x float>* %.sub6235.i
-
-	%tmp1406.i1367.i = shufflevector <4 x float> %tmp2723.i1170.i, <4 x float> undef, <4 x i32> < i32 2, i32 2, i32 2, i32 2 >		; <<4 x float>> [#uses=1]
-	%tmp84.i1413.i = load <4 x float>* %.sub6235.i		; <<4 x float>> [#uses=1]
-	%tmp89.i1415.i = fmul <4 x float> %tmp84.i1413.i, %tmp1406.i1367.i		; <<4 x float>> [#uses=1]
-	store <4 x float> %tmp89.i1415.i, <4 x float>* %.sub.i
-        ret i16 0
-}
-
-declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
diff --git a/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll b/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
index e64375a..a0106d7 100644
--- a/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
+++ b/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
@@ -8,7 +8,7 @@ target triple = "i386-pc-linux-gnu"
 @__resp = thread_local global %struct.__res_state* @_res		; <%struct.__res_state**> [#uses=1]
 @_res = global %struct.__res_state zeroinitializer, section ".bss"		; <%struct.__res_state*> [#uses=1]
 
-@__libc_resp = hidden alias %struct.__res_state** @__resp		; <%struct.__res_state**> [#uses=2]
+@__libc_resp = hidden thread_local alias %struct.__res_state** @__resp		; <%struct.__res_state**> [#uses=2]
 
 define i32 @foo() {
 ; CHECK-LABEL: foo:
diff --git a/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll b/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll
index 1259cf4..dfb98bb 100644
--- a/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll
+++ b/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mcpu=generic -mtriple=x86_64-mingw32 < %s | FileCheck %s
 ; CHECK: subq    $40, %rsp
-; CHECK: movaps  %xmm8, (%rsp)
-; CHECK: movaps  %xmm7, 16(%rsp)
+; CHECK: movaps  %xmm8, 16(%rsp)
+; CHECK: movaps  %xmm7, (%rsp)
 
 define i32 @a() nounwind {
 entry:
diff --git a/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll b/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
index f9bf310..850f678 100644
--- a/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
+++ b/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
@@ -11,9 +11,9 @@ entry:
 ; CHECK: movl 4([[REG]]), %edx
 ; CHECK: LBB0_1:
 ; CHECK: movl %eax, %ebx
-; CHECK: addl {{%[a-z]+}}, %ebx
+; CHECK: addl $1, %ebx
 ; CHECK: movl %edx, %ecx
-; CHECK: adcl {{%[a-z]+}}, %ecx
+; CHECK: adcl $0, %ecx
 ; CHECK: lock
 ; CHECK-NEXT: cmpxchg8b ([[REG]])
 ; CHECK-NEXT: jne
diff --git a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
index b45ac22..4181c26 100644
--- a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
+++ b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
@@ -24,7 +24,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !llvm.dbg.lv = !{!0, !14, !15, !16, !17, !24, !25, !28}
 
 !0 = metadata !{i32 786689, metadata !1, metadata !"this", metadata !3, i32 11, metadata !12, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !31, metadata !2, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEi", i32 11, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 (%struct.foo*, i32)* @_ZN3foo3bazEi, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 786478, metadata !31, metadata !2, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEi", i32 11, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 (%struct.foo*, i32)* null, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 786451, metadata !31, metadata !3, metadata !"foo", i32 3, i64 32, i64 32, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 3, size 32, align 32, offset 0] [def] [from ]
 !3 = metadata !{i32 786473, metadata !31} ; [ DW_TAG_file_type ]
 !4 = metadata !{i32 786449, metadata !31, i32 4, metadata !"4.2.1 LLVM build", i1 true, metadata !"", i32 0, metadata !32, metadata !32, metadata !33, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
diff --git a/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll b/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll
index f69cedc..ebf51a5 100644
--- a/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll
+++ b/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll
@@ -18,7 +18,8 @@ entry:
 loop:
 ; CHECK: lock
 ; CHECK-NEXT: cmpxchg8b
-  %r = cmpxchg i64* %ptr, i64 0, i64 1 monotonic monotonic
+  %pair = cmpxchg i64* %ptr, i64 0, i64 1 monotonic monotonic
+  %r = extractvalue { i64, i1 } %pair, 0
   %stored1  = icmp eq i64 %r, 0
   br i1 %stored1, label %loop, label %continue
 continue:
diff --git a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
index f016528..625a351 100644
--- a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
+++ b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
@@ -8,11 +8,11 @@ target triple = "x86_64-apple-darwin10.0.0"
 ; CHECK: DW_TAG_subprogram
 ; CHECK: DW_TAG_variable
 ; CHECK: DW_TAG_variable
+; CHECK-NEXT:   DW_AT_location
 ; CHECK-NEXT:   DW_AT_name {{.*}} "z_s"
 ; CHECK-NEXT:   DW_AT_decl_file
 ; CHECK-NEXT:   DW_AT_decl_line
 ; CHECK-NEXT:   DW_AT_type{{.*}}{[[TYPE:.*]]}
-; CHECK-NEXT:   DW_AT_location
 ; CHECK: [[TYPE]]:
 ; CHECK-NEXT: DW_AT_name {{.*}} "int"
 
diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
index 650839a..36667de 100644
--- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
@@ -69,15 +69,15 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
 !1 = metadata !{metadata !2}
 !2 = metadata !{}
 !4 = metadata !{i32 786688, metadata !5, metadata !"num1", metadata !14, i32 815, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [num1] [line 815]
-!5 = metadata !{i32 786443, metadata !6, i32 815, i32 0, metadata !14, i32 177} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!6 = metadata !{i32 786443, metadata !7, i32 812, i32 0, metadata !14, i32 176} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!7 = metadata !{i32 786443, metadata !8, i32 807, i32 0, metadata !14, i32 175} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!8 = metadata !{i32 786443, metadata !9, i32 440, i32 0, metadata !14, i32 94} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!9 = metadata !{i32 786443, metadata !10, i32 435, i32 0, metadata !14, i32 91} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!10 = metadata !{i32 786443, metadata !11, i32 434, i32 0, metadata !14, i32 90} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!11 = metadata !{i32 786443, metadata !12, i32 250, i32 0, metadata !14, i32 24} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!12 = metadata !{i32 786443, metadata !13, i32 249, i32 0, metadata !14, i32 23} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!13 = metadata !{i32 786443, metadata !2, i32 221, i32 0, metadata !14, i32 19} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!5 = metadata !{i32 786443, metadata !14, metadata !6, i32 815, i32 0, i32 177} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!6 = metadata !{i32 786443, metadata !14, metadata !7, i32 812, i32 0, i32 176} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!7 = metadata !{i32 786443, metadata !14, metadata !8, i32 807, i32 0, i32 175} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!8 = metadata !{i32 786443, metadata !14, metadata !9, i32 440, i32 0, i32 94} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!9 = metadata !{i32 786443, metadata !14, metadata !10, i32 435, i32 0, i32 91} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!10 = metadata !{i32 786443, metadata !14, metadata !11, i32 434, i32 0, i32 90} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!11 = metadata !{i32 786443, metadata !14, metadata !12, i32 250, i32 0, i32 24} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!12 = metadata !{i32 786443, metadata !14, metadata !13, i32 249, i32 0, i32 23} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!13 = metadata !{i32 786443, metadata !14, metadata !2, i32 221, i32 0, i32 19} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
 !14 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
 !15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 160, i64 8, i32 0, i32 0, metadata !16, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
 !16 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
diff --git a/test/CodeGen/X86/2014-05-29-factorial.ll b/test/CodeGen/X86/2014-05-29-factorial.ll
new file mode 100644
index 0000000..987a21d
--- /dev/null
+++ b/test/CodeGen/X86/2014-05-29-factorial.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+; CHECK: decq [[X:%rdi|%rcx]]
+; CHECK-NOT: testq [[X]], [[X]]
+
+define i64 @fact2(i64 %x) {
+entry:
+  br label %while.body
+
+while.body:
+  %result.06 = phi i64 [ %mul, %while.body ], [ 1, %entry ]
+  %x.addr.05 = phi i64 [ %dec, %while.body ], [ %x, %entry ]
+  %mul = mul nsw i64 %result.06, %x.addr.05
+  %dec = add nsw i64 %x.addr.05, -1
+  %cmp = icmp sgt i64 %dec, 0
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:
+  %mul.lcssa = phi i64 [ %mul, %while.body ]
+  br label %while.end
+
+while.end:
+  %result.0.lcssa = phi i64 [ %mul.lcssa, %while.end.loopexit ]
+  ret i64 %result.0.lcssa
+}
diff --git a/test/CodeGen/X86/2014-05-30-CombineAddNSW.ll b/test/CodeGen/X86/2014-05-30-CombineAddNSW.ll
new file mode 100644
index 0000000..4580795
--- /dev/null
+++ b/test/CodeGen/X86/2014-05-30-CombineAddNSW.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+; CHECK: addl
+
+; The two additions are the same , but have different flags.
+; In theory this code should never be generated by the frontend, but this 
+; tries to test that two identical instructions with two different flags
+; actually generate two different nodes.
+;
+; Normally the combiner would see this condition without the flags 
+; and optimize the result of the sub into a register clear
+; (the final result would be 0). With the different flags though the combiner 
+; needs to keep the add + sub nodes, because the two nodes result as different
+; nodes and so cannot assume that the subtraction of the two nodes
+; generates 0 as result
+define i32 @foo(i32 %a, i32 %b) {
+  %1 = add i32 %a, %b
+  %2 = add nsw i32 %a, %b
+  %3 = sub i32 %1, %2
+  ret i32 %3
+}
diff --git a/test/CodeGen/X86/Atomics-64.ll b/test/CodeGen/X86/Atomics-64.ll
index c274688..c392e94 100644
--- a/test/CodeGen/X86/Atomics-64.ll
+++ b/test/CodeGen/X86/Atomics-64.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86-64 > %t.x86-64
-; RUN: llc < %s -march=x86 > %t.x86
+; RUN: llc < %s -march=x86 -mattr=cx16 > %t.x86
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin8"
 
@@ -704,7 +704,8 @@ entry:
   %3 = zext i8 %2 to i32
   %4 = trunc i32 %3 to i8
   %5 = trunc i32 %1 to i8
-  %6 = cmpxchg i8* @sc, i8 %4, i8 %5 monotonic monotonic
+  %pair6 = cmpxchg i8* @sc, i8 %4, i8 %5 monotonic monotonic
+  %6 = extractvalue { i8, i1 } %pair6, 0
   store i8 %6, i8* @sc, align 1
   %7 = load i8* @sc, align 1
   %8 = zext i8 %7 to i32
@@ -712,7 +713,8 @@ entry:
   %10 = zext i8 %9 to i32
   %11 = trunc i32 %10 to i8
   %12 = trunc i32 %8 to i8
-  %13 = cmpxchg i8* @uc, i8 %11, i8 %12 monotonic monotonic
+  %pair13 = cmpxchg i8* @uc, i8 %11, i8 %12 monotonic monotonic
+  %13 = extractvalue { i8, i1 } %pair13, 0
   store i8 %13, i8* @uc, align 1
   %14 = load i8* @sc, align 1
   %15 = sext i8 %14 to i16
@@ -722,7 +724,8 @@ entry:
   %19 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
   %20 = trunc i32 %18 to i16
   %21 = trunc i32 %16 to i16
-  %22 = cmpxchg i16* %19, i16 %20, i16 %21 monotonic monotonic
+  %pair22 = cmpxchg i16* %19, i16 %20, i16 %21 monotonic monotonic
+  %22 = extractvalue { i16, i1 } %pair22, 0
   store i16 %22, i16* @ss, align 2
   %23 = load i8* @sc, align 1
   %24 = sext i8 %23 to i16
@@ -732,49 +735,56 @@ entry:
   %28 = bitcast i8* bitcast (i16* @us to i8*) to i16*
   %29 = trunc i32 %27 to i16
   %30 = trunc i32 %25 to i16
-  %31 = cmpxchg i16* %28, i16 %29, i16 %30 monotonic monotonic
+  %pair31 = cmpxchg i16* %28, i16 %29, i16 %30 monotonic monotonic
+  %31 = extractvalue { i16, i1 } %pair31, 0
   store i16 %31, i16* @us, align 2
   %32 = load i8* @sc, align 1
   %33 = sext i8 %32 to i32
   %34 = load i8* @uc, align 1
   %35 = zext i8 %34 to i32
   %36 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %37 = cmpxchg i32* %36, i32 %35, i32 %33 monotonic monotonic
+  %pair37 = cmpxchg i32* %36, i32 %35, i32 %33 monotonic monotonic
+  %37 = extractvalue { i32, i1 } %pair37, 0
   store i32 %37, i32* @si, align 4
   %38 = load i8* @sc, align 1
   %39 = sext i8 %38 to i32
   %40 = load i8* @uc, align 1
   %41 = zext i8 %40 to i32
   %42 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %43 = cmpxchg i32* %42, i32 %41, i32 %39 monotonic monotonic
+  %pair43 = cmpxchg i32* %42, i32 %41, i32 %39 monotonic monotonic
+  %43 = extractvalue { i32, i1 } %pair43, 0
   store i32 %43, i32* @ui, align 4
   %44 = load i8* @sc, align 1
   %45 = sext i8 %44 to i64
   %46 = load i8* @uc, align 1
   %47 = zext i8 %46 to i64
   %48 = bitcast i8* bitcast (i64* @sl to i8*) to i64*
-  %49 = cmpxchg i64* %48, i64 %47, i64 %45 monotonic monotonic
+  %pair49 = cmpxchg i64* %48, i64 %47, i64 %45 monotonic monotonic
+  %49 = extractvalue { i64, i1 } %pair49, 0
   store i64 %49, i64* @sl, align 8
   %50 = load i8* @sc, align 1
   %51 = sext i8 %50 to i64
   %52 = load i8* @uc, align 1
   %53 = zext i8 %52 to i64
   %54 = bitcast i8* bitcast (i64* @ul to i8*) to i64*
-  %55 = cmpxchg i64* %54, i64 %53, i64 %51 monotonic monotonic
+  %pair55 = cmpxchg i64* %54, i64 %53, i64 %51 monotonic monotonic
+  %55 = extractvalue { i64, i1 } %pair55, 0
   store i64 %55, i64* @ul, align 8
   %56 = load i8* @sc, align 1
   %57 = sext i8 %56 to i64
   %58 = load i8* @uc, align 1
   %59 = zext i8 %58 to i64
   %60 = bitcast i8* bitcast (i64* @sll to i8*) to i64*
-  %61 = cmpxchg i64* %60, i64 %59, i64 %57 monotonic monotonic
+  %pair61 = cmpxchg i64* %60, i64 %59, i64 %57 monotonic monotonic
+  %61 = extractvalue { i64, i1 } %pair61, 0
   store i64 %61, i64* @sll, align 8
   %62 = load i8* @sc, align 1
   %63 = sext i8 %62 to i64
   %64 = load i8* @uc, align 1
   %65 = zext i8 %64 to i64
   %66 = bitcast i8* bitcast (i64* @ull to i8*) to i64*
-  %67 = cmpxchg i64* %66, i64 %65, i64 %63 monotonic monotonic
+  %pair67 = cmpxchg i64* %66, i64 %65, i64 %63 monotonic monotonic
+  %67 = extractvalue { i64, i1 } %pair67, 0
   store i64 %67, i64* @ull, align 8
   %68 = load i8* @sc, align 1
   %69 = zext i8 %68 to i32
@@ -782,7 +792,8 @@ entry:
   %71 = zext i8 %70 to i32
   %72 = trunc i32 %71 to i8
   %73 = trunc i32 %69 to i8
-  %74 = cmpxchg i8* @sc, i8 %72, i8 %73 monotonic monotonic
+  %pair74 = cmpxchg i8* @sc, i8 %72, i8 %73 monotonic monotonic
+  %74 = extractvalue { i8, i1 } %pair74, 0
   %75 = icmp eq i8 %74, %72
   %76 = zext i1 %75 to i8
   %77 = zext i8 %76 to i32
@@ -793,7 +804,8 @@ entry:
   %81 = zext i8 %80 to i32
   %82 = trunc i32 %81 to i8
   %83 = trunc i32 %79 to i8
-  %84 = cmpxchg i8* @uc, i8 %82, i8 %83 monotonic monotonic
+  %pair84 = cmpxchg i8* @uc, i8 %82, i8 %83 monotonic monotonic
+  %84 = extractvalue { i8, i1 } %pair84, 0
   %85 = icmp eq i8 %84, %82
   %86 = zext i1 %85 to i8
   %87 = zext i8 %86 to i32
@@ -805,7 +817,8 @@ entry:
   %92 = zext i8 %91 to i32
   %93 = trunc i32 %92 to i8
   %94 = trunc i32 %90 to i8
-  %95 = cmpxchg i8* bitcast (i16* @ss to i8*), i8 %93, i8 %94 monotonic monotonic
+  %pair95 = cmpxchg i8* bitcast (i16* @ss to i8*), i8 %93, i8 %94 monotonic monotonic
+  %95 = extractvalue { i8, i1 } %pair95, 0
   %96 = icmp eq i8 %95, %93
   %97 = zext i1 %96 to i8
   %98 = zext i8 %97 to i32
@@ -817,7 +830,8 @@ entry:
   %103 = zext i8 %102 to i32
   %104 = trunc i32 %103 to i8
   %105 = trunc i32 %101 to i8
-  %106 = cmpxchg i8* bitcast (i16* @us to i8*), i8 %104, i8 %105 monotonic monotonic
+  %pair106 = cmpxchg i8* bitcast (i16* @us to i8*), i8 %104, i8 %105 monotonic monotonic
+  %106 = extractvalue { i8, i1 } %pair106, 0
   %107 = icmp eq i8 %106, %104
   %108 = zext i1 %107 to i8
   %109 = zext i8 %108 to i32
@@ -828,7 +842,8 @@ entry:
   %113 = zext i8 %112 to i32
   %114 = trunc i32 %113 to i8
   %115 = trunc i32 %111 to i8
-  %116 = cmpxchg i8* bitcast (i32* @si to i8*), i8 %114, i8 %115 monotonic monotonic
+  %pair116 = cmpxchg i8* bitcast (i32* @si to i8*), i8 %114, i8 %115 monotonic monotonic
+  %116 = extractvalue { i8, i1 } %pair116, 0
   %117 = icmp eq i8 %116, %114
   %118 = zext i1 %117 to i8
   %119 = zext i8 %118 to i32
@@ -839,7 +854,8 @@ entry:
   %123 = zext i8 %122 to i32
   %124 = trunc i32 %123 to i8
   %125 = trunc i32 %121 to i8
-  %126 = cmpxchg i8* bitcast (i32* @ui to i8*), i8 %124, i8 %125 monotonic monotonic
+  %pair126 = cmpxchg i8* bitcast (i32* @ui to i8*), i8 %124, i8 %125 monotonic monotonic
+  %126 = extractvalue { i8, i1 } %pair126, 0
   %127 = icmp eq i8 %126, %124
   %128 = zext i1 %127 to i8
   %129 = zext i8 %128 to i32
@@ -850,7 +866,8 @@ entry:
   %133 = zext i8 %132 to i64
   %134 = trunc i64 %133 to i8
   %135 = trunc i64 %131 to i8
-  %136 = cmpxchg i8* bitcast (i64* @sl to i8*), i8 %134, i8 %135 monotonic monotonic
+  %pair136 = cmpxchg i8* bitcast (i64* @sl to i8*), i8 %134, i8 %135 monotonic monotonic
+  %136 = extractvalue { i8, i1 } %pair136, 0
   %137 = icmp eq i8 %136, %134
   %138 = zext i1 %137 to i8
   %139 = zext i8 %138 to i32
@@ -861,7 +878,8 @@ entry:
   %143 = zext i8 %142 to i64
   %144 = trunc i64 %143 to i8
   %145 = trunc i64 %141 to i8
-  %146 = cmpxchg i8* bitcast (i64* @ul to i8*), i8 %144, i8 %145 monotonic monotonic
+  %pair146 = cmpxchg i8* bitcast (i64* @ul to i8*), i8 %144, i8 %145 monotonic monotonic
+  %146 = extractvalue { i8, i1 } %pair146, 0
   %147 = icmp eq i8 %146, %144
   %148 = zext i1 %147 to i8
   %149 = zext i8 %148 to i32
@@ -872,7 +890,8 @@ entry:
   %153 = zext i8 %152 to i64
   %154 = trunc i64 %153 to i8
   %155 = trunc i64 %151 to i8
-  %156 = cmpxchg i8* bitcast (i64* @sll to i8*), i8 %154, i8 %155 monotonic monotonic
+  %pair156 = cmpxchg i8* bitcast (i64* @sll to i8*), i8 %154, i8 %155 monotonic monotonic
+  %156 = extractvalue { i8, i1 } %pair156, 0
   %157 = icmp eq i8 %156, %154
   %158 = zext i1 %157 to i8
   %159 = zext i8 %158 to i32
@@ -883,7 +902,8 @@ entry:
   %163 = zext i8 %162 to i64
   %164 = trunc i64 %163 to i8
   %165 = trunc i64 %161 to i8
-  %166 = cmpxchg i8* bitcast (i64* @ull to i8*), i8 %164, i8 %165 monotonic monotonic
+  %pair166 = cmpxchg i8* bitcast (i64* @ull to i8*), i8 %164, i8 %165 monotonic monotonic
+  %166 = extractvalue { i8, i1 } %pair166, 0
   %167 = icmp eq i8 %166, %164
   %168 = zext i1 %167 to i8
   %169 = zext i8 %168 to i32
diff --git a/test/CodeGen/X86/GC/lit.local.cfg b/test/CodeGen/X86/GC/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/CodeGen/X86/GC/lit.local.cfg
+++ b/test/CodeGen/X86/GC/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/X86/aliases.ll b/test/CodeGen/X86/aliases.ll
index 8487c60..bf55644 100644
--- a/test/CodeGen/X86/aliases.ll
+++ b/test/CodeGen/X86/aliases.ll
@@ -1,4 +1,20 @@
-; RUN: llc < %s -mtriple=i686-pc-linux-gnu -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu -asm-verbose=false \
+; RUN: -relocation-model=pic | FileCheck %s
+
+@thread_var = thread_local global i32 42, align 4
+@thread_alias = thread_local(localdynamic) alias i32* @thread_var
+
+; CHECK-LABEL: get_thread_var
+define i32* @get_thread_var() {
+; CHECK: leal    thread_var@TLSGD
+  ret i32* @thread_var
+}
+
+; CHECK-LABEL: get_thread_alias
+define i32* @get_thread_alias() {
+; CHECK: leal    thread_alias@TLSLD
+  ret i32* @thread_alias
+}
 
 @bar = global i32 42
 
@@ -22,7 +38,7 @@ define i32 @foo_f() {
 @bar_i = alias internal i32* @bar
 
 ; CHECK-DAG: .globl	A
-@A = alias i64, i32* @bar
+@A = alias bitcast (i32* @bar to i64*)
 
 ; CHECK-DAG: .globl	bar_h
 ; CHECK-DAG: .hidden	bar_h
@@ -32,6 +48,19 @@ define i32 @foo_f() {
 ; CHECK-DAG: .protected	bar_p
 @bar_p = protected alias i32* @bar
 
+; CHECK-DAG: test2 = bar+4
+@test2 = alias getelementptr(i32 *@bar, i32 1)
+
+; CHECK-DAG: test3 = 42
+@test3 = alias inttoptr(i32 42 to i32*)
+
+; CHECK-DAG: test4 = bar
+@test4 = alias inttoptr(i64 ptrtoint (i32* @bar to i64) to i32*)
+
+; CHECK-DAG: test5 = test2-bar
+@test5 = alias inttoptr(i32 sub (i32 ptrtoint (i32* @test2 to i32),
+                                 i32 ptrtoint (i32* @bar to i32)) to i32*)
+
 ; CHECK-DAG: .globl	test
 define i32 @test() {
 entry:
diff --git a/test/CodeGen/X86/atom-fixup-lea4.ll b/test/CodeGen/X86/atom-fixup-lea4.ll
new file mode 100644
index 0000000..668574b
--- /dev/null
+++ b/test/CodeGen/X86/atom-fixup-lea4.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mcpu=atom -mtriple=x86_64-linux
+
+%struct.ValueWrapper = type { double }
+%struct.ValueWrapper.6 = type { %struct.ValueWrapper.7 }
+%struct.ValueWrapper.7 = type { %struct.ValueWrapper.8 }
+%struct.ValueWrapper.8 = type { %struct.ValueWrapper }
+
+; Function Attrs: uwtable
+define linkonce_odr void @_ZN12ValueWrapperIS_IS_IS_IdEEEEC2Ev(%struct.ValueWrapper.6* %this) unnamed_addr #0 align 2 {
+entry:
+  %this.addr = alloca %struct.ValueWrapper.6*, align 8
+  store %struct.ValueWrapper.6* %this, %struct.ValueWrapper.6** %this.addr, align 8
+  %this1 = load %struct.ValueWrapper.6** %this.addr
+  %value = getelementptr inbounds %struct.ValueWrapper.6* %this1, i32 0, i32 0
+  call void @_ZN12ValueWrapperIS_IS_IdEEEC2Ev(%struct.ValueWrapper.7* %value)
+  ret void
+}
+
+; Function Attrs: uwtable
+declare void @_ZN12ValueWrapperIS_IS_IdEEEC2Ev(%struct.ValueWrapper.7*) unnamed_addr #0 align 2
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
diff --git a/test/CodeGen/X86/atomic-load-store-wide.ll b/test/CodeGen/X86/atomic-load-store-wide.ll
index 17e04f0..7352d5a 100644
--- a/test/CodeGen/X86/atomic-load-store-wide.ll
+++ b/test/CodeGen/X86/atomic-load-store-wide.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mcpu=corei7 -march=x86 -verify-machineinstrs | FileCheck %s
 
 ; 64-bit load/store on x86-32
 ; FIXME: The generated code can be substantially improved.
diff --git a/test/CodeGen/X86/atomic-minmax-i6432.ll b/test/CodeGen/X86/atomic-minmax-i6432.ll
index 1cfbc49..ffb7a3f 100644
--- a/test/CodeGen/X86/atomic-minmax-i6432.ll
+++ b/test/CodeGen/X86/atomic-minmax-i6432.ll
@@ -1,6 +1,5 @@
-; RUN: llc -march=x86 -mattr=+cmov -mtriple=i386-pc-linux -verify-machineinstrs < %s | FileCheck %s -check-prefix=LINUX
-; RUN: llc -march=x86 -mattr=-cmov -mtriple=i386-pc-linux -verify-machineinstrs < %s | FileCheck %s -check-prefix=NOCMOV
-; RUN: llc -march=x86 -mtriple=i386-macosx -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s -check-prefix=PIC
+; RUN: llc -march=x86 -mattr=+cmov,cx16 -mtriple=i386-pc-linux -verify-machineinstrs < %s | FileCheck %s -check-prefix=LINUX
+; RUN: llc -march=x86 -mattr=cx16 -mtriple=i386-macosx -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s -check-prefix=PIC
 
 @sc64 = external global i64
 
@@ -9,87 +8,39 @@ define void @atomic_maxmin_i6432() {
   %1 = atomicrmw max  i64* @sc64, i64 5 acquire
 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
 ; LINUX: cmpl
-; LINUX: setl
-; LINUX: cmpl
-; LINUX: setl
+; LINUX: seta
 ; LINUX: cmovne
 ; LINUX: cmovne
 ; LINUX: lock
 ; LINUX-NEXT: cmpxchg8b
 ; LINUX: jne [[LABEL]]
-; NOCMOV: [[LABEL:.LBB[0-9]+_[0-9]+]]
-; NOCMOV: cmpl
-; NOCMOV: setl
-; NOCMOV: cmpl
-; NOCMOV: setl
-; NOCMOV: jne
-; NOCMOV: jne
-; NOCMOV: lock
-; NOCMOV-NEXT: cmpxchg8b
-; NOCMOV: jne [[LABEL]]
   %2 = atomicrmw min  i64* @sc64, i64 6 acquire
 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
 ; LINUX: cmpl
-; LINUX: setg
-; LINUX: cmpl
-; LINUX: setg
+; LINUX: setb
 ; LINUX: cmovne
 ; LINUX: cmovne
 ; LINUX: lock
 ; LINUX-NEXT: cmpxchg8b
 ; LINUX: jne [[LABEL]]
-; NOCMOV: [[LABEL:.LBB[0-9]+_[0-9]+]]
-; NOCMOV: cmpl
-; NOCMOV: setg
-; NOCMOV: cmpl
-; NOCMOV: setg
-; NOCMOV: jne
-; NOCMOV: jne
-; NOCMOV: lock
-; NOCMOV-NEXT: cmpxchg8b
-; NOCMOV: jne [[LABEL]]
   %3 = atomicrmw umax i64* @sc64, i64 7 acquire
 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
 ; LINUX: cmpl
-; LINUX: setb
-; LINUX: cmpl
-; LINUX: setb
+; LINUX: seta
 ; LINUX: cmovne
 ; LINUX: cmovne
 ; LINUX: lock
 ; LINUX-NEXT: cmpxchg8b
 ; LINUX: jne [[LABEL]]
-; NOCMOV: [[LABEL:.LBB[0-9]+_[0-9]+]]
-; NOCMOV: cmpl
-; NOCMOV: setb
-; NOCMOV: cmpl
-; NOCMOV: setb
-; NOCMOV: jne
-; NOCMOV: jne
-; NOCMOV: lock
-; NOCMOV-NEXT: cmpxchg8b
-; NOCMOV: jne [[LABEL]]
   %4 = atomicrmw umin i64* @sc64, i64 8 acquire
 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
 ; LINUX: cmpl
-; LINUX: seta
-; LINUX: cmpl
-; LINUX: seta
+; LINUX: setb
 ; LINUX: cmovne
 ; LINUX: cmovne
 ; LINUX: lock
 ; LINUX-NEXT: cmpxchg8b
 ; LINUX: jne [[LABEL]]
-; NOCMOV: [[LABEL:.LBB[0-9]+_[0-9]+]]
-; NOCMOV: cmpl
-; NOCMOV: seta
-; NOCMOV: cmpl
-; NOCMOV: seta
-; NOCMOV: jne
-; NOCMOV: jne
-; NOCMOV: lock
-; NOCMOV-NEXT: cmpxchg8b
-; NOCMOV: jne [[LABEL]]
   ret void
 }
 
@@ -98,8 +49,8 @@ define void @atomic_maxmin_i6432() {
 
 define void @tf_bug(i8* %ptr) nounwind {
 ; PIC-LABEL: tf_bug:
-; PIC: movl _id-L1$pb(
-; PIC: movl (_id-L1$pb)+4(
+; PIC-DAG: movl _id-L1$pb(
+; PIC-DAG: movl (_id-L1$pb)+4(
   %tmp1 = atomicrmw add i64* @id, i64 1 seq_cst
   %tmp2 = add i64 %tmp1, 1
   %tmp3 = bitcast i8* %ptr to i64*
diff --git a/test/CodeGen/X86/atomic-ops-ancient-64.ll b/test/CodeGen/X86/atomic-ops-ancient-64.ll
new file mode 100644
index 0000000..18749b9
--- /dev/null
+++ b/test/CodeGen/X86/atomic-ops-ancient-64.ll
@@ -0,0 +1,43 @@
+; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s
+
+define i64 @test_add(i64* %addr, i64 %inc) {
+; CHECK-LABEL: test_add:
+; CHECK: calll __sync_fetch_and_add_8
+  %old = atomicrmw add i64* %addr, i64 %inc seq_cst
+  ret i64 %old
+}
+
+define i64 @test_sub(i64* %addr, i64 %inc) {
+; CHECK-LABEL: test_sub:
+; CHECK: calll __sync_fetch_and_sub_8
+  %old = atomicrmw sub i64* %addr, i64 %inc seq_cst
+  ret i64 %old
+}
+
+define i64 @test_and(i64* %andr, i64 %inc) {
+; CHECK-LABEL: test_and:
+; CHECK: calll __sync_fetch_and_and_8
+  %old = atomicrmw and i64* %andr, i64 %inc seq_cst
+  ret i64 %old
+}
+
+define i64 @test_or(i64* %orr, i64 %inc) {
+; CHECK-LABEL: test_or:
+; CHECK: calll __sync_fetch_and_or_8
+  %old = atomicrmw or i64* %orr, i64 %inc seq_cst
+  ret i64 %old
+}
+
+define i64 @test_xor(i64* %xorr, i64 %inc) {
+; CHECK-LABEL: test_xor:
+; CHECK: calll __sync_fetch_and_xor_8
+  %old = atomicrmw xor i64* %xorr, i64 %inc seq_cst
+  ret i64 %old
+}
+
+define i64 @test_nand(i64* %nandr, i64 %inc) {
+; CHECK-LABEL: test_nand:
+; CHECK: calll __sync_fetch_and_nand_8
+  %old = atomicrmw nand i64* %nandr, i64 %inc seq_cst
+  ret i64 %old
+}
diff --git a/test/CodeGen/X86/atomic128.ll b/test/CodeGen/X86/atomic128.ll
new file mode 100644
index 0000000..741d290
--- /dev/null
+++ b/test/CodeGen/X86/atomic128.ll
@@ -0,0 +1,316 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9 -verify-machineinstrs -mattr=cx16 | FileCheck %s
+
+@var = global i128 0
+
+define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
+; CHECK-LABEL: val_compare_and_swap:
+; CHECK: movq %rsi, %rax
+; CHECK: movq %rcx, %rbx
+; CHECK: movq %r8, %rcx
+; CHECK: lock
+; CHECK: cmpxchg16b (%rdi)
+
+  %pair = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire
+  %val = extractvalue { i128, i1 } %pair, 0
+  ret i128 %val
+}
+
+define void @fetch_and_nand(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_nand:
+; CHECK-DAG:     movq %rdx, [[INCHI:%[a-z0-9]+]]
+; CHECK-DAG:     movq (%rdi), %rax
+; CHECK-DAG:     movq 8(%rdi), %rdx
+
+; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
+; CHECK:         movq %rdx, %rcx
+; CHECK:         andq [[INCHI]], %rcx
+; CHECK:         movq %rax, %rbx
+  ; INCLO equivalent comes in in %rsi, so it makes sense it stays there.
+; CHECK:         andq %rsi, %rbx
+; CHECK:         notq %rbx
+; CHECK:         notq %rcx
+; CHECK:         lock
+; CHECK:         cmpxchg16b (%rdi)
+; CHECK:         jne [[LOOP]]
+
+; CHECK:         movq %rax, _var
+; CHECK:         movq %rdx, _var+8
+  %val = atomicrmw nand i128* %p, i128 %bits release
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_or(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_or:
+; CHECK-DAG:     movq %rdx, [[INCHI:%[a-z0-9]+]]
+; CHECK-DAG:     movq (%rdi), %rax
+; CHECK-DAG:     movq 8(%rdi), %rdx
+
+; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
+; CHECK:         movq %rax, %rbx
+  ; INCLO equivalent comes in in %rsi, so it makes sense it stays there.
+; CHECK:         orq %rsi, %rbx
+; CHECK:         movq %rdx, %rcx
+; CHECK:         orq [[INCHI]], %rcx
+; CHECK:         lock
+; CHECK:         cmpxchg16b (%rdi)
+; CHECK:         jne [[LOOP]]
+
+; CHECK:         movq %rax, _var
+; CHECK:         movq %rdx, _var+8
+
+  %val = atomicrmw or i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_add(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_add:
+; CHECK-DAG:     movq %rdx, [[INCHI:%[a-z0-9]+]]
+; CHECK-DAG:     movq (%rdi), %rax
+; CHECK-DAG:     movq 8(%rdi), %rdx
+
+; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
+; CHECK:         movq %rax, %rbx
+  ; INCLO equivalent comes in in %rsi, so it makes sense it stays there.
+; CHECK:         addq %rsi, %rbx
+; CHECK:         movq %rdx, %rcx
+; CHECK:         adcq [[INCHI]], %rcx
+; CHECK:         lock
+; CHECK:         cmpxchg16b (%rdi)
+; CHECK:         jne [[LOOP]]
+
+; CHECK:         movq %rax, _var
+; CHECK:         movq %rdx, _var+8
+
+  %val = atomicrmw add i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_sub(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_sub:
+; CHECK-DAG:     movq %rdx, [[INCHI:%[a-z0-9]+]]
+; CHECK-DAG:     movq (%rdi), %rax
+; CHECK-DAG:     movq 8(%rdi), %rdx
+
+; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
+; CHECK:         movq %rax, %rbx
+  ; INCLO equivalent comes in in %rsi, so it makes sense it stays there.
+; CHECK:         subq %rsi, %rbx
+; CHECK:         movq %rdx, %rcx
+; CHECK:         sbbq [[INCHI]], %rcx
+; CHECK:         lock
+; CHECK:         cmpxchg16b (%rdi)
+; CHECK:         jne [[LOOP]]
+
+; CHECK:         movq %rax, _var
+; CHECK:         movq %rdx, _var+8
+
+  %val = atomicrmw sub i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_min(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_min:
+; CHECK-DAG:     movq %rdx, [[INCHI:%[a-z0-9]+]]
+; CHECK-DAG:     movq (%rdi), %rax
+; CHECK-DAG:     movq 8(%rdi), %rdx
+
+; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
+; CHECK:         cmpq %rsi, %rax
+; CHECK:         setbe [[CMP:%[a-z0-9]+]]
+; CHECK:         cmpq [[INCHI]], %rdx
+; CHECK:         setle [[HICMP:%[a-z0-9]+]]
+; CHECK:         je [[USE_LO:.?LBB[0-9]+_[0-9]+]]
+
+; CHECK:         movb [[HICMP]], [[CMP]]
+; CHECK: [[USE_LO]]:
+; CHECK:         testb [[CMP]], [[CMP]]
+; CHECK:         movq %rsi, %rbx
+; CHECK:         cmovneq %rax, %rbx
+; CHECK:         movq [[INCHI]], %rcx
+; CHECK:         cmovneq %rdx, %rcx
+; CHECK:         lock
+; CHECK:         cmpxchg16b (%rdi)
+; CHECK:         jne [[LOOP]]
+
+; CHECK:         movq %rax, _var
+; CHECK:         movq %rdx, _var+8
+
+  %val = atomicrmw min i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_max(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_max:
+; CHECK-DAG:     movq %rdx, [[INCHI:%[a-z0-9]+]]
+; CHECK-DAG:     movq (%rdi), %rax
+; CHECK-DAG:     movq 8(%rdi), %rdx
+
+; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
+; CHECK:         cmpq %rsi, %rax
+; CHECK:         setae [[CMP:%[a-z0-9]+]]
+; CHECK:         cmpq [[INCHI]], %rdx
+; CHECK:         setge [[HICMP:%[a-z0-9]+]]
+; CHECK:         je [[USE_LO:.?LBB[0-9]+_[0-9]+]]
+
+; CHECK:         movb [[HICMP]], [[CMP]]
+; CHECK: [[USE_LO]]:
+; CHECK:         testb [[CMP]], [[CMP]]
+; CHECK:         movq %rsi, %rbx
+; CHECK:         cmovneq %rax, %rbx
+; CHECK:         movq [[INCHI]], %rcx
+; CHECK:         cmovneq %rdx, %rcx
+; CHECK:         lock
+; CHECK:         cmpxchg16b (%rdi)
+; CHECK:         jne [[LOOP]]
+
+; CHECK:         movq %rax, _var
+; CHECK:         movq %rdx, _var+8
+
+  %val = atomicrmw max i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_umin(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_umin:
+; CHECK-DAG:     movq %rdx, [[INCHI:%[a-z0-9]+]]
+; CHECK-DAG:     movq (%rdi), %rax
+; CHECK-DAG:     movq 8(%rdi), %rdx
+
+; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
+; CHECK:         cmpq %rsi, %rax
+; CHECK:         setbe [[CMP:%[a-z0-9]+]]
+; CHECK:         cmpq [[INCHI]], %rdx
+; CHECK:         setbe [[HICMP:%[a-z0-9]+]]
+; CHECK:         je [[USE_LO:.?LBB[0-9]+_[0-9]+]]
+
+; CHECK:         movb [[HICMP]], [[CMP]]
+; CHECK: [[USE_LO]]:
+; CHECK:         testb [[CMP]], [[CMP]]
+; CHECK:         movq %rsi, %rbx
+; CHECK:         cmovneq %rax, %rbx
+; CHECK:         movq [[INCHI]], %rcx
+; CHECK:         cmovneq %rdx, %rcx
+; CHECK:         lock
+; CHECK:         cmpxchg16b (%rdi)
+; CHECK:         jne [[LOOP]]
+
+; CHECK:         movq %rax, _var
+; CHECK:         movq %rdx, _var+8
+
+  %val = atomicrmw umin i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_umax(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_umax:
+; CHECK-DAG:     movq %rdx, [[INCHI:%[a-z0-9]+]]
+; CHECK-DAG:     movq (%rdi), %rax
+; CHECK-DAG:     movq 8(%rdi), %rdx
+
+; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
+; CHECK:         cmpq %rax, %rsi
+; CHECK:         setb [[CMP:%[a-z0-9]+]]
+; CHECK:         cmpq [[INCHI]], %rdx
+; CHECK:         seta [[HICMP:%[a-z0-9]+]]
+; CHECK:         je [[USE_LO:.?LBB[0-9]+_[0-9]+]]
+
+; CHECK:         movb [[HICMP]], [[CMP]]
+; CHECK: [[USE_LO]]:
+; CHECK:         testb [[CMP]], [[CMP]]
+; CHECK:         movq %rsi, %rbx
+; CHECK:         cmovneq %rax, %rbx
+; CHECK:         movq [[INCHI]], %rcx
+; CHECK:         cmovneq %rdx, %rcx
+; CHECK:         lock
+; CHECK:         cmpxchg16b (%rdi)
+; CHECK:         jne [[LOOP]]
+
+; CHECK:         movq %rax, _var
+; CHECK:         movq %rdx, _var+8
+
+  %val = atomicrmw umax i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define i128 @atomic_load_seq_cst(i128* %p) {
+; CHECK-LABEL: atomic_load_seq_cst:
+; CHECK: xorl %eax, %eax
+; CHECK: xorl %edx, %edx
+; CHECK: xorl %ebx, %ebx
+; CHECK: xorl %ecx, %ecx
+; CHECK: lock
+; CHECK: cmpxchg16b (%rdi)
+
+   %r = load atomic i128* %p seq_cst, align 16
+   ret i128 %r
+}
+
+define i128 @atomic_load_relaxed(i128* %p) {
+; CHECK: atomic_load_relaxed:
+; CHECK: xorl %eax, %eax
+; CHECK: xorl %edx, %edx
+; CHECK: xorl %ebx, %ebx
+; CHECK: xorl %ecx, %ecx
+; CHECK: lock
+; CHECK: cmpxchg16b (%rdi)
+
+   %r = load atomic i128* %p monotonic, align 16
+   ret i128 %r
+}
+
+define void @atomic_store_seq_cst(i128* %p, i128 %in) {
+; CHECK-LABEL: atomic_store_seq_cst:
+; CHECK:         movq %rdx, %rcx
+; CHECK:         movq %rsi, %rbx
+; CHECK:         movq (%rdi), %rax
+; CHECK:         movq 8(%rdi), %rdx
+
+; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
+; CHECK:         lock
+; CHECK:         cmpxchg16b (%rdi)
+; CHECK:         jne [[LOOP]]
+; CHECK-NOT:     callq ___sync_lock_test_and_set_16
+
+   store atomic i128 %in, i128* %p seq_cst, align 16
+   ret void
+}
+
+define void @atomic_store_release(i128* %p, i128 %in) {
+; CHECK-LABEL: atomic_store_release:
+; CHECK:         movq %rdx, %rcx
+; CHECK:         movq %rsi, %rbx
+; CHECK:         movq (%rdi), %rax
+; CHECK:         movq 8(%rdi), %rdx
+
+; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
+; CHECK:         lock
+; CHECK:         cmpxchg16b (%rdi)
+; CHECK:         jne [[LOOP]]
+
+   store atomic i128 %in, i128* %p release, align 16
+   ret void
+}
+
+define void @atomic_store_relaxed(i128* %p, i128 %in) {
+; CHECK-LABEL: atomic_store_relaxed:
+; CHECK:         movq %rdx, %rcx
+; CHECK:         movq %rsi, %rbx
+; CHECK:         movq (%rdi), %rax
+; CHECK:         movq 8(%rdi), %rdx
+
+; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
+; CHECK:         lock
+; CHECK:         cmpxchg16b (%rdi)
+; CHECK:         jne [[LOOP]]
+
+   store atomic i128 %in, i128* %p unordered, align 16
+   ret void
+}
diff --git a/test/CodeGen/X86/atomic16.ll b/test/CodeGen/X86/atomic16.ll
index 45d3ff4..faaa4c4 100644
--- a/test/CodeGen/X86/atomic16.ll
+++ b/test/CodeGen/X86/atomic16.ll
@@ -4,8 +4,8 @@
 @sc16 = external global i16
 
 define void @atomic_fetch_add16() nounwind {
-; X64:   atomic_fetch_add16
-; X32:   atomic_fetch_add16
+; X64-LABEL:   atomic_fetch_add16
+; X32-LABEL:   atomic_fetch_add16
 entry:
 ; 32-bit
   %t1 = atomicrmw add  i16* @sc16, i16 1 acquire
@@ -34,8 +34,8 @@ entry:
 }
 
 define void @atomic_fetch_sub16() nounwind {
-; X64:   atomic_fetch_sub16
-; X32:   atomic_fetch_sub16
+; X64-LABEL:   atomic_fetch_sub16
+; X32-LABEL:   atomic_fetch_sub16
   %t1 = atomicrmw sub  i16* @sc16, i16 1 acquire
 ; X64:       lock
 ; X64:       decw
@@ -62,18 +62,18 @@ define void @atomic_fetch_sub16() nounwind {
 }
 
 define void @atomic_fetch_and16() nounwind {
-; X64:   atomic_fetch_and16
-; X32:   atomic_fetch_and16
+; X64-LABEL:   atomic_fetch_and16
+; X32-LABEL:   atomic_fetch_and16
   %t1 = atomicrmw and  i16* @sc16, i16 3 acquire
 ; X64:       lock
 ; X64:       andw $3, {{.*}} # encoding: [0xf0,0x66
 ; X32:       lock
 ; X32:       andw $3
   %t2 = atomicrmw and  i16* @sc16, i16 5 acquire
-; X64:       andw
+; X64:       andl
 ; X64:       lock
 ; X64:       cmpxchgw
-; X32:       andw
+; X32:       andl
 ; X32:       lock
 ; X32:       cmpxchgw
   %t3 = atomicrmw and  i16* @sc16, i16 %t2 acquire
@@ -87,18 +87,18 @@ define void @atomic_fetch_and16() nounwind {
 }
 
 define void @atomic_fetch_or16() nounwind {
-; X64:   atomic_fetch_or16
-; X32:   atomic_fetch_or16
+; X64-LABEL:   atomic_fetch_or16
+; X32-LABEL:   atomic_fetch_or16
   %t1 = atomicrmw or   i16* @sc16, i16 3 acquire
 ; X64:       lock
 ; X64:       orw $3, {{.*}} # encoding: [0xf0,0x66
 ; X32:       lock
 ; X32:       orw $3
   %t2 = atomicrmw or   i16* @sc16, i16 5 acquire
-; X64:       orw
+; X64:       orl
 ; X64:       lock
 ; X64:       cmpxchgw
-; X32:       orw
+; X32:       orl
 ; X32:       lock
 ; X32:       cmpxchgw
   %t3 = atomicrmw or   i16* @sc16, i16 %t2 acquire
@@ -112,18 +112,18 @@ define void @atomic_fetch_or16() nounwind {
 }
 
 define void @atomic_fetch_xor16() nounwind {
-; X64:   atomic_fetch_xor16
-; X32:   atomic_fetch_xor16
+; X64-LABEL:   atomic_fetch_xor16
+; X32-LABEL:   atomic_fetch_xor16
   %t1 = atomicrmw xor  i16* @sc16, i16 3 acquire
 ; X64:       lock
 ; X64:       xorw $3, {{.*}} # encoding: [0xf0,0x66
 ; X32:       lock
 ; X32:       xorw $3
   %t2 = atomicrmw xor  i16* @sc16, i16 5 acquire
-; X64:       xorw
+; X64:       xorl
 ; X64:       lock
 ; X64:       cmpxchgw
-; X32:       xorw
+; X32:       xorl
 ; X32:       lock
 ; X32:       cmpxchgw
   %t3 = atomicrmw xor  i16* @sc16, i16 %t2 acquire
@@ -137,15 +137,15 @@ define void @atomic_fetch_xor16() nounwind {
 }
 
 define void @atomic_fetch_nand16(i16 %x) nounwind {
-; X64:   atomic_fetch_nand16
-; X32:   atomic_fetch_nand16
+; X64-LABEL:   atomic_fetch_nand16
+; X32-LABEL:   atomic_fetch_nand16
   %t1 = atomicrmw nand i16* @sc16, i16 %x acquire
-; X64:       andw
-; X64:       notw
+; X64:       andl
+; X64:       notl
 ; X64:       lock
 ; X64:       cmpxchgw
-; X32:       andw
-; X32:       notw
+; X32:       andl
+; X32:       notl
 ; X32:       lock
 ; X32:       cmpxchgw
   ret void
@@ -155,12 +155,16 @@ define void @atomic_fetch_nand16(i16 %x) nounwind {
 
 define void @atomic_fetch_max16(i16 %x) nounwind {
   %t1 = atomicrmw max  i16* @sc16, i16 %x acquire
-; X64:       cmpw
+; X64:       movswl
+; X64:       movswl
+; X64:       subl
 ; X64:       cmov
 ; X64:       lock
 ; X64:       cmpxchgw
 
-; X32:       cmpw
+; X32:       movswl
+; X32:       movswl
+; X32:       subl
 ; X32:       cmov
 ; X32:       lock
 ; X32:       cmpxchgw
@@ -171,12 +175,16 @@ define void @atomic_fetch_max16(i16 %x) nounwind {
 
 define void @atomic_fetch_min16(i16 %x) nounwind {
   %t1 = atomicrmw min  i16* @sc16, i16 %x acquire
-; X64:       cmpw
+; X64:       movswl
+; X64:       movswl
+; X64:       subl
 ; X64:       cmov
 ; X64:       lock
 ; X64:       cmpxchgw
 
-; X32:       cmpw
+; X32:       movswl
+; X32:       movswl
+; X32:       subl
 ; X32:       cmov
 ; X32:       lock
 ; X32:       cmpxchgw
@@ -187,12 +195,16 @@ define void @atomic_fetch_min16(i16 %x) nounwind {
 
 define void @atomic_fetch_umax16(i16 %x) nounwind {
   %t1 = atomicrmw umax i16* @sc16, i16 %x acquire
-; X64:       cmpw
+; X64:       movzwl
+; X64:       movzwl
+; X64:       subl
 ; X64:       cmov
 ; X64:       lock
 ; X64:       cmpxchgw
 
-; X32:       cmpw
+; X32:       movzwl
+; X32:       movzwl
+; X32:       subl
 ; X32:       cmov
 ; X32:       lock
 ; X32:       cmpxchgw
@@ -203,11 +215,16 @@ define void @atomic_fetch_umax16(i16 %x) nounwind {
 
 define void @atomic_fetch_umin16(i16 %x) nounwind {
   %t1 = atomicrmw umin i16* @sc16, i16 %x acquire
-; X64:       cmpw
+; X64:       movzwl
+; X64:       movzwl
+; X64:       subl
 ; X64:       cmov
 ; X64:       lock
 ; X64:       cmpxchgw
-; X32:       cmpw
+
+; X32:       movzwl
+; X32:       movzwl
+; X32:       subl
 ; X32:       cmov
 ; X32:       lock
 ; X32:       cmpxchgw
diff --git a/test/CodeGen/X86/atomic32.ll b/test/CodeGen/X86/atomic32.ll
index 474c0e6..4f2cbe0 100644
--- a/test/CodeGen/X86/atomic32.ll
+++ b/test/CodeGen/X86/atomic32.ll
@@ -5,8 +5,8 @@
 @sc32 = external global i32
 
 define void @atomic_fetch_add32() nounwind {
-; X64:   atomic_fetch_add32
-; X32:   atomic_fetch_add32
+; X64-LABEL:   atomic_fetch_add32:
+; X32-LABEL:   atomic_fetch_add32:
 entry:
 ; 32-bit
   %t1 = atomicrmw add  i32* @sc32, i32 1 acquire
@@ -35,8 +35,8 @@ entry:
 }
 
 define void @atomic_fetch_sub32() nounwind {
-; X64:   atomic_fetch_sub32
-; X32:   atomic_fetch_sub32
+; X64-LABEL:   atomic_fetch_sub32:
+; X32-LABEL:   atomic_fetch_sub32:
   %t1 = atomicrmw sub  i32* @sc32, i32 1 acquire
 ; X64:       lock
 ; X64:       decl
@@ -63,8 +63,8 @@ define void @atomic_fetch_sub32() nounwind {
 }
 
 define void @atomic_fetch_and32() nounwind {
-; X64:   atomic_fetch_and32
-; X32:   atomic_fetch_and32
+; X64-LABEL:   atomic_fetch_and32:
+; X32-LABEL:   atomic_fetch_and32:
   %t1 = atomicrmw and  i32* @sc32, i32 3 acquire
 ; X64:       lock
 ; X64:       andl $3
@@ -88,8 +88,8 @@ define void @atomic_fetch_and32() nounwind {
 }
 
 define void @atomic_fetch_or32() nounwind {
-; X64:   atomic_fetch_or32
-; X32:   atomic_fetch_or32
+; X64-LABEL:   atomic_fetch_or32:
+; X32-LABEL:   atomic_fetch_or32:
   %t1 = atomicrmw or   i32* @sc32, i32 3 acquire
 ; X64:       lock
 ; X64:       orl $3
@@ -113,8 +113,8 @@ define void @atomic_fetch_or32() nounwind {
 }
 
 define void @atomic_fetch_xor32() nounwind {
-; X64:   atomic_fetch_xor32
-; X32:   atomic_fetch_xor32
+; X64-LABEL:   atomic_fetch_xor32:
+; X32-LABEL:   atomic_fetch_xor32:
   %t1 = atomicrmw xor  i32* @sc32, i32 3 acquire
 ; X64:       lock
 ; X64:       xorl $3
@@ -138,8 +138,8 @@ define void @atomic_fetch_xor32() nounwind {
 }
 
 define void @atomic_fetch_nand32(i32 %x) nounwind {
-; X64:   atomic_fetch_nand32
-; X32:   atomic_fetch_nand32
+; X64-LABEL:   atomic_fetch_nand32:
+; X32-LABEL:   atomic_fetch_nand32:
   %t1 = atomicrmw nand i32* @sc32, i32 %x acquire
 ; X64:       andl
 ; X64:       notl
@@ -155,19 +155,22 @@ define void @atomic_fetch_nand32(i32 %x) nounwind {
 }
 
 define void @atomic_fetch_max32(i32 %x) nounwind {
+; X64-LABEL: atomic_fetch_max32:
+; X32-LABEL: atomic_fetch_max32:
+
   %t1 = atomicrmw max  i32* @sc32, i32 %x acquire
-; X64:       cmpl
+; X64:       subl
 ; X64:       cmov
 ; X64:       lock
 ; X64:       cmpxchgl
 
-; X32:       cmpl
+; X32:       subl
 ; X32:       cmov
 ; X32:       lock
 ; X32:       cmpxchgl
 
-; NOCMOV:    cmpl
-; NOCMOV:    jl
+; NOCMOV:    subl
+; NOCMOV:    jge
 ; NOCMOV:    lock
 ; NOCMOV:    cmpxchgl
   ret void
@@ -177,19 +180,23 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 }
 
 define void @atomic_fetch_min32(i32 %x) nounwind {
+; X64-LABEL: atomic_fetch_min32:
+; X32-LABEL: atomic_fetch_min32:
+; NOCMOV-LABEL: atomic_fetch_min32:
+
   %t1 = atomicrmw min  i32* @sc32, i32 %x acquire
-; X64:       cmpl
+; X64:       subl
 ; X64:       cmov
 ; X64:       lock
 ; X64:       cmpxchgl
 
-; X32:       cmpl
+; X32:       subl
 ; X32:       cmov
 ; X32:       lock
 ; X32:       cmpxchgl
 
-; NOCMOV:    cmpl
-; NOCMOV:    jg
+; NOCMOV:    subl
+; NOCMOV:    jle
 ; NOCMOV:    lock
 ; NOCMOV:    cmpxchgl
   ret void
@@ -199,19 +206,23 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 }
 
 define void @atomic_fetch_umax32(i32 %x) nounwind {
+; X64-LABEL: atomic_fetch_umax32:
+; X32-LABEL: atomic_fetch_umax32:
+; NOCMOV-LABEL: atomic_fetch_umax32:
+
   %t1 = atomicrmw umax i32* @sc32, i32 %x acquire
-; X64:       cmpl
+; X64:       subl
 ; X64:       cmov
 ; X64:       lock
 ; X64:       cmpxchgl
 
-; X32:       cmpl
+; X32:       subl
 ; X32:       cmov
 ; X32:       lock
 ; X32:       cmpxchgl
 
-; NOCMOV:    cmpl
-; NOCMOV:    jb
+; NOCMOV:    subl
+; NOCMOV:    ja
 ; NOCMOV:    lock
 ; NOCMOV:    cmpxchgl
   ret void
@@ -221,19 +232,23 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 }
 
 define void @atomic_fetch_umin32(i32 %x) nounwind {
+; X64-LABEL: atomic_fetch_umin32:
+; X32-LABEL: atomic_fetch_umin32:
+; NOCMOV-LABEL: atomic_fetch_umin32:
+
   %t1 = atomicrmw umin i32* @sc32, i32 %x acquire
-; X64:       cmpl
+; X64:       subl
 ; X64:       cmov
 ; X64:       lock
 ; X64:       cmpxchgl
 
-; X32:       cmpl
+; X32:       subl
 ; X32:       cmov
 ; X32:       lock
 ; X32:       cmpxchgl
 
-; NOCMOV:    cmpl
-; NOCMOV:    ja
+; NOCMOV:    subl
+; NOCMOV:    jb
 ; NOCMOV:    lock
 ; NOCMOV:    cmpxchgl
   ret void
@@ -243,6 +258,9 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 }
 
 define void @atomic_fetch_cmpxchg32() nounwind {
+; X64-LABEL: atomic_fetch_cmpxchg32:
+; X32-LABEL: atomic_fetch_cmpxchg32:
+
   %t1 = cmpxchg i32* @sc32, i32 0, i32 1 acquire acquire
 ; X64:       lock
 ; X64:       cmpxchgl
@@ -254,6 +272,9 @@ define void @atomic_fetch_cmpxchg32() nounwind {
 }
 
 define void @atomic_fetch_store32(i32 %x) nounwind {
+; X64-LABEL: atomic_fetch_store32:
+; X32-LABEL: atomic_fetch_store32:
+
   store atomic i32 %x, i32* @sc32 release, align 4
 ; X64-NOT:   lock
 ; X64:       movl
@@ -265,6 +286,9 @@ define void @atomic_fetch_store32(i32 %x) nounwind {
 }
 
 define void @atomic_fetch_swap32(i32 %x) nounwind {
+; X64-LABEL: atomic_fetch_swap32:
+; X32-LABEL: atomic_fetch_swap32:
+
   %t1 = atomicrmw xchg i32* @sc32, i32 %x acquire
 ; X64-NOT:   lock
 ; X64:       xchgl
diff --git a/test/CodeGen/X86/atomic64.ll b/test/CodeGen/X86/atomic64.ll
index 4f55edc..11b4e68 100644
--- a/test/CodeGen/X86/atomic64.ll
+++ b/test/CodeGen/X86/atomic64.ll
@@ -3,7 +3,8 @@
 @sc64 = external global i64
 
 define void @atomic_fetch_add64() nounwind {
-; X64:   atomic_fetch_add64
+; X64-LABEL:   atomic_fetch_add64:
+; X32-LABEL:   atomic_fetch_add64:
 entry:
   %t1 = atomicrmw add  i64* @sc64, i64 1 acquire
 ; X64:       lock
@@ -22,7 +23,8 @@ entry:
 }
 
 define void @atomic_fetch_sub64() nounwind {
-; X64:   atomic_fetch_sub64
+; X64-LABEL:   atomic_fetch_sub64:
+; X32-LABEL:   atomic_fetch_sub64:
   %t1 = atomicrmw sub  i64* @sc64, i64 1 acquire
 ; X64:       lock
 ; X64:       decq
@@ -40,7 +42,8 @@ define void @atomic_fetch_sub64() nounwind {
 }
 
 define void @atomic_fetch_and64() nounwind {
-; X64:   atomic_fetch_and64
+; X64-LABEL:   atomic_fetch_and64:
+; X32-LABEL:   atomic_fetch_and64:
   %t1 = atomicrmw and  i64* @sc64, i64 3 acquire
 ; X64:       lock
 ; X64:       andq $3
@@ -56,7 +59,8 @@ define void @atomic_fetch_and64() nounwind {
 }
 
 define void @atomic_fetch_or64() nounwind {
-; X64:   atomic_fetch_or64
+; X64-LABEL:   atomic_fetch_or64:
+; X32-LABEL:   atomic_fetch_or64:
   %t1 = atomicrmw or   i64* @sc64, i64 3 acquire
 ; X64:       lock
 ; X64:       orq $3
@@ -72,7 +76,8 @@ define void @atomic_fetch_or64() nounwind {
 }
 
 define void @atomic_fetch_xor64() nounwind {
-; X64:   atomic_fetch_xor64
+; X64-LABEL:   atomic_fetch_xor64:
+; X32-LABEL:   atomic_fetch_xor64:
   %t1 = atomicrmw xor  i64* @sc64, i64 3 acquire
 ; X64:       lock
 ; X64:       xorq $3
@@ -88,8 +93,8 @@ define void @atomic_fetch_xor64() nounwind {
 }
 
 define void @atomic_fetch_nand64(i64 %x) nounwind {
-; X64:   atomic_fetch_nand64
-; X32:   atomic_fetch_nand64
+; X64-LABEL:   atomic_fetch_nand64:
+; X32-LABEL:   atomic_fetch_nand64:
   %t1 = atomicrmw nand i64* @sc64, i64 %x acquire
 ; X64:       andq
 ; X64:       notq
@@ -107,8 +112,10 @@ define void @atomic_fetch_nand64(i64 %x) nounwind {
 }
 
 define void @atomic_fetch_max64(i64 %x) nounwind {
+; X64-LABEL:   atomic_fetch_max64:
+; X32-LABEL:   atomic_fetch_max64:
   %t1 = atomicrmw max  i64* @sc64, i64 %x acquire
-; X64:       cmpq
+; X64:       subq
 ; X64:       cmov
 ; X64:       lock
 ; X64:       cmpxchgq
@@ -126,8 +133,10 @@ define void @atomic_fetch_max64(i64 %x) nounwind {
 }
 
 define void @atomic_fetch_min64(i64 %x) nounwind {
+; X64-LABEL:   atomic_fetch_min64:
+; X32-LABEL:   atomic_fetch_min64:
   %t1 = atomicrmw min  i64* @sc64, i64 %x acquire
-; X64:       cmpq
+; X64:       subq
 ; X64:       cmov
 ; X64:       lock
 ; X64:       cmpxchgq
@@ -145,8 +154,10 @@ define void @atomic_fetch_min64(i64 %x) nounwind {
 }
 
 define void @atomic_fetch_umax64(i64 %x) nounwind {
+; X64-LABEL:   atomic_fetch_umax64:
+; X32-LABEL:   atomic_fetch_umax64:
   %t1 = atomicrmw umax i64* @sc64, i64 %x acquire
-; X64:       cmpq
+; X64:       subq
 ; X64:       cmov
 ; X64:       lock
 ; X64:       cmpxchgq
@@ -164,8 +175,10 @@ define void @atomic_fetch_umax64(i64 %x) nounwind {
 }
 
 define void @atomic_fetch_umin64(i64 %x) nounwind {
+; X64-LABEL:   atomic_fetch_umin64:
+; X32-LABEL:   atomic_fetch_umin64:
   %t1 = atomicrmw umin i64* @sc64, i64 %x acquire
-; X64:       cmpq
+; X64:       subq
 ; X64:       cmov
 ; X64:       lock
 ; X64:       cmpxchgq
@@ -183,6 +196,8 @@ define void @atomic_fetch_umin64(i64 %x) nounwind {
 }
 
 define void @atomic_fetch_cmpxchg64() nounwind {
+; X64-LABEL:   atomic_fetch_cmpxchg64:
+; X32-LABEL:   atomic_fetch_cmpxchg64:
   %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire acquire
 ; X64:       lock
 ; X64:       cmpxchgq
@@ -194,6 +209,8 @@ define void @atomic_fetch_cmpxchg64() nounwind {
 }
 
 define void @atomic_fetch_store64(i64 %x) nounwind {
+; X64-LABEL:   atomic_fetch_store64:
+; X32-LABEL:   atomic_fetch_store64:
   store atomic i64 %x, i64* @sc64 release, align 8
 ; X64-NOT:   lock
 ; X64:       movq
@@ -205,6 +222,8 @@ define void @atomic_fetch_store64(i64 %x) nounwind {
 }
 
 define void @atomic_fetch_swap64(i64 %x) nounwind {
+; X64-LABEL:   atomic_fetch_swap64:
+; X32-LABEL:   atomic_fetch_swap64:
   %t1 = atomicrmw xchg i64* @sc64, i64 %x acquire
 ; X64-NOT:   lock
 ; X64:       xchgq
diff --git a/test/CodeGen/X86/atomic6432.ll b/test/CodeGen/X86/atomic6432.ll
index c0f7267..1c4b0f4 100644
--- a/test/CodeGen/X86/atomic6432.ll
+++ b/test/CodeGen/X86/atomic6432.ll
@@ -3,7 +3,8 @@
 @sc64 = external global i64
 
 define void @atomic_fetch_add64() nounwind {
-; X32:   atomic_fetch_add64
+; X64-LABEL:   atomic_fetch_add64:
+; X32-LABEL:   atomic_fetch_add64:
 entry:
   %t1 = atomicrmw add  i64* @sc64, i64 1 acquire
 ; X32:       addl
@@ -30,20 +31,21 @@ entry:
 }
 
 define void @atomic_fetch_sub64() nounwind {
-; X32:   atomic_fetch_sub64
+; X64-LABEL:   atomic_fetch_sub64:
+; X32-LABEL:   atomic_fetch_sub64:
   %t1 = atomicrmw sub  i64* @sc64, i64 1 acquire
-; X32:       subl
-; X32:       sbbl
+; X32:       addl $-1
+; X32:       adcl $-1
 ; X32:       lock
 ; X32:       cmpxchg8b
   %t2 = atomicrmw sub  i64* @sc64, i64 3 acquire
-; X32:       subl
-; X32:       sbbl
+; X32:       addl $-3
+; X32:       adcl $-1
 ; X32:       lock
 ; X32:       cmpxchg8b
   %t3 = atomicrmw sub  i64* @sc64, i64 5 acquire
-; X32:       subl
-; X32:       sbbl
+; X32:       addl $-5
+; X32:       adcl $-1
 ; X32:       lock
 ; X32:       cmpxchg8b
   %t4 = atomicrmw sub  i64* @sc64, i64 %t3 acquire
@@ -56,15 +58,16 @@ define void @atomic_fetch_sub64() nounwind {
 }
 
 define void @atomic_fetch_and64() nounwind {
-; X32:   atomic_fetch_and64
+; X64-LABEL:   atomic_fetch_and:64
+; X32-LABEL:   atomic_fetch_and64:
   %t1 = atomicrmw and  i64* @sc64, i64 3 acquire
-; X32:       andl
-; X32:       andl
+; X32:       andl $3
+; X32-NOT:       andl
 ; X32:       lock
 ; X32:       cmpxchg8b
-  %t2 = atomicrmw and  i64* @sc64, i64 5 acquire
-; X32:       andl
-; X32:       andl
+  %t2 = atomicrmw and  i64* @sc64, i64 4294967297 acquire
+; X32:       andl $1
+; X32:       andl $1
 ; X32:       lock
 ; X32:       cmpxchg8b
   %t3 = atomicrmw and  i64* @sc64, i64 %t2 acquire
@@ -77,15 +80,16 @@ define void @atomic_fetch_and64() nounwind {
 }
 
 define void @atomic_fetch_or64() nounwind {
-; X32:   atomic_fetch_or64
+; X64-LABEL:   atomic_fetch_or64:
+; X32-LABEL:   atomic_fetch_or64:
   %t1 = atomicrmw or   i64* @sc64, i64 3 acquire
-; X32:       orl
-; X32:       orl
+; X32:       orl $3
+; X32-NOT:       orl
 ; X32:       lock
 ; X32:       cmpxchg8b
-  %t2 = atomicrmw or   i64* @sc64, i64 5 acquire
-; X32:       orl
-; X32:       orl
+  %t2 = atomicrmw or   i64* @sc64, i64 4294967297 acquire
+; X32:       orl $1
+; X32:       orl $1
 ; X32:       lock
 ; X32:       cmpxchg8b
   %t3 = atomicrmw or   i64* @sc64, i64 %t2 acquire
@@ -98,15 +102,16 @@ define void @atomic_fetch_or64() nounwind {
 }
 
 define void @atomic_fetch_xor64() nounwind {
-; X32:   atomic_fetch_xor64
+; X64-LABEL:   atomic_fetch_xor:64
+; X32-LABEL:   atomic_fetch_xor64:
   %t1 = atomicrmw xor  i64* @sc64, i64 3 acquire
 ; X32:       xorl
-; X32:       xorl
+; X32-NOT:       xorl
 ; X32:       lock
 ; X32:       cmpxchg8b
-  %t2 = atomicrmw xor  i64* @sc64, i64 5 acquire
-; X32:       xorl
-; X32:       xorl
+  %t2 = atomicrmw xor  i64* @sc64, i64 4294967297 acquire
+; X32:       xorl $1
+; X32:       xorl $1
 ; X32:       lock
 ; X32:       cmpxchg8b
   %t3 = atomicrmw xor  i64* @sc64, i64 %t2 acquire
@@ -119,7 +124,8 @@ define void @atomic_fetch_xor64() nounwind {
 }
 
 define void @atomic_fetch_nand64(i64 %x) nounwind {
-; X32:   atomic_fetch_nand64
+; X64-LABEL:   atomic_fetch_nand64:
+; X32-LABEL:   atomic_fetch_nand64:
   %t1 = atomicrmw nand i64* @sc64, i64 %x acquire
 ; X32:       andl
 ; X32:       andl
@@ -132,10 +138,11 @@ define void @atomic_fetch_nand64(i64 %x) nounwind {
 }
 
 define void @atomic_fetch_max64(i64 %x) nounwind {
+; X64-LABEL:   atomic_fetch_max:64
+; X32-LABEL:   atomic_fetch_max64:
   %t1 = atomicrmw max  i64* @sc64, i64 %x acquire
-; X32:       cmpl
-; X32:       cmpl
-; X32:       cmov
+; X32:       subl
+; X32:       subl
 ; X32:       cmov
 ; X32:       cmov
 ; X32:       lock
@@ -145,10 +152,11 @@ define void @atomic_fetch_max64(i64 %x) nounwind {
 }
 
 define void @atomic_fetch_min64(i64 %x) nounwind {
+; X64-LABEL:   atomic_fetch_min64:
+; X32-LABEL:   atomic_fetch_min64:
   %t1 = atomicrmw min  i64* @sc64, i64 %x acquire
-; X32:       cmpl
-; X32:       cmpl
-; X32:       cmov
+; X32:       subl
+; X32:       subl
 ; X32:       cmov
 ; X32:       cmov
 ; X32:       lock
@@ -158,10 +166,11 @@ define void @atomic_fetch_min64(i64 %x) nounwind {
 }
 
 define void @atomic_fetch_umax64(i64 %x) nounwind {
+; X64-LABEL:   atomic_fetch_umax:64
+; X32-LABEL:   atomic_fetch_umax64:
   %t1 = atomicrmw umax i64* @sc64, i64 %x acquire
-; X32:       cmpl
-; X32:       cmpl
-; X32:       cmov
+; X32:       subl
+; X32:       subl
 ; X32:       cmov
 ; X32:       cmov
 ; X32:       lock
@@ -171,10 +180,11 @@ define void @atomic_fetch_umax64(i64 %x) nounwind {
 }
 
 define void @atomic_fetch_umin64(i64 %x) nounwind {
+; X64-LABEL:   atomic_fetch_umin64:
+; X32-LABEL:   atomic_fetch_umin64:
   %t1 = atomicrmw umin i64* @sc64, i64 %x acquire
-; X32:       cmpl
-; X32:       cmpl
-; X32:       cmov
+; X32:       subl
+; X32:       subl
 ; X32:       cmov
 ; X32:       cmov
 ; X32:       lock
@@ -184,6 +194,8 @@ define void @atomic_fetch_umin64(i64 %x) nounwind {
 }
 
 define void @atomic_fetch_cmpxchg64() nounwind {
+; X64-LABEL:   atomic_fetch_cmpxchg:64
+; X32-LABEL:   atomic_fetch_cmpxchg64:
   %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire acquire
 ; X32:       lock
 ; X32:       cmpxchg8b
@@ -192,6 +204,8 @@ define void @atomic_fetch_cmpxchg64() nounwind {
 }
 
 define void @atomic_fetch_store64(i64 %x) nounwind {
+; X64-LABEL:   atomic_fetch_store64:
+; X32-LABEL:   atomic_fetch_store64:
   store atomic i64 %x, i64* @sc64 release, align 8
 ; X32:       lock
 ; X32:       cmpxchg8b
@@ -200,6 +214,8 @@ define void @atomic_fetch_store64(i64 %x) nounwind {
 }
 
 define void @atomic_fetch_swap64(i64 %x) nounwind {
+; X64-LABEL:   atomic_fetch_swap64:
+; X32-LABEL:   atomic_fetch_swap64:
   %t1 = atomicrmw xchg i64* @sc64, i64 %x acquire
 ; X32:       lock
 ; X32:       xchg8b
diff --git a/test/CodeGen/X86/atomic8.ll b/test/CodeGen/X86/atomic8.ll
index 203b26f..5eef9b2 100644
--- a/test/CodeGen/X86/atomic8.ll
+++ b/test/CodeGen/X86/atomic8.ll
@@ -4,8 +4,8 @@
 @sc8 = external global i8
 
 define void @atomic_fetch_add8() nounwind {
-; X64:   atomic_fetch_add8
-; X32:   atomic_fetch_add8
+; X64-LABEL:   atomic_fetch_add8:
+; X32-LABEL:   atomic_fetch_add8:
 entry:
 ; 32-bit
   %t1 = atomicrmw add  i8* @sc8, i8 1 acquire
@@ -34,8 +34,8 @@ entry:
 }
 
 define void @atomic_fetch_sub8() nounwind {
-; X64:   atomic_fetch_sub8
-; X32:   atomic_fetch_sub8
+; X64-LABEL:   atomic_fetch_sub8:
+; X32-LABEL:   atomic_fetch_sub8:
   %t1 = atomicrmw sub  i8* @sc8, i8 1 acquire
 ; X64:       lock
 ; X64:       decb
@@ -62,8 +62,8 @@ define void @atomic_fetch_sub8() nounwind {
 }
 
 define void @atomic_fetch_and8() nounwind {
-; X64:   atomic_fetch_and8
-; X32:   atomic_fetch_and8
+; X64-LABEL:   atomic_fetch_and8:
+; X32-LABEL:   atomic_fetch_and8:
   %t1 = atomicrmw and  i8* @sc8, i8 3 acquire
 ; X64:       lock
 ; X64:       andb $3
@@ -87,8 +87,8 @@ define void @atomic_fetch_and8() nounwind {
 }
 
 define void @atomic_fetch_or8() nounwind {
-; X64:   atomic_fetch_or8
-; X32:   atomic_fetch_or8
+; X64-LABEL:   atomic_fetch_or8:
+; X32-LABEL:   atomic_fetch_or8:
   %t1 = atomicrmw or   i8* @sc8, i8 3 acquire
 ; X64:       lock
 ; X64:       orb $3
@@ -112,8 +112,8 @@ define void @atomic_fetch_or8() nounwind {
 }
 
 define void @atomic_fetch_xor8() nounwind {
-; X64:   atomic_fetch_xor8
-; X32:   atomic_fetch_xor8
+; X64-LABEL:   atomic_fetch_xor8:
+; X32-LABEL:   atomic_fetch_xor8:
   %t1 = atomicrmw xor  i8* @sc8, i8 3 acquire
 ; X64:       lock
 ; X64:       xorb $3
@@ -137,8 +137,8 @@ define void @atomic_fetch_xor8() nounwind {
 }
 
 define void @atomic_fetch_nand8(i8 %x) nounwind {
-; X64:   atomic_fetch_nand8
-; X32:   atomic_fetch_nand8
+; X64-LABEL:   atomic_fetch_nand8:
+; X32-LABEL:   atomic_fetch_nand8:
   %t1 = atomicrmw nand i8* @sc8, i8 %x acquire
 ; X64:       andb
 ; X64:       notb
@@ -154,14 +154,18 @@ define void @atomic_fetch_nand8(i8 %x) nounwind {
 }
 
 define void @atomic_fetch_max8(i8 %x) nounwind {
+; X64-LABEL:   atomic_fetch_max8:
+; X32-LABEL:   atomic_fetch_max8:
   %t1 = atomicrmw max  i8* @sc8, i8 %x acquire
-; X64:       cmpb
-; X64:       cmov
+; X64:       movsbl
+; X64:       movsbl
+; X64:       subl
 ; X64:       lock
 ; X64:       cmpxchgb
 
-; X32:       cmpb
-; X32:       cmov
+; X32:       movsbl
+; X32:       movsbl
+; X32:       subl
 ; X32:       lock
 ; X32:       cmpxchgb
   ret void
@@ -170,14 +174,18 @@ define void @atomic_fetch_max8(i8 %x) nounwind {
 }
 
 define void @atomic_fetch_min8(i8 %x) nounwind {
+; X64-LABEL:   atomic_fetch_min8:
+; X32-LABEL:   atomic_fetch_min8:
   %t1 = atomicrmw min  i8* @sc8, i8 %x acquire
-; X64:       cmpb
-; X64:       cmov
+; X64:       movsbl
+; X64:       movsbl
+; X64:       subl
 ; X64:       lock
 ; X64:       cmpxchgb
 
-; X32:       cmpb
-; X32:       cmov
+; X32:       movsbl
+; X32:       movsbl
+; X32:       subl
 ; X32:       lock
 ; X32:       cmpxchgb
   ret void
@@ -186,14 +194,18 @@ define void @atomic_fetch_min8(i8 %x) nounwind {
 }
 
 define void @atomic_fetch_umax8(i8 %x) nounwind {
+; X64-LABEL:   atomic_fetch_umax8:
+; X32-LABEL:   atomic_fetch_umax8:
   %t1 = atomicrmw umax i8* @sc8, i8 %x acquire
-; X64:       cmpb
-; X64:       cmov
+; X64:       movzbl
+; X64:       movzbl
+; X64:       subl
 ; X64:       lock
 ; X64:       cmpxchgb
 
-; X32:       cmpb
-; X32:       cmov
+; X32:       movzbl
+; X32:       movzbl
+; X32:       subl
 ; X32:       lock
 ; X32:       cmpxchgb
   ret void
@@ -202,13 +214,18 @@ define void @atomic_fetch_umax8(i8 %x) nounwind {
 }
 
 define void @atomic_fetch_umin8(i8 %x) nounwind {
+; X64-LABEL:   atomic_fetch_umin8:
+; X32-LABEL:   atomic_fetch_umin8:
   %t1 = atomicrmw umin i8* @sc8, i8 %x acquire
-; X64:       cmpb
-; X64:       cmov
+; X64:       movzbl
+; X64:       movzbl
+; X64:       subl
 ; X64:       lock
 ; X64:       cmpxchgb
-; X32:       cmpb
-; X32:       cmov
+
+; X32:       movzbl
+; X32:       movzbl
+; X32:       subl
 ; X32:       lock
 ; X32:       cmpxchgb
   ret void
@@ -217,6 +234,8 @@ define void @atomic_fetch_umin8(i8 %x) nounwind {
 }
 
 define void @atomic_fetch_cmpxchg8() nounwind {
+; X64-LABEL:   atomic_fetch_cmpxchg8:
+; X32-LABEL:   atomic_fetch_cmpxchg8:
   %t1 = cmpxchg i8* @sc8, i8 0, i8 1 acquire acquire
 ; X64:       lock
 ; X64:       cmpxchgb
@@ -228,6 +247,8 @@ define void @atomic_fetch_cmpxchg8() nounwind {
 }
 
 define void @atomic_fetch_store8(i8 %x) nounwind {
+; X64-LABEL:   atomic_fetch_store8:
+; X32-LABEL:   atomic_fetch_store8:
   store atomic i8 %x, i8* @sc8 release, align 4
 ; X64-NOT:   lock
 ; X64:       movb
@@ -239,6 +260,8 @@ define void @atomic_fetch_store8(i8 %x) nounwind {
 }
 
 define void @atomic_fetch_swap8(i8 %x) nounwind {
+; X64-LABEL:   atomic_fetch_swap8:
+; X32-LABEL:   atomic_fetch_swap8:
   %t1 = atomicrmw xchg i8* @sc8, i8 %x acquire
 ; X64-NOT:   lock
 ; X64:       xchgb
diff --git a/test/CodeGen/X86/atomic_op.ll b/test/CodeGen/X86/atomic_op.ll
index b3045ed..d0ab28a 100644
--- a/test/CodeGen/X86/atomic_op.ll
+++ b/test/CodeGen/X86/atomic_op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+cmov -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+cmov,cx16 -verify-machineinstrs | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
@@ -101,26 +101,28 @@ entry:
 	%neg1 = sub i32 0, 10		; <i32> [#uses=1]
         ; CHECK: lock
         ; CHECK: cmpxchgl
-  %16 = cmpxchg i32* %val2, i32 %neg1, i32 1 monotonic monotonic
+  %pair16 = cmpxchg i32* %val2, i32 %neg1, i32 1 monotonic monotonic
+  %16 = extractvalue { i32, i1 } %pair16, 0
 	store i32 %16, i32* %old
         ; CHECK: lock
         ; CHECK: cmpxchgl
-  %17 = cmpxchg i32* %val2, i32 1976, i32 1 monotonic monotonic
+  %pair17 = cmpxchg i32* %val2, i32 1976, i32 1 monotonic monotonic
+  %17 = extractvalue { i32, i1 } %pair17, 0
 	store i32 %17, i32* %old
         ; CHECK: movl  [[R17atomic:.*]], %eax
-        ; CHECK: movl	$1401, %[[R17mask:[a-z]*]]
-        ; CHECK: andl	%eax, %[[R17mask]]
-        ; CHECK: notl	%[[R17mask]]
+        ; CHECK: movl %eax, %[[R17mask:[a-z]*]]
+        ; CHECK: notl %[[R17mask]]
+        ; CHECK: orl $-1402, %[[R17mask]]
         ; CHECK: lock
         ; CHECK: cmpxchgl	%[[R17mask]], [[R17atomic]]
         ; CHECK: jne
         ; CHECK: movl	%eax,
   %18 = atomicrmw nand i32* %val2, i32 1401 monotonic
   store i32 %18, i32* %old
-        ; CHECK: andl
-        ; CHECK: andl
         ; CHECK: notl
         ; CHECK: notl
+        ; CHECK: orl $252645135
+        ; CHECK: orl $252645135
         ; CHECK: lock
         ; CHECK: cmpxchg8b
   %19 = atomicrmw nand i64* %temp64, i64 17361641481138401520 monotonic
@@ -133,6 +135,7 @@ entry:
 ; CHECK: lock
 ; CHECK:	cmpxchgl	%{{.*}}, %gs:(%{{.*}})
 
-  %0 = cmpxchg i32 addrspace(256)* %P, i32 0, i32 1 monotonic monotonic
+  %pair0 = cmpxchg i32 addrspace(256)* %P, i32 0, i32 1 monotonic monotonic
+  %0 = extractvalue { i32, i1 } %pair0, 0
   ret void
 }
diff --git a/test/CodeGen/X86/avx-blend.ll b/test/CodeGen/X86/avx-blend.ll
index e21c7a0..d2a22d7 100644
--- a/test/CodeGen/X86/avx-blend.ll
+++ b/test/CodeGen/X86/avx-blend.ll
@@ -110,7 +110,7 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
 
 ;CHECK-LABEL: vsel_double4:
 ;CHECK-NOT: vinsertf128
-;CHECK: vshufpd $10
+;CHECK: vblendpd $10
 ;CHECK-NEXT: ret
 define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2
@@ -158,3 +158,45 @@ define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd)
 
 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
 declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+;; 4 tests for shufflevectors that optimize to blend + immediate
+; CHECK-LABEL: @blend_shufflevector_4xfloat
+define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) {
+; Equivalent select mask is <i1 true, i1 false, i1 true, i1 false>.
+; Big endian representation is 0101 = 5.
+; '1' means takes the first argument, '0' means takes the second argument.
+; This is the opposite of the intel syntax, thus we expect
+; Inverted mask: 1010 = 10.
+; According to the ABI:
+; a is in xmm0 => first argument is xmm0.
+; b is in xmm1 => second argument is xmm1.
+; Result is in xmm0 => destination argument.
+; CHECK: vblendps $10, %xmm1, %xmm0, %xmm0
+; CHECK: ret
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x float> %1
+}
+
+; CHECK-LABEL: @blend_shufflevector_8xfloat
+define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b) {
+; CHECK: vblendps $190, %ymm1, %ymm0, %ymm0
+; CHECK: ret
+  %1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 15>
+  ret <8 x float> %1
+}
+
+; CHECK-LABEL: @blend_shufflevector_4xdouble
+define <4 x double> @blend_shufflevector_4xdouble(<4 x double> %a, <4 x double> %b) {
+; CHECK: vblendpd $2, %ymm1, %ymm0, %ymm0
+; CHECK: ret
+  %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  ret <4 x double> %1
+}
+
+; CHECK-LABEL: @blend_shufflevector_4xi64
+define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK: vblendpd $13, %ymm1, %ymm0, %ymm0
+; CHECK: ret
+  %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+  ret <4 x i64> %1
+}
diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll
index 7337815..3e051bf 100644
--- a/test/CodeGen/X86/avx-intel-ocl.ll
+++ b/test/CodeGen/X86/avx-intel-ocl.ll
@@ -7,21 +7,21 @@ declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
 declare <16 x float> @func_float16(<16 x float>, <16 x float>)
 declare i32 @func_int(i32, i32)
 
-; WIN64: testf16_inp
+; WIN64-LABEL: testf16_inp
 ; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
 ; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
 ; WIN64: leaq    {{.*}}(%rsp), %rcx
 ; WIN64: call
 ; WIN64: ret
 
-; X32: testf16_inp
+; X32-LABEL: testf16_inp
 ; X32: movl    %eax, (%esp)
 ; X32: vaddps  {{.*}}, {{%ymm[0-1]}}
 ; X32: vaddps  {{.*}}, {{%ymm[0-1]}}
 ; X32: call
 ; X32: ret
 
-; X64: testf16_inp
+; X64-LABEL: testf16_inp
 ; X64: vaddps  {{.*}}, {{%ymm[0-1]}}
 ; X64: vaddps  {{.*}}, {{%ymm[0-1]}}
 ; X64: leaq    {{.*}}(%rsp), %rdi
@@ -41,14 +41,14 @@ define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
 ;test calling conventions - preserved registers
 
 ; preserved ymm6-ymm15
-; WIN64: testf16_regs
+; WIN64-LABEL: testf16_regs
 ; WIN64: call
 ; WIN64: vaddps  {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
 ; WIN64: vaddps  {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
 ; WIN64: ret
 
 ; preserved ymm8-ymm15
-; X64: testf16_regs
+; X64-LABEL: testf16_regs
 ; X64: call
 ; X64: vaddps  {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
 ; X64: vaddps  {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
@@ -65,28 +65,30 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
 }
 
 ; test calling conventions - prolog and epilog
-; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
-; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
-; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
-; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
-; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
-; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
-; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
-; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
-; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
-; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64-LABEL: test_prolog_epilog
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
 ; WIN64: call
-; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
-; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
-; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
-; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
-; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
-; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
-; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
-; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
-; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
-; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
-
+; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+
+; X64-LABEL: test_prolog_epilog
 ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
 ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
 ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
@@ -111,12 +113,14 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
 
 ; test functions with integer parameters
 ; pass parameters on stack for 32-bit platform
+; X32-LABEL: test_int
 ; X32: movl {{.*}}, 4(%esp)
 ; X32: movl {{.*}}, (%esp)
 ; X32: call
 ; X32: addl {{.*}}, %eax
 
 ; pass parameters in registers for 64-bit platform
+; X64-LABEL: test_int
 ; X64: leal {{.*}}, %edi
 ; X64: movl {{.*}}, %esi
 ; X64: call
@@ -128,21 +132,21 @@ define i32 @test_int(i32 %a, i32 %b) nounwind {
 	ret i32 %c
 }
 
-; WIN64: test_float4
+; WIN64-LABEL: test_float4
 ; WIN64-NOT: vzeroupper
 ; WIN64: call
 ; WIN64-NOT: vzeroupper
 ; WIN64: call
 ; WIN64: ret
 
-; X64: test_float4
+; X64-LABEL: test_float4
 ; X64-NOT: vzeroupper
 ; X64: call
 ; X64-NOT: vzeroupper
 ; X64: call
 ; X64: ret
 
-; X32: test_float4
+; X32-LABEL: test_float4
 ; X32: vzeroupper
 ; X32: call
 ; X32: vzeroupper
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index 0be83f6..ce31161 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -2219,14 +2219,6 @@ define void @test_x86_avx_storeu_ps_256(i8* %a0, <8 x float> %a1) {
 declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) nounwind
 
 
-define <4 x double> @test_x86_avx_vbroadcast_sd_256(i8* %a0) {
-  ; CHECK: vbroadcastsd
-  %res = call <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8* %a0) ; <<4 x double>> [#uses=1]
-  ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8*) nounwind readonly
-
-
 define <4 x double> @test_x86_avx_vbroadcastf128_pd_256(i8* %a0) {
   ; CHECK: vbroadcastf128
   %res = call <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8* %a0) ; <<4 x double>> [#uses=1]
@@ -2243,22 +2235,6 @@ define <8 x float> @test_x86_avx_vbroadcastf128_ps_256(i8* %a0) {
 declare <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8*) nounwind readonly
 
 
-define <4 x float> @test_x86_avx_vbroadcast_ss(i8* %a0) {
-  ; CHECK: vbroadcastss
-  %res = call <4 x float> @llvm.x86.avx.vbroadcast.ss(i8* %a0) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx.vbroadcast.ss(i8*) nounwind readonly
-
-
-define <8 x float> @test_x86_avx_vbroadcast_ss_256(i8* %a0) {
-  ; CHECK: vbroadcastss
-  %res = call <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8* %a0) ; <<8 x float>> [#uses=1]
-  ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8*) nounwind readonly
-
-
 define <2 x double> @test_x86_avx_vextractf128_pd_256(<4 x double> %a0) {
   ; CHECK: vextractf128
   %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 7) ; <<2 x double>> [#uses=1]
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index f407ba4..4a996d7 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -5,8 +5,10 @@ define <4 x float> @test1(<4 x float> %a) nounwind {
   %b = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 5, i32 undef, i32 undef>
   ret <4 x float> %b
 ; CHECK-LABEL: test1:
-; CHECK: vshufps
-; CHECK: vpshufd
+;; TODO: This test could be improved by removing the xor instruction and
+;; having vinsertps zero out the needed elements.
+; CHECK: vxorps
+; CHECK: vinsertps
 }
 
 ; rdar://10538417
@@ -23,7 +25,7 @@ define <4 x i64> @test3(<4 x i64> %a, <4 x i64> %b) nounwind {
   %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 undef>
   ret <4 x i64> %c
 ; CHECK-LABEL: test3:
-; CHECK: vperm2f128
+; CHECK: vblendpd
 ; CHECK: ret
 }
 
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index 5d07815..b1b2f8b 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -43,13 +43,10 @@ entry:
   ret <4 x double> %vecinit6.i
 }
 
-; Test this simple opt:
+; Test this turns into a broadcast:
 ;   shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
-; To:
-;   shuffle (vload ptr)), undef, <1, 1, 1, 1>
-; CHECK: vmovdqa
-; CHECK-NEXT: vpshufd $-1
-; CHECK-NEXT: vinsertf128  $1
+;   
+; CHECK: vbroadcastss
 define <8 x float> @funcE() nounwind {
 allocas:
   %udx495 = alloca [18 x [18 x float]], align 32
diff --git a/test/CodeGen/X86/avx-vperm2f128.ll b/test/CodeGen/X86/avx-vperm2f128.ll
index caa21e5..c20775b 100644
--- a/test/CodeGen/X86/avx-vperm2f128.ll
+++ b/test/CodeGen/X86/avx-vperm2f128.ll
@@ -9,7 +9,7 @@ entry:
 }
 
 ; CHECK: _B
-; CHECK: vperm2f128 $48
+; CHECK: vblendps $240
 define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
 entry:
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
diff --git a/test/CodeGen/X86/avx-vshufp.ll b/test/CodeGen/X86/avx-vshufp.ll
index 45883b7..ad3dbc1 100644
--- a/test/CodeGen/X86/avx-vshufp.ll
+++ b/test/CodeGen/X86/avx-vshufp.ll
@@ -32,14 +32,14 @@ entry:
   ret <8 x i32> %shuffle
 }
 
-; CHECK: vshufpd  $10, %ymm
+; CHECK: vblendpd  $10, %ymm
 define <4 x double> @B(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp {
 entry:
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   ret <4 x double> %shuffle
 }
 
-; CHECK: vshufpd  $10, (%{{.*}}), %ymm
+; CHECK: vblendpd  $10, (%{{.*}}), %ymm
 define <4 x double> @B2(<4 x double>* %a, <4 x double>* %b) nounwind uwtable readnone ssp {
 entry:
   %a2 = load <4 x double>* %a
@@ -48,14 +48,14 @@ entry:
   ret <4 x double> %shuffle
 }
 
-; CHECK: vshufpd  $10, %ymm
+; CHECK: vblendpd  $10, %ymm
 define <4 x i64> @B3(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
 entry:
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   ret <4 x i64> %shuffle
 }
 
-; CHECK: vshufpd  $10, (%{{.*}}), %ymm
+; CHECK: vblendpd  $10, (%{{.*}}), %ymm
 define <4 x i64> @B4(<4 x i64>* %a, <4 x i64>* %b) nounwind uwtable readnone ssp {
 entry:
   %a2 = load <4 x i64>* %a
@@ -71,7 +71,7 @@ entry:
   ret <8 x float> %shuffle
 }
 
-; CHECK: vshufpd  $2, %ymm
+; CHECK: vblendpd  $2, %ymm
 define <4 x double> @D(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp {
 entry:
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 undef>
diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
index 0e6dd29..185b989 100644
--- a/test/CodeGen/X86/avx2-shuffle.ll
+++ b/test/CodeGen/X86/avx2-shuffle.ll
@@ -60,6 +60,24 @@ define <4 x i64> @blend_test4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline
   ret <4 x i64> %t
 }
 
+;; 2 tests for shufflevectors that optimize to blend + immediate
+; CHECK-LABEL: @blend_test5
+; CHECK: vpblendd $10, %xmm1, %xmm0, %xmm0
+; CHECK: ret
+define <4 x i32> @blend_test5(<4 x i32> %a, <4 x i32> %b) {
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %1
+}
+
+; CHECK-LABEL: @blend_test6
+; CHECK: vpblendw $134, %ymm1, %ymm0, %ymm0
+; CHECK: ret
+define <16 x i16> @blend_test6(<16 x i16> %a, <16 x i16> %b) {
+  %1 = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32  3, i32  4, i32  5, i32  6, i32 23,
+                                                               i32 8, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 31>
+  ret <16 x i16> %1
+}
+
 ; CHECK: vpshufhw $27, %ymm
 define <16 x i16> @vpshufhw(<16 x i16> %src1) nounwind uwtable readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 2476ea1..f5cda96 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -192,6 +192,14 @@ define <16 x double> @uitof64(<16 x i32> %a) nounwind {
   ret <16 x double> %b
 }
 
+; CHECK-LABEL: uitof64_256
+; CHECK: vcvtudq2pd
+; CHECK: ret
+define <4 x double> @uitof64_256(<4 x i32> %a) nounwind {
+  %b = uitofp <4 x i32> %a to <4 x double>
+  ret <4 x double> %b
+}
+
 ; CHECK-LABEL: uitof32
 ; CHECK: vcvtudq2ps
 ; CHECK: ret
diff --git a/test/CodeGen/X86/avx512-inc-dec.ll b/test/CodeGen/X86/avx512-inc-dec.ll
new file mode 100644
index 0000000..f04ca87
--- /dev/null
+++ b/test/CodeGen/X86/avx512-inc-dec.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+;CHECK-LABEL: test
+;CHECK-NOT: dec
+;CHECK_NOT: enc
+;CHECK: ret
+define i32 @test(i32 %a, i32 %b) {
+ %a1 = add i32 %a, -1
+ %b1 = add i32 %b, 1
+ %res = mul i32 %a1, %b1
+ ret i32 %res
+}
+
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index e19841a..18cfcfe 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -311,7 +311,6 @@ define <8 x i64> @test_conflict_q(<8 x i64> %a) {
 
 declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
 
-
 define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
   ; CHECK: vpconflictd 
   %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 %mask)
@@ -324,6 +323,57 @@ define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
   ret <8 x i64> %res
 }
 
+define <16 x i32> @test_lzcnt_d(<16 x i32> %a) {
+  ; CHECK: movw $-1, %ax
+  ; CHECK: vpxor
+  ; CHECK: vplzcntd
+  %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
+
+define <8 x i64> @test_lzcnt_q(<8 x i64> %a) {
+  ; CHECK: movb $-1, %al
+  ; CHECK: vpxor
+  ; CHECK: vplzcntq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
+
+
+define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+  ; CHECK: vplzcntd
+  %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+  ; CHECK: vplzcntq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @test_ctlz_d(<16 x i32> %a) {
+  ; CHECK-LABEL: test_ctlz_d
+  ; CHECK: vplzcntd
+  %res = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1) nounwind readonly
+
+define <8 x i64> @test_ctlz_q(<8 x i64> %a) {
+  ; CHECK-LABEL: test_ctlz_q
+  ; CHECK: vplzcntq
+  %res = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1) nounwind readonly
+
 define <16 x float> @test_x86_mask_blend_ps_512(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
   ; CHECK: vblendmps
   %res = call <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float> %a1, <16 x float> %a2, i16 %a0) ; <<16 x float>> [#uses=1]
@@ -544,4 +594,20 @@ define <16 x float> @test_vpermt2ps(<16 x float>%x, <16 x float>%y, <16 x i32>%p
   ret <16 x float> %res
 }
 
+define <16 x float> @test_vpermt2ps_mask(<16 x float>%x, <16 x float>%y, <16 x i32>%perm, i16 %mask) {
+; CHECK-LABEL: test_vpermt2ps_mask:
+; CHECK: vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x49,0x7f,0xc1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermt.ps.512(<16 x i32>%perm, <16 x float>%x, <16 x float>%y, i16 %mask)
+  ret <16 x float> %res
+}
+
 declare <16 x float> @llvm.x86.avx512.mask.vpermt.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
+
+define <8 x i64> @test_vmovntdqa(i8 *%x) {
+; CHECK-LABEL: test_vmovntdqa:
+; CHECK: vmovntdqa (%rdi), %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x2a,0x07]
+  %res = call <8 x i64> @llvm.x86.avx512.movntdqa(i8* %x)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.movntdqa(i8*)
diff --git a/test/CodeGen/X86/avx512-nontemporal.ll b/test/CodeGen/X86/avx512-nontemporal.ll
new file mode 100644
index 0000000..ef50cdb
--- /dev/null
+++ b/test/CodeGen/X86/avx512-nontemporal.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=x86-64 -mattr=+avx512f | FileCheck %s
+
+define void @f(<16 x float> %A, <16 x float> %AA, i8* %B, <8 x double> %C, <8 x double> %CC, i32 %D, <8 x i64> %E, <8 x i64> %EE) {
+; CHECK: vmovntps %z
+  %cast = bitcast i8* %B to <16 x float>*
+  %A2 = fadd <16 x float> %A, %AA
+  store <16 x float> %A2, <16 x float>* %cast, align 64, !nontemporal !0
+; CHECK: vmovntdq %z
+  %cast1 = bitcast i8* %B to <8 x i64>*
+  %E2 = add <8 x i64> %E, %EE
+  store <8 x i64> %E2, <8 x i64>* %cast1, align 64, !nontemporal !0
+; CHECK: vmovntpd %z
+  %cast2 = bitcast i8* %B to <8 x double>*
+  %C2 = fadd <8 x double> %C, %CC
+  store <8 x double> %C2, <8 x double>* %cast2, align 64, !nontemporal !0
+  ret void
+}
+
+!0 = metadata !{i32 1}
diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll
index 23ddc3a..b99e89a 100644
--- a/test/CodeGen/X86/avx512-shuffle.ll
+++ b/test/CodeGen/X86/avx512-shuffle.ll
@@ -56,6 +56,16 @@ define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind {
   ret <8 x double> %c
 }
 
+; The reg variant of vpermt2 with a writemask
+; CHECK-LABEL: test5m:
+; CHECK: vpermt2pd {{.* {%k[1-7]} {z}}}
+define <8 x double> @test5m(<8 x double> %a, <8 x double> %b, i8 %mask) nounwind {
+  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
+  %m = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %m, <8 x double> %c, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
 ; CHECK-LABEL: test6:
 ; CHECK: vpermq $30
 ; CHECK: ret
@@ -72,6 +82,27 @@ define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind {
   ret <8 x i64> %c
 }
 
+; The reg variant of vpermt2 with a writemask
+; CHECK-LABEL: test7m:
+; CHECK: vpermt2q {{.* {%k[1-7]} {z}}}
+define <8 x i64> @test7m(<8 x i64> %a, <8 x i64> %b, i8 %mask) nounwind {
+  %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
+  %m = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+
+; The mem variant of vpermt2 with a writemask
+; CHECK-LABEL: test7mm:
+; CHECK: vpermt2q {{\(.*\).* {%k[1-7]} {z}}}
+define <8 x i64> @test7mm(<8 x i64> %a, <8 x i64> *%pb, i8 %mask) nounwind {
+  %b = load <8 x i64>* %pb
+  %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
+  %m = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+
 ; CHECK-LABEL: test8:
 ; CHECK: vpermt2d
 ; CHECK: ret
@@ -80,6 +111,27 @@ define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind {
   ret <16 x i32> %c
 }
 
+; The reg variant of vpermt2 with a writemask
+; CHECK-LABEL: test8m:
+; CHECK: vpermt2d {{.* {%k[1-7]} {z}}}
+define <16 x i32> @test8m(<16 x i32> %a, <16 x i32> %b, i16 %mask) nounwind {
+  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
+  %m = bitcast i16 %mask to <16 x i1>
+  %res = select <16 x i1> %m, <16 x i32> %c, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
+; The mem variant of vpermt2 with a writemask
+; CHECK-LABEL: test8mm:
+; CHECK: vpermt2d {{\(.*\).* {%k[1-7]} {z}}}
+define <16 x i32> @test8mm(<16 x i32> %a, <16 x i32> *%pb, i16 %mask) nounwind {
+  %b = load <16 x i32> * %pb
+  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
+  %m = bitcast i16 %mask to <16 x i1>
+  %res = select <16 x i1> %m, <16 x i32> %c, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
 ; CHECK-LABEL: test9:
 ; CHECK: vpermt2ps
 ; CHECK: ret
@@ -88,6 +140,16 @@ define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind {
   ret <16 x float> %c
 }
 
+; The reg variant of vpermt2 with a writemask
+; CHECK-LABEL: test9m:
+; CHECK: vpermt2ps {{.*}} {%k{{.}}} {z}
+define <16 x float> @test9m(<16 x float> %a, <16 x float> %b, i16 %mask) nounwind {
+  %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
+  %m = bitcast i16 %mask to <16 x i1>
+  %res = select <16 x i1> %m, <16 x float> %c, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+
 ; CHECK-LABEL: test10:
 ; CHECK: vpermt2ps (
 ; CHECK: ret
diff --git a/test/CodeGen/X86/bswap-vector.ll b/test/CodeGen/X86/bswap-vector.ll
index 3c931db..9dc960d 100644
--- a/test/CodeGen/X86/bswap-vector.ll
+++ b/test/CodeGen/X86/bswap-vector.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -mcpu=x86-64 | FileCheck %s -check-prefix=CHECK-NOSSSE3
 ; RUN: llc < %s -mcpu=core2 | FileCheck %s -check-prefix=CHECK-SSSE3
 ; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK-AVX2
+; RUN: llc < %s -mcpu=core-avx2 -x86-experimental-vector-widening-legalization | FileCheck %s -check-prefix=CHECK-WIDE-AVX2
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -31,6 +32,10 @@ entry:
 ; CHECK-AVX2-LABEL: @test1
 ; CHECK-AVX2: vpshufb
 ; CHECK-AVX2-NEXT: retq
+
+; CHECK-WIDE-AVX2-LABEL: @test1
+; CHECK-WIDE-AVX2: vpshufb
+; CHECK-WIDE-AVX2-NEXT: retq
 }
 
 define <4 x i32> @test2(<4 x i32> %v) #0 {
@@ -52,6 +57,10 @@ entry:
 ; CHECK-AVX2-LABEL: @test2
 ; CHECK-AVX2: vpshufb
 ; CHECK-AVX2-NEXT: retq
+
+; CHECK-WIDE-AVX2-LABEL: @test2
+; CHECK-WIDE-AVX2: vpshufb
+; CHECK-WIDE-AVX2-NEXT: retq
 }
 
 define <2 x i64> @test3(<2 x i64> %v) #0 {
@@ -71,6 +80,10 @@ entry:
 ; CHECK-AVX2-LABEL: @test3
 ; CHECK-AVX2: vpshufb
 ; CHECK-AVX2-NEXT: retq
+
+; CHECK-WIDE-AVX2-LABEL: @test3
+; CHECK-WIDE-AVX2: vpshufb
+; CHECK-WIDE-AVX2-NEXT: retq
 }
 
 declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>)
@@ -90,6 +103,10 @@ entry:
 ; CHECK-AVX2-LABEL: @test4
 ; CHECK-AVX2: vpshufb
 ; CHECK-AVX2-NEXT: retq
+
+; CHECK-WIDE-AVX2-LABEL: @test4
+; CHECK-WIDE-AVX2: vpshufb
+; CHECK-WIDE-AVX2-NEXT: retq
 }
 
 define <8 x i32> @test5(<8 x i32> %v) #0 {
@@ -105,6 +122,10 @@ entry:
 ; CHECK-AVX2-LABEL: @test5
 ; CHECK-AVX2: vpshufb
 ; CHECK-AVX2-NEXT: retq
+
+; CHECK-WIDE-AVX2-LABEL: @test5
+; CHECK-WIDE-AVX2: vpshufb
+; CHECK-WIDE-AVX2-NEXT: retq
 }
 
 define <4 x i64> @test6(<4 x i64> %v) #0 {
@@ -120,6 +141,10 @@ entry:
 ; CHECK-AVX2-LABEL: @test6
 ; CHECK-AVX2: vpshufb
 ; CHECK-AVX2-NEXT: retq
+
+; CHECK-WIDE-AVX2-LABEL: @test6
+; CHECK-WIDE-AVX2: vpshufb
+; CHECK-WIDE-AVX2-NEXT: retq
 }
 
 declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
@@ -138,6 +163,10 @@ entry:
 ; CHECK-AVX2: vpshufb
 ; CHECK-AVX2: vpsrld $16
 ; CHECK-AVX2-NEXT: retq
+
+; CHECK-WIDE-AVX2-LABEL: @test7
+; CHECK-WIDE-AVX2: vpshufb
+; CHECK-WIDE-AVX2-NEXT: retq
 }
 
 attributes #0 = { nounwind uwtable }
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index cdcdc96..149d537 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -198,3 +198,16 @@ define i32 @test14(i32 %mask, i32 %base, i32 %intra) #0 {
 ; CHECK: 	shrl	$7, %edi
 ; CHECK-NEXT: 	cmovnsl	%edx, %esi
 }
+
+; PR19964
+define zeroext i1 @test15(i32 %bf.load, i32 %n) {
+  %bf.lshr = lshr i32 %bf.load, 16
+  %cmp2 = icmp eq i32 %bf.lshr, 0
+  %cmp5 = icmp uge i32 %bf.lshr, %n
+  %.cmp5 = or i1 %cmp2, %cmp5
+  ret i1 %.cmp5
+
+; CHECK-LABEL: test15:
+; CHECK:  shrl	$16, %edi
+; CHECK:  cmpl	%esi, %edi
+}
diff --git a/test/CodeGen/X86/cmpxchg-i1.ll b/test/CodeGen/X86/cmpxchg-i1.ll
new file mode 100644
index 0000000..a21ab59
--- /dev/null
+++ b/test/CodeGen/X86/cmpxchg-i1.ll
@@ -0,0 +1,87 @@
+; RUN: llc -mtriple=x86_64 -o - %s | FileCheck %s
+
+define i1 @try_cmpxchg(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: try_cmpxchg:
+; CHECK: cmpxchgl
+; CHECK-NOT: cmp
+; CHECK: sete %al
+; CHECK: retq
+  %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %pair, 1
+  ret i1 %success
+}
+
+define void @cmpxchg_flow(i64* %addr, i64 %desired, i64 %new) {
+; CHECK-LABEL: cmpxchg_flow:
+; CHECK: cmpxchgq
+; CHECK-NOT: cmp
+; CHECK-NOT: set
+; CHECK: {{jne|jeq}}
+  %pair = cmpxchg i64* %addr, i64 %desired, i64 %new seq_cst seq_cst
+  %success = extractvalue { i64, i1 } %pair, 1
+  br i1 %success, label %true, label %false
+
+true:
+  call void @foo()
+  ret void
+
+false:
+  call void @bar()
+  ret void
+}
+
+define i64 @cmpxchg_sext(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: cmpxchg_sext:
+; CHECK-DAG: cmpxchgl
+; CHECK-NOT: cmpl
+; CHECK: sete %al
+; CHECK: retq
+  %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %pair, 1
+  %mask = sext i1 %success to i64
+  ret i64 %mask
+}
+
+define i32 @cmpxchg_zext(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: cmpxchg_zext:
+; CHECK: cmpxchgl
+; CHECK-NOT: cmp
+; CHECK: sete [[BYTE:%[a-z0-9]+]]
+; CHECK: movzbl [[BYTE]], %eax
+  %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %pair, 1
+  %mask = zext i1 %success to i32
+  ret i32 %mask
+}
+
+
+define i32 @cmpxchg_use_eflags_and_val(i32* %addr, i32 %offset) {
+; CHECK-LABEL: cmpxchg_use_eflags_and_val:
+; CHECK: movl (%rdi), %e[[OLDVAL:[a-z0-9]+]]
+
+; CHECK: [[LOOPBB:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: leal (%r[[OLDVAL]],%rsi), [[NEW:%[a-z0-9]+]]
+; CHECK: cmpxchgl [[NEW]], (%rdi)
+; CHECK-NOT: cmpl
+; CHECK: jne [[LOOPBB]]
+
+  ; Result already in %eax
+; CHECK: retq
+entry:
+  %init = load atomic i32* %addr seq_cst, align 4
+  br label %loop
+
+loop:
+  %old = phi i32 [%init, %entry], [%oldval, %loop]
+  %new = add i32 %old, %offset
+  %pair = cmpxchg i32* %addr, i32 %old, i32 %new seq_cst seq_cst
+  %oldval = extractvalue { i32, i1 } %pair, 0
+  %success = extractvalue { i32, i1 } %pair, 1
+  br i1 %success, label %done, label %loop
+
+done:
+  ret i32 %oldval
+}
+
+declare void @foo()
+declare void @bar()
diff --git a/test/CodeGen/X86/cmpxchg-i128-i1.ll b/test/CodeGen/X86/cmpxchg-i128-i1.ll
new file mode 100644
index 0000000..4dd3001
--- /dev/null
+++ b/test/CodeGen/X86/cmpxchg-i128-i1.ll
@@ -0,0 +1,83 @@
+; RUN: llc -mcpu=core-avx2 -mtriple=x86_64 -o - %s | FileCheck %s
+
+define i1 @try_cmpxchg(i128* %addr, i128 %desired, i128 %new) {
+; CHECK-LABEL: try_cmpxchg:
+; CHECK: cmpxchg16b
+; CHECK-NOT: cmp
+; CHECK: sete %al
+; CHECK: retq
+  %pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst
+  %success = extractvalue { i128, i1 } %pair, 1
+  ret i1 %success
+}
+
+define void @cmpxchg_flow(i128* %addr, i128 %desired, i128 %new) {
+; CHECK-LABEL: cmpxchg_flow:
+; CHECK: cmpxchg16b
+; CHECK-NOT: cmp
+; CHECK-NOT: set
+; CHECK: {{jne|jeq}}
+  %pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst
+  %success = extractvalue { i128, i1 } %pair, 1
+  br i1 %success, label %true, label %false
+
+true:
+  call void @foo()
+  ret void
+
+false:
+  call void @bar()
+  ret void
+}
+
+; Can't use the flags here because cmpxchg16b only sets ZF.
+define i1 @cmpxchg_arithcmp(i128* %addr, i128 %desired, i128 %new) {
+; CHECK-LABEL: cmpxchg_arithcmp:
+; CHECK: cmpxchg16b
+; CHECK: cmpq
+; CHECK: retq
+  %pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst
+  %oldval = extractvalue { i128, i1 } %pair, 0
+  %success = icmp sge i128 %oldval, %desired
+  ret i1 %success
+}
+
+define i128 @cmpxchg_zext(i128* %addr, i128 %desired, i128 %new) {
+; CHECK-LABEL: cmpxchg_zext:
+; CHECK: cmpxchg16b
+; CHECK-NOT: cmpq
+; CHECK: sete [[BYTE:%[a-z0-9]+]]
+; CHECK: movzbl [[BYTE]], %eax
+  %pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst
+  %success = extractvalue { i128, i1 } %pair, 1
+  %mask = zext i1 %success to i128
+  ret i128 %mask
+}
+
+
+define i128 @cmpxchg_use_eflags_and_val(i128* %addr, i128 %offset) {
+; CHECK-LABEL: cmpxchg_use_eflags_and_val:
+
+; CHECK: cmpxchg16b
+; CHECK-NOT: cmpq
+; CHECK: jne
+entry:
+  %init = load atomic i128* %addr seq_cst, align 16
+  br label %loop
+
+loop:
+  %old = phi i128 [%init, %entry], [%oldval, %loop]
+  %new = add i128 %old, %offset
+
+  %pair = cmpxchg i128* %addr, i128 %old, i128 %new seq_cst seq_cst
+  %oldval = extractvalue { i128, i1 } %pair, 0
+  %success = extractvalue { i128, i1 } %pair, 1
+
+  br i1 %success, label %done, label %loop
+
+done:
+  ret i128 %old
+}
+
+declare void @foo()
+declare void @bar()
diff --git a/test/CodeGen/X86/coalescer-remat.ll b/test/CodeGen/X86/coalescer-remat.ll
index 468b70b..bb08a0e 100644
--- a/test/CodeGen/X86/coalescer-remat.ll
+++ b/test/CodeGen/X86/coalescer-remat.ll
@@ -5,7 +5,8 @@
 
 define i32 @main() nounwind {
 entry:
-  %0 = cmpxchg i64* @val, i64 0, i64 1 monotonic monotonic
+  %t0 = cmpxchg i64* @val, i64 0, i64 1 monotonic monotonic
+  %0 = extractvalue { i64, i1 } %t0, 0
   %1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([7 x i8]* @"\01LC", i32 0, i64 0), i64 %0) nounwind
   ret i32 0
 }
diff --git a/test/CodeGen/X86/coff-comdat.ll b/test/CodeGen/X86/coff-comdat.ll
new file mode 100644
index 0000000..bf27b2f
--- /dev/null
+++ b/test/CodeGen/X86/coff-comdat.ll
@@ -0,0 +1,92 @@
+; RUN: llc -mtriple i386-pc-win32 < %s | FileCheck %s
+
+$f1 = comdat any
+@v1 = global i32 0, comdat $f1
+define void @f1() comdat $f1 {
+  ret void
+}
+
+$f2 = comdat exactmatch
+@v2 = global i32 0, comdat $f2
+define void @f2() comdat $f2 {
+  ret void
+}
+
+$f3 = comdat largest
+@v3 = global i32 0, comdat $f3
+define void @f3() comdat $f3 {
+  ret void
+}
+
+$f4 = comdat noduplicates
+@v4 = global i32 0, comdat $f4
+define void @f4() comdat $f4 {
+  ret void
+}
+
+$f5 = comdat samesize
+@v5 = global i32 0, comdat $f5
+define void @f5() comdat $f5 {
+  ret void
+}
+
+$f6 = comdat samesize
+@v6 = global i32 0, comdat $f6
+@f6 = global i32 0, comdat $f6
+
+$"\01@f7@0" = comdat any
+define x86_fastcallcc void @"\01@v7@0"() comdat $"\01@f7@0" {
+  ret void
+}
+define x86_fastcallcc void @"\01@f7@0"() comdat $"\01@f7@0" {
+  ret void
+}
+
+$f8 = comdat any
+define x86_fastcallcc void @v8() comdat $f8 {
+  ret void
+}
+define x86_fastcallcc void @f8() comdat $f8 {
+  ret void
+}
+
+$vftable = comdat largest
+
+@some_name = private unnamed_addr constant [2 x i8*] zeroinitializer, comdat $vftable
+@vftable = alias getelementptr([2 x i8*]* @some_name, i32 0, i32 1)
+
+; CHECK: .section        .text,"xr",discard,_f1
+; CHECK: .globl  _f1
+; CHECK: .section        .text,"xr",same_contents,_f2
+; CHECK: .globl  _f2
+; CHECK: .section        .text,"xr",largest,_f3
+; CHECK: .globl  _f3
+; CHECK: .section        .text,"xr",one_only,_f4
+; CHECK: .globl  _f4
+; CHECK: .section        .text,"xr",same_size,_f5
+; CHECK: .globl  _f5
+; CHECK: .section        .text,"xr",associative,@f7@0
+; CHECK: .globl  @v7@0
+; CHECK: .section        .text,"xr",discard,@f7@0
+; CHECK: .globl  @f7@0
+; CHECK: .section        .text,"xr",associative,@f8@0
+; CHECK: .globl  @v8@0
+; CHECK: .section        .text,"xr",discard,@f8@0
+; CHECK: .globl  @f8@0
+; CHECK: .section        .bss,"bw",associative,_f1
+; CHECK: .globl  _v1
+; CHECK: .section        .bss,"bw",associative,_f2
+; CHECK: .globl  _v2
+; CHECK: .section        .bss,"bw",associative,_f3
+; CHECK: .globl  _v3
+; CHECK: .section        .bss,"bw",associative,_f4
+; CHECK: .globl  _v4
+; CHECK: .section        .bss,"bw",associative,_f5
+; CHECK: .globl  _v5
+; CHECK: .section        .bss,"bw",associative,_f6
+; CHECK: .globl  _v6
+; CHECK: .section        .bss,"bw",same_size,_f6
+; CHECK: .globl  _f6
+; CHECK: .section        .rdata,"rd",largest,_vftable
+; CHECK: .globl  _vftable
+; CHECK: _vftable = L_some_name+4
diff --git a/test/CodeGen/X86/coff-comdat2.ll b/test/CodeGen/X86/coff-comdat2.ll
new file mode 100644
index 0000000..6744b5b
--- /dev/null
+++ b/test/CodeGen/X86/coff-comdat2.ll
@@ -0,0 +1,9 @@
+; RUN: not llc %s -o /dev/null 2>&1 | FileCheck %s
+
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+$foo = comdat largest
+@foo = global i32 0
+@bar = global i32 0, comdat $foo
+; CHECK: Associative COMDAT symbol 'foo' is not a key for it's COMDAT.
diff --git a/test/CodeGen/X86/coff-comdat3.ll b/test/CodeGen/X86/coff-comdat3.ll
new file mode 100644
index 0000000..76e464b
--- /dev/null
+++ b/test/CodeGen/X86/coff-comdat3.ll
@@ -0,0 +1,8 @@
+; RUN: not llc %s -o /dev/null 2>&1 | FileCheck %s
+
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+$foo = comdat largest
+@bar = global i32 0, comdat $foo
+; CHECK: Associative COMDAT symbol 'foo' does not exist.
diff --git a/test/CodeGen/X86/combine-64bit-vec-binop.ll b/test/CodeGen/X86/combine-64bit-vec-binop.ll
new file mode 100644
index 0000000..8440fda
--- /dev/null
+++ b/test/CodeGen/X86/combine-64bit-vec-binop.ll
@@ -0,0 +1,273 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
+; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
+
+
+define double @test1_add(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %add = add <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %add to double
+  ret double %3
+}
+; CHECK-LABEL: test1_add
+; SSE41: paddd
+; AVX: vpaddd
+; CHECK-NEXT: ret
+
+
+define double @test2_add(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %add = add <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %add to double
+  ret double %3
+}
+; CHECK-LABEL: test2_add
+; SSE41: paddw
+; AVX: vpaddw
+; CHECK-NEXT: ret
+
+define double @test3_add(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %add = add <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %add to double
+  ret double %3
+}
+; CHECK-LABEL: test3_add
+; SSE41: paddb
+; AVX: vpaddb
+; CHECK-NEXT: ret
+
+
+define double @test1_sub(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %sub = sub <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %sub to double
+  ret double %3
+}
+; CHECK-LABEL: test1_sub
+; SSE41: psubd
+; AVX: vpsubd
+; CHECK-NEXT: ret
+
+
+define double @test2_sub(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %sub = sub <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %sub to double
+  ret double %3
+}
+; CHECK-LABEL: test2_sub
+; SSE41: psubw
+; AVX: vpsubw
+; CHECK-NEXT: ret
+
+
+define double @test3_sub(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %sub = sub <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %sub to double
+  ret double %3
+}
+; CHECK-LABEL: test3_sub
+; SSE41: psubb
+; AVX: vpsubb
+; CHECK-NEXT: ret
+
+
+define double @test1_mul(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %mul = mul <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %mul to double
+  ret double %3
+}
+; CHECK-LABEL: test1_mul
+; SSE41: pmulld
+; AVX: vpmulld
+; CHECK-NEXT: ret
+
+
+define double @test2_mul(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %mul = mul <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %mul to double
+  ret double %3
+}
+; CHECK-LABEL: test2_mul
+; SSE41: pmullw
+; AVX: vpmullw
+; CHECK-NEXT: ret
+
+; There is no legal ISD::MUL with type MVT::v8i16.
+define double @test3_mul(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %mul = mul <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %mul to double
+  ret double %3
+}
+; CHECK-LABEL: test3_mul
+; CHECK: pmullw
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+
+
+define double @test1_and(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %and = and <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %and to double
+  ret double %3
+}
+; CHECK-LABEL: test1_and
+; SSE41: andps
+; AVX: vandps
+; CHECK-NEXT: ret
+
+
+define double @test2_and(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %and = and <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %and to double
+  ret double %3
+}
+; CHECK-LABEL: test2_and
+; SSE41: andps
+; AVX: vandps
+; CHECK-NEXT: ret
+
+
+define double @test3_and(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %and = and <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %and to double
+  ret double %3
+}
+; CHECK-LABEL: test3_and
+; SSE41: andps
+; AVX: vandps
+; CHECK-NEXT: ret
+
+
+define double @test1_or(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %or = or <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %or to double
+  ret double %3
+}
+; CHECK-LABEL: test1_or
+; SSE41: orps
+; AVX: vorps
+; CHECK-NEXT: ret
+
+
+define double @test2_or(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %or = or <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %or to double
+  ret double %3
+}
+; CHECK-LABEL: test2_or
+; SSE41: orps
+; AVX: vorps
+; CHECK-NEXT: ret
+
+
+define double @test3_or(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %or = or <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %or to double
+  ret double %3
+}
+; CHECK-LABEL: test3_or
+; SSE41: orps
+; AVX: vorps
+; CHECK-NEXT: ret
+
+
+define double @test1_xor(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %xor = xor <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %xor to double
+  ret double %3
+}
+; CHECK-LABEL: test1_xor
+; SSE41: xorps
+; AVX: vxorps
+; CHECK-NEXT: ret
+
+
+define double @test2_xor(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %xor = xor <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %xor to double
+  ret double %3
+}
+; CHECK-LABEL: test2_xor
+; SSE41: xorps
+; AVX: vxorps
+; CHECK-NEXT: ret
+
+
+define double @test3_xor(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %xor = xor <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %xor to double
+  ret double %3
+}
+; CHECK-LABEL: test3_xor
+; SSE41: xorps
+; AVX: vxorps
+; CHECK-NEXT: ret
+
+
+define double @test_fadd(double %A, double %B) {
+  %1 = bitcast double %A to <2 x float>
+  %2 = bitcast double %B to <2 x float>
+  %add = fadd <2 x float> %1, %2
+  %3 = bitcast <2 x float> %add to double
+  ret double %3
+}
+; CHECK-LABEL: test_fadd
+; SSE41: addps
+; AVX: vaddps
+; CHECK-NEXT: ret
+
+define double @test_fsub(double %A, double %B) {
+  %1 = bitcast double %A to <2 x float>
+  %2 = bitcast double %B to <2 x float>
+  %sub = fsub <2 x float> %1, %2
+  %3 = bitcast <2 x float> %sub to double
+  ret double %3
+}
+; CHECK-LABEL: test_fsub
+; SSE41: subps
+; AVX: vsubps
+; CHECK-NEXT: ret
+
+define double @test_fmul(double %A, double %B) {
+  %1 = bitcast double %A to <2 x float>
+  %2 = bitcast double %B to <2 x float>
+  %mul = fmul <2 x float> %1, %2
+  %3 = bitcast <2 x float> %mul to double
+  ret double %3
+}
+; CHECK-LABEL: test_fmul
+; SSE41: mulps
+; AVX: vmulps
+; CHECK-NEXT: ret
+
diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll
index c1ce533..ff807b9 100644
--- a/test/CodeGen/X86/combine-or.ll
+++ b/test/CodeGen/X86/combine-or.ll
@@ -25,7 +25,7 @@ define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
 }
 ; CHECK-LABEL: test2
 ; CHECK-NOT: xorps
-; CHECK: shufps
+; CHECK: movsd
 ; CHECK: ret
 
 
@@ -74,7 +74,7 @@ define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) {
 }
 ; CHECK-LABEL: test6
 ; CHECK-NOT: xorps
-; CHECK: shufps
+; CHECK: blendps $12
 ; CHECK-NEXT: ret
 
 
@@ -86,7 +86,7 @@ define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) {
 }
 ; CHECK-LABEL: test7
 ; CHECK-NOT: xorps
-; CHECK: shufps
+; CHECK: blendps $12
 ; CHECK-NEXT: ret
 
 
@@ -111,7 +111,7 @@ define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
 }
 ; CHECK-LABEL: test9
 ; CHECK-NOT: xorps
-; CHECK: shufps
+; CHECK: movsd
 ; CHECK: ret
 
 
diff --git a/test/CodeGen/X86/combine-vec-shuffle-2.ll b/test/CodeGen/X86/combine-vec-shuffle-2.ll
new file mode 100644
index 0000000..7ab7f80
--- /dev/null
+++ b/test/CodeGen/X86/combine-vec-shuffle-2.ll
@@ -0,0 +1,164 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
+
+; Check that DAGCombiner correctly folds the following pairs of shuffles
+; using the following rules:
+;  1. shuffle(shuffle(x, y), undef) -> x
+;  2. shuffle(shuffle(x, y), undef) -> y
+;  3. shuffle(shuffle(x, y), undef) -> shuffle(x, undef)
+;  4. shuffle(shuffle(x, y), undef) -> shuffle(undef, y)
+;
+; Rules 3. and 4. are used only if the resulting shuffle mask is legal.
+
+define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test1
+; Mask: [3,0,0,1]
+; CHECK: pshufd $67
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test2
+; Mask: [2,0,0,3]
+; CHECK: pshufd $-62
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test3
+; Mask: [2,0,0,3]
+; CHECK: pshufd $-62
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test4
+; Mask: [0,0,0,1]
+; CHECK: pshufd $64
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test5
+; Mask: [1,1]
+; CHECK: movhlps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test6(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test6
+; Mask: [2,0,0,0]
+; CHECK: pshufd $2
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test7(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test7
+; Mask: [0,2,0,2]
+; CHECK: pshufd $-120
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test8
+; Mask: [1,0,3,0]
+; CHECK: pshufd $49
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test9(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test9
+; Mask: [1,3,0,2]
+; CHECK: pshufd $-115
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test10(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test10
+; Mask: [1,0,1,0]
+; CHECK: pshufd $17
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test11(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test11
+; Mask: [1,0,2,1]
+; CHECK: pshufd $97
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test12(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test12
+; Mask: [0,0,0,0]
+; CHECK: pshufd $0
+; CHECK-NEXT: ret
+
+
+; The following pair of shuffles is folded into vector %A.
+define <4 x i32> @test13(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test13
+; CHECK-NOT: pshufd
+; CHECK: ret
+
+
+; The following pair of shuffles is folded into vector %B.
+define <4 x i32> @test14(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test14
+; CHECK-NOT: pshufd
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/computeKnownBits_urem.ll b/test/CodeGen/X86/computeKnownBits_urem.ll
new file mode 100644
index 0000000..9902e6f
--- /dev/null
+++ b/test/CodeGen/X86/computeKnownBits_urem.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
+define i32 @main() #0 {
+entry:
+  %a = alloca i32, align 4
+  store i32 1, i32* %a, align 4
+  %0 = load i32* %a, align 4
+  %or = or i32 1, %0
+  %and = and i32 1, %or
+  %rem = urem i32 %and, 1
+  %add = add i32 %rem, 1
+  ret i32 %add
+}
+; CHECK: $1, %eax
+; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/cvt16.ll b/test/CodeGen/X86/cvt16.ll
new file mode 100644
index 0000000..951b5c3
--- /dev/null
+++ b/test/CodeGen/X86/cvt16.ll
@@ -0,0 +1,64 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c | FileCheck %s -check-prefix=CHECK -check-prefix=LIBCALL
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c | FileCheck %s -check-prefix=CHECK -check-prefix=F16C
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -soft-float=1 -mattr=-f16c | FileCheck %s -check-prefix=CHECK -check-prefix=SOFTFLOAT
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -soft-float=1 -mattr=+f16c | FileCheck %s -check-prefix=CHECK -check-prefix=SOFTFLOAT
+
+; This is a test for float to half float conversions on x86-64.
+;
+; If flag -soft-float is set, or if there is no F16C support, then:
+; 1) half float to float conversions are
+;    translated into calls to __gnu_h2f_ieee defined
+;    by the compiler runtime library;
+; 2) float to half float conversions are translated into calls
+;    to __gnu_f2h_ieee which expected to be defined by the
+;    compiler runtime library.
+;
+; Otherwise (we have F16C support):
+; 1) half float to float conversion are translated using
+;    vcvtph2ps instructions;
+; 2) float to half float conversions are translated using
+;    vcvtps2ph instructions
+
+
+define void @test1(float %src, i16* %dest) {
+  %1 = tail call i16 @llvm.convert.to.fp16(float %src)
+  store i16 %1, i16* %dest, align 2
+  ret void
+}
+; CHECK-LABEL: test1
+; LIBCALL: callq  __gnu_f2h_ieee
+; SOFTFLOAT: callq  __gnu_f2h_ieee
+; F16C: vcvtps2ph
+; CHECK: ret
+
+
+define float @test2(i16* nocapture %src) {
+  %1 = load i16* %src, align 2
+  %2 = tail call float @llvm.convert.from.fp16(i16 %1)
+  ret float %2
+}
+; CHECK-LABEL: test2:
+; LIBCALL: jmp  __gnu_h2f_ieee
+; SOFTFLOAT: callq  __gnu_h2f_ieee
+; F16C: vcvtph2ps
+; F16C: ret
+
+
+define float @test3(float %src) nounwind uwtable readnone {
+  %1 = tail call i16 @llvm.convert.to.fp16(float %src)
+  %2 = tail call float @llvm.convert.from.fp16(i16 %1)
+  ret float %2
+}
+
+; CHECK-LABEL: test3:
+; LIBCALL: callq  __gnu_f2h_ieee
+; LIBCALL: jmp   __gnu_h2f_ieee
+; SOFTFLOAT: callq  __gnu_f2h_ieee
+; SOFTFLOAT: callq  __gnu_h2f_ieee
+; F16C: vcvtps2ph
+; F16C-NEXT: vcvtph2ps
+; F16C: ret
+
+declare float @llvm.convert.from.fp16(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16(float) nounwind readnone
+
diff --git a/test/CodeGen/X86/dagcombine-and-setcc.ll b/test/CodeGen/X86/dagcombine-and-setcc.ll
new file mode 100644
index 0000000..e7336a9
--- /dev/null
+++ b/test/CodeGen/X86/dagcombine-and-setcc.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; Function Attrs: nounwind
+declare i32 @printf(i8* nocapture readonly, ...)
+
+; On X86 1 is true and 0 is false, so we can't perform the combine:
+; (and (setgt X,  true), (setgt Y,  true)) -> (setgt (or X, Y), true)
+; This combine only works if the true value is -1.
+
+
+;CHECK: cmpl
+;CHECK: setg
+;CHECK: cmpl
+;CHECK: setg
+;CHECK: andb
+
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+; Function Attrs: optsize ssp uwtable
+define i32 @foo(i32 %a, i32 %b, i32 * %c) {
+if.else429:
+  %cmp.i1144 = icmp eq i32* %c, null
+  %cmp430 = icmp slt i32 %a, 2
+  %cmp432 = icmp slt i32 %b, 2
+  %or.cond710 = or i1 %cmp430, %cmp432
+  %or.cond710.not = xor i1 %or.cond710, true
+  %brmerge1448 = or i1 %cmp.i1144, %or.cond710.not
+  br i1 %brmerge1448, label %ret1, label %ret2
+
+ret1:
+  ret i32 0
+
+ret2:
+  ret i32 1
+}
+
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) {
+  %res = alloca i32, align 4
+  %t = call i32 @foo(i32 1, i32 2, i32* %res) #3
+  %v = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %t)
+  ret i32 0
+}
+
+
+
diff --git a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
index 23f8335..4912213 100644
--- a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
@@ -52,58 +52,153 @@ define void @_Z3barii(i32 %param1, i32 %param2) #0 {
 entry:
   %var1 = alloca %struct.AAA3, align 1
   %var2 = alloca %struct.AAA3, align 1
-  %tobool = icmp eq i32 %param2, 0
-  br i1 %tobool, label %if.end, label %if.then
+  tail call void @llvm.dbg.value(metadata !{i32 %param1}, i64 0, metadata !30), !dbg !47
+  tail call void @llvm.dbg.value(metadata !{i32 %param2}, i64 0, metadata !31), !dbg !47
+  tail call void @llvm.dbg.value(metadata !48, i64 0, metadata !32), !dbg !49
+  %tobool = icmp eq i32 %param2, 0, !dbg !50
+  br i1 %tobool, label %if.end, label %if.then, !dbg !50
 
 if.then:                                          ; preds = %entry
-  %call = call i8* @_Z5i2stri(i32 %param2)
-  br label %if.end
+  %call = tail call i8* @_Z5i2stri(i32 %param2), !dbg !52
+  tail call void @llvm.dbg.value(metadata !{i8* %call}, i64 0, metadata !32), !dbg !49
+  br label %if.end, !dbg !54
 
 if.end:                                           ; preds = %entry, %if.then
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !60)
-  call void @llvm.dbg.value(metadata !62, i64 0, metadata !63)
-  %arraydecay.i = getelementptr inbounds %struct.AAA3* %var1, i64 0, i32 0, i64 0
-  call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0))
-  call void @llvm.dbg.declare(metadata !{%struct.AAA3* %var2}, metadata !38)
-  %arraydecay.i5 = getelementptr inbounds %struct.AAA3* %var2, i64 0, i32 0, i64 0
-  call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0))
-  %tobool1 = icmp eq i32 %param1, 0
-  br i1 %tobool1, label %if.else, label %if.then2
+  tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33), !dbg !55
+  tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !56), !dbg !57
+  tail call void @llvm.dbg.value(metadata !58, i64 0, metadata !59), !dbg !60
+  %arraydecay.i = getelementptr inbounds %struct.AAA3* %var1, i64 0, i32 0, i64 0, !dbg !61
+  call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !61
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34), !dbg !63
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !64), !dbg !65
+  call void @llvm.dbg.value(metadata !58, i64 0, metadata !66), !dbg !67
+  %arraydecay.i5 = getelementptr inbounds %struct.AAA3* %var2, i64 0, i32 0, i64 0, !dbg !68
+  call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !68
+  %tobool1 = icmp eq i32 %param1, 0, !dbg !69
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34), !dbg !63
+  br i1 %tobool1, label %if.else, label %if.then2, !dbg !69
 
 if.then2:                                         ; preds = %if.end
-  call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0))
-  br label %if.end3
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !71), !dbg !73
+  call void @llvm.dbg.value(metadata !74, i64 0, metadata !75), !dbg !76
+  call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0)), !dbg !76
+  br label %if.end3, !dbg !72
 
 if.else:                                          ; preds = %if.end
-  call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0))
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !77), !dbg !79
+  call void @llvm.dbg.value(metadata !80, i64 0, metadata !81), !dbg !82
+  call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0)), !dbg !82
   br label %if.end3
 
 if.end3:                                          ; preds = %if.else, %if.then2
-  call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0))
-  ret void
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33), !dbg !55
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !83), !dbg !85
+  call void @llvm.dbg.value(metadata !58, i64 0, metadata !86), !dbg !87
+  call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !87
+  ret void, !dbg !88
 }
 
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
-
-declare i8* @_Z5i2stri(i32) #2
+declare i8* @_Z5i2stri(i32) #1
 
-declare void @_Z3fooPcjPKc(i8*, i32, i8*) #2
+declare void @_Z3fooPcjPKc(i8*, i32, i8*) #1
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #1
+declare void @llvm.dbg.value(metadata, i64, metadata) #2
 
 attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.module.flags = !{!48, !49}
-!llvm.ident = !{!50}
-
-!38 = metadata !{i32 786688, null, metadata !"var2", null, i32 20, null, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [var2] [line 20]
-!48 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
-!50 = metadata !{metadata !"clang version 3.5 (202418)"}
-!60 = metadata !{i32 786689, null, metadata !"this", null, i32 16777216, null, i32 1088, null} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!62 = metadata !{i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)}
-!63 = metadata !{i32 786689, null, metadata !"value", null, i32 33554439, null, i32 0, null} ; [ DW_TAG_arg_variable ] [value] [line 7]
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!44, !45}
+!llvm.ident = !{!46}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !3, metadata !23, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"dbg-changes-codegen-branch-folding.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786451, metadata !1, null, metadata !"AAA3", i32 4, i64 32, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_structure_type ] [AAA3] [line 4, size 32, align 8, offset 0] [def] [from ]
+!5 = metadata !{metadata !6, metadata !11, metadata !17, metadata !18}
+!6 = metadata !{i32 786445, metadata !1, metadata !"_ZTS4AAA3", metadata !"text", i32 8, i64 32, i64 8, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] [text] [line 8, size 32, align 8, offset 0] [from ]
+!7 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 32, i64 8, i32 0, i32 0, metadata !8, metadata !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 8, offset 0] [from char]
+!8 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!9 = metadata !{metadata !10}
+!10 = metadata !{i32 786465, i64 0, i64 4}        ; [ DW_TAG_subrange_type ] [0, 3]
+!11 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"AAA3", metadata !"AAA3", metadata !"", i32 5, metadata !12, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 5} ; [ DW_TAG_subprogram ] [line 5] [AAA3]
+!12 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{null, metadata !14, metadata !15}
+!14 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS4AAA3]
+!15 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!16 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from char]
+!17 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"operator=", metadata !"operator=", metadata !"_ZN4AAA3aSEPKc", i32 6, metadata !12, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 6} ; [ DW_TAG_subprogram ] [line 6] [operator=]
+!18 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"operator const char *", metadata !"operator const char *", metadata !"_ZNK4AAA3cvPKcEv", i32 7, metadata !19, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 7} ; [ DW_TAG_subprogram ] [line 7] [operator const char *]
+!19 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !20, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!20 = metadata !{metadata !15, metadata !21}
+!21 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !22} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from ]
+!22 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTS4AAA3"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS4AAA3]
+!23 = metadata !{metadata !24, metadata !35, metadata !40}
+!24 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"bar", metadata !"bar", metadata !"_Z3barii", i32 11, metadata !26, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32, i32)* @_Z3barii, null, null, metadata !29, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [bar]
+!25 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!26 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !27, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!27 = metadata !{null, metadata !28, metadata !28}
+!28 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!29 = metadata !{metadata !30, metadata !31, metadata !32, metadata !33, metadata !34}
+!30 = metadata !{i32 786689, metadata !24, metadata !"param1", metadata !25, i32 16777227, metadata !28, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [param1] [line 11]
+!31 = metadata !{i32 786689, metadata !24, metadata !"param2", metadata !25, i32 33554443, metadata !28, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [param2] [line 11]
+!32 = metadata !{i32 786688, metadata !24, metadata !"temp", metadata !25, i32 12, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [temp] [line 12]
+!33 = metadata !{i32 786688, metadata !24, metadata !"var1", metadata !25, i32 17, metadata !"_ZTS4AAA3", i32 0, i32 0} ; [ DW_TAG_auto_variable ] [var1] [line 17]
+!34 = metadata !{i32 786688, metadata !24, metadata !"var2", metadata !25, i32 18, metadata !"_ZTS4AAA3", i32 0, i32 0} ; [ DW_TAG_auto_variable ] [var2] [line 18]
+!35 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"operator=", metadata !"operator=", metadata !"_ZN4AAA3aSEPKc", i32 6, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !17, metadata !36, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [operator=]
+!36 = metadata !{metadata !37, metadata !39}
+!37 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!38 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS4AAA3]
+!39 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!40 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"AAA3", metadata !"AAA3", metadata !"_ZN4AAA3C2EPKc", i32 5, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !11, metadata !41, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [AAA3]
+!41 = metadata !{metadata !42, metadata !43}
+!42 = metadata !{i32 786689, metadata !40, metadata !"this", null, i32 16777216, metadata !38, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!43 = metadata !{i32 786689, metadata !40, metadata !"value", metadata !25, i32 33554437, metadata !15, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [value] [line 5]
+!44 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!45 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!46 = metadata !{metadata !"clang version 3.5.0 "}
+!47 = metadata !{i32 11, i32 0, metadata !24, null}
+!48 = metadata !{i8* null}
+!49 = metadata !{i32 12, i32 0, metadata !24, null}
+!50 = metadata !{i32 14, i32 0, metadata !51, null}
+!51 = metadata !{i32 786443, metadata !1, metadata !24, i32 14, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!52 = metadata !{i32 15, i32 0, metadata !53, null}
+!53 = metadata !{i32 786443, metadata !1, metadata !51, i32 14, i32 0, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!54 = metadata !{i32 16, i32 0, metadata !53, null}
+!55 = metadata !{i32 17, i32 0, metadata !24, null}
+!56 = metadata !{i32 786689, metadata !40, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !55} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!57 = metadata !{i32 0, i32 0, metadata !40, metadata !55}
+!58 = metadata !{i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)}
+!59 = metadata !{i32 786689, metadata !40, metadata !"value", metadata !25, i32 33554437, metadata !15, i32 0, metadata !55} ; [ DW_TAG_arg_variable ] [value] [line 5]
+!60 = metadata !{i32 5, i32 0, metadata !40, metadata !55}
+!61 = metadata !{i32 5, i32 0, metadata !62, metadata !55}
+!62 = metadata !{i32 786443, metadata !1, metadata !40, i32 5, i32 0, i32 0, i32 3} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!63 = metadata !{i32 18, i32 0, metadata !24, null}
+!64 = metadata !{i32 786689, metadata !40, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !63} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!65 = metadata !{i32 0, i32 0, metadata !40, metadata !63}
+!66 = metadata !{i32 786689, metadata !40, metadata !"value", metadata !25, i32 33554437, metadata !15, i32 0, metadata !63} ; [ DW_TAG_arg_variable ] [value] [line 5]
+!67 = metadata !{i32 5, i32 0, metadata !40, metadata !63}
+!68 = metadata !{i32 5, i32 0, metadata !62, metadata !63}
+!69 = metadata !{i32 20, i32 0, metadata !70, null}
+!70 = metadata !{i32 786443, metadata !1, metadata !24, i32 20, i32 0, i32 0, i32 2} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!71 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !72} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!72 = metadata !{i32 21, i32 0, metadata !70, null}
+!73 = metadata !{i32 0, i32 0, metadata !35, metadata !72}
+!74 = metadata !{i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0)}
+!75 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, metadata !72} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!76 = metadata !{i32 6, i32 0, metadata !35, metadata !72}
+!77 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !78} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!78 = metadata !{i32 23, i32 0, metadata !70, null}
+!79 = metadata !{i32 0, i32 0, metadata !35, metadata !78}
+!80 = metadata !{i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0)}
+!81 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, metadata !78} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!82 = metadata !{i32 6, i32 0, metadata !35, metadata !78}
+!83 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !84} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!84 = metadata !{i32 24, i32 0, metadata !24, null}
+!85 = metadata !{i32 0, i32 0, metadata !35, metadata !84}
+!86 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, metadata !84} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!87 = metadata !{i32 6, i32 0, metadata !35, metadata !84}
+!88 = metadata !{i32 25, i32 0, metadata !24, null}
diff --git a/test/CodeGen/X86/dllexport-x86_64.ll b/test/CodeGen/X86/dllexport-x86_64.ll
index f4dec4f..0d5afa1 100644
--- a/test/CodeGen/X86/dllexport-x86_64.ll
+++ b/test/CodeGen/X86/dllexport-x86_64.ll
@@ -73,7 +73,7 @@ define weak_odr dllexport void @weak1() {
 @weak_alias = dllexport alias weak_odr void()* @f1
 
 @blob = global [6 x i8] c"\B8*\00\00\00\C3", section ".text", align 16
-@blob_alias = dllexport alias i32 (), [6 x i8]* @blob
+@blob_alias = dllexport alias bitcast ([6 x i8]* @blob to i32 ()*)
 
 ; CHECK: .section .drectve
 ; WIN32: " /EXPORT:Var1,DATA"
diff --git a/test/CodeGen/X86/elf-comdat.ll b/test/CodeGen/X86/elf-comdat.ll
new file mode 100644
index 0000000..c7e6df7
--- /dev/null
+++ b/test/CodeGen/X86/elf-comdat.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple x86_64-pc-linux-gnu < %s | FileCheck %s
+
+$f = comdat any
+@v = global i32 0, comdat $f
+define void @f() comdat $f {
+  ret void
+}
+; CHECK: .section        .text.f,"axG",@progbits,f,comdat
+; CHECK: .globl  f
+; CHECK: .section        .bss.v,"aGw",@nobits,f,comdat
+; CHECK: .globl  v
diff --git a/test/CodeGen/X86/elf-comdat2.ll b/test/CodeGen/X86/elf-comdat2.ll
new file mode 100644
index 0000000..209da39
--- /dev/null
+++ b/test/CodeGen/X86/elf-comdat2.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple x86_64-pc-linux-gnu < %s | FileCheck %s
+
+$foo = comdat any
+@bar = global i32 42, comdat $foo
+@foo = global i32 42
+
+; CHECK:      .type   bar,@object
+; CHECK-NEXT: .section        .data.bar,"aGw",@progbits,foo,comdat
+; CHECK-NEXT: .globl  bar
+; CHECK:      .type   foo,@object
+; CHECK-NEXT: .data
+; CHECK-NEXT: .globl  foo
diff --git a/test/CodeGen/X86/fast-isel-args-fail2.ll b/test/CodeGen/X86/fast-isel-args-fail2.ll
new file mode 100644
index 0000000..08de472
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-args-fail2.ll
@@ -0,0 +1,10 @@
+; RUN: not --crash llc < %s -fast-isel -fast-isel-abort-args -mtriple=x86_64-apple-darwin10
+; REQUIRES: asserts
+
+%struct.s0 = type { x86_fp80, x86_fp80 }
+
+; FastISel cannot handle this case yet. Make sure that we abort.
+define i8* @args_fail(%struct.s0* byval nocapture readonly align 16 %y) {
+  %1 = bitcast %struct.s0* %y to i8*
+  ret i8* %1
+}
diff --git a/test/CodeGen/X86/fast-isel-args.ll b/test/CodeGen/X86/fast-isel-args.ll
index 0f36265..8c86a9c 100644
--- a/test/CodeGen/X86/fast-isel-args.ll
+++ b/test/CodeGen/X86/fast-isel-args.ll
@@ -23,3 +23,27 @@ entry:
   %add2 = add nsw i64 %add, %conv1
   ret i64 %add2
 }
+
+define float @t4(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h) {
+entry:
+  %add1 = fadd float %a, %b
+  %add2 = fadd float %c, %d
+  %add3 = fadd float %e, %f
+  %add4 = fadd float %g, %h
+  %add5 = fadd float %add1, %add2
+  %add6 = fadd float %add3, %add4
+  %add7 = fadd float %add5, %add6
+  ret float %add7
+}
+
+define double @t5(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h) {
+entry:
+  %add1 = fadd double %a, %b
+  %add2 = fadd double %c, %d
+  %add3 = fadd double %e, %f
+  %add4 = fadd double %g, %h
+  %add5 = fadd double %add1, %add2
+  %add6 = fadd double %add3, %add4
+  %add7 = fadd double %add5, %add6
+  ret double %add7
+}
diff --git a/test/CodeGen/X86/fast-isel-branch_weights.ll b/test/CodeGen/X86/fast-isel-branch_weights.ll
new file mode 100644
index 0000000..bc41395
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-branch_weights.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s                             -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -fast-isel -fast-isel-abort -mtriple=x86_64-apple-darwin10 | FileCheck %s
+
+; Test if the BBs are reordred according to their branch weights.
+define i64 @branch_weights_test(i64 %a, i64 %b) {
+; CHECK-LABEL: branch_weights_test
+; CHECK-LABEL: success
+; CHECK-LABEL: fail
+  %1 = icmp ult i64 %a, %b
+  br i1 %1, label %fail, label %success, !prof !0
+
+fail:
+  ret i64 -1
+
+success:
+  ret i64 0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 0, i32 2147483647}
diff --git a/test/CodeGen/X86/fast-isel-cmp-branch2.ll b/test/CodeGen/X86/fast-isel-cmp-branch2.ll
new file mode 100644
index 0000000..7e45c49
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-cmp-branch2.ll
@@ -0,0 +1,294 @@
+; RUN: llc < %s                             -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -fast-isel -fast-isel-abort -mtriple=x86_64-apple-darwin10 | FileCheck %s
+
+define i32 @fcmp_oeq(float %x, float %y) {
+; CHECK-LABEL: fcmp_oeq
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jne {{LBB.+_1}}
+; CHECK-NEXT:  jnp {{LBB.+_2}}
+  %1 = fcmp oeq float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ogt(float %x, float %y) {
+; CHECK-LABEL: fcmp_ogt
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jbe {{LBB.+_1}}
+  %1 = fcmp ogt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oge(float %x, float %y) {
+; CHECK-LABEL: fcmp_oge
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jb {{LBB.+_1}}
+  %1 = fcmp oge float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_olt(float %x, float %y) {
+; CHECK-LABEL: fcmp_olt
+; CHECK:       ucomiss  %xmm0, %xmm1
+; CHECK-NEXT:  jbe {{LBB.+_1}}
+  %1 = fcmp olt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ole(float %x, float %y) {
+; CHECK-LABEL: fcmp_ole
+; CHECK:       ucomiss  %xmm0, %xmm1
+; CHECK-NEXT:  jb {{LBB.+_1}}
+  %1 = fcmp ole float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_one(float %x, float %y) {
+; CHECK-LABEL: fcmp_one
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  je {{LBB.+_1}}
+  %1 = fcmp one float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ord(float %x, float %y) {
+; CHECK-LABEL: fcmp_ord
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jp {{LBB.+_1}}
+  %1 = fcmp ord float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uno(float %x, float %y) {
+; CHECK-LABEL: fcmp_uno
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jp {{LBB.+_2}}
+  %1 = fcmp uno float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ueq(float %x, float %y) {
+; CHECK-LABEL: fcmp_ueq
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  je {{LBB.+_2}}
+  %1 = fcmp ueq float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ugt(float %x, float %y) {
+; CHECK-LABEL: fcmp_ugt
+; CHECK:       ucomiss  %xmm0, %xmm1
+; CHECK-NEXT:  jae {{LBB.+_1}}
+  %1 = fcmp ugt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uge(float %x, float %y) {
+; CHECK-LABEL: fcmp_uge
+; CHECK:       ucomiss  %xmm0, %xmm1
+; CHECK-NEXT:  ja {{LBB.+_1}}
+  %1 = fcmp uge float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ult(float %x, float %y) {
+; CHECK-LABEL: fcmp_ult
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jae {{LBB.+_1}}
+  %1 = fcmp ult float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ule(float %x, float %y) {
+; CHECK-LABEL: fcmp_ule
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  ja {{LBB.+_1}}
+  %1 = fcmp ule float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_une(float %x, float %y) {
+; CHECK-LABEL: fcmp_une
+; CHECK:       ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jne {{LBB.+_2}}
+; CHECK-NEXT:  jp  {{LBB.+_2}}
+; CHECK-NEXT:  jmp {{LBB.+_1}}
+  %1 = fcmp une float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_eq
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  jne {{LBB.+_1}}
+  %1 = icmp eq i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ne(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ne
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  je {{LBB.+_1}}
+  %1 = icmp ne i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ugt(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ugt
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  jbe {{LBB.+_1}}
+  %1 = icmp ugt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_uge(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_uge
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  jb {{LBB.+_1}}
+  %1 = icmp uge i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ult(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ult
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  jae {{LBB.+_1}}
+  %1 = icmp ult i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ule(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ule
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  ja {{LBB.+_1}}
+  %1 = icmp ule i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sgt(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_sgt
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  jle {{LBB.+_1}}
+  %1 = icmp sgt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sge(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_sge
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  jl {{LBB.+_1}}
+  %1 = icmp sge i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_slt(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_slt
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  jge {{LBB.+_1}}
+  %1 = icmp slt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sle(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_sle
+; CHECK:       cmpl %esi, %edi
+; CHECK-NEXT:  jg {{LBB.+_1}}
+  %1 = icmp sle i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
diff --git a/test/CodeGen/X86/fast-isel-cmp-branch3.ll b/test/CodeGen/X86/fast-isel-cmp-branch3.ll
new file mode 100644
index 0000000..a3f6851
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-cmp-branch3.ll
@@ -0,0 +1,470 @@
+; RUN: llc < %s -fast-isel -fast-isel-abort -mtriple=x86_64-apple-darwin10 | FileCheck %s
+
+define i32 @fcmp_oeq1(float %x) {
+; CHECK-LABEL: fcmp_oeq1
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jp {{LBB.+_1}}
+  %1 = fcmp oeq float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oeq2(float %x) {
+; CHECK-LABEL: fcmp_oeq2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jne {{LBB.+_1}}
+; CHECK-NEXT:  jnp {{LBB.+_2}}
+  %1 = fcmp oeq float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ogt1(float %x) {
+; CHECK-LABEL: fcmp_ogt1
+; CHECK-NOT:   ucomiss
+; CHECK:       movl $1, %eax
+  %1 = fcmp ogt float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ogt2(float %x) {
+; CHECK-LABEL: fcmp_ogt2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jbe {{LBB.+_1}}
+  %1 = fcmp ogt float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oge1(float %x) {
+; CHECK-LABEL: fcmp_oge1
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jp {{LBB.+_1}}
+  %1 = fcmp oge float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oge2(float %x) {
+; CHECK-LABEL: fcmp_oge2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jb {{LBB.+_1}}
+  %1 = fcmp oge float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_olt1(float %x) {
+; CHECK-LABEL: fcmp_olt1
+; CHECK-NOT:   ucomiss
+; CHECK:       movl $1, %eax
+  %1 = fcmp olt float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_olt2(float %x) {
+; CHECK-LABEL: fcmp_olt2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm0, %xmm1
+; CHECK-NEXT:  jbe {{LBB.+_1}}
+  %1 = fcmp olt float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ole1(float %x) {
+; CHECK-LABEL: fcmp_ole1
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jp {{LBB.+_1}}
+  %1 = fcmp ole float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ole2(float %x) {
+; CHECK-LABEL: fcmp_ole2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm0, %xmm1
+; CHECK-NEXT:  jb {{LBB.+_1}}
+  %1 = fcmp ole float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_one1(float %x) {
+; CHECK-LABEL: fcmp_one1
+; CHECK-NOT:   ucomiss
+; CHECK:       movl $1, %eax
+  %1 = fcmp one float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_one2(float %x) {
+; CHECK-LABEL: fcmp_one2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  je {{LBB.+_1}}
+  %1 = fcmp one float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ord1(float %x) {
+; CHECK-LABEL: fcmp_ord1
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jp {{LBB.+_1}}
+  %1 = fcmp ord float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ord2(float %x) {
+; CHECK-LABEL: fcmp_ord2
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jp {{LBB.+_1}}
+  %1 = fcmp ord float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uno1(float %x) {
+; CHECK-LABEL: fcmp_uno1
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jp {{LBB.+_2}}
+  %1 = fcmp uno float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uno2(float %x) {
+; CHECK-LABEL: fcmp_uno2
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jp {{LBB.+_2}}
+  %1 = fcmp uno float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ueq1(float %x) {
+; CHECK-LABEL: fcmp_ueq1
+; CHECK-NOT:   ucomiss
+  %1 = fcmp ueq float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ueq2(float %x) {
+; CHECK-LABEL: fcmp_ueq2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  je {{LBB.+_2}}
+  %1 = fcmp ueq float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ugt1(float %x) {
+; CHECK-LABEL: fcmp_ugt1
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jnp {{LBB.+_1}}
+  %1 = fcmp ugt float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ugt2(float %x) {
+; CHECK-LABEL: fcmp_ugt2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm0, %xmm1
+; CHECK-NEXT:  jae {{LBB.+_1}}
+  %1 = fcmp ugt float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uge1(float %x) {
+; CHECK-LABEL: fcmp_uge1
+; CHECK-NOT:   ucomiss
+  %1 = fcmp uge float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uge2(float %x) {
+; CHECK-LABEL: fcmp_uge2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm0, %xmm1
+; CHECK-NEXT:  ja {{LBB.+_1}}
+  %1 = fcmp uge float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ult1(float %x) {
+; CHECK-LABEL: fcmp_ult1
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jnp {{LBB.+_1}}
+  %1 = fcmp ult float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ult2(float %x) {
+; CHECK-LABEL: fcmp_ult2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jae {{LBB.+_1}}
+  %1 = fcmp ult float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ule1(float %x) {
+; CHECK-LABEL: fcmp_ule1
+; CHECK-NOT:   ucomiss
+  %1 = fcmp ule float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ule2(float %x) {
+; CHECK-LABEL: fcmp_ule2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  ja {{LBB.+_1}}
+  %1 = fcmp ule float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_une1(float %x) {
+; CHECK-LABEL: fcmp_une1
+; CHECK:       ucomiss  %xmm0, %xmm0
+; CHECK-NEXT:  jnp {{LBB.+_1}}
+  %1 = fcmp une float %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_une2(float %x) {
+; CHECK-LABEL: fcmp_une2
+; CHECK:       xorps    %xmm1, %xmm1
+; CHECK-NEXT:  ucomiss  %xmm1, %xmm0
+; CHECK-NEXT:  jne {{LBB.+_2}}
+; CHECK-NEXT:  jp {{LBB.+_2}}
+; CHECK-NEXT:  jmp {{LBB.+_1}}
+  %1 = fcmp une float %x, 0.000000e+00
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq(i32 %x) {
+; CHECK-LABEL: icmp_eq
+; CHECK-NOT:   cmpl
+; CHECK:       movl $0, %eax
+  %1 = icmp eq i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ne(i32 %x) {
+; CHECK-LABEL: icmp_ne
+; CHECK-NOT:   cmpl
+; CHECK:       movl $1, %eax
+  %1 = icmp ne i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ugt(i32 %x) {
+; CHECK-LABEL: icmp_ugt
+; CHECK-NOT:   cmpl
+; CHECK:       movl $1, %eax
+  %1 = icmp ugt i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_uge(i32 %x) {
+; CHECK-LABEL: icmp_uge
+; CHECK-NOT:   cmpl
+; CHECK:       movl $0, %eax
+  %1 = icmp uge i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ult(i32 %x) {
+; CHECK-LABEL: icmp_ult
+; CHECK-NOT:   cmpl
+; CHECK:       movl $1, %eax
+  %1 = icmp ult i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ule(i32 %x) {
+; CHECK-LABEL: icmp_ule
+; CHECK-NOT:   cmpl
+; CHECK:       movl $0, %eax
+  %1 = icmp ule i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sgt(i32 %x) {
+; CHECK-LABEL: icmp_sgt
+; CHECK-NOT:   cmpl
+; CHECK:       movl $1, %eax
+  %1 = icmp sgt i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sge(i32 %x) {
+; CHECK-LABEL: icmp_sge
+; CHECK-NOT:   cmpl
+; CHECK:       movl $0, %eax
+  %1 = icmp sge i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_slt(i32 %x) {
+; CHECK-LABEL: icmp_slt
+; CHECK-NOT:   cmpl
+; CHECK:       movl $1, %eax
+  %1 = icmp slt i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sle(i32 %x) {
+; CHECK-LABEL: icmp_sle
+; CHECK-NOT:   cmpl
+; CHECK:       movl $0, %eax
+  %1 = icmp sle i32 %x, %x
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
diff --git a/test/CodeGen/X86/fast-isel-cmp.ll b/test/CodeGen/X86/fast-isel-cmp.ll
new file mode 100644
index 0000000..1b72cfc
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-cmp.ll
@@ -0,0 +1,689 @@
+; RUN: llc < %s                             -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=SDAG
+; RUN: llc < %s -fast-isel -fast-isel-abort -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=FAST
+
+define zeroext i1 @fcmp_oeq(float %x, float %y) {
+; SDAG-LABEL: fcmp_oeq
+; SDAG:       cmpeqss  %xmm1, %xmm0
+; SDAG-NEXT:  movd     %xmm0, %eax
+; SDAG-NEXT:  andl     $1, %eax
+; FAST-LABEL: fcmp_oeq
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  sete     %al
+; FAST-NEXT:  setnp    %cl
+; FAST-NEXT:  andb     %al, %cl
+  %1 = fcmp oeq float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ogt(float %x, float %y) {
+; SDAG-LABEL: fcmp_ogt
+; SDAG:       ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  seta     %al
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  seta     %al
+  %1 = fcmp ogt float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_oge(float %x, float %y) {
+; SDAG-LABEL: fcmp_oge
+; SDAG:       ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setae    %al
+; FAST-LABEL: fcmp_oge
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setae    %al
+  %1 = fcmp oge float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_olt(float %x, float %y) {
+; SDAG-LABEL: fcmp_olt
+; SDAG:       ucomiss  %xmm0, %xmm1
+; SDAG-NEXT:  seta     %al
+; FAST-LABEL: fcmp_olt
+; FAST:       ucomiss  %xmm0, %xmm1
+; FAST-NEXT:  seta     %al
+  %1 = fcmp olt float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ole(float %x, float %y) {
+; SDAG-LABEL: fcmp_ole
+; SDAG:       ucomiss  %xmm0, %xmm1
+; SDAG-NEXT:  setae    %al
+; FAST-LABEL: fcmp_ole
+; FAST:       ucomiss  %xmm0, %xmm1
+; FAST-NEXT:  setae    %al
+  %1 = fcmp ole float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_one(float %x, float %y) {
+; SDAG-LABEL: fcmp_one
+; SDAG:       ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setne    %al
+; FAST-LABEL: fcmp_one
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setne    %al
+  %1 = fcmp one float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ord(float %x, float %y) {
+; SDAG-LABEL: fcmp_ord
+; SDAG:       ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setnp    %al
+; FAST-LABEL: fcmp_ord
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setnp    %al
+  %1 = fcmp ord float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_uno(float %x, float %y) {
+; SDAG-LABEL: fcmp_uno
+; SDAG:       ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setp     %al
+; FAST-LABEL: fcmp_uno
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setp     %al
+  %1 = fcmp uno float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ueq(float %x, float %y) {
+; SDAG-LABEL: fcmp_ueq
+; SDAG:       ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  sete     %al
+; FAST-LABEL: fcmp_ueq
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  sete     %al
+  %1 = fcmp ueq float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ugt(float %x, float %y) {
+; SDAG-LABEL: fcmp_ugt
+; SDAG:       ucomiss  %xmm0, %xmm1
+; SDAG-NEXT:  setb     %al
+; FAST-LABEL: fcmp_ugt
+; FAST:       ucomiss  %xmm0, %xmm1
+; FAST-NEXT:  setb     %al
+  %1 = fcmp ugt float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_uge(float %x, float %y) {
+; SDAG-LABEL: fcmp_uge
+; SDAG:       ucomiss  %xmm0, %xmm1
+; SDAG-NEXT:  setbe    %al
+; FAST-LABEL: fcmp_uge
+; FAST:       ucomiss  %xmm0, %xmm1
+; FAST-NEXT:  setbe    %al
+  %1 = fcmp uge float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ult(float %x, float %y) {
+; SDAG-LABEL: fcmp_ult
+; SDAG:       ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setb     %al
+; FAST-LABEL: fcmp_ult
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setb     %al
+  %1 = fcmp ult float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ule(float %x, float %y) {
+; SDAG-LABEL: fcmp_ule
+; SDAG:       ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setbe    %al
+; FAST-LABEL: fcmp_ule
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setbe    %al
+  %1 = fcmp ule float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_une(float %x, float %y) {
+; SDAG-LABEL: fcmp_une
+; SDAG:       cmpneqss %xmm1, %xmm0
+; SDAG-NEXT:  movd     %xmm0, %eax
+; SDAG-NEXT:  andl     $1, %eax
+; FAST-LABEL: fcmp_une
+; FAST:       ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setne    %al
+; FAST-NEXT:  setp     %cl
+; FAST-NEXT:  orb      %al, %cl
+  %1 = fcmp une float %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_eq(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_eq
+; SDAG:       cmpl     %esi, %edi
+; SDAG-NEXT:  sete     %al
+; FAST-LABEL: icmp_eq
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  sete     %al
+  %1 = icmp eq i32 %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_ne(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_ne
+; SDAG:       cmpl     %esi, %edi
+; SDAG-NEXT:  setne    %al
+; FAST-LABEL: icmp_ne
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  setne    %al
+  %1 = icmp ne i32 %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_ugt(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_ugt
+; SDAG:       cmpl     %edi, %esi
+; SDAG-NEXT:  setb     %al
+; FAST-LABEL: icmp_ugt
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  seta     %al
+  %1 = icmp ugt i32 %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_uge(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_uge
+; SDAG:       cmpl     %esi, %edi
+; SDAG-NEXT:  setae    %al
+; FAST-LABEL: icmp_uge
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  setae    %al
+  %1 = icmp uge i32 %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_ult(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_ult
+; SDAG:       cmpl     %esi, %edi
+; SDAG-NEXT:  setb     %al
+; FAST-LABEL: icmp_ult
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  setb     %al
+  %1 = icmp ult i32 %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_ule(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_ule
+; SDAG:       cmpl     %esi, %edi
+; SDAG-NEXT:  setbe    %al
+; FAST-LABEL: icmp_ule
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  setbe    %al
+  %1 = icmp ule i32 %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_sgt(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_sgt
+; SDAG:       cmpl     %esi, %edi
+; SDAG-NEXT:  setg     %al
+; FAST-LABEL: icmp_sgt
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  setg     %al
+  %1 = icmp sgt i32 %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_sge(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_sge
+; SDAG:       cmpl     %esi, %edi
+; SDAG-NEXT:  setge    %al
+; FAST-LABEL: icmp_sge
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  setge    %al
+  %1 = icmp sge i32 %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_slt(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_slt
+; SDAG:       cmpl     %esi, %edi
+; SDAG-NEXT:  setl     %al
+; FAST-LABEL: icmp_slt
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  setl     %al
+  %1 = icmp slt i32 %x, %y
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_sle(i32 %x, i32 %y) {
+; SDAG-LABEL: icmp_sle
+; SDAG:       cmpl     %esi, %edi
+; SDAG-NEXT:  setle    %al
+; FAST-LABEL: icmp_sle
+; FAST:       cmpl     %esi, %edi
+; FAST-NEXT:  setle    %al
+  %1 = icmp sle i32 %x, %y
+  ret i1 %1
+}
+
+; Test cmp folding and condition optimization.
+define zeroext i1 @fcmp_oeq2(float %x) {
+; SDAG-LABEL: fcmp_oeq2
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setnp    %al
+; FAST-LABEL: fcmp_oeq2
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setnp    %al
+  %1 = fcmp oeq float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_oeq3(float %x) {
+; SDAG-LABEL: fcmp_oeq3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  cmpeqss  %xmm1, %xmm0
+; SDAG-NEXT:  movd     %xmm0, %eax
+; SDAG-NEXT:  andl     $1, %eax
+; FAST-LABEL: fcmp_oeq3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  sete     %al
+; FAST-NEXT:  setnp    %cl
+; FAST-NEXT:  andb     %al, %cl
+  %1 = fcmp oeq float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ogt2(float %x) {
+; SDAG-LABEL: fcmp_ogt2
+; SDAG:       xorl     %eax, %eax
+; FAST-LABEL: fcmp_ogt2
+; FAST:       xorl     %eax, %eax
+  %1 = fcmp ogt float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ogt3(float %x) {
+; SDAG-LABEL: fcmp_ogt3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  seta     %al
+; FAST-LABEL: fcmp_ogt3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  seta     %al
+  %1 = fcmp ogt float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_oge2(float %x) {
+; SDAG-LABEL: fcmp_oge2
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setnp    %al
+; FAST-LABEL: fcmp_oge2
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setnp    %al
+  %1 = fcmp oge float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_oge3(float %x) {
+; SDAG-LABEL: fcmp_oge3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setae    %al
+; FAST-LABEL: fcmp_oge3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setae    %al
+  %1 = fcmp oge float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_olt2(float %x) {
+; SDAG-LABEL: fcmp_olt2
+; SDAG:       xorl     %eax, %eax
+; FAST-LABEL: fcmp_olt2
+; FAST:       xorl     %eax, %eax
+  %1 = fcmp olt float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_olt3(float %x) {
+; SDAG-LABEL: fcmp_olt3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm0, %xmm1
+; SDAG-NEXT:  seta     %al
+; FAST-LABEL: fcmp_olt3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm0, %xmm1
+; FAST-NEXT:  seta     %al
+  %1 = fcmp olt float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ole2(float %x) {
+; SDAG-LABEL: fcmp_ole2
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setnp    %al
+; FAST-LABEL: fcmp_ole2
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setnp    %al
+  %1 = fcmp ole float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ole3(float %x) {
+; SDAG-LABEL: fcmp_ole3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm0, %xmm1
+; SDAG-NEXT:  setae    %al
+; FAST-LABEL: fcmp_ole3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm0, %xmm1
+; FAST-NEXT:  setae    %al
+  %1 = fcmp ole float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_one2(float %x) {
+; SDAG-LABEL: fcmp_one2
+; SDAG:       xorl     %eax, %eax
+; FAST-LABEL: fcmp_one2
+; FAST:       xorl     %eax, %eax
+  %1 = fcmp one float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_one3(float %x) {
+; SDAG-LABEL: fcmp_one3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setne    %al
+; FAST-LABEL: fcmp_one3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setne    %al
+  %1 = fcmp one float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ord2(float %x) {
+; SDAG-LABEL: fcmp_ord2
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setnp    %al
+; FAST-LABEL: fcmp_ord2
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setnp    %al
+  %1 = fcmp ord float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ord3(float %x) {
+; SDAG-LABEL: fcmp_ord3
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setnp    %al
+; FAST-LABEL: fcmp_ord3
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setnp    %al
+  %1 = fcmp ord float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_uno2(float %x) {
+; SDAG-LABEL: fcmp_uno2
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setp     %al
+; FAST-LABEL: fcmp_uno2
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setp     %al
+  %1 = fcmp uno float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_uno3(float %x) {
+; SDAG-LABEL: fcmp_uno3
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setp     %al
+; FAST-LABEL: fcmp_uno3
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setp     %al
+  %1 = fcmp uno float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ueq2(float %x) {
+; SDAG-LABEL: fcmp_ueq2
+; SDAG:       movb     $1, %al
+; FAST-LABEL: fcmp_ueq2
+; FAST:       movb     $1, %al
+  %1 = fcmp ueq float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ueq3(float %x) {
+; SDAG-LABEL: fcmp_ueq3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  sete     %al
+; FAST-LABEL: fcmp_ueq3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  sete     %al
+  %1 = fcmp ueq float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ugt2(float %x) {
+; SDAG-LABEL: fcmp_ugt2
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setp     %al
+; FAST-LABEL: fcmp_ugt2
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setp     %al
+  %1 = fcmp ugt float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ugt3(float %x) {
+; SDAG-LABEL: fcmp_ugt3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm0, %xmm1
+; SDAG-NEXT:  setb     %al
+; FAST-LABEL: fcmp_ugt3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm0, %xmm1
+; FAST-NEXT:  setb     %al
+  %1 = fcmp ugt float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_uge2(float %x) {
+; SDAG-LABEL: fcmp_uge2
+; SDAG:       movb     $1, %al
+; FAST-LABEL: fcmp_uge2
+; FAST:       movb     $1, %al
+  %1 = fcmp uge float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_uge3(float %x) {
+; SDAG-LABEL: fcmp_uge3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm0, %xmm1
+; SDAG-NEXT:  setbe    %al
+; FAST-LABEL: fcmp_uge3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm0, %xmm1
+; FAST-NEXT:  setbe    %al
+  %1 = fcmp uge float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ult2(float %x) {
+; SDAG-LABEL: fcmp_ult2
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setp     %al
+; FAST-LABEL: fcmp_ult2
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setp     %al
+  %1 = fcmp ult float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ult3(float %x) {
+; SDAG-LABEL: fcmp_ult3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setb     %al
+; FAST-LABEL: fcmp_ult3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setb     %al
+  %1 = fcmp ult float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ule2(float %x) {
+; SDAG-LABEL: fcmp_ule2
+; SDAG:       movb     $1, %al
+; FAST-LABEL: fcmp_ule2
+; FAST:       movb     $1, %al
+  %1 = fcmp ule float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ule3(float %x) {
+; SDAG-LABEL: fcmp_ule3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  ucomiss  %xmm1, %xmm0
+; SDAG-NEXT:  setbe    %al
+; FAST-LABEL: fcmp_ule3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setbe    %al
+  %1 = fcmp ule float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_une2(float %x) {
+; SDAG-LABEL: fcmp_une2
+; SDAG:       ucomiss  %xmm0, %xmm0
+; SDAG-NEXT:  setp     %al
+; FAST-LABEL: fcmp_une2
+; FAST:       ucomiss  %xmm0, %xmm0
+; FAST-NEXT:  setp     %al
+  %1 = fcmp une float %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_une3(float %x) {
+; SDAG-LABEL: fcmp_une3
+; SDAG:       xorps    %xmm1, %xmm1
+; SDAG-NEXT:  cmpneqss %xmm1, %xmm0
+; SDAG-NEXT:  movd     %xmm0, %eax
+; SDAG-NEXT:  andl     $1, %eax
+; FAST-LABEL: fcmp_une3
+; FAST:       xorps    %xmm1, %xmm1
+; FAST-NEXT:  ucomiss  %xmm1, %xmm0
+; FAST-NEXT:  setne    %al
+; FAST-NEXT:  setp     %cl
+; FAST-NEXT:  orb      %al, %cl
+  %1 = fcmp une float %x, 0.000000e+00
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_eq2(i32 %x) {
+; SDAG-LABEL: icmp_eq2
+; SDAG:       movb     $1, %al
+; FAST-LABEL: icmp_eq2
+; FAST:       movb     $1, %al
+  %1 = icmp eq i32 %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_ne2(i32 %x) {
+; SDAG-LABEL: icmp_ne2
+; SDAG:       xorl     %eax, %eax
+; FAST-LABEL: icmp_ne2
+; FAST:       xorl     %eax, %eax
+  %1 = icmp ne i32 %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_ugt2(i32 %x) {
+; SDAG-LABEL: icmp_ugt2
+; SDAG:       xorl     %eax, %eax
+; FAST-LABEL: icmp_ugt2
+; FAST:       xorl     %eax, %eax
+  %1 = icmp ugt i32 %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_uge2(i32 %x) {
+; SDAG-LABEL: icmp_uge2
+; SDAG:       movb     $1, %al
+; FAST-LABEL: icmp_uge2
+; FAST:       movb     $1, %al
+  %1 = icmp uge i32 %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_ult2(i32 %x) {
+; SDAG-LABEL: icmp_ult2
+; SDAG:       xorl     %eax, %eax
+; FAST-LABEL: icmp_ult2
+; FAST:       xorl     %eax, %eax
+  %1 = icmp ult i32 %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_ule2(i32 %x) {
+; SDAG-LABEL: icmp_ule2
+; SDAG:       movb     $1, %al
+; FAST-LABEL: icmp_ule2
+; FAST:       movb     $1, %al
+  %1 = icmp ule i32 %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_sgt2(i32 %x) {
+; SDAG-LABEL: icmp_sgt2
+; SDAG:       xorl     %eax, %eax
+; FAST-LABEL: icmp_sgt2
+; FAST:       xorl     %eax, %eax
+  %1 = icmp sgt i32 %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_sge2(i32 %x) {
+; SDAG-LABEL: icmp_sge2
+; SDAG:       movb     $1, %al
+; FAST-LABEL: icmp_sge2
+; FAST:       movb     $1, %al
+  %1 = icmp sge i32 %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_slt2(i32 %x) {
+; SDAG-LABEL: icmp_slt2
+; SDAG:       xorl     %eax, %eax
+; FAST-LABEL: icmp_slt2
+; FAST:       xorl     %eax, %eax
+  %1 = icmp slt i32 %x, %x
+  ret i1 %1
+}
+
+define zeroext i1 @icmp_sle2(i32 %x) {
+; SDAG-LABEL: icmp_sle2
+; SDAG:       movb     $1, %al
+; FAST-LABEL: icmp_sle2
+; FAST:       movb     $1, %al
+  %1 = icmp sle i32 %x, %x
+  ret i1 %1
+}
+
diff --git a/test/CodeGen/X86/fast-isel-fold-mem.ll b/test/CodeGen/X86/fast-isel-fold-mem.ll
new file mode 100644
index 0000000..a945779
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-fold-mem.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s                             -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: llc < %s -fast-isel -fast-isel-abort -mtriple=x86_64-apple-darwin | FileCheck %s
+
+define i64 @fold_load(i64* %a, i64 %b) {
+; CHECK-LABEL: fold_load
+; CHECK:       addq  (%rdi), %rsi
+; CHECK-NEXT:  movq  %rsi, %rax
+  %1 = load i64* %a, align 8
+  %2 = add i64 %1, %b
+  ret i64 %2
+}
+
diff --git a/test/CodeGen/X86/fast-isel-select-cmov.ll b/test/CodeGen/X86/fast-isel-select-cmov.ll
new file mode 100644
index 0000000..8008e28
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-select-cmov.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s -fast-isel -fast-isel-abort -mtriple=x86_64-apple-darwin10                  | FileCheck %s
+
+; Test conditional move for the supported types (i16, i32, and i32) and
+; conditon input (argument or cmp). Currently i8 is not supported.
+
+define zeroext i16 @select_cmov_i16(i1 zeroext %cond, i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: select_cmov_i16
+; CHECK:       testb   $1, %dil
+; CHECK-NEXT:  cmovew  %dx, %si
+; CHECK-NEXT:  movzwl  %si, %eax
+  %1 = select i1 %cond, i16 %a, i16 %b
+  ret i16 %1
+}
+
+define zeroext i16 @select_cmp_cmov_i16(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: select_cmp_cmov_i16
+; CHECK:       cmpw    %si, %di
+; CHECK-NEXT:  cmovbw  %di, %si
+; CHECK-NEXT:  movzwl  %si, %eax
+  %1 = icmp ult i16 %a, %b
+  %2 = select i1 %1, i16 %a, i16 %b
+  ret i16 %2
+}
+
+define i32 @select_cmov_i32(i1 zeroext %cond, i32 %a, i32 %b) {
+; CHECK-LABEL: select_cmov_i32
+; CHECK:       testb   $1, %dil
+; CHECK-NEXT:  cmovel  %edx, %esi
+; CHECK-NEXT:  movl    %esi, %eax
+  %1 = select i1 %cond, i32 %a, i32 %b
+  ret i32 %1
+}
+
+define i32 @select_cmp_cmov_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: select_cmp_cmov_i32
+; CHECK:       cmpl    %esi, %edi
+; CHECK-NEXT:  cmovbl  %edi, %esi
+; CHECK-NEXT:  movl    %esi, %eax
+  %1 = icmp ult i32 %a, %b
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+}
+
+define i64 @select_cmov_i64(i1 zeroext %cond, i64 %a, i64 %b) {
+; CHECK-LABEL: select_cmov_i64
+; CHECK:       testb   $1, %dil
+; CHECK-NEXT:  cmoveq  %rdx, %rsi
+; CHECK-NEXT:  movq    %rsi, %rax
+  %1 = select i1 %cond, i64 %a, i64 %b
+  ret i64 %1
+}
+
+define i64 @select_cmp_cmov_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: select_cmp_cmov_i64
+; CHECK:       cmpq    %rsi, %rdi
+; CHECK-NEXT:  cmovbq  %rdi, %rsi
+; CHECK-NEXT:  movq    %rsi, %rax
+  %1 = icmp ult i64 %a, %b
+  %2 = select i1 %1, i64 %a, i64 %b
+  ret i64 %2
+}
+
diff --git a/test/CodeGen/X86/fast-isel-select-cmov2.ll b/test/CodeGen/X86/fast-isel-select-cmov2.ll
new file mode 100644
index 0000000..658098f
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-select-cmov2.ll
@@ -0,0 +1,255 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10                             | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort | FileCheck %s
+
+; Test all the cmp predicates that can feed an integer conditional move.
+
+define i64 @select_fcmp_false_cmov(double %a, double %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_fcmp_false_cmov
+; CHECK:       movq %rsi, %rax
+; CHECK-NEXT:  retq
+  %1 = fcmp false double %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_fcmp_oeq_cmov(double %a, double %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_fcmp_oeq_cmov
+; CHECK:       ucomisd %xmm1, %xmm0
+; CHECK-NEXT:  setnp %al
+; CHECK-NEXT:  sete %cl
+; CHECK-NEXT:  testb %al, %cl
+; CHECK-NEXT:  cmoveq %rsi, %rdi
+  %1 = fcmp oeq double %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_fcmp_ogt_cmov(double %a, double %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_fcmp_ogt_cmov
+; CHECK:       ucomisd %xmm1, %xmm0
+; CHECK-NEXT:  cmovbeq %rsi, %rdi
+  %1 = fcmp ogt double %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_fcmp_oge_cmov(double %a, double %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_fcmp_oge_cmov
+; CHECK:       ucomisd %xmm1, %xmm0
+; CHECK-NEXT:  cmovbq %rsi, %rdi
+  %1 = fcmp oge double %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_fcmp_olt_cmov(double %a, double %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_fcmp_olt_cmov
+; CHECK:       ucomisd %xmm0, %xmm1
+; CHECK-NEXT:  cmovbeq %rsi, %rdi
+  %1 = fcmp olt double %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_fcmp_ole_cmov(double %a, double %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_fcmp_ole_cmov
+; CHECK:       ucomisd %xmm0, %xmm1
+; CHECK-NEXT:  cmovbq %rsi, %rdi
+  %1 = fcmp ole double %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_fcmp_one_cmov(double %a, double %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_fcmp_one_cmov
+; CHECK:       ucomisd %xmm1, %xmm0
+; CHECK-NEXT:  cmoveq %rsi, %rdi
+  %1 = fcmp one double %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_fcmp_ord_cmov(double %a, double %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_fcmp_ord_cmov
+; CHECK:       ucomisd %xmm1, %xmm0
+; CHECK-NEXT:  cmovpq %rsi, %rdi
+  %1 = fcmp ord double %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_fcmp_uno_cmov(double %a, double %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_fcmp_uno_cmov
+; CHECK:       ucomisd %xmm1, %xmm0
+; CHECK-NEXT:  cmovnpq %rsi, %rdi
+  %1 = fcmp uno double %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_fcmp_ueq_cmov(double %a, double %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_fcmp_ueq_cmov
+; CHECK:       ucomisd %xmm1, %xmm0
+; CHECK-NEXT:  cmovneq %rsi, %rdi
+  %1 = fcmp ueq double %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_fcmp_ugt_cmov(double %a, double %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_fcmp_ugt_cmov
+; CHECK:       ucomisd %xmm0, %xmm1
+; CHECK-NEXT:  cmovaeq %rsi, %rdi
+  %1 = fcmp ugt double %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_fcmp_uge_cmov(double %a, double %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_fcmp_uge_cmov
+; CHECK:       ucomisd %xmm0, %xmm1
+; CHECK-NEXT:  cmovaq %rsi, %rdi
+  %1 = fcmp uge double %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_fcmp_ult_cmov(double %a, double %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_fcmp_ult_cmov
+; CHECK:       ucomisd %xmm1, %xmm0
+; CHECK-NEXT:  cmovaeq %rsi, %rdi
+  %1 = fcmp ult double %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_fcmp_ule_cmov(double %a, double %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_fcmp_ule_cmov
+; CHECK:       ucomisd %xmm1, %xmm0
+; CHECK-NEXT:  cmovaq %rsi, %rdi
+  %1 = fcmp ule double %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_fcmp_une_cmov(double %a, double %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_fcmp_une_cmov
+; CHECK:       ucomisd %xmm1, %xmm0
+; CHECK-NEXT:  setp %al
+; CHECK-NEXT:  setne %cl
+; CHECK-NEXT:  orb %al, %cl
+; CHECK-NEXT:  cmoveq %rsi, %rdi
+  %1 = fcmp une double %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_fcmp_true_cmov(double %a, double %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_fcmp_true_cmov
+; CHECK:       movq %rdi, %rax
+  %1 = fcmp true double %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_icmp_eq_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_icmp_eq_cmov
+; CHECK:       cmpq    %rsi, %rdi
+; CHECK-NEXT:  cmovneq %rcx, %rdx
+; CHECK-NEXT:  movq    %rdx, %rax
+  %1 = icmp eq i64 %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_icmp_ne_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_icmp_ne_cmov
+; CHECK:       cmpq    %rsi, %rdi
+; CHECK-NEXT:  cmoveq  %rcx, %rdx
+; CHECK-NEXT:  movq    %rdx, %rax
+  %1 = icmp ne i64 %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_icmp_ugt_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_icmp_ugt_cmov
+; CHECK:       cmpq    %rsi, %rdi
+; CHECK-NEXT:  cmovbeq %rcx, %rdx
+; CHECK-NEXT:  movq    %rdx, %rax
+  %1 = icmp ugt i64 %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+
+define i64 @select_icmp_uge_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_icmp_uge_cmov
+; CHECK:       cmpq    %rsi, %rdi
+; CHECK-NEXT:  cmovbq  %rcx, %rdx
+; CHECK-NEXT:  movq    %rdx, %rax
+  %1 = icmp uge i64 %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_icmp_ult_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_icmp_ult_cmov
+; CHECK:       cmpq    %rsi, %rdi
+; CHECK-NEXT:  cmovaeq %rcx, %rdx
+; CHECK-NEXT:  movq    %rdx, %rax
+  %1 = icmp ult i64 %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_icmp_ule_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_icmp_ule_cmov
+; CHECK:       cmpq    %rsi, %rdi
+; CHECK-NEXT:  cmovaq  %rcx, %rdx
+; CHECK-NEXT:  movq    %rdx, %rax
+  %1 = icmp ule i64 %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_icmp_sgt_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_icmp_sgt_cmov
+; CHECK:       cmpq    %rsi, %rdi
+; CHECK-NEXT:  cmovleq %rcx, %rdx
+; CHECK-NEXT:  movq    %rdx, %rax
+  %1 = icmp sgt i64 %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_icmp_sge_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_icmp_sge_cmov
+; CHECK:       cmpq    %rsi, %rdi
+; CHECK-NEXT:  cmovlq  %rcx, %rdx
+; CHECK-NEXT:  movq    %rdx, %rax
+  %1 = icmp sge i64 %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_icmp_slt_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_icmp_slt_cmov
+; CHECK:       cmpq    %rsi, %rdi
+; CHECK-NEXT:  cmovgeq %rcx, %rdx
+; CHECK-NEXT:  movq    %rdx, %rax
+  %1 = icmp slt i64 %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
+define i64 @select_icmp_sle_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
+; CHECK-LABEL: select_icmp_sle_cmov
+; CHECK:       cmpq    %rsi, %rdi
+; CHECK-NEXT:  cmovgq  %rcx, %rdx
+; CHECK-NEXT:  movq    %rdx, %rax
+  %1 = icmp sle i64 %a, %b
+  %2 = select i1 %1, i64 %c, i64 %d
+  ret i64 %2
+}
+
diff --git a/test/CodeGen/X86/fast-isel-select-cmp.ll b/test/CodeGen/X86/fast-isel-select-cmp.ll
new file mode 100644
index 0000000..1af30e9
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-select-cmp.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -O0 -mtriple=x86_64-apple-darwin10 | FileCheck %s
+
+; Test if we do not fold the cmp into select if the instructions are in
+; different basic blocks.
+
+define i32 @select_cmp_cmov_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: select_cmp_cmov_i32
+; CHECK-LABEL: continue
+; CHECK-NOT:   cmp
+  %1 = icmp ult i32 %a, %b
+  br i1 %1, label %continue, label %exit
+
+continue:
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+
+exit:
+  ret i32 -1
+}
+
+define float @select_fcmp_oeq_f32(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: select_fcmp_oeq_f32
+; CHECK-LABEL: continue
+; CHECK-NOT:   cmp
+  %1 = fcmp oeq float %a, %b
+  br i1 %1, label %continue, label %exit
+
+continue:
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+
+exit:
+  ret float -1.0
+}
+
+define float @select_fcmp_one_f32(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: select_fcmp_one_f32
+; CHECK-LABEL: continue
+; CHECK-NOT:   ucomi
+  %1 = fcmp one float %a, %b
+  br i1 %1, label %continue, label %exit
+
+continue:
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+
+exit:
+  ret float -1.0
+}
+
diff --git a/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll b/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll
new file mode 100644
index 0000000..1ec4d64
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll
@@ -0,0 +1,138 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10                                              | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort                  | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10                             -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort -mcpu=corei7-avx | FileCheck %s
+
+
+define float @select_fcmp_one_f32(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: select_fcmp_one_f32
+; CHECK:       ucomiss %xmm1, %xmm0
+; CHECK-NEXT:  jne [[BB:LBB[0-9]+_2]]
+; CHECK:       [[BB]]
+; CHECK-NEXT:  movaps %xmm2, %xmm0
+  %1 = fcmp one float %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define double @select_fcmp_one_f64(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: select_fcmp_one_f64
+; CHECK:       ucomisd %xmm1, %xmm0
+; CHECK-NEXT:  jne [[BB:LBB[0-9]+_2]]
+; CHECK:       [[BB]]
+; CHECK-NEXT:  movaps  %xmm2, %xmm0
+  %1 = fcmp one double %a, %b
+  %2 = select i1 %1, double %c, double %d
+  ret double %2
+}
+
+define float @select_icmp_eq_f32(i64 %a, i64 %b, float %c, float %d) {
+; CHECK-LABEL: select_icmp_eq_f32
+; CHECK:       cmpq %rsi, %rdi
+; CHECK-NEXT:  je [[BB:LBB[0-9]+_2]]
+; CHECK:       [[BB]]
+; CHECK-NEXT:  retq
+  %1 = icmp eq i64 %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define float @select_icmp_ne_f32(i64 %a, i64 %b, float %c, float %d) {
+; CHECK-LABEL: select_icmp_ne_f32
+; CHECK:       cmpq %rsi, %rdi
+; CHECK-NEXT:  jne [[BB:LBB[0-9]+_2]]
+; CHECK:       [[BB]]
+; CHECK-NEXT:  retq
+  %1 = icmp ne i64 %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define float @select_icmp_ugt_f32(i64 %a, i64 %b, float %c, float %d) {
+; CHECK-LABEL: select_icmp_ugt_f32
+; CHECK:       cmpq %rsi, %rdi
+; CHECK-NEXT:  ja [[BB:LBB[0-9]+_2]]
+; CHECK:       [[BB]]
+; CHECK-NEXT:  retq
+  %1 = icmp ugt i64 %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define float @select_icmp_uge_f32(i64 %a, i64 %b, float %c, float %d) {
+; CHECK-LABEL: select_icmp_uge_f32
+; CHECK:       cmpq %rsi, %rdi
+; CHECK-NEXT:  jae [[BB:LBB[0-9]+_2]]
+; CHECK:       [[BB]]
+; CHECK-NEXT:  retq
+  %1 = icmp uge i64 %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define float @select_icmp_ult_f32(i64 %a, i64 %b, float %c, float %d) {
+; CHECK-LABEL: select_icmp_ult_f32
+; CHECK:       cmpq %rsi, %rdi
+; CHECK-NEXT:  jb [[BB:LBB[0-9]+_2]]
+; CHECK:       [[BB]]
+; CHECK-NEXT:  retq
+  %1 = icmp ult i64 %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define float @select_icmp_ule_f32(i64 %a, i64 %b, float %c, float %d) {
+; CHECK-LABEL: select_icmp_ule_f32
+; CHECK:       cmpq %rsi, %rdi
+; CHECK-NEXT:  jbe [[BB:LBB[0-9]+_2]]
+; CHECK:       [[BB]]
+; CHECK-NEXT:  retq
+  %1 = icmp ule i64 %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define float @select_icmp_sgt_f32(i64 %a, i64 %b, float %c, float %d) {
+; CHECK-LABEL: select_icmp_sgt_f32
+; CHECK:       cmpq %rsi, %rdi
+; CHECK-NEXT:  jg [[BB:LBB[0-9]+_2]]
+; CHECK:       [[BB]]
+; CHECK-NEXT:  retq
+  %1 = icmp sgt i64 %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define float @select_icmp_sge_f32(i64 %a, i64 %b, float %c, float %d) {
+; CHECK-LABEL: select_icmp_sge_f32
+; CHECK:       cmpq %rsi, %rdi
+; CHECK-NEXT:  jge [[BB:LBB[0-9]+_2]]
+; CHECK:       [[BB]]
+; CHECK-NEXT:  retq
+  %1 = icmp sge i64 %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define float @select_icmp_slt_f32(i64 %a, i64 %b, float %c, float %d) {
+; CHECK-LABEL: select_icmp_slt_f32
+; CHECK:       cmpq %rsi, %rdi
+; CHECK-NEXT:  jl [[BB:LBB[0-9]+_2]]
+; CHECK:       [[BB]]
+; CHECK-NEXT:  retq
+  %1 = icmp slt i64 %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define float @select_icmp_sle_f32(i64 %a, i64 %b, float %c, float %d) {
+; CHECK-LABEL: select_icmp_sle_f32
+; CHECK:       cmpq %rsi, %rdi
+; CHECK-NEXT:  jle [[BB:LBB[0-9]+_2]]
+; CHECK:       [[BB]]
+; CHECK-NEXT:  retq
+  %1 = icmp sle i64 %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
diff --git a/test/CodeGen/X86/fast-isel-select-sse.ll b/test/CodeGen/X86/fast-isel-select-sse.ll
new file mode 100644
index 0000000..3c03a03
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-select-sse.ll
@@ -0,0 +1,391 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10                                              | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort                  | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10                             -mcpu=corei7-avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort -mcpu=corei7-avx | FileCheck %s --check-prefix=AVX
+
+; Test all cmp predicates that can be used with SSE.
+
+define float @select_fcmp_oeq_f32(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: select_fcmp_oeq_f32
+; CHECK:       cmpeqss %xmm1, %xmm0
+; CHECK-NEXT:  andps   %xmm0, %xmm2
+; CHECK-NEXT:  andnps  %xmm3, %xmm0
+; CHECK-NEXT:  orps    %xmm2, %xmm0
+; AVX-LABEL: select_fcmp_oeq_f32
+; AVX:       vcmpeqss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vandps   %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnps  %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorps    %xmm1, %xmm0, %xmm0
+  %1 = fcmp oeq float %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define double @select_fcmp_oeq_f64(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: select_fcmp_oeq_f64
+; CHECK:       cmpeqsd %xmm1, %xmm0
+; CHECK-NEXT:  andpd   %xmm0, %xmm2
+; CHECK-NEXT:  andnpd  %xmm3, %xmm0
+; CHECK-NEXT:  orpd    %xmm2, %xmm0
+; AVX-LABEL: select_fcmp_oeq_f64
+; AVX:       vcmpeqsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vandpd   %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnpd  %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorpd    %xmm1, %xmm0, %xmm0
+  %1 = fcmp oeq double %a, %b
+  %2 = select i1 %1, double %c, double %d
+  ret double %2
+}
+
+define float @select_fcmp_ogt_f32(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: select_fcmp_ogt_f32
+; CHECK:       cmpltss %xmm0, %xmm1
+; CHECK-NEXT:  andps   %xmm1, %xmm2
+; CHECK-NEXT:  andnps  %xmm3, %xmm1
+; CHECK-NEXT:  orps    %xmm2, %xmm1
+; AVX-LABEL: select_fcmp_ogt_f32
+; AVX:       vcmpltss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:  vandps   %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnps  %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorps    %xmm1, %xmm0, %xmm0
+  %1 = fcmp ogt float %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define double @select_fcmp_ogt_f64(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: select_fcmp_ogt_f64
+; CHECK:       cmpltsd %xmm0, %xmm1
+; CHECK-NEXT:  andpd   %xmm1, %xmm2
+; CHECK-NEXT:  andnpd  %xmm3, %xmm1
+; CHECK-NEXT:  orpd    %xmm2, %xmm1
+; AVX-LABEL: select_fcmp_ogt_f64
+; AVX:       vcmpltsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:  vandpd   %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnpd  %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorpd    %xmm1, %xmm0, %xmm0
+  %1 = fcmp ogt double %a, %b
+  %2 = select i1 %1, double %c, double %d
+  ret double %2
+}
+
+define float @select_fcmp_oge_f32(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: select_fcmp_oge_f32
+; CHECK:       cmpless %xmm0, %xmm1
+; CHECK-NEXT:  andps   %xmm1, %xmm2
+; CHECK-NEXT:  andnps  %xmm3, %xmm1
+; CHECK-NEXT:  orps    %xmm2, %xmm1
+; AVX-LABEL: select_fcmp_oge_f32
+; AVX:       vcmpless %xmm0, %xmm1, %xmm0
+; AVX-NEXT:  vandps   %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnps  %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorps    %xmm1, %xmm0, %xmm0
+  %1 = fcmp oge float %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define double @select_fcmp_oge_f64(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: select_fcmp_oge_f64
+; CHECK:       cmplesd %xmm0, %xmm1
+; CHECK-NEXT:  andpd   %xmm1, %xmm2
+; CHECK-NEXT:  andnpd  %xmm3, %xmm1
+; CHECK-NEXT:  orpd    %xmm2, %xmm1
+; AVX-LABEL: select_fcmp_oge_f64
+; AVX:       vcmplesd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:  vandpd   %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnpd  %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorpd    %xmm1, %xmm0, %xmm0
+  %1 = fcmp oge double %a, %b
+  %2 = select i1 %1, double %c, double %d
+  ret double %2
+}
+
+define float @select_fcmp_olt_f32(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: select_fcmp_olt_f32
+; CHECK:       cmpltss %xmm1, %xmm0
+; CHECK-NEXT:  andps   %xmm0, %xmm2
+; CHECK-NEXT:  andnps  %xmm3, %xmm0
+; CHECK-NEXT:  orps    %xmm2, %xmm0
+; AVX-LABEL: select_fcmp_olt_f32
+; AVX:       vcmpltss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vandps   %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnps  %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorps    %xmm1, %xmm0, %xmm0
+  %1 = fcmp olt float %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define double @select_fcmp_olt_f64(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: select_fcmp_olt_f64
+; CHECK:       cmpltsd %xmm1, %xmm0
+; CHECK-NEXT:  andpd   %xmm0, %xmm2
+; CHECK-NEXT:  andnpd  %xmm3, %xmm0
+; CHECK-NEXT:  orpd    %xmm2, %xmm0
+; AVX-LABEL: select_fcmp_olt_f64
+; AVX:       vcmpltsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vandpd   %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnpd  %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorpd    %xmm1, %xmm0, %xmm0
+  %1 = fcmp olt double %a, %b
+  %2 = select i1 %1, double %c, double %d
+  ret double %2
+}
+
+define float @select_fcmp_ole_f32(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: select_fcmp_ole_f32
+; CHECK:       cmpless %xmm1, %xmm0
+; CHECK-NEXT:  andps   %xmm0, %xmm2
+; CHECK-NEXT:  andnps  %xmm3, %xmm0
+; CHECK-NEXT:  orps    %xmm2, %xmm0
+; AVX-LABEL: select_fcmp_ole_f32
+; AVX:       vcmpless %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vandps   %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnps  %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorps    %xmm1, %xmm0, %xmm0
+  %1 = fcmp ole float %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define double @select_fcmp_ole_f64(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: select_fcmp_ole_f64
+; CHECK:       cmplesd %xmm1, %xmm0
+; CHECK-NEXT:  andpd   %xmm0, %xmm2
+; CHECK-NEXT:  andnpd  %xmm3, %xmm0
+; CHECK-NEXT:  orpd    %xmm2, %xmm0
+; AVX-LABEL: select_fcmp_ole_f64
+; AVX:       vcmplesd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vandpd   %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnpd  %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorpd    %xmm1, %xmm0, %xmm0
+  %1 = fcmp ole double %a, %b
+  %2 = select i1 %1, double %c, double %d
+  ret double %2
+}
+
+define float @select_fcmp_ord_f32(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: select_fcmp_ord_f32
+; CHECK:       cmpordss %xmm1, %xmm0
+; CHECK-NEXT:  andps    %xmm0, %xmm2
+; CHECK-NEXT:  andnps   %xmm3, %xmm0
+; CHECK-NEXT:  orps     %xmm2, %xmm0
+; AVX-LABEL: select_fcmp_ord_f32
+; AVX:       vcmpordss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vandps    %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnps   %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorps     %xmm1, %xmm0, %xmm0
+  %1 = fcmp ord float %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define double @select_fcmp_ord_f64(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: select_fcmp_ord_f64
+; CHECK:       cmpordsd %xmm1, %xmm0
+; CHECK-NEXT:  andpd    %xmm0, %xmm2
+; CHECK-NEXT:  andnpd   %xmm3, %xmm0
+; CHECK-NEXT:  orpd     %xmm2, %xmm0
+; AVX-LABEL: select_fcmp_ord_f64
+; AVX:       vcmpordsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vandpd    %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnpd   %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorpd     %xmm1, %xmm0, %xmm0
+  %1 = fcmp ord double %a, %b
+  %2 = select i1 %1, double %c, double %d
+  ret double %2
+}
+
+define float @select_fcmp_uno_f32(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: select_fcmp_uno_f32
+; CHECK:       cmpunordss %xmm1, %xmm0
+; CHECK-NEXT:  andps      %xmm0, %xmm2
+; CHECK-NEXT:  andnps     %xmm3, %xmm0
+; CHECK-NEXT:  orps       %xmm2, %xmm0
+; AVX-LABEL: select_fcmp_uno_f32
+; AVX:       vcmpunordss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vandps      %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnps     %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorps       %xmm1, %xmm0, %xmm0
+  %1 = fcmp uno float %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define double @select_fcmp_uno_f64(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: select_fcmp_uno_f64
+; CHECK:       cmpunordsd %xmm1, %xmm0
+; CHECK-NEXT:  andpd      %xmm0, %xmm2
+; CHECK-NEXT:  andnpd     %xmm3, %xmm0
+; CHECK-NEXT:  orpd       %xmm2, %xmm0
+; AVX-LABEL: select_fcmp_uno_f64
+; AVX:       vcmpunordsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vandpd      %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnpd     %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorpd       %xmm1, %xmm0, %xmm0
+  %1 = fcmp uno double %a, %b
+  %2 = select i1 %1, double %c, double %d
+  ret double %2
+}
+
+define float @select_fcmp_ugt_f32(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: select_fcmp_ugt_f32
+; CHECK:       cmpnless %xmm1, %xmm0
+; CHECK-NEXT:  andps    %xmm0, %xmm2
+; CHECK-NEXT:  andnps   %xmm3, %xmm0
+; CHECK-NEXT:  orps     %xmm2, %xmm0
+; AVX-LABEL: select_fcmp_ugt_f32
+; AVX:       vcmpnless %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vandps    %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnps   %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorps     %xmm1, %xmm0, %xmm0
+  %1 = fcmp ugt float %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define double @select_fcmp_ugt_f64(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: select_fcmp_ugt_f64
+; CHECK:       cmpnlesd %xmm1, %xmm0
+; CHECK-NEXT:  andpd    %xmm0, %xmm2
+; CHECK-NEXT:  andnpd   %xmm3, %xmm0
+; CHECK-NEXT:  orpd     %xmm2, %xmm0
+; AVX-LABEL: select_fcmp_ugt_f64
+; AVX:       vcmpnlesd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vandpd    %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnpd   %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorpd     %xmm1, %xmm0, %xmm0
+  %1 = fcmp ugt double %a, %b
+  %2 = select i1 %1, double %c, double %d
+  ret double %2
+}
+
+define float @select_fcmp_uge_f32(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: select_fcmp_uge_f32
+; CHECK:       cmpnltss %xmm1, %xmm0
+; CHECK-NEXT:  andps    %xmm0, %xmm2
+; CHECK-NEXT:  andnps   %xmm3, %xmm0
+; CHECK-NEXT:  orps     %xmm2, %xmm0
+; AVX-LABEL: select_fcmp_uge_f32
+; AVX:       vcmpnltss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vandps    %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnps   %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorps     %xmm1, %xmm0, %xmm0
+  %1 = fcmp uge float %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define double @select_fcmp_uge_f64(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: select_fcmp_uge_f64
+; CHECK:       cmpnltsd %xmm1, %xmm0
+; CHECK-NEXT:  andpd    %xmm0, %xmm2
+; CHECK-NEXT:  andnpd   %xmm3, %xmm0
+; CHECK-NEXT:  orpd     %xmm2, %xmm0
+; AVX-LABEL: select_fcmp_uge_f64
+; AVX:       vcmpnltsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vandpd    %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnpd   %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorpd     %xmm1, %xmm0, %xmm0
+  %1 = fcmp uge double %a, %b
+  %2 = select i1 %1, double %c, double %d
+  ret double %2
+}
+
+define float @select_fcmp_ult_f32(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: select_fcmp_ult_f32
+; CHECK:       cmpnless %xmm0, %xmm1
+; CHECK-NEXT:  andps    %xmm1, %xmm2
+; CHECK-NEXT:  andnps   %xmm3, %xmm1
+; CHECK-NEXT:  orps     %xmm2, %xmm1
+; AVX-LABEL: select_fcmp_ult_f32
+; AVX:       vcmpnless %xmm0, %xmm1, %xmm0
+; AVX-NEXT:  vandps    %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnps   %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorps     %xmm1, %xmm0, %xmm0
+  %1 = fcmp ult float %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define double @select_fcmp_ult_f64(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: select_fcmp_ult_f64
+; CHECK:       cmpnlesd %xmm0, %xmm1
+; CHECK-NEXT:  andpd    %xmm1, %xmm2
+; CHECK-NEXT:  andnpd   %xmm3, %xmm1
+; CHECK-NEXT:  orpd     %xmm2, %xmm1
+; AVX-LABEL: select_fcmp_ult_f64
+; AVX:       vcmpnlesd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:  vandpd    %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnpd   %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorpd     %xmm1, %xmm0, %xmm0
+  %1 = fcmp ult double %a, %b
+  %2 = select i1 %1, double %c, double %d
+  ret double %2
+}
+
+define float @select_fcmp_ule_f32(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: select_fcmp_ule_f32
+; CHECK:       cmpnltss %xmm0, %xmm1
+; CHECK-NEXT:  andps    %xmm1, %xmm2
+; CHECK-NEXT:  andnps   %xmm3, %xmm1
+; CHECK-NEXT:  orps     %xmm2, %xmm1
+; AVX-LABEL: select_fcmp_ule_f32
+; AVX:       vcmpnltss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:  vandps    %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnps   %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorps     %xmm1, %xmm0, %xmm0
+  %1 = fcmp ule float %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define double @select_fcmp_ule_f64(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: select_fcmp_ule_f64
+; CHECK:       cmpnltsd %xmm0, %xmm1
+; CHECK-NEXT:  andpd    %xmm1, %xmm2
+; CHECK-NEXT:  andnpd   %xmm3, %xmm1
+; CHECK-NEXT:  orpd     %xmm2, %xmm1
+; AVX-LABEL: select_fcmp_ule_f64
+; AVX:       vcmpnltsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:  vandpd    %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnpd   %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorpd     %xmm1, %xmm0, %xmm0
+  %1 = fcmp ule double %a, %b
+  %2 = select i1 %1, double %c, double %d
+  ret double %2
+}
+
+define float @select_fcmp_une_f32(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: select_fcmp_une_f32
+; CHECK:       cmpneqss %xmm1, %xmm0
+; CHECK-NEXT:  andps    %xmm0, %xmm2
+; CHECK-NEXT:  andnps   %xmm3, %xmm0
+; CHECK-NEXT:  orps     %xmm2, %xmm0
+; AVX-LABEL: select_fcmp_une_f32
+; AVX:       vcmpneqss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vandps    %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnps   %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorps     %xmm1, %xmm0, %xmm0
+  %1 = fcmp une float %a, %b
+  %2 = select i1 %1, float %c, float %d
+  ret float %2
+}
+
+define double @select_fcmp_une_f64(double %a, double %b, double %c, double %d) {
+; CHECK-LABEL: select_fcmp_une_f64
+; CHECK:       cmpneqsd %xmm1, %xmm0
+; CHECK-NEXT:  andpd    %xmm0, %xmm2
+; CHECK-NEXT:  andnpd   %xmm3, %xmm0
+; CHECK-NEXT:  orpd     %xmm2, %xmm0
+; AVX-LABEL: select_fcmp_une_f64
+; AVX:       vcmpneqsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:  vandpd    %xmm2, %xmm0, %xmm1
+; AVX-NEXT:  vandnpd   %xmm3, %xmm0, %xmm0
+; AVX-NEXT:  vorpd     %xmm1, %xmm0, %xmm0
+  %1 = fcmp une double %a, %b
+  %2 = select i1 %1, double %c, double %d
+  ret double %2
+}
+
diff --git a/test/CodeGen/X86/fast-isel-select.ll b/test/CodeGen/X86/fast-isel-select.ll
index 53158bc..7b3c99f 100644
--- a/test/CodeGen/X86/fast-isel-select.ll
+++ b/test/CodeGen/X86/fast-isel-select.ll
@@ -4,10 +4,10 @@
 ; lsb is zero.
 ; <rdar://problem/15651765>
 
-; CHECK-LABEL: fastisel_select: 
+; CHECK-LABEL: fastisel_select:
 ; CHECK: subb {{%[a-z0-9]+}}, [[RES:%[a-z0-9]+]]
 ; CHECK: testb $1, [[RES]]
-; CHECK: cmovel
+; CHECK: cmovnel %edi, %esi
 define i32 @fastisel_select(i1 %exchSub2211_, i1 %trunc_8766) {
   %shuffleInternal15257_8932 = sub i1 %exchSub2211_, %trunc_8766
   %counter_diff1345 = select i1 %shuffleInternal15257_8932, i32 1204476887, i32 0
diff --git a/test/CodeGen/X86/fast-isel-sse12-fptoint.ll b/test/CodeGen/X86/fast-isel-sse12-fptoint.ll
new file mode 100644
index 0000000..769c987
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-sse12-fptoint.ll
@@ -0,0 +1,54 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-avx,+sse2 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-avx2,+avx -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=AVX
+
+define i32 @cvt_test1(float %a) {
+; SSE-LABEL: cvt_test1
+; SSE:       cvttss2si %xmm0, %eax
+; AVX-LABEL: cvt_test1
+; AVX:       vcvttss2si %xmm0, %eax
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 0.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 0.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 0.000000e+00, i32 3
+  %5 = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %4)
+  ret i32 %5
+}
+declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
+
+define i64 @cvt_test2(float %a) {
+; SSE-LABEL: cvt_test2
+; SSE:       cvttss2si %xmm0, %rax
+; AVX-LABEL: cvt_test2
+; AVX:       vcvttss2si %xmm0, %rax
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 0.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 0.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 0.000000e+00, i32 3
+  %5 = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %4)
+  ret i64 %5
+}
+declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
+
+define i32 @cvt_test3(double %a) {
+; SSE-LABEL: cvt_test3
+; SSE:       cvttsd2si %xmm0, %eax
+; AVX-LABEL: cvt_test3
+; AVX:       vcvttsd2si %xmm0, %eax
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 0.000000e+00, i32 1
+  %3 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %2)
+  ret i32 %3
+}
+declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
+
+define i64 @cvt_test4(double %a) {
+; SSE-LABEL: cvt_test4
+; SSE:       cvttsd2si %xmm0, %rax
+; AVX-LABEL: cvt_test4
+; AVX:       vcvttsd2si %xmm0, %rax
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 0.000000e+00, i32 1
+  %3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %2)
+  ret i64 %3
+}
+declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
diff --git a/test/CodeGen/X86/float-asmprint.ll b/test/CodeGen/X86/float-asmprint.ll
index 4aeae7f..5de9700 100644
--- a/test/CodeGen/X86/float-asmprint.ll
+++ b/test/CodeGen/X86/float-asmprint.ll
@@ -16,8 +16,9 @@
 ; CHECK-NEXT: .size
 
 ; CHECK: varppc128:
-; CHECK-NEXT: .quad 0                         # ppc_fp128 -0
-; CHECK-NEXT: .quad -9223372036854775808
+; For ppc_fp128, the high double always comes first.
+; CHECK-NEXT: .quad -9223372036854775808      # ppc_fp128 -0
+; CHECK-NEXT: .quad 0
 ; CHECK-NEXT: .size
 
 ; CHECK: var80:
diff --git a/test/CodeGen/X86/frameaddr.ll b/test/CodeGen/X86/frameaddr.ll
new file mode 100644
index 0000000..6c1ca25
--- /dev/null
+++ b/test/CodeGen/X86/frameaddr.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -march=x86                                | FileCheck %s --check-prefix=CHECK-32
+; RUN: llc < %s -march=x86    -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-32
+; RUN: llc < %s -march=x86-64                             | FileCheck %s --check-prefix=CHECK-64
+; RUN: llc < %s -march=x86-64 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-64
+
+define i8* @test1() nounwind {
+entry:
+; CHECK-32-LABEL: test1
+; CHECK-32:       push
+; CHECK-32-NEXT:  movl %esp, %ebp
+; CHECK-32-NEXT:  movl %ebp, %eax
+; CHECK-32-NEXT:  pop
+; CHECK-32-NEXT:  ret
+; CHECK-64-LABEL: test1
+; CHECK-64:       push
+; CHECK-64-NEXT:  movq %rsp, %rbp
+; CHECK-64-NEXT:  movq %rbp, %rax
+; CHECK-64-NEXT:  pop
+; CHECK-64-NEXT:  ret
+  %0 = tail call i8* @llvm.frameaddress(i32 0)
+  ret i8* %0
+}
+
+define i8* @test2() nounwind {
+entry:
+; CHECK-32-LABEL: test2
+; CHECK-32:       push
+; CHECK-32-NEXT:  movl %esp, %ebp
+; CHECK-32-NEXT:  movl (%ebp), %eax
+; CHECK-32-NEXT:  movl (%eax), %eax
+; CHECK-32-NEXT:  pop
+; CHECK-32-NEXT:  ret
+; CHECK-64-LABEL: test2
+; CHECK-64:       push
+; CHECK-64-NEXT:  movq %rsp, %rbp
+; CHECK-64-NEXT:  movq (%rbp), %rax
+; CHECK-64-NEXT:  movq (%rax), %rax
+; CHECK-64-NEXT:  pop
+; CHECK-64-NEXT:  ret
+  %0 = tail call i8* @llvm.frameaddress(i32 2)
+  ret i8* %0
+}
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/X86/gcc_except_table.ll b/test/CodeGen/X86/gcc_except_table.ll
index 8c328ec..a732eb1 100644
--- a/test/CodeGen/X86/gcc_except_table.ll
+++ b/test/CodeGen/X86/gcc_except_table.ll
@@ -13,14 +13,14 @@ define i32 @main() uwtable optsize ssp {
 ; APPLE: GCC_except_table0:
 ; APPLE: Lexception0:
 
-; MINGW64: .cfi_startproc
-; MINGW64: .cfi_personality 0, __gxx_personality_v0
-; MINGW64: .cfi_lsda 0, .Lexception0
-; MINGW64: .cfi_def_cfa_offset 16
+; MINGW64: .seh_proc
+; MINGW64: .seh_handler __gxx_personality_v0
+; MINGW64: .seh_setframe 5, 0
 ; MINGW64: callq _Unwind_Resume
-; MINGW64: .cfi_endproc
+; MINGW64: .seh_handlerdata
 ; MINGW64: GCC_except_table0:
 ; MINGW64: Lexception0:
+; MINGW64: .seh_endproc
 
 ; MINGW32: .cfi_startproc
 ; MINGW32: .cfi_personality 0, ___gxx_personality_v0
diff --git a/test/CodeGen/X86/haddsub-2.ll b/test/CodeGen/X86/haddsub-2.ll
new file mode 100644
index 0000000..ff939a9
--- /dev/null
+++ b/test/CodeGen/X86/haddsub-2.ll
@@ -0,0 +1,802 @@
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse3 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE3
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse3,+ssse3 | FileCheck %s -check-prefix=CHECK -check-prefix=SSSE3
+; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
+; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
+
+
+
+define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) {
+  %vecext = extractelement <4 x float> %A, i32 0
+  %vecext1 = extractelement <4 x float> %A, i32 1
+  %add = fadd float %vecext, %vecext1
+  %vecinit = insertelement <4 x float> undef, float %add, i32 0
+  %vecext2 = extractelement <4 x float> %A, i32 2
+  %vecext3 = extractelement <4 x float> %A, i32 3
+  %add4 = fadd float %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
+  %vecext6 = extractelement <4 x float> %B, i32 0
+  %vecext7 = extractelement <4 x float> %B, i32 1
+  %add8 = fadd float %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
+  %vecext10 = extractelement <4 x float> %B, i32 2
+  %vecext11 = extractelement <4 x float> %B, i32 3
+  %add12 = fadd float %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
+  ret <4 x float> %vecinit13
+}
+; CHECK-LABEL: hadd_ps_test1
+; CHECK: haddps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
+  %vecext = extractelement <4 x float> %A, i32 2
+  %vecext1 = extractelement <4 x float> %A, i32 3
+  %add = fadd float %vecext, %vecext1
+  %vecinit = insertelement <4 x float> undef, float %add, i32 1
+  %vecext2 = extractelement <4 x float> %A, i32 0
+  %vecext3 = extractelement <4 x float> %A, i32 1
+  %add4 = fadd float %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 0
+  %vecext6 = extractelement <4 x float> %B, i32 2
+  %vecext7 = extractelement <4 x float> %B, i32 3
+  %add8 = fadd float %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 3
+  %vecext10 = extractelement <4 x float> %B, i32 0
+  %vecext11 = extractelement <4 x float> %B, i32 1
+  %add12 = fadd float %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 2
+  ret <4 x float> %vecinit13
+}
+; CHECK-LABEL: hadd_ps_test2
+; CHECK: haddps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) {
+  %vecext = extractelement <4 x float> %A, i32 0
+  %vecext1 = extractelement <4 x float> %A, i32 1
+  %sub = fsub float %vecext, %vecext1
+  %vecinit = insertelement <4 x float> undef, float %sub, i32 0
+  %vecext2 = extractelement <4 x float> %A, i32 2
+  %vecext3 = extractelement <4 x float> %A, i32 3
+  %sub4 = fsub float %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 1
+  %vecext6 = extractelement <4 x float> %B, i32 0
+  %vecext7 = extractelement <4 x float> %B, i32 1
+  %sub8 = fsub float %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 2
+  %vecext10 = extractelement <4 x float> %B, i32 2
+  %vecext11 = extractelement <4 x float> %B, i32 3
+  %sub12 = fsub float %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 3
+  ret <4 x float> %vecinit13
+}
+; CHECK-LABEL: hsub_ps_test1
+; CHECK: hsubps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) {
+  %vecext = extractelement <4 x float> %A, i32 2
+  %vecext1 = extractelement <4 x float> %A, i32 3
+  %sub = fsub float %vecext, %vecext1
+  %vecinit = insertelement <4 x float> undef, float %sub, i32 1
+  %vecext2 = extractelement <4 x float> %A, i32 0
+  %vecext3 = extractelement <4 x float> %A, i32 1
+  %sub4 = fsub float %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
+  %vecext6 = extractelement <4 x float> %B, i32 2
+  %vecext7 = extractelement <4 x float> %B, i32 3
+  %sub8 = fsub float %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
+  %vecext10 = extractelement <4 x float> %B, i32 0
+  %vecext11 = extractelement <4 x float> %B, i32 1
+  %sub12 = fsub float %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
+  ret <4 x float> %vecinit13
+}
+; CHECK-LABEL: hsub_ps_test2
+; CHECK: hsubps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
+  %vecext = extractelement <4 x i32> %A, i32 0
+  %vecext1 = extractelement <4 x i32> %A, i32 1
+  %add = add i32 %vecext, %vecext1
+  %vecinit = insertelement <4 x i32> undef, i32 %add, i32 0
+  %vecext2 = extractelement <4 x i32> %A, i32 2
+  %vecext3 = extractelement <4 x i32> %A, i32 3
+  %add4 = add i32 %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 1
+  %vecext6 = extractelement <4 x i32> %B, i32 0
+  %vecext7 = extractelement <4 x i32> %B, i32 1
+  %add8 = add i32 %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 2
+  %vecext10 = extractelement <4 x i32> %B, i32 2
+  %vecext11 = extractelement <4 x i32> %B, i32 3
+  %add12 = add i32 %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 3
+  ret <4 x i32> %vecinit13
+}
+; CHECK-LABEL: phadd_d_test1
+; SSE3-NOT: phaddd
+; SSSE3: phaddd
+; AVX: vphaddd
+; AVX2 vphaddd
+; CHECK: ret
+
+
+define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
+  %vecext = extractelement <4 x i32> %A, i32 2
+  %vecext1 = extractelement <4 x i32> %A, i32 3
+  %add = add i32 %vecext, %vecext1
+  %vecinit = insertelement <4 x i32> undef, i32 %add, i32 1
+  %vecext2 = extractelement <4 x i32> %A, i32 0
+  %vecext3 = extractelement <4 x i32> %A, i32 1
+  %add4 = add i32 %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 0
+  %vecext6 = extractelement <4 x i32> %B, i32 3
+  %vecext7 = extractelement <4 x i32> %B, i32 2
+  %add8 = add i32 %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 3
+  %vecext10 = extractelement <4 x i32> %B, i32 1
+  %vecext11 = extractelement <4 x i32> %B, i32 0
+  %add12 = add i32 %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 2
+  ret <4 x i32> %vecinit13
+}
+; CHECK-LABEL: phadd_d_test2
+; SSE3-NOT: phaddd
+; SSSE3: phaddd
+; AVX: vphaddd
+; AVX2 vphaddd
+; CHECK: ret
+
+
+define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
+  %vecext = extractelement <4 x i32> %A, i32 0
+  %vecext1 = extractelement <4 x i32> %A, i32 1
+  %sub = sub i32 %vecext, %vecext1
+  %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
+  %vecext2 = extractelement <4 x i32> %A, i32 2
+  %vecext3 = extractelement <4 x i32> %A, i32 3
+  %sub4 = sub i32 %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
+  %vecext6 = extractelement <4 x i32> %B, i32 0
+  %vecext7 = extractelement <4 x i32> %B, i32 1
+  %sub8 = sub i32 %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
+  %vecext10 = extractelement <4 x i32> %B, i32 2
+  %vecext11 = extractelement <4 x i32> %B, i32 3
+  %sub12 = sub i32 %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
+  ret <4 x i32> %vecinit13
+}
+; CHECK-LABEL: phsub_d_test1
+; SSE3-NOT: phsubd
+; SSSE3: phsubd
+; AVX: vphsubd
+; AVX2 vphsubd
+; CHECK: ret
+
+
+define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
+  %vecext = extractelement <4 x i32> %A, i32 2
+  %vecext1 = extractelement <4 x i32> %A, i32 3
+  %sub = sub i32 %vecext, %vecext1
+  %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 1
+  %vecext2 = extractelement <4 x i32> %A, i32 0
+  %vecext3 = extractelement <4 x i32> %A, i32 1
+  %sub4 = sub i32 %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 0
+  %vecext6 = extractelement <4 x i32> %B, i32 2
+  %vecext7 = extractelement <4 x i32> %B, i32 3
+  %sub8 = sub i32 %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 3
+  %vecext10 = extractelement <4 x i32> %B, i32 0
+  %vecext11 = extractelement <4 x i32> %B, i32 1
+  %sub12 = sub i32 %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 2
+  ret <4 x i32> %vecinit13
+}
+; CHECK-LABEL: phsub_d_test2
+; SSE3-NOT: phsubd
+; SSSE3: phsubd
+; AVX: vphsubd
+; AVX2 vphsubd
+; CHECK: ret
+
+
+define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) {
+  %vecext = extractelement <2 x double> %A, i32 0
+  %vecext1 = extractelement <2 x double> %A, i32 1
+  %add = fadd double %vecext, %vecext1
+  %vecinit = insertelement <2 x double> undef, double %add, i32 0
+  %vecext2 = extractelement <2 x double> %B, i32 0
+  %vecext3 = extractelement <2 x double> %B, i32 1
+  %add2 = fadd double %vecext2, %vecext3
+  %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
+  ret <2 x double> %vecinit2
+}
+; CHECK-LABEL: hadd_pd_test1
+; CHECK: haddpd
+; CHECK-NEXT: ret
+
+
+define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) {
+  %vecext = extractelement <2 x double> %A, i32 1
+  %vecext1 = extractelement <2 x double> %A, i32 0
+  %add = fadd double %vecext, %vecext1
+  %vecinit = insertelement <2 x double> undef, double %add, i32 0
+  %vecext2 = extractelement <2 x double> %B, i32 1
+  %vecext3 = extractelement <2 x double> %B, i32 0
+  %add2 = fadd double %vecext2, %vecext3
+  %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
+  ret <2 x double> %vecinit2
+}
+; CHECK-LABEL: hadd_pd_test2
+; CHECK: haddpd
+; CHECK-NEXT: ret
+
+
+define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) {
+  %vecext = extractelement <2 x double> %A, i32 0
+  %vecext1 = extractelement <2 x double> %A, i32 1
+  %sub = fsub double %vecext, %vecext1
+  %vecinit = insertelement <2 x double> undef, double %sub, i32 0
+  %vecext2 = extractelement <2 x double> %B, i32 0
+  %vecext3 = extractelement <2 x double> %B, i32 1
+  %sub2 = fsub double %vecext2, %vecext3
+  %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 1
+  ret <2 x double> %vecinit2
+}
+; CHECK-LABEL: hsub_pd_test1
+; CHECK: hsubpd
+; CHECK-NEXT: ret
+
+
+define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) {
+  %vecext = extractelement <2 x double> %B, i32 0
+  %vecext1 = extractelement <2 x double> %B, i32 1
+  %sub = fsub double %vecext, %vecext1
+  %vecinit = insertelement <2 x double> undef, double %sub, i32 1
+  %vecext2 = extractelement <2 x double> %A, i32 0
+  %vecext3 = extractelement <2 x double> %A, i32 1
+  %sub2 = fsub double %vecext2, %vecext3
+  %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
+  ret <2 x double> %vecinit2
+}
+; CHECK-LABEL: hsub_pd_test2
+; CHECK: hsubpd
+; CHECK-NEXT: ret
+
+
+define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) {
+  %vecext = extractelement <4 x double> %A, i32 0
+  %vecext1 = extractelement <4 x double> %A, i32 1
+  %add = fadd double %vecext, %vecext1
+  %vecinit = insertelement <4 x double> undef, double %add, i32 0
+  %vecext2 = extractelement <4 x double> %A, i32 2
+  %vecext3 = extractelement <4 x double> %A, i32 3
+  %add4 = fadd double %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
+  %vecext6 = extractelement <4 x double> %B, i32 0
+  %vecext7 = extractelement <4 x double> %B, i32 1
+  %add8 = fadd double %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
+  %vecext10 = extractelement <4 x double> %B, i32 2
+  %vecext11 = extractelement <4 x double> %B, i32 3
+  %add12 = fadd double %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
+  ret <4 x double> %vecinit13
+}
+; CHECK-LABEL: avx_vhadd_pd_test
+; SSE3: haddpd
+; SSE3-NEXT: haddpd
+; SSSE3: haddpd
+; SSSE3: haddpd
+; AVX: vhaddpd
+; AVX: vhaddpd
+; AVX2: vhaddpd
+; AVX2: vhaddpd
+; CHECK: ret
+
+
+define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) {
+  %vecext = extractelement <4 x double> %A, i32 0
+  %vecext1 = extractelement <4 x double> %A, i32 1
+  %sub = fsub double %vecext, %vecext1
+  %vecinit = insertelement <4 x double> undef, double %sub, i32 0
+  %vecext2 = extractelement <4 x double> %A, i32 2
+  %vecext3 = extractelement <4 x double> %A, i32 3
+  %sub4 = fsub double %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
+  %vecext6 = extractelement <4 x double> %B, i32 0
+  %vecext7 = extractelement <4 x double> %B, i32 1
+  %sub8 = fsub double %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
+  %vecext10 = extractelement <4 x double> %B, i32 2
+  %vecext11 = extractelement <4 x double> %B, i32 3
+  %sub12 = fsub double %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
+  ret <4 x double> %vecinit13
+}
+; CHECK-LABEL: avx_vhsub_pd_test
+; SSE3: hsubpd
+; SSE3-NEXT: hsubpd
+; SSSE3: hsubpd
+; SSSE3-NEXT: hsubpd
+; AVX: vhsubpd
+; AVX: vhsubpd
+; AVX2: vhsubpd
+; AVX2: vhsubpd
+; CHECK: ret
+
+
+define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
+  %vecext = extractelement <8 x i32> %A, i32 0
+  %vecext1 = extractelement <8 x i32> %A, i32 1
+  %add = add i32 %vecext, %vecext1
+  %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
+  %vecext2 = extractelement <8 x i32> %A, i32 2
+  %vecext3 = extractelement <8 x i32> %A, i32 3
+  %add4 = add i32 %vecext2, %vecext3
+  %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
+  %vecext6 = extractelement <8 x i32> %A, i32 4
+  %vecext7 = extractelement <8 x i32> %A, i32 5
+  %add8 = add i32 %vecext6, %vecext7
+  %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
+  %vecext10 = extractelement <8 x i32> %A, i32 6
+  %vecext11 = extractelement <8 x i32> %A, i32 7
+  %add12 = add i32 %vecext10, %vecext11
+  %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
+  %vecext14 = extractelement <8 x i32> %B, i32 0
+  %vecext15 = extractelement <8 x i32> %B, i32 1
+  %add16 = add i32 %vecext14, %vecext15
+  %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
+  %vecext18 = extractelement <8 x i32> %B, i32 2
+  %vecext19 = extractelement <8 x i32> %B, i32 3
+  %add20 = add i32 %vecext18, %vecext19
+  %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
+  %vecext22 = extractelement <8 x i32> %B, i32 4
+  %vecext23 = extractelement <8 x i32> %B, i32 5
+  %add24 = add i32 %vecext22, %vecext23
+  %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
+  %vecext26 = extractelement <8 x i32> %B, i32 6
+  %vecext27 = extractelement <8 x i32> %B, i32 7
+  %add28 = add i32 %vecext26, %vecext27
+  %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
+  ret <8 x i32> %vecinit29
+}
+; CHECK-LABEL: avx2_vphadd_d_test
+; SSE3-NOT: phaddd
+; SSSE3: phaddd
+; SSSE3-NEXT: phaddd
+; AVX: vphaddd
+; AVX: vphaddd
+; AVX2: vphaddd
+; AVX2: vphaddd
+; CHECK: ret
+
+define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
+  %vecext = extractelement <16 x i16> %a, i32 0
+  %vecext1 = extractelement <16 x i16> %a, i32 1
+  %add = add i16 %vecext, %vecext1
+  %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
+  %vecext4 = extractelement <16 x i16> %a, i32 2
+  %vecext6 = extractelement <16 x i16> %a, i32 3
+  %add8 = add i16 %vecext4, %vecext6
+  %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
+  %vecext11 = extractelement <16 x i16> %a, i32 4
+  %vecext13 = extractelement <16 x i16> %a, i32 5
+  %add15 = add i16 %vecext11, %vecext13
+  %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
+  %vecext18 = extractelement <16 x i16> %a, i32 6
+  %vecext20 = extractelement <16 x i16> %a, i32 7
+  %add22 = add i16 %vecext18, %vecext20
+  %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
+  %vecext25 = extractelement <16 x i16> %a, i32 8
+  %vecext27 = extractelement <16 x i16> %a, i32 9
+  %add29 = add i16 %vecext25, %vecext27
+  %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 4
+  %vecext32 = extractelement <16 x i16> %a, i32 10
+  %vecext34 = extractelement <16 x i16> %a, i32 11
+  %add36 = add i16 %vecext32, %vecext34
+  %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 5
+  %vecext39 = extractelement <16 x i16> %a, i32 12
+  %vecext41 = extractelement <16 x i16> %a, i32 13
+  %add43 = add i16 %vecext39, %vecext41
+  %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 6
+  %vecext46 = extractelement <16 x i16> %a, i32 14
+  %vecext48 = extractelement <16 x i16> %a, i32 15
+  %add50 = add i16 %vecext46, %vecext48
+  %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 7
+  %vecext53 = extractelement <16 x i16> %b, i32 0
+  %vecext55 = extractelement <16 x i16> %b, i32 1
+  %add57 = add i16 %vecext53, %vecext55
+  %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 8
+  %vecext60 = extractelement <16 x i16> %b, i32 2
+  %vecext62 = extractelement <16 x i16> %b, i32 3
+  %add64 = add i16 %vecext60, %vecext62
+  %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 9
+  %vecext67 = extractelement <16 x i16> %b, i32 4
+  %vecext69 = extractelement <16 x i16> %b, i32 5
+  %add71 = add i16 %vecext67, %vecext69
+  %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 10
+  %vecext74 = extractelement <16 x i16> %b, i32 6
+  %vecext76 = extractelement <16 x i16> %b, i32 7
+  %add78 = add i16 %vecext74, %vecext76
+  %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 11
+  %vecext81 = extractelement <16 x i16> %b, i32 8
+  %vecext83 = extractelement <16 x i16> %b, i32 9
+  %add85 = add i16 %vecext81, %vecext83
+  %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
+  %vecext88 = extractelement <16 x i16> %b, i32 10
+  %vecext90 = extractelement <16 x i16> %b, i32 11
+  %add92 = add i16 %vecext88, %vecext90
+  %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
+  %vecext95 = extractelement <16 x i16> %b, i32 12
+  %vecext97 = extractelement <16 x i16> %b, i32 13
+  %add99 = add i16 %vecext95, %vecext97
+  %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
+  %vecext102 = extractelement <16 x i16> %b, i32 14
+  %vecext104 = extractelement <16 x i16> %b, i32 15
+  %add106 = add i16 %vecext102, %vecext104
+  %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
+  ret <16 x i16> %vecinit108
+}
+; CHECK-LABEL: avx2_vphadd_w_test
+; SSE3-NOT: phaddw
+; SSSE3: phaddw
+; SSSE3-NEXT: phaddw
+; AVX: vphaddw
+; AVX: vphaddw
+; AVX2: vphaddw
+; AVX2: vphaddw
+; CHECK: ret
+
+
+; Verify that we don't select horizontal subs in the following functions.
+
+define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
+  %vecext = extractelement <4 x i32> %A, i32 0
+  %vecext1 = extractelement <4 x i32> %A, i32 1
+  %sub = sub i32 %vecext, %vecext1
+  %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
+  %vecext2 = extractelement <4 x i32> %A, i32 2
+  %vecext3 = extractelement <4 x i32> %A, i32 3
+  %sub4 = sub i32 %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
+  %vecext6 = extractelement <4 x i32> %B, i32 1
+  %vecext7 = extractelement <4 x i32> %B, i32 0
+  %sub8 = sub i32 %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
+  %vecext10 = extractelement <4 x i32> %B, i32 3
+  %vecext11 = extractelement <4 x i32> %B, i32 2
+  %sub12 = sub i32 %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
+  ret <4 x i32> %vecinit13
+}
+; CHECK-LABEL: not_a_hsub_1
+; CHECK-NOT: phsubd
+; CHECK: ret
+
+
+define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
+  %vecext = extractelement <4 x float> %A, i32 2
+  %vecext1 = extractelement <4 x float> %A, i32 3
+  %sub = fsub float %vecext, %vecext1
+  %vecinit = insertelement <4 x float> undef, float %sub, i32 1
+  %vecext2 = extractelement <4 x float> %A, i32 0
+  %vecext3 = extractelement <4 x float> %A, i32 1
+  %sub4 = fsub float %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
+  %vecext6 = extractelement <4 x float> %B, i32 3
+  %vecext7 = extractelement <4 x float> %B, i32 2
+  %sub8 = fsub float %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
+  %vecext10 = extractelement <4 x float> %B, i32 0
+  %vecext11 = extractelement <4 x float> %B, i32 1
+  %sub12 = fsub float %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
+  ret <4 x float> %vecinit13
+}
+; CHECK-LABEL: not_a_hsub_2
+; CHECK-NOT: hsubps
+; CHECK: ret
+
+
+define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
+  %vecext = extractelement <2 x double> %B, i32 0
+  %vecext1 = extractelement <2 x double> %B, i32 1
+  %sub = fsub double %vecext, %vecext1
+  %vecinit = insertelement <2 x double> undef, double %sub, i32 1
+  %vecext2 = extractelement <2 x double> %A, i32 1
+  %vecext3 = extractelement <2 x double> %A, i32 0
+  %sub2 = fsub double %vecext2, %vecext3
+  %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
+  ret <2 x double> %vecinit2
+}
+; CHECK-LABEL: not_a_hsub_3
+; CHECK-NOT: hsubpd
+; CHECK: ret
+
+
+; Test AVX horizontal add/sub of packed single/double precision
+; floating point values from 256-bit vectors.
+
+define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) {
+  %vecext = extractelement <8 x float> %a, i32 0
+  %vecext1 = extractelement <8 x float> %a, i32 1
+  %add = fadd float %vecext, %vecext1
+  %vecinit = insertelement <8 x float> undef, float %add, i32 0
+  %vecext2 = extractelement <8 x float> %a, i32 2
+  %vecext3 = extractelement <8 x float> %a, i32 3
+  %add4 = fadd float %vecext2, %vecext3
+  %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
+  %vecext6 = extractelement <8 x float> %b, i32 0
+  %vecext7 = extractelement <8 x float> %b, i32 1
+  %add8 = fadd float %vecext6, %vecext7
+  %vecinit9 = insertelement <8 x float> %vecinit5, float %add8, i32 2
+  %vecext10 = extractelement <8 x float> %b, i32 2
+  %vecext11 = extractelement <8 x float> %b, i32 3
+  %add12 = fadd float %vecext10, %vecext11
+  %vecinit13 = insertelement <8 x float> %vecinit9, float %add12, i32 3
+  %vecext14 = extractelement <8 x float> %a, i32 4
+  %vecext15 = extractelement <8 x float> %a, i32 5
+  %add16 = fadd float %vecext14, %vecext15
+  %vecinit17 = insertelement <8 x float> %vecinit13, float %add16, i32 4
+  %vecext18 = extractelement <8 x float> %a, i32 6
+  %vecext19 = extractelement <8 x float> %a, i32 7
+  %add20 = fadd float %vecext18, %vecext19
+  %vecinit21 = insertelement <8 x float> %vecinit17, float %add20, i32 5
+  %vecext22 = extractelement <8 x float> %b, i32 4
+  %vecext23 = extractelement <8 x float> %b, i32 5
+  %add24 = fadd float %vecext22, %vecext23
+  %vecinit25 = insertelement <8 x float> %vecinit21, float %add24, i32 6
+  %vecext26 = extractelement <8 x float> %b, i32 6
+  %vecext27 = extractelement <8 x float> %b, i32 7
+  %add28 = fadd float %vecext26, %vecext27
+  %vecinit29 = insertelement <8 x float> %vecinit25, float %add28, i32 7
+  ret <8 x float> %vecinit29
+}
+; CHECK-LABEL: avx_vhadd_ps
+; SSE3: haddps
+; SSE3-NEXT: haddps
+; SSSE3: haddps
+; SSSE3-NEXT: haddps
+; AVX: vhaddps
+; AVX2: vhaddps
+; CHECK: ret
+
+
+define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) {
+  %vecext = extractelement <8 x float> %a, i32 0
+  %vecext1 = extractelement <8 x float> %a, i32 1
+  %sub = fsub float %vecext, %vecext1
+  %vecinit = insertelement <8 x float> undef, float %sub, i32 0
+  %vecext2 = extractelement <8 x float> %a, i32 2
+  %vecext3 = extractelement <8 x float> %a, i32 3
+  %sub4 = fsub float %vecext2, %vecext3
+  %vecinit5 = insertelement <8 x float> %vecinit, float %sub4, i32 1
+  %vecext6 = extractelement <8 x float> %b, i32 0
+  %vecext7 = extractelement <8 x float> %b, i32 1
+  %sub8 = fsub float %vecext6, %vecext7
+  %vecinit9 = insertelement <8 x float> %vecinit5, float %sub8, i32 2
+  %vecext10 = extractelement <8 x float> %b, i32 2
+  %vecext11 = extractelement <8 x float> %b, i32 3
+  %sub12 = fsub float %vecext10, %vecext11
+  %vecinit13 = insertelement <8 x float> %vecinit9, float %sub12, i32 3
+  %vecext14 = extractelement <8 x float> %a, i32 4
+  %vecext15 = extractelement <8 x float> %a, i32 5
+  %sub16 = fsub float %vecext14, %vecext15
+  %vecinit17 = insertelement <8 x float> %vecinit13, float %sub16, i32 4
+  %vecext18 = extractelement <8 x float> %a, i32 6
+  %vecext19 = extractelement <8 x float> %a, i32 7
+  %sub20 = fsub float %vecext18, %vecext19
+  %vecinit21 = insertelement <8 x float> %vecinit17, float %sub20, i32 5
+  %vecext22 = extractelement <8 x float> %b, i32 4
+  %vecext23 = extractelement <8 x float> %b, i32 5
+  %sub24 = fsub float %vecext22, %vecext23
+  %vecinit25 = insertelement <8 x float> %vecinit21, float %sub24, i32 6
+  %vecext26 = extractelement <8 x float> %b, i32 6
+  %vecext27 = extractelement <8 x float> %b, i32 7
+  %sub28 = fsub float %vecext26, %vecext27
+  %vecinit29 = insertelement <8 x float> %vecinit25, float %sub28, i32 7
+  ret <8 x float> %vecinit29
+}
+; CHECK-LABEL: avx_vhsub_ps
+; SSE3: hsubps
+; SSE3-NEXT: hsubps
+; SSSE3: hsubps
+; SSSE3-NEXT: hsubps
+; AVX: vhsubps
+; AVX2: vhsubps
+; CHECK: ret
+
+
+define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) {
+  %vecext = extractelement <4 x double> %a, i32 0
+  %vecext1 = extractelement <4 x double> %a, i32 1
+  %add = fadd double %vecext, %vecext1
+  %vecinit = insertelement <4 x double> undef, double %add, i32 0
+  %vecext2 = extractelement <4 x double> %b, i32 0
+  %vecext3 = extractelement <4 x double> %b, i32 1
+  %add4 = fadd double %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
+  %vecext6 = extractelement <4 x double> %a, i32 2
+  %vecext7 = extractelement <4 x double> %a, i32 3
+  %add8 = fadd double %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
+  %vecext10 = extractelement <4 x double> %b, i32 2
+  %vecext11 = extractelement <4 x double> %b, i32 3
+  %add12 = fadd double %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
+  ret <4 x double> %vecinit13
+}
+; CHECK-LABEL: avx_hadd_pd
+; SSE3: haddpd
+; SSE3-NEXT: haddpd
+; SSSE3: haddpd
+; SSSE3-NEXT: haddpd
+; AVX: vhaddpd
+; AVX2: vhaddpd
+; CHECK: ret
+
+
+define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) {
+  %vecext = extractelement <4 x double> %a, i32 0
+  %vecext1 = extractelement <4 x double> %a, i32 1
+  %sub = fsub double %vecext, %vecext1
+  %vecinit = insertelement <4 x double> undef, double %sub, i32 0
+  %vecext2 = extractelement <4 x double> %b, i32 0
+  %vecext3 = extractelement <4 x double> %b, i32 1
+  %sub4 = fsub double %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
+  %vecext6 = extractelement <4 x double> %a, i32 2
+  %vecext7 = extractelement <4 x double> %a, i32 3
+  %sub8 = fsub double %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
+  %vecext10 = extractelement <4 x double> %b, i32 2
+  %vecext11 = extractelement <4 x double> %b, i32 3
+  %sub12 = fsub double %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
+  ret <4 x double> %vecinit13
+}
+; CHECK-LABEL: avx_hsub_pd
+; SSE3: hsubpd
+; SSE3-NEXT: hsubpd
+; SSSE3: hsubpd
+; SSSE3-NEXT: hsubpd
+; AVX: vhsubpd
+; AVX2: vhsubpd
+; CHECK: ret
+
+
+; Test AVX2 horizontal add of packed integer values from 256-bit vectors.
+
+define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
+  %vecext = extractelement <8 x i32> %a, i32 0
+  %vecext1 = extractelement <8 x i32> %a, i32 1
+  %add = add i32 %vecext, %vecext1
+  %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
+  %vecext2 = extractelement <8 x i32> %a, i32 2
+  %vecext3 = extractelement <8 x i32> %a, i32 3
+  %add4 = add i32 %vecext2, %vecext3
+  %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
+  %vecext6 = extractelement <8 x i32> %b, i32 0
+  %vecext7 = extractelement <8 x i32> %b, i32 1
+  %add8 = add i32 %vecext6, %vecext7
+  %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
+  %vecext10 = extractelement <8 x i32> %b, i32 2
+  %vecext11 = extractelement <8 x i32> %b, i32 3
+  %add12 = add i32 %vecext10, %vecext11
+  %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
+  %vecext14 = extractelement <8 x i32> %a, i32 4
+  %vecext15 = extractelement <8 x i32> %a, i32 5
+  %add16 = add i32 %vecext14, %vecext15
+  %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
+  %vecext18 = extractelement <8 x i32> %a, i32 6
+  %vecext19 = extractelement <8 x i32> %a, i32 7
+  %add20 = add i32 %vecext18, %vecext19
+  %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
+  %vecext22 = extractelement <8 x i32> %b, i32 4
+  %vecext23 = extractelement <8 x i32> %b, i32 5
+  %add24 = add i32 %vecext22, %vecext23
+  %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
+  %vecext26 = extractelement <8 x i32> %b, i32 6
+  %vecext27 = extractelement <8 x i32> %b, i32 7
+  %add28 = add i32 %vecext26, %vecext27
+  %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
+  ret <8 x i32> %vecinit29
+}
+; CHECK-LABEL: avx2_hadd_d
+; SSE3-NOT: phaddd
+; SSSE3: phaddd
+; SSSE3-NEXT: phaddd
+; AVX: vphaddd
+; AVX: vphaddd
+; AVX2: vphaddd
+; AVX2-NOT: vphaddd
+; CHECK: ret
+
+
+define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
+  %vecext = extractelement <16 x i16> %a, i32 0
+  %vecext1 = extractelement <16 x i16> %a, i32 1
+  %add = add i16 %vecext, %vecext1
+  %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
+  %vecext4 = extractelement <16 x i16> %a, i32 2
+  %vecext6 = extractelement <16 x i16> %a, i32 3
+  %add8 = add i16 %vecext4, %vecext6
+  %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
+  %vecext11 = extractelement <16 x i16> %a, i32 4
+  %vecext13 = extractelement <16 x i16> %a, i32 5
+  %add15 = add i16 %vecext11, %vecext13
+  %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
+  %vecext18 = extractelement <16 x i16> %a, i32 6
+  %vecext20 = extractelement <16 x i16> %a, i32 7
+  %add22 = add i16 %vecext18, %vecext20
+  %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
+  %vecext25 = extractelement <16 x i16> %a, i32 8
+  %vecext27 = extractelement <16 x i16> %a, i32 9
+  %add29 = add i16 %vecext25, %vecext27
+  %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 8
+  %vecext32 = extractelement <16 x i16> %a, i32 10
+  %vecext34 = extractelement <16 x i16> %a, i32 11
+  %add36 = add i16 %vecext32, %vecext34
+  %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 9
+  %vecext39 = extractelement <16 x i16> %a, i32 12
+  %vecext41 = extractelement <16 x i16> %a, i32 13
+  %add43 = add i16 %vecext39, %vecext41
+  %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 10
+  %vecext46 = extractelement <16 x i16> %a, i32 14
+  %vecext48 = extractelement <16 x i16> %a, i32 15
+  %add50 = add i16 %vecext46, %vecext48
+  %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 11
+  %vecext53 = extractelement <16 x i16> %b, i32 0
+  %vecext55 = extractelement <16 x i16> %b, i32 1
+  %add57 = add i16 %vecext53, %vecext55
+  %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 4
+  %vecext60 = extractelement <16 x i16> %b, i32 2
+  %vecext62 = extractelement <16 x i16> %b, i32 3
+  %add64 = add i16 %vecext60, %vecext62
+  %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 5
+  %vecext67 = extractelement <16 x i16> %b, i32 4
+  %vecext69 = extractelement <16 x i16> %b, i32 5
+  %add71 = add i16 %vecext67, %vecext69
+  %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 6
+  %vecext74 = extractelement <16 x i16> %b, i32 6
+  %vecext76 = extractelement <16 x i16> %b, i32 7
+  %add78 = add i16 %vecext74, %vecext76
+  %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 7
+  %vecext81 = extractelement <16 x i16> %b, i32 8
+  %vecext83 = extractelement <16 x i16> %b, i32 9
+  %add85 = add i16 %vecext81, %vecext83
+  %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
+  %vecext88 = extractelement <16 x i16> %b, i32 10
+  %vecext90 = extractelement <16 x i16> %b, i32 11
+  %add92 = add i16 %vecext88, %vecext90
+  %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
+  %vecext95 = extractelement <16 x i16> %b, i32 12
+  %vecext97 = extractelement <16 x i16> %b, i32 13
+  %add99 = add i16 %vecext95, %vecext97
+  %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
+  %vecext102 = extractelement <16 x i16> %b, i32 14
+  %vecext104 = extractelement <16 x i16> %b, i32 15
+  %add106 = add i16 %vecext102, %vecext104
+  %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
+  ret <16 x i16> %vecinit108
+}
+; CHECK-LABEL: avx2_hadd_w
+; SSE3-NOT: phaddw
+; SSSE3: phaddw
+; SSSE3-NEXT: phaddw
+; AVX: vphaddw
+; AVX: vphaddw
+; AVX2: vphaddw
+; AVX2-NOT: vphaddw
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/haddsub-undef.ll b/test/CodeGen/X86/haddsub-undef.ll
new file mode 100644
index 0000000..954a9d9
--- /dev/null
+++ b/test/CodeGen/X86/haddsub-undef.ll
@@ -0,0 +1,325 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+ssse3 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
+; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
+; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
+
+; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
+
+define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) {
+  %vecext = extractelement <4 x float> %a, i32 0
+  %vecext1 = extractelement <4 x float> %a, i32 1
+  %add = fadd float %vecext, %vecext1
+  %vecinit = insertelement <4 x float> undef, float %add, i32 0
+  %vecext2 = extractelement <4 x float> %a, i32 2
+  %vecext3 = extractelement <4 x float> %a, i32 3
+  %add4 = fadd float %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
+  %vecext10 = extractelement <4 x float> %b, i32 2
+  %vecext11 = extractelement <4 x float> %b, i32 3
+  %add12 = fadd float %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x float> %vecinit5, float %add12, i32 3
+  ret <4 x float> %vecinit13
+}
+; CHECK-LABEL: test1_undef
+; SSE: haddps
+; AVX: vhaddps
+; AVX2: vhaddps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) {
+  %vecext = extractelement <4 x float> %a, i32 0
+  %vecext1 = extractelement <4 x float> %a, i32 1
+  %add = fadd float %vecext, %vecext1
+  %vecinit = insertelement <4 x float> undef, float %add, i32 0
+  %vecext6 = extractelement <4 x float> %b, i32 0
+  %vecext7 = extractelement <4 x float> %b, i32 1
+  %add8 = fadd float %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x float> %vecinit, float %add8, i32 2
+  %vecext10 = extractelement <4 x float> %b, i32 2
+  %vecext11 = extractelement <4 x float> %b, i32 3
+  %add12 = fadd float %vecext10, %vecext11
+  %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
+  ret <4 x float> %vecinit13
+}
+; CHECK-LABEL: test2_undef
+; SSE: haddps
+; AVX: vhaddps
+; AVX2: vhaddps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) {
+  %vecext = extractelement <4 x float> %a, i32 0
+  %vecext1 = extractelement <4 x float> %a, i32 1
+  %add = fadd float %vecext, %vecext1
+  %vecinit = insertelement <4 x float> undef, float %add, i32 0
+  %vecext2 = extractelement <4 x float> %a, i32 2
+  %vecext3 = extractelement <4 x float> %a, i32 3
+  %add4 = fadd float %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
+  %vecext6 = extractelement <4 x float> %b, i32 0
+  %vecext7 = extractelement <4 x float> %b, i32 1
+  %add8 = fadd float %vecext6, %vecext7
+  %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
+  ret <4 x float> %vecinit9
+}
+; CHECK-LABEL: test3_undef
+; SSE: haddps
+; AVX: vhaddps
+; AVX2: vhaddps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test4_undef(<4 x float> %a, <4 x float> %b) {
+  %vecext = extractelement <4 x float> %a, i32 0
+  %vecext1 = extractelement <4 x float> %a, i32 1
+  %add = fadd float %vecext, %vecext1
+  %vecinit = insertelement <4 x float> undef, float %add, i32 0
+  ret <4 x float> %vecinit
+}
+; CHECK-LABEL: test4_undef
+; CHECK-NOT: haddps
+; CHECK: ret
+
+
+define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) {
+  %vecext = extractelement <2 x double> %a, i32 0
+  %vecext1 = extractelement <2 x double> %a, i32 1
+  %add = fadd double %vecext, %vecext1
+  %vecinit = insertelement <2 x double> undef, double %add, i32 0
+  ret <2 x double> %vecinit
+}
+; CHECK-LABEL: test5_undef
+; CHECK-NOT: haddpd
+; CHECK: ret
+
+
+define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) {
+  %vecext = extractelement <4 x float> %a, i32 0
+  %vecext1 = extractelement <4 x float> %a, i32 1
+  %add = fadd float %vecext, %vecext1
+  %vecinit = insertelement <4 x float> undef, float %add, i32 0
+  %vecext2 = extractelement <4 x float> %a, i32 2
+  %vecext3 = extractelement <4 x float> %a, i32 3
+  %add4 = fadd float %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
+  ret <4 x float> %vecinit5
+}
+; CHECK-LABEL: test6_undef
+; SSE: haddps
+; AVX: vhaddps
+; AVX2: vhaddps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) {
+  %vecext = extractelement <4 x float> %b, i32 0
+  %vecext1 = extractelement <4 x float> %b, i32 1
+  %add = fadd float %vecext, %vecext1
+  %vecinit = insertelement <4 x float> undef, float %add, i32 2
+  %vecext2 = extractelement <4 x float> %b, i32 2
+  %vecext3 = extractelement <4 x float> %b, i32 3
+  %add4 = fadd float %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3
+  ret <4 x float> %vecinit5
+}
+; CHECK-LABEL: test7_undef
+; SSE: haddps
+; AVX: vhaddps
+; AVX2: vhaddps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) {
+  %vecext = extractelement <4 x float> %a, i32 0
+  %vecext1 = extractelement <4 x float> %a, i32 1
+  %add = fadd float %vecext, %vecext1
+  %vecinit = insertelement <4 x float> undef, float %add, i32 0
+  %vecext2 = extractelement <4 x float> %a, i32 2
+  %vecext3 = extractelement <4 x float> %a, i32 3
+  %add4 = fadd float %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 2
+  ret <4 x float> %vecinit5
+}
+; CHECK-LABEL: test8_undef
+; CHECK-NOT: haddps
+; CHECK: ret
+
+
+define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) {
+  %vecext = extractelement <4 x float> %a, i32 0
+  %vecext1 = extractelement <4 x float> %a, i32 1
+  %add = fadd float %vecext, %vecext1
+  %vecinit = insertelement <4 x float> undef, float %add, i32 0
+  %vecext2 = extractelement <4 x float> %b, i32 2
+  %vecext3 = extractelement <4 x float> %b, i32 3
+  %add4 = fadd float %vecext2, %vecext3
+  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3
+  ret <4 x float> %vecinit5
+}
+; CHECK-LABEL: test9_undef
+; CHECK: haddps
+; CHECK-NEXT: ret
+
+define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) {
+  %vecext = extractelement <8 x float> %a, i32 0
+  %vecext1 = extractelement <8 x float> %a, i32 1
+  %add = fadd float %vecext, %vecext1
+  %vecinit = insertelement <8 x float> undef, float %add, i32 0
+  %vecext2 = extractelement <8 x float> %b, i32 2
+  %vecext3 = extractelement <8 x float> %b, i32 3
+  %add4 = fadd float %vecext2, %vecext3
+  %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 3
+  ret <8 x float> %vecinit5
+}
+; CHECK-LABEL: test10_undef
+; SSE: haddps
+; AVX: vhaddps
+; AVX2: vhaddps
+; CHECK-NOT: haddps
+; CHECK: ret
+
+define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) {
+  %vecext = extractelement <8 x float> %a, i32 0
+  %vecext1 = extractelement <8 x float> %a, i32 1
+  %add = fadd float %vecext, %vecext1
+  %vecinit = insertelement <8 x float> undef, float %add, i32 0
+  %vecext2 = extractelement <8 x float> %b, i32 4
+  %vecext3 = extractelement <8 x float> %b, i32 5
+  %add4 = fadd float %vecext2, %vecext3
+  %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 6
+  ret <8 x float> %vecinit5
+}
+; CHECK-LABEL: test11_undef
+; SSE-NOT: haddps
+; AVX: vhaddps
+; AVX2: vhaddps
+; CHECK: ret
+
+define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) {
+  %vecext = extractelement <8 x float> %a, i32 0
+  %vecext1 = extractelement <8 x float> %a, i32 1
+  %add = fadd float %vecext, %vecext1
+  %vecinit = insertelement <8 x float> undef, float %add, i32 0
+  %vecext2 = extractelement <8 x float> %a, i32 2
+  %vecext3 = extractelement <8 x float> %a, i32 3
+  %add4 = fadd float %vecext2, %vecext3
+  %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
+  ret <8 x float> %vecinit5
+}
+; CHECK-LABEL: test12_undef
+; SSE: haddps
+; AVX: vhaddps
+; AVX2: vhaddps
+; CHECK-NOT: haddps
+; CHECK: ret
+
+define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) {
+  %vecext = extractelement <8 x float> %a, i32 0
+  %vecext1 = extractelement <8 x float> %a, i32 1
+  %add1 = fadd float %vecext, %vecext1
+  %vecinit1 = insertelement <8 x float> undef, float %add1, i32 0
+  %vecext2 = extractelement <8 x float> %a, i32 2
+  %vecext3 = extractelement <8 x float> %a, i32 3
+  %add2 = fadd float %vecext2, %vecext3
+  %vecinit2 = insertelement <8 x float> %vecinit1, float %add2, i32 1
+  %vecext4 = extractelement <8 x float> %a, i32 4
+  %vecext5 = extractelement <8 x float> %a, i32 5
+  %add3 = fadd float %vecext4, %vecext5
+  %vecinit3 = insertelement <8 x float> %vecinit2, float %add3, i32 2
+  %vecext6 = extractelement <8 x float> %a, i32 6
+  %vecext7 = extractelement <8 x float> %a, i32 7
+  %add4 = fadd float %vecext6, %vecext7
+  %vecinit4 = insertelement <8 x float> %vecinit3, float %add4, i32 3
+  ret <8 x float> %vecinit4
+}
+; CHECK-LABEL: test13_undef
+; SSE: haddps
+; SSE-NOT: haddps
+; AVX: vhaddps
+; AVX2: vhaddps
+; CHECK-NOT: haddps
+; CHECK: ret
+
+define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
+  %vecext = extractelement <8 x i32> %a, i32 0
+  %vecext1 = extractelement <8 x i32> %a, i32 1
+  %add = add i32 %vecext, %vecext1
+  %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
+  %vecext2 = extractelement <8 x i32> %b, i32 2
+  %vecext3 = extractelement <8 x i32> %b, i32 3
+  %add4 = add i32 %vecext2, %vecext3
+  %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 3
+  ret <8 x i32> %vecinit5
+}
+; CHECK-LABEL: test14_undef
+; SSE: phaddd
+; AVX: vphaddd
+; AVX2: vphaddd
+; CHECK-NOT: phaddd
+; CHECK: ret
+
+; On AVX2, the following sequence can be folded into a single horizontal add.
+; If the Subtarget doesn't support AVX2, then we avoid emitting two packed 
+; integer horizontal adds instead of two scalar adds followed by vector inserts.
+define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
+  %vecext = extractelement <8 x i32> %a, i32 0
+  %vecext1 = extractelement <8 x i32> %a, i32 1
+  %add = add i32 %vecext, %vecext1
+  %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
+  %vecext2 = extractelement <8 x i32> %b, i32 4
+  %vecext3 = extractelement <8 x i32> %b, i32 5
+  %add4 = add i32 %vecext2, %vecext3
+  %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 6
+  ret <8 x i32> %vecinit5
+}
+; CHECK-LABEL: test15_undef
+; SSE-NOT: phaddd
+; AVX-NOT: vphaddd
+; AVX2: vphaddd
+; CHECK: ret
+
+define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) {
+  %vecext = extractelement <8 x i32> %a, i32 0
+  %vecext1 = extractelement <8 x i32> %a, i32 1
+  %add = add i32 %vecext, %vecext1
+  %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
+  %vecext2 = extractelement <8 x i32> %a, i32 2
+  %vecext3 = extractelement <8 x i32> %a, i32 3
+  %add4 = add i32 %vecext2, %vecext3
+  %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
+  ret <8 x i32> %vecinit5
+}
+; CHECK-LABEL: test16_undef
+; SSE: phaddd
+; AVX: vphaddd
+; AVX2: vphaddd
+; CHECK-NOT: haddps
+; CHECK: ret
+
+define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
+  %vecext = extractelement <8 x i32> %a, i32 0
+  %vecext1 = extractelement <8 x i32> %a, i32 1
+  %add1 = add i32 %vecext, %vecext1
+  %vecinit1 = insertelement <8 x i32> undef, i32 %add1, i32 0
+  %vecext2 = extractelement <8 x i32> %a, i32 2
+  %vecext3 = extractelement <8 x i32> %a, i32 3
+  %add2 = add i32 %vecext2, %vecext3
+  %vecinit2 = insertelement <8 x i32> %vecinit1, i32 %add2, i32 1
+  %vecext4 = extractelement <8 x i32> %a, i32 4
+  %vecext5 = extractelement <8 x i32> %a, i32 5
+  %add3 = add i32 %vecext4, %vecext5
+  %vecinit3 = insertelement <8 x i32> %vecinit2, i32 %add3, i32 2
+  %vecext6 = extractelement <8 x i32> %a, i32 6
+  %vecext7 = extractelement <8 x i32> %a, i32 7
+  %add4 = add i32 %vecext6, %vecext7
+  %vecinit4 = insertelement <8 x i32> %vecinit3, i32 %add4, i32 3
+  ret <8 x i32> %vecinit4
+}
+; CHECK-LABEL: test17_undef
+; SSE: phaddd
+; AVX: vphaddd
+; AVX2: vphaddd
+; CHECK-NOT: haddps
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/i8-umulo.ll b/test/CodeGen/X86/i8-umulo.ll
new file mode 100644
index 0000000..ba846f3
--- /dev/null
+++ b/test/CodeGen/X86/i8-umulo.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mcpu=generic -march=x86 < %s | FileCheck %s
+; PR19858
+
+declare {i8, i1} @llvm.umul.with.overflow.i8(i8 %a, i8 %b)
+define i8 @testumulo(i32 %argc) {
+; CHECK: imulw
+; CHECK: testb %{{.+}}, %{{.+}}
+; CHECK: je [[NOOVERFLOWLABEL:.+]]
+; CHECK: {{.*}}[[NOOVERFLOWLABEL]]:
+; CHECK-NEXT: movb
+; CHECK-NEXT: retl
+top:
+  %RHS = trunc i32 %argc to i8
+  %umul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 25, i8 %RHS)
+  %ex = extractvalue { i8, i1 } %umul, 1
+  br i1 %ex, label %overflow, label %nooverlow
+
+overflow:
+  ret i8 %RHS
+
+nooverlow:
+  %umul.value = extractvalue { i8, i1 } %umul, 0
+  ret i8 %umul.value
+}
diff --git a/test/CodeGen/X86/jump_table_alias.ll b/test/CodeGen/X86/jump_table_alias.ll
new file mode 100644
index 0000000..f3691fd
--- /dev/null
+++ b/test/CodeGen/X86/jump_table_alias.ll
@@ -0,0 +1,33 @@
+; RUN: llc <%s -jump-table-type=single | FileCheck %s
+target triple = "x86_64-unknown-linux-gnu"
+define i32 @f() unnamed_addr jumptable {
+entry:
+  ret i32 0
+}
+
+@i = alias internal i32 ()* @f
+@j = alias i32 ()* @f
+
+define i32 @main(i32 %argc, i8** %argv) {
+  %temp = alloca i32 ()*, align 8
+  store i32 ()* @i, i32()** %temp, align 8
+; CHECK: movq    $__llvm_jump_instr_table_0_1
+  %1 = load i32 ()** %temp, align 8
+; CHECK: movl    $__llvm_jump_instr_table_0_1
+  %2 = call i32 ()* %1()
+  %3 = call i32 ()* @i()
+; CHECK: callq   i
+  %4 = call i32 ()* @j()
+; CHECK: callq   j
+  ret i32 %3
+}
+
+; There should only be one table, even though there are two GlobalAliases,
+; because they both alias the same value.
+
+; CHECK:         .globl  __llvm_jump_instr_table_0_1
+; CHECK:         .align  8, 0x90
+; CHECK:         .type   __llvm_jump_instr_table_0_1,@function
+; CHECK: __llvm_jump_instr_table_0_1:
+; CHECK:         jmp     f@PLT
+
diff --git a/test/CodeGen/X86/jump_table_bitcast.ll b/test/CodeGen/X86/jump_table_bitcast.ll
new file mode 100644
index 0000000..33a798f
--- /dev/null
+++ b/test/CodeGen/X86/jump_table_bitcast.ll
@@ -0,0 +1,46 @@
+; RUN: llc <%s -jump-table-type=single | FileCheck %s
+target triple = "x86_64-unknown-linux-gnu"
+define i32 @f() unnamed_addr jumptable {
+  ret i32 0
+}
+
+define i32 @g(i8* %a) unnamed_addr jumptable {
+  ret i32 0
+}
+
+define void @h(void ()* %func) unnamed_addr jumptable {
+  ret void
+}
+
+define i32 @main() {
+  %g = alloca i32 (...)*, align 8
+  store i32 (...)* bitcast (i32 ()* @f to i32 (...)*), i32 (...)** %g, align 8
+; CHECK: movq    $__llvm_jump_instr_table_0_[[ENTRY:1|2|3]], (%rsp)
+; CHECK: movl    $__llvm_jump_instr_table_0_[[ENTRY]], %ecx
+  %1 = load i32 (...)** %g, align 8
+  %call = call i32 (...)* %1()
+  call void (void ()*)* @h(void ()* bitcast (void (void ()*)* @h to void ()*))
+; CHECK: movl    $__llvm_jump_instr_table_0_{{1|2|3}}, %edi
+; CHECK: callq   h
+
+  %a = call i32 (i32*)* bitcast (i32 (i8*)* @g to i32(i32*)*)(i32* null)
+; CHECK: callq g
+  ret i32 %a
+}
+
+; CHECK:         .globl  __llvm_jump_instr_table_0_1
+; CHECK:         .align  8, 0x90
+; CHECK:         .type   __llvm_jump_instr_table_0_1,@function
+; CHECK: __llvm_jump_instr_table_0_1:
+; CHECK:         jmp     {{f|g|h}}@PLT
+; CHECK:         .globl  __llvm_jump_instr_table_0_2
+; CHECK:         .align  8, 0x90
+; CHECK:         .type   __llvm_jump_instr_table_0_2,@function
+; CHECK: __llvm_jump_instr_table_0_2:
+; CHECK:         jmp     {{f|g|h}}@PLT
+; CHECK:         .globl  __llvm_jump_instr_table_0_3
+; CHECK:         .align  8, 0x90
+; CHECK:         .type   __llvm_jump_instr_table_0_3,@function
+; CHECK: __llvm_jump_instr_table_0_3:
+; CHECK:         jmp     {{f|g|h}}@PLT
+
diff --git a/test/CodeGen/X86/jump_tables.ll b/test/CodeGen/X86/jump_tables.ll
new file mode 100644
index 0000000..5a0aed0
--- /dev/null
+++ b/test/CodeGen/X86/jump_tables.ll
@@ -0,0 +1,272 @@
+; RUN: llc <%s -jump-table-type=single | FileCheck --check-prefix=SINGLE %s
+; RUN: llc <%s -jump-table-type=arity | FileCheck --check-prefix=ARITY %s
+; RUN: llc <%s -jump-table-type=simplified | FileCheck --check-prefix=SIMPL %s
+; RUN: llc <%s -jump-table-type=full | FileCheck --check-prefix=FULL %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.fun_struct = type { i32 (...)* }
+
+define void @indirect_fun() unnamed_addr jumptable {
+  ret void
+}
+
+define void @indirect_fun_match() unnamed_addr jumptable {
+  ret void
+}
+
+define i32 @indirect_fun_i32() unnamed_addr jumptable {
+  ret i32 0
+}
+
+define i32 @indirect_fun_i32_1(i32 %a) unnamed_addr jumptable {
+  ret i32 %a
+}
+
+define i32 @indirect_fun_i32_2(i32 %a, i32 %b) unnamed_addr jumptable {
+  ret i32 %a
+}
+
+define i32* @indirect_fun_i32S_2(i32* %a, i32 %b) unnamed_addr jumptable {
+  ret i32* %a
+}
+
+define void @indirect_fun_struct(%struct.fun_struct %fs) unnamed_addr jumptable {
+  ret void
+}
+
+define void @indirect_fun_fun(i32 (...)* %fun, i32 %a) unnamed_addr jumptable {
+  ret void
+}
+
+define i32 @indirect_fun_fun_ret(i32 (...)* %fun, i32 %a) unnamed_addr jumptable {
+  ret i32 %a
+}
+
+define void @indirect_fun_array([19 x i8] %a) unnamed_addr jumptable {
+  ret void
+}
+
+define void @indirect_fun_vec(<3 x i32> %a) unnamed_addr jumptable {
+  ret void
+}
+
+define void @indirect_fun_vec_2(<4 x float> %a) unnamed_addr jumptable {
+  ret void
+}
+
+define i32 @m(void ()* %fun) {
+  call void ()* %fun()
+  ret i32 0
+}
+
+define void ()* @get_fun() {
+  ret void ()* @indirect_fun
+; SINGLE: movl    $__llvm_jump_instr_table_0_
+; ARITY: movl    $__llvm_jump_instr_table_
+; SIMPL: movl    $__llvm_jump_instr_table_
+; FULL: movl    $__llvm_jump_instr_table_
+}
+
+define i32 @main(i32 %argc, i8** %argv) {
+  %f = call void ()* ()* @get_fun()
+  %a = call i32 @m(void ()* %f)
+  ret i32 %a
+}
+
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_1
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_1,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_1:
+; SINGLE-DAG:         jmp     indirect_fun_array@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_2
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_2,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_2:
+; SINGLE-DAG:         jmp     indirect_fun_i32_2@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_3
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_3,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_3:
+; SINGLE-DAG:         jmp     indirect_fun_vec_2@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_4
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_4,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_4:
+; SINGLE-DAG:         jmp     indirect_fun_i32S_2@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_5
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_5,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_5:
+; SINGLE-DAG:         jmp     indirect_fun_struct@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_6
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_6,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_6:
+; SINGLE-DAG:         jmp     indirect_fun_i32_1@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_7
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_7,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_7:
+; SINGLE-DAG:         jmp     indirect_fun_i32@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_8
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_8,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_8:
+; SINGLE-DAG:         jmp     indirect_fun_fun@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_9
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_9,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_9:
+; SINGLE-DAG:         jmp     indirect_fun_fun_ret@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_10
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_10,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_10:
+; SINGLE-DAG:         jmp     indirect_fun@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_11
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_11,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_11:
+; SINGLE-DAG:         jmp     indirect_fun_match@PLT
+; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_12
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_12,@function
+; SINGLE-DAG: __llvm_jump_instr_table_0_12:
+; SINGLE-DAG:         jmp     indirect_fun_vec@PLT
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         ud2
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         ud2
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         ud2
+; SINGLE-DAG:         .align  8, 0x90
+; SINGLE-DAG:         ud2
+
+
+; ARITY-DAG:         .globl  __llvm_jump_instr_table_2_1
+; ARITY-DAG:         .align  8, 0x90
+; ARITY-DAG:         .type   __llvm_jump_instr_table_2_1,@function
+; ARITY-DAG: __llvm_jump_instr_table_2_1:
+; ARITY-DAG:         jmp     indirect_fun{{.*}}@PLT
+; ARITY-DAG:         .align  8, 0x90
+; ARITY-DAG:         ud2
+; ARITY-DAG:         .globl  __llvm_jump_instr_table_0_1
+; ARITY-DAG:         .align  8, 0x90
+; ARITY-DAG:         .type   __llvm_jump_instr_table_0_1,@function
+; ARITY-DAG: __llvm_jump_instr_table_0_1:
+; ARITY-DAG:         jmp     indirect_fun{{.*}}@PLT
+; ARITY-DAG:         .globl  __llvm_jump_instr_table_1_1
+; ARITY-DAG:         .align  8, 0x90
+; ARITY-DAG:         .type   __llvm_jump_instr_table_1_1,@function
+; ARITY-DAG: __llvm_jump_instr_table_1_1:
+; ARITY-DAG:         jmp     indirect_fun{{.*}}@PLT
+
+; SIMPL-DAG:         .globl  __llvm_jump_instr_table_2_1
+; SIMPL-DAG:         .align  8, 0x90
+; SIMPL-DAG:         .type   __llvm_jump_instr_table_2_1,@function
+; SIMPL-DAG: __llvm_jump_instr_table_2_1:
+; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
+; SIMPL-DAG:         .align  8, 0x90
+; SIMPL-DAG:         ud2
+; SIMPL-DAG:         .globl  __llvm_jump_instr_table_0_1
+; SIMPL-DAG:         .align  8, 0x90
+; SIMPL-DAG:         .type   __llvm_jump_instr_table_0_1,@function
+; SIMPL-DAG: __llvm_jump_instr_table_0_1:
+; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
+; SIMPL-DAG:         .globl  __llvm_jump_instr_table_1_1
+; SIMPL-DAG:         .align  8, 0x90
+; SIMPL-DAG:         .type   __llvm_jump_instr_table_1_1,@function
+; SIMPL-DAG: __llvm_jump_instr_table_1_1:
+; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
+; SIMPL-DAG:         .globl  __llvm_jump_instr_table_3_1
+; SIMPL-DAG:         .align  8, 0x90
+; SIMPL-DAG:         .type   __llvm_jump_instr_table_3_1,@function
+; SIMPL-DAG: __llvm_jump_instr_table_3_1:
+; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
+; SIMPL-DAG:         .globl  __llvm_jump_instr_table_4_1
+; SIMPL-DAG:         .align  8, 0x90
+; SIMPL-DAG:         .type   __llvm_jump_instr_table_4_1,@function
+; SIMPL-DAG: __llvm_jump_instr_table_4_1:
+; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
+
+
+; FULL-DAG:        .globl  __llvm_jump_instr_table_10_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_10_1,@function
+; FULL-DAG:__llvm_jump_instr_table_10_1:
+; FULL-DAG:        jmp     indirect_fun_i32_1@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_9_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_9_1,@function
+; FULL-DAG:__llvm_jump_instr_table_9_1:
+; FULL-DAG:        jmp     indirect_fun_i32_2@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_7_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_7_1,@function
+; FULL-DAG:__llvm_jump_instr_table_7_1:
+; FULL-DAG:        jmp     indirect_fun_i32S_2@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_3_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_3_1,@function
+; FULL-DAG:__llvm_jump_instr_table_3_1:
+; FULL-DAG:        jmp     indirect_fun_vec_2@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_2_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_2_1,@function
+; FULL-DAG:__llvm_jump_instr_table_2_1:
+; FULL-DAG:        jmp     indirect_fun@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_8_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_8_1,@function
+; FULL-DAG:__llvm_jump_instr_table_8_1:
+; FULL-DAG:        jmp     indirect_fun_i32@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_1_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_1_1,@function
+; FULL-DAG:__llvm_jump_instr_table_1_1:
+; FULL-DAG:        jmp     indirect_fun_array@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_0_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_0_1,@function
+; FULL-DAG:__llvm_jump_instr_table_0_1:
+; FULL-DAG:        jmp     indirect_fun_vec@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_6_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_6_1,@function
+; FULL-DAG:__llvm_jump_instr_table_6_1:
+; FULL-DAG:        jmp     indirect_fun_struct@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_5_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_5_1,@function
+; FULL-DAG:__llvm_jump_instr_table_5_1:
+; FULL-DAG:        jmp     indirect_fun_fun@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
+; FULL-DAG:        .globl  __llvm_jump_instr_table_4_1
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        .type   __llvm_jump_instr_table_4_1,@function
+; FULL-DAG:__llvm_jump_instr_table_4_1:
+; FULL-DAG:        jmp     indirect_fun_fun_ret@PLT
+; FULL-DAG:        .align  8, 0x90
+; FULL-DAG:        ud2
diff --git a/test/CodeGen/X86/libcall-sret.ll b/test/CodeGen/X86/libcall-sret.ll
new file mode 100644
index 0000000..67b99ac
--- /dev/null
+++ b/test/CodeGen/X86/libcall-sret.ll
@@ -0,0 +1,28 @@
+; RUN: llc -mtriple=i686-linux-gnu -o - %s | FileCheck %s
+
+@var = global i128 0
+
+; We were trying to convert the i128 operation into a libcall, but failing to
+; perform sret demotion when we couldn't return the result in registers. Make
+; sure we marshal the return properly:
+
+define void @test_sret_libcall(i128 %l, i128 %r) {
+; CHECK-LABEL: test_sret_libcall:
+
+  ; Stack for call: 4(sret ptr), 16(i128 %l), 16(128 %r). So next logical
+  ; (aligned) place for the actual sret data is %esp + 40.
+; CHECK: leal 40(%esp), [[SRET_ADDR:%[a-z]+]]
+; CHECK: movl [[SRET_ADDR]], (%esp)
+; CHECK: calll __multi3
+; CHECK-DAG: movl 40(%esp), [[RES0:%[a-z]+]]
+; CHECK-DAG: movl 44(%esp), [[RES1:%[a-z]+]]
+; CHECK-DAG: movl 48(%esp), [[RES2:%[a-z]+]]
+; CHECK-DAG: movl 52(%esp), [[RES3:%[a-z]+]]
+; CHECK-DAG: movl [[RES0]], var
+; CHECK-DAG: movl [[RES1]], var+4
+; CHECK-DAG: movl [[RES2]], var+8
+; CHECK-DAG: movl [[RES3]], var+12
+  %prod = mul i128 %l, %r
+  store i128 %prod, i128* @var
+  ret void
+}
diff --git a/test/CodeGen/X86/lit.local.cfg b/test/CodeGen/X86/lit.local.cfg
index 3d91b03..8ed58f1 100644
--- a/test/CodeGen/X86/lit.local.cfg
+++ b/test/CodeGen/X86/lit.local.cfg
@@ -6,7 +6,6 @@
 # cleanly.
 config.suffixes = ['.ll', '.test', '.txt']
 
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/CodeGen/X86/lower-bitcast.ll b/test/CodeGen/X86/lower-bitcast.ll
index b9b29a5..f47161e 100644
--- a/test/CodeGen/X86/lower-bitcast.ll
+++ b/test/CodeGen/X86/lower-bitcast.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=x86-64 -mcpu=core2 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE
 
 
 define double @test1(double %A) {
@@ -9,14 +10,19 @@ define double @test1(double %A) {
 }
 ; FIXME: Ideally we should be able to fold the entire body of @test1 into a
 ; single paddd instruction. At the moment we produce the sequence 
-; pshufd+paddq+pshufd.
-
+; pshufd+paddq+pshufd. This is fixed with the widening legalization.
+;
 ; CHECK-LABEL: test1
 ; CHECK-NOT: movsd
 ; CHECK: pshufd
-; CHECK-NEXT: paddq
+; CHECK-NEXT: paddd
 ; CHECK-NEXT: pshufd
 ; CHECK-NEXT: ret
+;
+; CHECK-WIDE-LABEL: test1
+; CHECK-WIDE-NOT: movsd
+; CHECK-WIDE: paddd
+; CHECK-WIDE-NEXT: ret
 
 
 define double @test2(double %A, double %B) {
@@ -26,17 +32,15 @@ define double @test2(double %A, double %B) {
   %3 = bitcast <2 x i32> %add to double
   ret double %3
 }
-; FIXME: Ideally we should be able to fold the entire body of @test2 into a
-; single 'paddd %xmm1, %xmm0' instruction. At the moment we produce the
-; sequence pshufd+pshufd+paddq+pshufd.
-
 ; CHECK-LABEL: test2
 ; CHECK-NOT: movsd
-; CHECK: pshufd
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: paddq
-; CHECK-NEXT: pshufd
+; CHECK: paddd
 ; CHECK-NEXT: ret
+;
+; CHECK-WIDE-LABEL: test2
+; CHECK-WIDE-NOT: movsd
+; CHECK-WIDE: paddd
+; CHECK-WIDE-NEXT: ret
 
 
 define i64 @test3(i64 %A) {
@@ -50,6 +54,12 @@ define i64 @test3(i64 %A) {
 ; CHECK: addps
 ; CHECK-NOT: pshufd
 ; CHECK: ret
+;
+; CHECK-WIDE-LABEL: test3
+; CHECK-WIDE-NOT: pshufd
+; CHECK-WIDE: addps
+; CHECK-WIDE-NOT: pshufd
+; CHECK-WIDE: ret
 
 
 define i64 @test4(i64 %A) {
@@ -59,13 +69,20 @@ define i64 @test4(i64 %A) {
   ret i64 %2
 }
 ; FIXME: At the moment we still produce the sequence pshufd+paddq+pshufd.
-; Ideally, we should fold that sequence into a single paddd.
-
+; Ideally, we should fold that sequence into a single paddd. This is fixed with
+; the widening legalization.
+;
 ; CHECK-LABEL: test4
 ; CHECK: pshufd
 ; CHECK-NEXT: paddq
 ; CHECK-NEXT: pshufd
 ; CHECK: ret
+;
+; CHECK-WIDE-LABEL: test4
+; CHECK-WIDE: movd %{{rdi|rcx}},
+; CHECK-WIDE-NEXT: paddd
+; CHECK-WIDE-NEXT: movd {{.*}}, %rax
+; CHECK-WIDE: ret
 
 
 define double @test5(double %A) {
@@ -77,6 +94,10 @@ define double @test5(double %A) {
 ; CHECK-LABEL: test5
 ; CHECK: addps
 ; CHECK-NEXT: ret
+;
+; CHECK-WIDE-LABEL: test5
+; CHECK-WIDE: addps
+; CHECK-WIDE-NEXT: ret
 
 
 define double @test6(double %A) {
@@ -86,14 +107,20 @@ define double @test6(double %A) {
   ret double %2
 }
 ; FIXME: Ideally we should be able to fold the entire body of @test6 into a
-; single paddw instruction.
-
+; single paddw instruction. This is fixed with the widening legalization.
+;
 ; CHECK-LABEL: test6
 ; CHECK-NOT: movsd
 ; CHECK: punpcklwd
-; CHECK-NEXT: paddd
+; CHECK-NEXT: paddw
 ; CHECK-NEXT: pshufb
 ; CHECK-NEXT: ret
+;
+; CHECK-WIDE-LABEL: test6
+; CHECK-WIDE-NOT: mov
+; CHECK-WIDE-NOT: punpcklwd
+; CHECK-WIDE: paddw
+; CHECK-WIDE-NEXT: ret
 
 
 define double @test7(double %A, double %B) {
@@ -103,17 +130,17 @@ define double @test7(double %A, double %B) {
   %3 = bitcast <4 x i16> %add to double
   ret double %3
 }
-; FIXME: Ideally we should be able to fold the entire body of @test7 into a
-; single 'paddw %xmm1, %xmm0' instruction. At the moment we produce the
-; sequence pshufd+pshufd+paddd+pshufd.
-
 ; CHECK-LABEL: test7
 ; CHECK-NOT: movsd
-; CHECK: punpcklwd
-; CHECK-NEXT: punpcklwd
-; CHECK-NEXT: paddd
-; CHECK-NEXT: pshufb
+; CHECK-NOT: punpcklwd
+; CHECK: paddw
 ; CHECK-NEXT: ret
+;
+; CHECK-WIDE-LABEL: test7
+; CHECK-WIDE-NOT: movsd
+; CHECK-WIDE-NOT: punpcklwd
+; CHECK-WIDE: paddw
+; CHECK-WIDE-NEXT: ret
 
 
 define double @test8(double %A) {
@@ -124,14 +151,20 @@ define double @test8(double %A) {
 }
 ; FIXME: Ideally we should be able to fold the entire body of @test8 into a
 ; single paddb instruction. At the moment we produce the sequence 
-; pshufd+paddw+pshufd.
-
+; pshufd+paddw+pshufd. This is fixed with the widening legalization.
+;
 ; CHECK-LABEL: test8
 ; CHECK-NOT: movsd
 ; CHECK: punpcklbw
-; CHECK-NEXT: paddw
+; CHECK-NEXT: paddb
 ; CHECK-NEXT: pshufb
 ; CHECK-NEXT: ret
+;
+; CHECK-WIDE-LABEL: test8
+; CHECK-WIDE-NOT: movsd
+; CHECK-WIDE-NOT: punpcklbw
+; CHECK-WIDE: paddb
+; CHECK-WIDE-NEXT: ret
 
 
 define double @test9(double %A, double %B) {
@@ -141,15 +174,15 @@ define double @test9(double %A, double %B) {
   %3 = bitcast <8 x i8> %add to double
   ret double %3
 }
-; FIXME: Ideally we should be able to fold the entire body of @test9 into a
-; single 'paddb %xmm1, %xmm0' instruction. At the moment we produce the
-; sequence pshufd+pshufd+paddw+pshufd.
-
 ; CHECK-LABEL: test9
 ; CHECK-NOT: movsd
-; CHECK: punpcklbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: paddw
-; CHECK-NEXT: pshufb
+; CHECK-NOT: punpcklbw
+; CHECK: paddb
 ; CHECK-NEXT: ret
+;
+; CHECK-WIDE-LABEL: test9
+; CHECK-WIDE-NOT: movsd
+; CHECK-WIDE-NOT: punpcklbw
+; CHECK-WIDE: paddb
+; CHECK-WIDE-NEXT: ret
 
diff --git a/test/CodeGen/X86/macho-comdat.ll b/test/CodeGen/X86/macho-comdat.ll
new file mode 100644
index 0000000..3c2d997
--- /dev/null
+++ b/test/CodeGen/X86/macho-comdat.ll
@@ -0,0 +1,6 @@
+; RUN: not llc -mtriple x86_64-apple-darwin < %s 2> %t
+; RUN: FileCheck < %t %s
+
+$f = comdat any
+@v = global i32 0, comdat $f
+; CHECK: LLVM ERROR: MachO doesn't support COMDATs, 'f' cannot be lowered.
diff --git a/test/CodeGen/X86/null-streamer.ll b/test/CodeGen/X86/null-streamer.ll
index 7c0e82f..fa77fcb 100644
--- a/test/CodeGen/X86/null-streamer.ll
+++ b/test/CodeGen/X86/null-streamer.ll
@@ -1,6 +1,7 @@
 ; Check the MCNullStreamer operates correctly, at least on a minimal test case.
 ;
 ; RUN: llc -filetype=null -o %t -march=x86 %s
+; RUN: llc -filetype=null -o %t -mtriple=i686-cygwin %s
 
 define void @f0()  {
   ret void
@@ -9,3 +10,20 @@ define void @f0()  {
 define void @f1() {
   ret void
 }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!11, !13}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !" ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2, metadata !""}
+!1 = metadata !{metadata !"", metadata !""}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"", metadata !"", metadata !"", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* null, null, null, metadata !2, i32 2}
+!5 = metadata !{i32 786473, metadata !1}
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null}
+!7 = metadata !{metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
+!9 = metadata !{metadata !10}
+!10 = metadata !{i32 786484, i32 0, null, metadata !"i", metadata !"i", metadata !"_ZL1i", metadata !5, i32 1, metadata !8, i32 1, i32 1, null, null}
+!11 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/pr20020.ll b/test/CodeGen/X86/pr20020.ll
new file mode 100644
index 0000000..83dae36
--- /dev/null
+++ b/test/CodeGen/X86/pr20020.ll
@@ -0,0 +1,73 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -disable-lsr -post-RA-scheduler=1 -break-anti-dependencies=critical  | FileCheck %s
+
+; In PR20020, the critical anti-dependency breaker algorithm mistakenly
+; changes the register operands of an 'xorl %eax, %eax' to 'xorl %ecx, %ecx'
+; and then immediately reloads %rcx with a value based on the wrong %rax
+
+; CHECK-NOT: xorl %ecx, %ecx
+; CHECK: leaq 1(%rax), %rcx
+
+
+%struct.planet = type { double, double, double }
+
+; Function Attrs: nounwind ssp uwtable
+define void @advance(i32 %nbodies, %struct.planet* nocapture %bodies) #0 {
+entry:
+  %cmp4 = icmp sgt i32 %nbodies, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end38
+
+for.body.preheader:                               ; preds = %entry
+  %gep = getelementptr %struct.planet* %bodies, i64 1, i32 1
+  %gep13 = bitcast double* %gep to %struct.planet*
+  %0 = add i32 %nbodies, -1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc20
+  %iv19 = phi i32 [ %0, %for.body.preheader ], [ %iv.next, %for.inc20 ]
+  %iv = phi %struct.planet* [ %gep13, %for.body.preheader ], [ %gep14, %for.inc20 ]
+  %iv9 = phi i64 [ %iv.next10, %for.inc20 ], [ 0, %for.body.preheader ]
+  %iv.next10 = add nuw nsw i64 %iv9, 1
+  %1 = trunc i64 %iv.next10 to i32
+  %cmp22 = icmp slt i32 %1, %nbodies
+  br i1 %cmp22, label %for.body3.lr.ph, label %for.inc20
+
+for.body3.lr.ph:                                  ; preds = %for.body
+  %x = getelementptr inbounds %struct.planet* %bodies, i64 %iv9, i32 0
+  %y = getelementptr inbounds %struct.planet* %bodies, i64 %iv9, i32 1
+  %vx = getelementptr inbounds %struct.planet* %bodies, i64 %iv9, i32 2
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
+  %iv20 = phi i32 [ %iv.next21, %for.body3 ], [ %iv19, %for.body3.lr.ph ]
+  %iv15 = phi %struct.planet* [ %gep16, %for.body3 ], [ %iv, %for.body3.lr.ph ]
+  %iv1517 = bitcast %struct.planet* %iv15 to double*
+  %2 = load double* %x, align 8
+  %gep18 = getelementptr double* %iv1517, i64 -1
+  %3 = load double* %gep18, align 8
+  %sub = fsub double %2, %3
+  %4 = load double* %y, align 8
+  %5 = load double* %iv1517, align 8
+  %sub8 = fsub double %4, %5
+  %add10 = fadd double %sub, %sub8
+  %call = tail call double @sqrt(double %sub8) #2
+  store double %add10, double* %vx, align 8
+  %gep16 = getelementptr %struct.planet* %iv15, i64 1
+  %iv.next21 = add i32 %iv20, -1
+  %exitcond = icmp eq i32 %iv.next21, 0
+  br i1 %exitcond, label %for.inc20, label %for.body3
+
+for.inc20:                                        ; preds = %for.body3, %for.body
+  %lftr.wideiv11 = trunc i64 %iv.next10 to i32
+  %gep14 = getelementptr %struct.planet* %iv, i64 1
+  %iv.next = add i32 %iv19, -1
+  %exitcond12 = icmp eq i32 %lftr.wideiv11, %nbodies
+  br i1 %exitcond12, label %for.end38, label %for.body
+
+for.end38:                                        ; preds = %for.inc20, %entry
+  ret void
+}
+
+; Function Attrs: nounwind
+declare double @sqrt(double) #1
+
+attributes #0 = { "no-frame-pointer-elim-non-leaf" }
diff --git a/test/CodeGen/X86/pr20088.ll b/test/CodeGen/X86/pr20088.ll
new file mode 100644
index 0000000..3a82962
--- /dev/null
+++ b/test/CodeGen/X86/pr20088.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s
+
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
+
+define <16 x i8> @foo(<16 x i8> %x) {
+; CHECK: vpblendvb
+  %res = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> zeroinitializer, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <16 x i8> %x)
+  ret <16 x i8> %res;
+}
diff --git a/test/CodeGen/X86/pr5145.ll b/test/CodeGen/X86/pr5145.ll
index d048db8..32a797b 100644
--- a/test/CodeGen/X86/pr5145.ll
+++ b/test/CodeGen/X86/pr5145.ll
@@ -5,29 +5,29 @@ define void @atomic_maxmin_i8() {
 ; CHECK: atomic_maxmin_i8
   %1 = atomicrmw max  i8* @sc8, i8 5 acquire
 ; CHECK: [[LABEL1:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: cmpb
-; CHECK: cmovl
+; CHECK: movsbl
+; CHECK: cmpl
 ; CHECK: lock
 ; CHECK-NEXT: cmpxchgb
 ; CHECK: jne [[LABEL1]]
   %2 = atomicrmw min  i8* @sc8, i8 6 acquire
 ; CHECK: [[LABEL3:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: cmpb
-; CHECK: cmovg
+; CHECK: movsbl
+; CHECK: cmpl
 ; CHECK: lock
 ; CHECK-NEXT: cmpxchgb
 ; CHECK: jne [[LABEL3]]
   %3 = atomicrmw umax i8* @sc8, i8 7 acquire
 ; CHECK: [[LABEL5:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: cmpb
-; CHECK: cmovb
+; CHECK: movzbl
+; CHECK: cmpl
 ; CHECK: lock
 ; CHECK-NEXT: cmpxchgb
 ; CHECK: jne [[LABEL5]]
   %4 = atomicrmw umin i8* @sc8, i8 8 acquire
 ; CHECK: [[LABEL7:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: cmpb
-; CHECK: cmova
+; CHECK: movzbl
+; CHECK: cmpl
 ; CHECK: lock
 ; CHECK-NEXT: cmpxchgb
 ; CHECK: jne [[LABEL7]]
diff --git a/test/CodeGen/X86/pshufd-combine-crash.ll b/test/CodeGen/X86/pshufd-combine-crash.ll
new file mode 100644
index 0000000..84c69e3
--- /dev/null
+++ b/test/CodeGen/X86/pshufd-combine-crash.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -debug
+
+; REQUIRES: asserts
+
+; Test that the dag combiner doesn't assert if we try to replace a sequence of two
+; v4f32 X86ISD::PSHUFD nodes with a single PSHUFD.
+
+
+define <4 x float> @test(<4 x float> %V) {
+  %1 = shufflevector <4 x float> %V, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
+  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
+  ret <4 x float> %2
+}
+
diff --git a/test/CodeGen/X86/rdpmc.ll b/test/CodeGen/X86/rdpmc.ll
new file mode 100644
index 0000000..7f1ca46
--- /dev/null
+++ b/test/CodeGen/X86/rdpmc.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=x86-64 -mcpu=generic | FileCheck %s --check-prefix=CHECK --check-prefix=X86-64
+; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s --check-prefix=CHECK --check-prefix=X86
+
+; Verify that we correctly lower the "Read Performance-Monitoring Counters"
+; x86 builtin.
+
+
+define i64 @test_builtin_read_pmc(i32 %ID) {
+  %1 = tail call i64 @llvm.x86.rdpmc(i32 %ID)
+  ret i64 %1
+}
+; CHECK-LABEL: test_builtin_read_pmc
+; CHECK: rdpmc
+; X86-NOT: shlq
+; X86-NOT: or
+; X86-64: shlq
+; X86-64: or
+; CHECK-NOT: mov
+; CHECK: ret
+
+declare i64 @llvm.x86.rdpmc(i32 %ID)
+
diff --git a/test/CodeGen/X86/shift-parts.ll b/test/CodeGen/X86/shift-parts.ll
index ce4f538..ddad307 100644
--- a/test/CodeGen/X86/shift-parts.ll
+++ b/test/CodeGen/X86/shift-parts.ll
@@ -1,10 +1,12 @@
-; RUN: llc < %s -march=x86-64 | grep shrdq
+; RUN: llc -march=x86-64 < %s | FileCheck %s
 ; PR4736
 
 %0 = type { i32, i8, [35 x i8] }
 
 @g_144 = external global %0, align 8              ; <%0*> [#uses=1]
 
+; CHECK: shrdq
+
 define i32 @int87(i32 %uint64p_8) nounwind {
 entry:
   %srcval4 = load i320* bitcast (%0* @g_144 to i320*), align 8 ; <i320> [#uses=1]
diff --git a/test/CodeGen/X86/sqrt.ll b/test/CodeGen/X86/sqrt.ll
new file mode 100644
index 0000000..be7c6e8
--- /dev/null
+++ b/test/CodeGen/X86/sqrt.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-avx,+sse2                             | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-avx,+sse2 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-avx2,+avx                             | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-avx2,+avx -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=AVX
+
+define float @test_sqrt_f32(float %a) {
+; SSE2-LABEL: test_sqrt_f32
+; SSE2:       sqrtss %xmm0, %xmm0
+; AVX-LABEL:  test_sqrt_f32
+; AVX:        vsqrtss %xmm0, %xmm0
+  %res = call float @llvm.sqrt.f32(float %a)
+  ret float %res
+}
+declare float @llvm.sqrt.f32(float) nounwind readnone
+
+define double @test_sqrt_f64(double %a) {
+; SSE2-LABEL: test_sqrt_f64
+; SSE2:       sqrtsd %xmm0, %xmm0
+; AVX-LABEL:  test_sqrt_f64
+; AVX:        vsqrtsd %xmm0, %xmm0
+  %res = call double @llvm.sqrt.f64(double %a)
+  ret double %res
+}
+declare double @llvm.sqrt.f64(double) nounwind readnone
+
+
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index cfc892d..c906ecd 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -717,3 +717,30 @@ define void @test_x86_sse2_pause() {
   ret void 
 }
 declare void @llvm.x86.sse2.pause() nounwind
+
+define <4 x i32> @test_x86_sse2_pshuf_d(<4 x i32> %a) {
+; CHECK-LABEL: test_x86_sse2_pshuf_d:
+; CHECK: pshufd $27
+entry:
+   %res = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) nounwind readnone
+   ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) nounwind readnone
+
+define <8 x i16> @test_x86_sse2_pshufl_w(<8 x i16> %a) {
+; CHECK-LABEL: test_x86_sse2_pshufl_w:
+; CHECK: pshuflw $27
+entry:
+   %res = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) nounwind readnone
+   ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) nounwind readnone
+
+define <8 x i16> @test_x86_sse2_pshufh_w(<8 x i16> %a) {
+; CHECK-LABEL: test_x86_sse2_pshufh_w:
+; CHECK: pshufhw $27
+entry:
+   %res = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) nounwind readnone
+   ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/sse3-avx-addsub-2.ll b/test/CodeGen/X86/sse3-avx-addsub-2.ll
new file mode 100644
index 0000000..b7706cc
--- /dev/null
+++ b/test/CodeGen/X86/sse3-avx-addsub-2.ll
@@ -0,0 +1,318 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
+
+
+; Verify that we correctly generate 'addsub' instructions from
+; a sequence of vector extracts + float add/sub + vector inserts.
+
+define <4 x float> @test1(<4 x float> %A, <4 x float> %B) {
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 2
+  %4 = extractelement <4 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <4 x float> %A, i32 1
+  %6 = extractelement <4 x float> %B, i32 1
+  %add = fadd float %5, %6
+  %7 = extractelement <4 x float> %A, i32 3
+  %8 = extractelement <4 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
+  ret <4 x float> %vecinsert4
+}
+; CHECK-LABEL: test1
+; SSE: addsubps
+; AVX: vaddsubps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test2(<4 x float> %A, <4 x float> %B) {
+  %1 = extractelement <4 x float> %A, i32 2
+  %2 = extractelement <4 x float> %B, i32 2
+  %sub2 = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 3
+  %4 = extractelement <4 x float> %B, i32 3
+  %add2 = fadd float %3, %4
+  %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 2
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+  ret <4 x float> %vecinsert2
+}
+; CHECK-LABEL: test2
+; SSE: addsubps
+; AVX: vaddsubps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 3
+  %4 = extractelement <4 x float> %B, i32 3
+  %add = fadd float %4, %3
+  %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 3
+  ret <4 x float> %vecinsert2
+}
+; CHECK-LABEL: test3
+; SSE: addsubps
+; AVX: vaddsubps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
+  %1 = extractelement <4 x float> %A, i32 2
+  %2 = extractelement <4 x float> %B, i32 2
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 1
+  %4 = extractelement <4 x float> %B, i32 1
+  %add = fadd float %3, %4
+  %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 1
+  ret <4 x float> %vecinsert2
+}
+; CHECK-LABEL: test4
+; SSE: addsubps
+; AVX: vaddsubps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test5(<4 x float> %A, <4 x float> %B) {
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub2 = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 1
+  %4 = extractelement <4 x float> %B, i32 1
+  %add2 = fadd float %3, %4
+  %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 0
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 1
+  ret <4 x float> %vecinsert2
+}
+; CHECK-LABEL: test5
+; SSE: addsubps
+; AVX: vaddsubps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test6(<4 x float> %A, <4 x float> %B) {
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 2
+  %4 = extractelement <4 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <4 x float> %A, i32 1
+  %6 = extractelement <4 x float> %B, i32 1
+  %add = fadd float %5, %6
+  %7 = extractelement <4 x float> %A, i32 3
+  %8 = extractelement <4 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
+  ret <4 x float> %vecinsert4
+}
+; CHECK-LABEL: test6
+; SSE: addsubps
+; AVX: vaddsubps
+; CHECK-NEXT: ret
+
+
+define <4 x double> @test7(<4 x double> %A, <4 x double> %B) {
+  %1 = extractelement <4 x double> %A, i32 0
+  %2 = extractelement <4 x double> %B, i32 0
+  %sub = fsub double %1, %2
+  %3 = extractelement <4 x double> %A, i32 2
+  %4 = extractelement <4 x double> %B, i32 2
+  %sub2 = fsub double %3, %4
+  %5 = extractelement <4 x double> %A, i32 1
+  %6 = extractelement <4 x double> %B, i32 1
+  %add = fadd double %5, %6
+  %7 = extractelement <4 x double> %A, i32 3
+  %8 = extractelement <4 x double> %B, i32 3
+  %add2 = fadd double %7, %8
+  %vecinsert1 = insertelement <4 x double> undef, double %add, i32 1
+  %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add2, i32 3
+  %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub, i32 0
+  %vecinsert4 = insertelement <4 x double> %vecinsert3, double %sub2, i32 2
+  ret <4 x double> %vecinsert4
+}
+; CHECK-LABEL: test7
+; SSE: addsubpd
+; SSE-NEXT: addsubpd
+; AVX: vaddsubpd
+; AVX-NOT: vaddsubpd
+; CHECK: ret
+
+
+define <2 x double> @test8(<2 x double> %A, <2 x double> %B) {
+  %1 = extractelement <2 x double> %A, i32 0
+  %2 = extractelement <2 x double> %B, i32 0
+  %sub = fsub double %1, %2
+  %3 = extractelement <2 x double> %A, i32 1
+  %4 = extractelement <2 x double> %B, i32 1
+  %add = fadd double %3, %4
+  %vecinsert1 = insertelement <2 x double> undef, double %sub, i32 0
+  %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add, i32 1
+  ret <2 x double> %vecinsert2
+}
+; CHECK-LABEL: test8
+; SSE: addsubpd
+; AVX: vaddsubpd
+; CHECK: ret
+
+
+define <8 x float> @test9(<8 x float> %A, <8 x float> %B) {
+  %1 = extractelement <8 x float> %A, i32 0
+  %2 = extractelement <8 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <8 x float> %A, i32 2
+  %4 = extractelement <8 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <8 x float> %A, i32 1
+  %6 = extractelement <8 x float> %B, i32 1
+  %add = fadd float %5, %6
+  %7 = extractelement <8 x float> %A, i32 3
+  %8 = extractelement <8 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %9 = extractelement <8 x float> %A, i32 4
+  %10 = extractelement <8 x float> %B, i32 4
+  %sub3 = fsub float %9, %10
+  %11 = extractelement <8 x float> %A, i32 6
+  %12 = extractelement <8 x float> %B, i32 6
+  %sub4 = fsub float %11, %12
+  %13 = extractelement <8 x float> %A, i32 5
+  %14 = extractelement <8 x float> %B, i32 5
+  %add3 = fadd float %13, %14
+  %15 = extractelement <8 x float> %A, i32 7
+  %16 = extractelement <8 x float> %B, i32 7
+  %add4 = fadd float %15, %16
+  %vecinsert1 = insertelement <8 x float> undef, float %add, i32 1
+  %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <8 x float> %vecinsert3, float %sub2, i32 2
+  %vecinsert5 = insertelement <8 x float> %vecinsert4, float %add3, i32 5
+  %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add4, i32 7
+  %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub3, i32 4
+  %vecinsert8 = insertelement <8 x float> %vecinsert7, float %sub4, i32 6
+  ret <8 x float> %vecinsert8
+}
+; CHECK-LABEL: test9
+; SSE: addsubps
+; SSE-NEXT: addsubps
+; AVX: vaddsubps
+; AVX-NOT: vaddsubps
+; CHECK: ret
+
+
+; Verify that we don't generate addsub instruction for the following
+; functions.
+define <4 x float> @test10(<4 x float> %A, <4 x float> %B) {
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
+  ret <4 x float> %vecinsert1
+}
+; CHECK-LABEL: test10
+; CHECK-NOT: addsubps
+; CHECK: ret
+
+
+define <4 x float> @test11(<4 x float> %A, <4 x float> %B) {
+  %1 = extractelement <4 x float> %A, i32 2
+  %2 = extractelement <4 x float> %B, i32 2
+  %sub = fsub float %1, %2
+  %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
+  ret <4 x float> %vecinsert1
+}
+; CHECK-LABEL: test11
+; CHECK-NOT: addsubps
+; CHECK: ret
+
+
+define <4 x float> @test12(<4 x float> %A, <4 x float> %B) {
+  %1 = extractelement <4 x float> %A, i32 1
+  %2 = extractelement <4 x float> %B, i32 1
+  %add = fadd float %1, %2
+  %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
+  ret <4 x float> %vecinsert1
+}
+; CHECK-LABEL: test12
+; CHECK-NOT: addsubps
+; CHECK: ret
+
+
+define <4 x float> @test13(<4 x float> %A, <4 x float> %B) {
+  %1 = extractelement <4 x float> %A, i32 3
+  %2 = extractelement <4 x float> %B, i32 3
+  %add = fadd float %1, %2
+  %vecinsert1 = insertelement <4 x float> undef, float %add, i32 3
+  ret <4 x float> %vecinsert1
+}
+; CHECK-LABEL: test13
+; CHECK-NOT: addsubps
+; CHECK: ret
+
+
+define <4 x float> @test14(<4 x float> %A, <4 x float> %B) {
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 2
+  %4 = extractelement <4 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %sub2, i32 2
+  ret <4 x float> %vecinsert2
+}
+; CHECK-LABEL: test14
+; CHECK-NOT: addsubps
+; CHECK: ret
+
+
+define <4 x float> @test15(<4 x float> %A, <4 x float> %B) {
+  %1 = extractelement <4 x float> %A, i32 1
+  %2 = extractelement <4 x float> %B, i32 1
+  %add = fadd float %1, %2
+  %3 = extractelement <4 x float> %A, i32 3
+  %4 = extractelement <4 x float> %B, i32 3
+  %add2 = fadd float %3, %4
+  %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+  ret <4 x float> %vecinsert2
+}
+; CHECK-LABEL: test15
+; CHECK-NOT: addsubps
+; CHECK: ret
+
+
+define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub = fsub float %1, undef
+  %3 = extractelement <4 x float> %A, i32 2
+  %4 = extractelement <4 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <4 x float> %A, i32 1
+  %6 = extractelement <4 x float> %B, i32 1
+  %add = fadd float %5, undef
+  %7 = extractelement <4 x float> %A, i32 3
+  %8 = extractelement <4 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
+  ret <4 x float> %vecinsert4
+}
+; CHECK-LABEL: test16
+; CHECK-NOT: addsubps
+; CHECK: ret
+
+
diff --git a/test/CodeGen/X86/sse3-avx-addsub.ll b/test/CodeGen/X86/sse3-avx-addsub.ll
new file mode 100644
index 0000000..8b66743
--- /dev/null
+++ b/test/CodeGen/X86/sse3-avx-addsub.ll
@@ -0,0 +1,296 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s -check-prefix=SSE -check-prefix=CHECK
+; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX -check-prefix=CHECK
+
+; Test ADDSUB ISel patterns.
+
+; Functions below are obtained from the following source:
+;
+; typedef double double2 __attribute__((ext_vector_type(2)));
+; typedef double double4 __attribute__((ext_vector_type(4)));
+; typedef float float4 __attribute__((ext_vector_type(4)));
+; typedef float float8 __attribute__((ext_vector_type(8)));
+;
+; float4 test1(float4 A, float4 B) {
+;   float4 X = A - B;
+;   float4 Y = A + B;
+;   return (float4){X[0], Y[1], X[2], Y[3]};
+; }
+;
+; float8 test2(float8 A, float8 B) {
+;   float8 X = A - B;
+;   float8 Y = A + B;
+;   return (float8){X[0], Y[1], X[2], Y[3], X[4], Y[5], X[6], Y[7]};
+; }
+;
+; double4 test3(double4 A, double4 B) {
+;   double4 X = A - B;
+;   double4 Y = A + B;
+;   return (double4){X[0], Y[1], X[2], Y[3]};
+; }
+;
+; double2 test4(double2 A, double2 B) {
+;   double2 X = A - B;
+;   double2 Y = A + B;
+;   return (double2){X[0], Y[1]};
+; }
+
+define <4 x float> @test1(<4 x float> %A, <4 x float> %B) {
+  %sub = fsub <4 x float> %A, %B
+  %add = fadd <4 x float> %A, %B
+  %vecinit6 = shufflevector <4 x float> %sub, <4 x float> %add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x float> %vecinit6
+}
+; CHECK-LABEL: test1
+; SSE: addsubps
+; AVX: vaddsubps
+; CHECK-NEXT: ret
+
+
+define <8 x float> @test2(<8 x float> %A, <8 x float> %B) {
+  %sub = fsub <8 x float> %A, %B
+  %add = fadd <8 x float> %A, %B
+  %vecinit14 = shufflevector <8 x float> %sub, <8 x float> %add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x float> %vecinit14
+}
+; CHECK-LABEL: test2
+; SSE: addsubps
+; SSE-NEXT: addsubps
+; AVX: vaddsubps
+; AVX-NOT: vaddsubps
+; CHECK: ret
+
+
+define <4 x double> @test3(<4 x double> %A, <4 x double> %B) {
+  %sub = fsub <4 x double> %A, %B
+  %add = fadd <4 x double> %A, %B
+  %vecinit6 = shufflevector <4 x double> %sub, <4 x double> %add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x double> %vecinit6
+}
+; CHECK-LABEL: test3
+; SSE: addsubpd
+; SSE: addsubpd
+; AVX: vaddsubpd
+; AVX-NOT: vaddsubpd
+; CHECK: ret
+
+
+define <2 x double> @test4(<2 x double> %A, <2 x double> %B) #0 {
+  %add = fadd <2 x double> %A, %B
+  %sub = fsub <2 x double> %A, %B
+  %vecinit2 = shufflevector <2 x double> %sub, <2 x double> %add, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %vecinit2
+}
+; CHECK-LABEL: test4
+; SSE: addsubpd
+; AVX: vaddsubpd
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test1b(<4 x float> %A, <4 x float>* %B) {
+  %1 = load <4 x float>* %B
+  %add = fadd <4 x float> %A, %1
+  %sub = fsub <4 x float> %A, %1
+  %vecinit6 = shufflevector <4 x float> %sub, <4 x float> %add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x float> %vecinit6
+}
+; CHECK-LABEL: test1b
+; SSE: addsubps
+; AVX: vaddsubps
+; CHECK-NEXT: ret
+
+
+define <8 x float> @test2b(<8 x float> %A, <8 x float>* %B) {
+  %1 = load <8 x float>* %B
+  %add = fadd <8 x float> %A, %1
+  %sub = fsub <8 x float> %A, %1
+  %vecinit14 = shufflevector <8 x float> %sub, <8 x float> %add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x float> %vecinit14
+}
+; CHECK-LABEL: test2b
+; SSE: addsubps
+; SSE-NEXT: addsubps
+; AVX: vaddsubps
+; AVX-NOT: vaddsubps
+; CHECK: ret
+
+
+define <4 x double> @test3b(<4 x double> %A, <4 x double>* %B) {
+  %1 = load <4 x double>* %B
+  %add = fadd <4 x double> %A, %1
+  %sub = fsub <4 x double> %A, %1
+  %vecinit6 = shufflevector <4 x double> %sub, <4 x double> %add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x double> %vecinit6
+}
+; CHECK-LABEL: test3b
+; SSE: addsubpd
+; SSE: addsubpd
+; AVX: vaddsubpd
+; AVX-NOT: vaddsubpd
+; CHECK: ret
+
+
+define <2 x double> @test4b(<2 x double> %A, <2 x double>* %B) {
+  %1 = load <2 x double>* %B
+  %sub = fsub <2 x double> %A, %1
+  %add = fadd <2 x double> %A, %1
+  %vecinit2 = shufflevector <2 x double> %sub, <2 x double> %add, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %vecinit2
+}
+; CHECK-LABEL: test4b
+; SSE: addsubpd
+; AVX: vaddsubpd
+; CHECK-NEXT: ret
+
+; Functions below are obtained from the following source:
+;
+; float4 test1(float4 A, float4 B) {
+;   float4 X = A + B;
+;   float4 Y = A - B;
+;   return (float4){X[0], Y[1], X[2], Y[3]};
+; }
+;
+; float8 test2(float8 A, float8 B) {
+;   float8 X = A + B;
+;   float8 Y = A - B;
+;   return (float8){X[0], Y[1], X[2], Y[3], X[4], Y[5], X[6], Y[7]};
+; }
+;
+; double4 test3(double4 A, double4 B) {
+;   double4 X = A + B;
+;   double4 Y = A - B;
+;   return (double4){X[0], Y[1], X[2], Y[3]};
+; }
+;
+; double2 test4(double2 A, double2 B) {
+;   double2 X = A + B;
+;   double2 Y = A - B;
+;   return (double2){X[0], Y[1]};
+; }
+
+define <4 x float> @test5(<4 x float> %A, <4 x float> %B) {
+  %sub = fsub <4 x float> %A, %B
+  %add = fadd <4 x float> %A, %B
+  %vecinit6 = shufflevector <4 x float> %add, <4 x float> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x float> %vecinit6
+}
+; CHECK-LABEL: test5
+; SSE: xorps
+; SSE-NEXT: addsubps
+; AVX: vxorps
+; AVX-NEXT: vaddsubps
+; CHECK: ret
+
+
+define <8 x float> @test6(<8 x float> %A, <8 x float> %B) {
+  %sub = fsub <8 x float> %A, %B
+  %add = fadd <8 x float> %A, %B
+  %vecinit14 = shufflevector <8 x float> %add, <8 x float> %sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x float> %vecinit14
+}
+; CHECK-LABEL: test6
+; SSE: xorps
+; SSE-NEXT: addsubps
+; SSE: xorps
+; SSE-NEXT: addsubps
+; AVX: vxorps
+; AVX-NEXT: vaddsubps
+; AVX-NOT: vxorps
+; AVX-NOT: vaddsubps
+; CHECK: ret
+
+
+define <4 x double> @test7(<4 x double> %A, <4 x double> %B) {
+  %sub = fsub <4 x double> %A, %B
+  %add = fadd <4 x double> %A, %B
+  %vecinit6 = shufflevector <4 x double> %add, <4 x double> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x double> %vecinit6
+}
+; CHECK-LABEL: test7
+; SSE: xorpd
+; SSE-NEXT: addsubpd
+; SSE: xorpd
+; SSE-NEXT: addsubpd
+; AVX: vxorpd
+; AVX-NEXT: vaddsubpd
+; AVX-NOT: vxorpd
+; AVX-NOT: vaddsubpd
+; CHECK: ret
+
+
+define <2 x double> @test8(<2 x double> %A, <2 x double> %B) #0 {
+  %add = fadd <2 x double> %A, %B
+  %sub = fsub <2 x double> %A, %B
+  %vecinit2 = shufflevector <2 x double> %add, <2 x double> %sub, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %vecinit2
+}
+; CHECK-LABEL: test8
+; SSE: xorpd
+; SSE-NEXT: addsubpd
+; AVX: vxorpd
+; AVX-NEXT: vaddsubpd
+; CHECK: ret
+
+
+define <4 x float> @test5b(<4 x float> %A, <4 x float> %B) {
+  %sub = fsub <4 x float> %A, %B
+  %add = fadd <4 x float> %B, %A
+  %vecinit6 = shufflevector <4 x float> %add, <4 x float> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x float> %vecinit6
+}
+; CHECK-LABEL: test5
+; SSE: xorps
+; SSE-NEXT: addsubps
+; AVX: vxorps
+; AVX-NEXT: vaddsubps
+; CHECK: ret
+
+
+define <8 x float> @test6b(<8 x float> %A, <8 x float> %B) {
+  %sub = fsub <8 x float> %A, %B
+  %add = fadd <8 x float> %B, %A
+  %vecinit14 = shufflevector <8 x float> %add, <8 x float> %sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x float> %vecinit14
+}
+; CHECK-LABEL: test6
+; SSE: xorps
+; SSE-NEXT: addsubps
+; SSE: xorps
+; SSE-NEXT: addsubps
+; AVX: vxorps
+; AVX-NEXT: vaddsubps
+; AVX-NOT: vxorps
+; AVX-NOT: vaddsubps
+; CHECK: ret
+
+
+define <4 x double> @test7b(<4 x double> %A, <4 x double> %B) {
+  %sub = fsub <4 x double> %A, %B
+  %add = fadd <4 x double> %B, %A
+  %vecinit6 = shufflevector <4 x double> %add, <4 x double> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x double> %vecinit6
+}
+; CHECK-LABEL: test7
+; SSE: xorpd
+; SSE-NEXT: addsubpd
+; SSE: xorpd
+; SSE-NEXT: addsubpd
+; AVX: vxorpd
+; AVX-NEXT: vaddsubpd
+; AVX-NOT: vxorpd
+; AVX-NOT: vaddsubpd
+; CHECK: ret
+
+
+define <2 x double> @test8b(<2 x double> %A, <2 x double> %B) #0 {
+  %add = fadd <2 x double> %B, %A
+  %sub = fsub <2 x double> %A, %B
+  %vecinit2 = shufflevector <2 x double> %add, <2 x double> %sub, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %vecinit2
+}
+; CHECK-LABEL: test8
+; SSE: xorpd
+; SSE-NEXT: addsubpd
+; AVX: vxorpd
+; AVX-NEXT: vaddsubpd
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll
index 8ad7987..3a48121 100644
--- a/test/CodeGen/X86/sse41-blend.ll
+++ b/test/CodeGen/X86/sse41-blend.ll
@@ -117,6 +117,24 @@ define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) {
   %1 = select <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x i8> %xyzw, <16 x i8> %abcd
   ret <16 x i8> %1
 }
+
 declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
 declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
+
+;; 2 tests for shufflevectors that optimize to blend + immediate
+; CHECK-LABEL: @blend_shufflevector_4xfloat
+; CHECK: blendps $6, %xmm1, %xmm0
+; CHECK: ret
+define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) {
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+  ret <4 x float> %1
+}
+
+; CHECK-LABEL: @blend_shufflevector_8xi16
+; CHECK: pblendw $134, %xmm1, %xmm0
+; CHECK: ret
+define <8 x i16> @blend_shufflevector_8xi16(<8 x i16> %a, <8 x i16> %b) {
+  %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 15>
+  ret <8 x i16> %1
+}
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index a3c6201..6726a3e 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -692,3 +692,25 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
   %13 = fadd <4 x float> %11, %12
   ret <4 x float> %13
 }
+
+define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
+; CHECK-LABEL: insertps_with_undefs:
+; CHECK-NOT: shufps
+; CHECK: insertps    $32, %xmm0
+; CHECK: ret
+  %1 = load float* %b, align 4
+  %2 = insertelement <4 x float> undef, float %1, i32 0
+  %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7>
+  ret <4 x float> %result
+}
+
+; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using
+; the destination index to change the load, instead of the source index.
+define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
+; CHECK-LABEL: pr20087:
+; CHECK: insertps  $48
+; CHECK: ret
+  %load = load <4 x float> *%ptr
+  %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
+  ret <4 x float> %ret
+}
diff --git a/test/CodeGen/X86/stackmap-fast-isel.ll b/test/CodeGen/X86/stackmap-fast-isel.ll
new file mode 100644
index 0000000..0b7e6db
--- /dev/null
+++ b/test/CodeGen/X86/stackmap-fast-isel.ll
@@ -0,0 +1,165 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim                             | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim -fast-isel -fast-isel-abort | FileCheck %s
+
+; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 4
+; Num LargeConstants
+; CHECK-NEXT:   .long 3
+; Num Callsites
+; CHECK-NEXT:   .long 7
+
+; Functions and stack size
+; CHECK-NEXT:   .quad _constantargs
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _liveConstant
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _directFrameIdx
+; CHECK-NEXT:   .quad 40
+; CHECK-NEXT:   .quad _longid
+; CHECK-NEXT:   .quad 8
+
+; Large Constants
+; CHECK-NEXT:   .quad   2147483648
+; CHECK-NEXT:   .quad   4294967295
+; CHECK-NEXT:   .quad   4294967296
+
+; Callsites
+; Constant arguments
+;
+; CHECK-NEXT:   .quad   1
+; CHECK-NEXT:   .long   L{{.*}}-_constantargs
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  12
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   -1
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   -1
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   65536
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   2000000000
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   2147483647
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   -1
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   -1
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   0
+; LargeConstant at index 0
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   0
+; LargeConstant at index 1
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   1
+; LargeConstant at index 2
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   2
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   -1
+
+define void @constantargs() {
+entry:
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 1, i32 15, i16 65535, i16 -1, i32 65536, i32 2000000000, i32 2147483647, i32 -1, i32 4294967295, i32 4294967296, i64 2147483648, i64 4294967295, i64 4294967296, i64 -1)
+  ret void
+}
+
+; Map a constant value.
+;
+; CHECK-LABEL:  .long L{{.*}}-_liveConstant
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short 1
+; Loc 0: SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   33
+
+define void @liveConstant() {
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 5, i32 33)
+  ret void
+}
+
+; Directly map an alloca's address.
+;
+; Callsite 16
+; CHECK-LABEL:  .long L{{.*}}-_directFrameIdx
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short	1
+; Loc 0: Direct RBP - ofs
+; CHECK-NEXT:   .byte	2
+; CHECK-NEXT:   .byte	8
+; CHECK-NEXT:   .short	6
+; CHECK-NEXT:   .long
+
+define void @directFrameIdx() {
+entry:
+  %metadata1 = alloca i64, i32 3, align 8
+  store i64 11, i64* %metadata1
+  store i64 12, i64* %metadata1
+  store i64 13, i64* %metadata1
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 0, i64* %metadata1)
+  ret void
+}
+
+; Test a 64-bit ID.
+;
+; CHECK:        .quad 4294967295
+; CHECK-LABEL:  .long L{{.*}}-_longid
+; CHECK:        .quad 4294967296
+; CHECK-LABEL:  .long L{{.*}}-_longid
+; CHECK:        .quad 9223372036854775807
+; CHECK-LABEL:  .long L{{.*}}-_longid
+; CHECK:        .quad -1
+; CHECK-LABEL:  .long L{{.*}}-_longid
+define void @longid() {
+entry:
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4294967295, i32 0)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4294967296, i32 0)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 9223372036854775807, i32 0)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 -1, i32 0)
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
diff --git a/test/CodeGen/X86/stackmap-liveness.ll b/test/CodeGen/X86/stackmap-liveness.ll
index 9ce5254..897595d 100644
--- a/test/CodeGen/X86/stackmap-liveness.ll
+++ b/test/CodeGen/X86/stackmap-liveness.ll
@@ -1,6 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim -enable-stackmap-liveness| FileCheck -check-prefix=STACK %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim -enable-patchpoint-liveness| FileCheck -check-prefix=PATCH %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim -enable-patchpoint-liveness=false | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim                                   | FileCheck -check-prefix=PATCH %s
 ;
 ; Note: Print verbose stackmaps using -debug-only=stackmaps.
 
@@ -37,36 +36,21 @@ entry:
 ; Align
 ; CHECK-NEXT:   .align  3
 
-; StackMap 1 (stackmap liveness information enabled)
-; STACK-LABEL:  .long L{{.*}}-_stackmap_liveness
-; STACK-NEXT:   .short  0
-; STACK-NEXT:   .short  0
-; Padding
-; STACK-NEXT:   .short  0
-; Num LiveOut Entries: 2
-; STACK-NEXT:   .short  2
-; LiveOut Entry 1: %RSP (8 bytes)
-; STACK-NEXT:   .short  7
-; STACK-NEXT:   .byte 0
-; STACK-NEXT:   .byte 8
-; LiveOut Entry 2: %YMM2 (16 bytes) --> %XMM2
-; STACK-NEXT:   .short  19
-; STACK-NEXT:   .byte 0
-; STACK-NEXT:   .byte 16
-; Align
-; STACK-NEXT:   .align  3
-
 ; StackMap 1 (patchpoint liveness information enabled)
 ; PATCH-LABEL:  .long L{{.*}}-_stackmap_liveness
 ; PATCH-NEXT:   .short  0
 ; PATCH-NEXT:   .short  0
 ; Padding
 ; PATCH-NEXT:   .short  0
-; Num LiveOut Entries: 0
-; PATCH-NEXT:   .short  0
+; Num LiveOut Entries: 1
+; PATCH-NEXT:   .short  1
+; LiveOut Entry 1: %YMM2 (16 bytes) --> %XMM2
+; PATCH-NEXT:   .short  19
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 16
 ; Align
 ; PATCH-NEXT:   .align  3
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 1, i32 5)
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 12, i8* null, i32 0)
   %a2 = call i64 asm sideeffect "", "={r8}"() nounwind
   %a3 = call i8 asm sideeffect "", "={ah}"() nounwind
   %a4 = call <4 x double> asm sideeffect "", "={ymm0}"() nounwind
@@ -83,52 +67,37 @@ entry:
 ; Align
 ; CHECK-NEXT:   .align  3
 
-; StackMap 2 (stackmap liveness information enabled)
-; STACK-LABEL:  .long L{{.*}}-_stackmap_liveness
-; STACK-NEXT:   .short  0
-; STACK-NEXT:   .short  0
-; Padding
-; STACK-NEXT:   .short  0
-; Num LiveOut Entries: 6
-; STACK-NEXT:   .short  6
-; LiveOut Entry 1: %RAX (1 bytes) --> %AL or %AH
-; STACK-NEXT:   .short  0
-; STACK-NEXT:   .byte 0
-; STACK-NEXT:   .byte 1
-; LiveOut Entry 2: %RSP (8 bytes)
-; STACK-NEXT:   .short  7
-; STACK-NEXT:   .byte 0
-; STACK-NEXT:   .byte 8
-; LiveOut Entry 3: %R8 (8 bytes)
-; STACK-NEXT:   .short  8
-; STACK-NEXT:   .byte 0
-; STACK-NEXT:   .byte 8
-; LiveOut Entry 4: %YMM0 (32 bytes)
-; STACK-NEXT:   .short  17
-; STACK-NEXT:   .byte 0
-; STACK-NEXT:   .byte 32
-; LiveOut Entry 5: %YMM1 (32 bytes)
-; STACK-NEXT:   .short  18
-; STACK-NEXT:   .byte 0
-; STACK-NEXT:   .byte 32
-; LiveOut Entry 6: %YMM2 (16 bytes) --> %XMM2
-; STACK-NEXT:   .short  19
-; STACK-NEXT:   .byte 0
-; STACK-NEXT:   .byte 16
-; Align
-; STACK-NEXT:   .align  3
-
 ; StackMap 2 (patchpoint liveness information enabled)
 ; PATCH-LABEL:  .long L{{.*}}-_stackmap_liveness
 ; PATCH-NEXT:   .short  0
 ; PATCH-NEXT:   .short  0
 ; Padding
 ; PATCH-NEXT:   .short  0
-; Num LiveOut Entries: 0
+; Num LiveOut Entries: 5
+; PATCH-NEXT:   .short  5
+; LiveOut Entry 1: %RAX (1 bytes) --> %AL or %AH
 ; PATCH-NEXT:   .short  0
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 1
+; LiveOut Entry 2: %R8 (8 bytes)
+; PATCH-NEXT:   .short  8
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 3: %YMM0 (32 bytes)
+; PATCH-NEXT:   .short  17
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 32
+; LiveOut Entry 4: %YMM1 (32 bytes)
+; PATCH-NEXT:   .short  18
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 32
+; LiveOut Entry 5: %YMM2 (16 bytes) --> %XMM2
+; PATCH-NEXT:   .short  19
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 16
 ; Align
 ; PATCH-NEXT:   .align  3
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 2, i32 5)
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 2, i32 12, i8* null, i32 0)
   call void asm sideeffect "", "{r8},{ah},{ymm0},{ymm1}"(i64 %a2, i8 %a3, <4 x double> %a4, <4 x double> %a5) nounwind
 
 ; StackMap 3 (no liveness information available)
@@ -142,36 +111,25 @@ entry:
 ; Align
 ; CHECK-NEXT:   .align  3
 
-; StackMap 3 (stackmap liveness information enabled)
-; STACK-LABEL:  .long L{{.*}}-_stackmap_liveness
-; STACK-NEXT:   .short  0
-; STACK-NEXT:   .short  0
-; Padding
-; STACK-NEXT:   .short  0
-; Num LiveOut Entries: 2
-; STACK-NEXT:   .short  2
-; LiveOut Entry 1: %RSP (8 bytes)
-; STACK-NEXT:   .short  7
-; STACK-NEXT:   .byte 0
-; STACK-NEXT:   .byte 8
-; LiveOut Entry 2: %YMM2 (16 bytes) --> %XMM2
-; STACK-NEXT:   .short  19
-; STACK-NEXT:   .byte 0
-; STACK-NEXT:   .byte 16
-; Align
-; STACK-NEXT:   .align  3
-
 ; StackMap 3 (patchpoint liveness information enabled)
 ; PATCH-LABEL:  .long L{{.*}}-_stackmap_liveness
 ; PATCH-NEXT:   .short  0
 ; PATCH-NEXT:   .short  0
 ; Padding
 ; PATCH-NEXT:   .short  0
-; Num LiveOut Entries: 0
-; PATCH-NEXT:   .short  0
+; Num LiveOut Entries: 2
+; PATCH-NEXT:   .short  2
+; LiveOut Entry 1: %RSP (8 bytes)
+; PATCH-NEXT:   .short  7
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 2: %YMM2 (16 bytes) --> %XMM2
+; PATCH-NEXT:   .short  19
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 16
 ; Align
 ; PATCH-NEXT:   .align  3
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 5)
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 12, i8* null, i32 0)
   call void asm sideeffect "", "{xmm2}"(<2 x double> %a1) nounwind
   ret void
 }
@@ -179,33 +137,6 @@ entry:
 define void @mixed_liveness() {
 entry:
   %a1 = call <2 x double> asm sideeffect "", "={xmm2}"() nounwind
-; StackMap 4 (stackmap liveness information enabled)
-; STACK-LABEL:  .long L{{.*}}-_mixed_liveness
-; STACK-NEXT:   .short  0
-; STACK-NEXT:   .short  0
-; Padding
-; STACK-NEXT:   .short  0
-; Num LiveOut Entries: 1
-; STACK-NEXT:   .short  1
-; LiveOut Entry 1: %YMM2 (16 bytes) --> %XMM2
-; STACK-NEXT:   .short  19
-; STACK-NEXT:   .byte 0
-; STACK-NEXT:   .byte 16
-; Align
-; STACK-NEXT:   .align  3
-
-
-; StackMap 5 (stackmap liveness information enabled)
-; STACK-LABEL:  .long L{{.*}}-_mixed_liveness
-; STACK-NEXT:   .short  0
-; STACK-NEXT:   .short  0
-; Padding
-; STACK-NEXT:   .short  0
-; Num LiveOut Entries: 0
-; STACK-NEXT:   .short  0
-; Align
-; STACK-NEXT:   .align  3
-
 ; StackMap 4 (patchpoint liveness information enabled)
 ; PATCH-LABEL:  .long L{{.*}}-_mixed_liveness
 ; PATCH-NEXT:   .short  0
diff --git a/test/CodeGen/X86/swizzle-2.ll b/test/CodeGen/X86/swizzle-2.ll
new file mode 100644
index 0000000..4b1f903
--- /dev/null
+++ b/test/CodeGen/X86/swizzle-2.ll
@@ -0,0 +1,515 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
+
+; Test that we correctly fold a shuffle that performs a swizzle of another
+; shuffle node according to the rule
+;  shuffle (shuffle (x, undef, M0), undef, M1) -> shuffle(x, undef, M2)
+;
+; We only do this if the resulting mask is legal to avoid introducing an
+; illegal shuffle that is expanded into a sub-optimal sequence of instructions
+; during lowering stage.
+
+
+define <4 x i32> @swizzle_1(<4 x i32> %v) {
+  %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: swizzle_1
+; Mask: [1,0,3,2]
+; CHECK: pshufd $-79
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @swizzle_2(<4 x i32> %v) {
+  %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: swizzle_2
+; Mask: [2,1,3,0]
+; CHECK: pshufd $54
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @swizzle_3(<4 x i32> %v) {
+  %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: swizzle_3
+; Mask: [1,0,3,2]
+; CHECK: pshufd $-79
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @swizzle_4(<4 x i32> %v) {
+  %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: swizzle_4
+; Mask: [3,1,0,2]
+; CHECK: pshufd $-121
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @swizzle_5(<4 x i32> %v) {
+  %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: swizzle_5
+; Mask: [2,3,0,1]
+; CHECK: pshufd $78
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @swizzle_6(<4 x i32> %v) {
+  %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: swizzle_6
+; Mask: [2,0,1,3]
+; CHECK: pshufd $-46
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @swizzle_7(<4 x i32> %v) {
+  %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: swizzle_7
+; Mask: [0,2,3,1]
+; CHECK: pshufd $120
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @swizzle_8(<4 x i32> %v) {
+  %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: swizzle_8
+; Mask: [1,3,2,0]
+; CHECK: pshufd $45
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @swizzle_9(<4 x i32> %v) {
+  %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: swizzle_9
+; Mask: [2,3,0,1]
+; CHECK: pshufd $78
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @swizzle_10(<4 x i32> %v) {
+  %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: swizzle_10
+; Mask: [1,2,0,3]
+; CHECK: pshufd $-55
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @swizzle_11(<4 x i32> %v) {
+  %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: swizzle_11
+; Mask: [3,2,1,0]
+; CHECK: pshufd $27
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @swizzle_12(<4 x i32> %v) {
+  %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: swizzle_12
+; Mask: [0,3,1,2]
+; CHECK: pshufd $-100
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @swizzle_13(<4 x i32> %v) {
+  %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: swizzle_13
+; Mask: [3,2,1,0]
+; CHECK: pshufd $27
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @swizzle_14(<4 x i32> %v) {
+  %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: swizzle_14
+; Mask: [3,0,2,1]
+; CHECK: pshufd $99
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x float> @swizzle_15(<4 x float> %v) {
+  %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
+  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
+  ret <4 x float> %2
+}
+; CHECK-LABEL: swizzle_15
+; Mask: [1,0,3,2]
+; CHECK: pshufd $-79
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x float> @swizzle_16(<4 x float> %v) {
+  %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2>
+  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2>
+  ret <4 x float> %2
+}
+; CHECK-LABEL: swizzle_16
+; Mask: [2,1,3,0]
+; CHECK: pshufd $54
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x float> @swizzle_17(<4 x float> %v) {
+  %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  ret <4 x float> %2
+}
+; CHECK-LABEL: swizzle_17
+; Mask: [1,0,3,2]
+; CHECK: pshufd $-79
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x float> @swizzle_18(<4 x float> %v) {
+  %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0>
+  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0>
+  ret <4 x float> %2
+}
+; CHECK-LABEL: swizzle_18
+; Mask: [3,1,0,2]
+; CHECK: pshufd $-121
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x float> @swizzle_19(<4 x float> %v) {
+  %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  ret <4 x float> %2
+}
+; CHECK-LABEL: swizzle_19
+; Mask: [2,3,0,1]
+; CHECK: pshufd $78
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x float> @swizzle_20(<4 x float> %v) {
+  %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
+  ret <4 x float> %2
+}
+; CHECK-LABEL: swizzle_20
+; Mask: [2,0,1,3]
+; CHECK: pshufd $-46
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x float> @swizzle_21(<4 x float> %v) {
+  %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
+  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
+  ret <4 x float> %2
+}
+; CHECK-LABEL: swizzle_21
+; Mask: [0,2,3,1]
+; CHECK: pshufd $120
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x float> @swizzle_22(<4 x float> %v) {
+  %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
+  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
+  ret <4 x float> %2
+}
+; CHECK-LABEL: swizzle_22
+; Mask: [1,3,2,0]
+; CHECK: pshufd $45
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x float> @swizzle_23(<4 x float> %v) {
+  %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+  ret <4 x float> %2
+}
+; CHECK-LABEL: swizzle_23
+; Mask: [2,3,0,1]
+; CHECK: pshufd $78
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x float> @swizzle_24(<4 x float> %v) {
+  %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+  ret <4 x float> %2
+}
+; CHECK-LABEL: swizzle_24
+; Mask: [1,2,0,3]
+; CHECK: pshufd $-55
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x float> @swizzle_25(<4 x float> %v) {
+  %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
+  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
+  ret <4 x float> %2
+}
+; CHECK-LABEL: swizzle_25
+; Mask: [3,2,1,0]
+; CHECK: pshufd $27
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x float> @swizzle_26(<4 x float> %v) {
+  %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
+  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
+  ret <4 x float> %2
+}
+; CHECK-LABEL: swizzle_26
+; Mask: [0,3,1,2]
+; CHECK: pshufd $-100
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x float> @swizzle_27(<4 x float> %v) {
+  %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+  ret <4 x float> %2
+}
+; CHECK-LABEL: swizzle_27
+; Mask: [3,2,1,0]
+; CHECK: pshufd $27
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x float> @swizzle_28(<4 x float> %v) {
+  %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  ret <4 x float> %2
+}
+; CHECK-LABEL: swizzle_28
+; Mask: [3,0,2,1]
+; CHECK: pshufd $99
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x float> @swizzle_29(<4 x float> %v) {
+  %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
+  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+  ret <4 x float> %2
+}
+; CHECK-LABEL: swizzle_29
+; Mask: [1,3,2,0]
+; CHECK: pshufd $45
+; CHECK-NOT: pshufd
+; CHECK-NEXT: ret
+
+; Make sure that we combine the shuffles from each function below into a single
+; legal shuffle (either pshuflw or pshufb depending on the masks).
+
+define <8 x i16> @swizzle_30(<8 x i16> %v) {
+  %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 7, i32 5, i32 6, i32 4>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 7, i32 5, i32 6, i32 4>
+  ret <8 x i16> %2
+}
+; CHECK-LABEL: swizzle_30
+; Mask: [1,3,2,0,5,7,6,4]
+; CHECK: pshuflw $45
+; CHECK-NOT: pshufb
+; CHECK-NEXT: ret
+
+
+define <8 x i16> @swizzle_31(<8 x i16> %v) {
+  %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 3, i32 0, i32 2, i32 1, i32 7, i32 5, i32 6, i32 4>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 3, i32 0, i32 2, i32 1, i32 7, i32 5, i32 6, i32 4>
+  ret <8 x i16> %2
+}
+; CHECK-LABEL: swizzle_31
+; Mask: [1,3,2,0,4,5,6,7]
+; CHECK: pshuflw $45
+; CHECK-NOT: pshufb
+; CHECK: ret
+
+
+define <8 x i16> @swizzle_32(<8 x i16> %v) {
+  %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 7, i32 5, i32 6, i32 4>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 7, i32 5, i32 6, i32 4>
+  ret <8 x i16> %2
+}
+; CHECK-LABEL: swizzle_32
+; Mask: [2,3,0,1,4,5,6,7] --> equivalent to pshufd mask [1,0,2,3]
+; CHECK: pshufd $-31
+; CHECK-NOT: pshufb
+; CHECK: ret
+
+define <8 x i16> @swizzle_33(<8 x i16> %v) {
+  %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 5, i32 7, i32 2, i32 3, i32 1, i32 0>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 5, i32 7, i32 2, i32 3, i32 1, i32 0>
+  ret <8 x i16> %2
+}
+; CHECK-LABEL: swizzle_33
+; CHECK: pshufb
+; CHECK-NOT: pshufb
+; CHECK-NOT: shufpd
+; CHECK: ret
+
+
+define <8 x i16> @swizzle_34(<8 x i16> %v) {
+  %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 4, i32 7, i32 6, i32 5, i32 1, i32 2, i32 0, i32 3>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 7, i32 6, i32 5, i32 1, i32 2, i32 0, i32 3>
+  ret <8 x i16> %2
+}
+; CHECK-LABEL: swizzle_34
+; CHECK: pshufb
+; CHECK-NOT: pshufb
+; CHECK-NOT: shufpd
+; CHECK: ret
+
+
+define <8 x i16> @swizzle_35(<8 x i16> %v) {
+  %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 1, i32 3, i32 0, i32 2>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 1, i32 3, i32 0, i32 2>
+  ret <8 x i16> %2
+}
+; CHECK-LABEL: swizzle_35
+; CHECK: pshufb
+; CHECK-NOT: pshufb
+; CHECK: ret
+
+
+define <8 x i16> @swizzle_36(<8 x i16> %v) {
+  %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 7, i32 5, i32 0, i32 1, i32 3, i32 2>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 7, i32 5, i32 0, i32 1, i32 3, i32 2>
+  ret <8 x i16> %2
+}
+; CHECK-LABEL: swizzle_36
+; CHECK: pshufb
+; CHECK-NOT: pshufb
+; CHECK-NOT: shufpd
+; CHECK: ret
+
+
+define <8 x i16> @swizzle_37(<8 x i16> %v) {
+  %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 7, i32 5, i32 6, i32 4>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 7, i32 4, i32 6, i32 5>
+  ret <8 x i16> %2
+}
+; CHECK-LABEL: swizzle_37
+; Mask: [0,1,2,3,4,7,6,5]
+; CHECK: pshufhw $108
+; CHECK-NOT: pshufb
+; CHECK: ret
+
+
+define <8 x i16> @swizzle_38(<8 x i16> %v) {
+  %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 5, i32 6, i32 4, i32 7, i32 0, i32 2, i32 1, i32 3>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 5, i32 6, i32 4, i32 7, i32 0, i32 2, i32 1, i32 3>
+  ret <8 x i16> %2
+}
+; CHECK-LABEL: swizzle_38
+; CHECK: pshufb
+; CHECK-NOT: pshufb
+; CHECK-NOT: shufpd
+; CHECK: ret
+
+
+define <8 x i16> @swizzle_39(<8 x i16> %v) {
+  %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 3, i32 2, i32 1, i32 0>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x i16> %2
+}
+; CHECK-LABEL: swizzle_39
+; CHECK: pshufb
+; CHECK-NOT: pshufb
+; CHECK-NOT: shufpd
+; CHECK: ret
+
+
+define <8 x i16> @swizzle_40(<8 x i16> %v) {
+  %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 6, i32 4, i32 7, i32 5, i32 1, i32 0, i32 3, i32 2>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 6, i32 4, i32 7, i32 5, i32 1, i32 0, i32 3, i32 2>
+  ret <8 x i16> %2
+}
+; CHECK-LABEL: swizzle_40
+; CHECK: pshufb
+; CHECK-NOT: pshufb
+; CHECK-NOT: shufpd
+; CHECK: ret
+
+
+define <8 x i16> @swizzle_41(<8 x i16> %v) {
+  %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 6, i32 7, i32 5, i32 4, i32 0, i32 1, i32 3, i32 2>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 6, i32 7, i32 5, i32 4, i32 0, i32 1, i32 3, i32 2>
+  ret <8 x i16> %2
+}
+; CHECK-LABEL: swizzle_41
+; CHECK: pshufb
+; CHECK-NOT: pshufb
+; CHECK-NOT: shufpd
+; CHECK: ret
+
+
+define <8 x i16> @swizzle_42(<8 x i16> %v) {
+  %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 7, i32 6, i32 4, i32 5>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 7, i32 6, i32 4, i32 5>
+  ret <8 x i16> %2
+}
+; CHECK-LABEL: swizzle_42
+; Mask: [0,1,2,3,5,4,7,6]
+; CHECK: pshufhw $-79
+; CHECK-NOT: pshufb
+; CHECK: ret
+
+
diff --git a/test/CodeGen/X86/swizzle-avx2.ll b/test/CodeGen/X86/swizzle-avx2.ll
new file mode 100644
index 0000000..29dfa6c
--- /dev/null
+++ b/test/CodeGen/X86/swizzle-avx2.ll
@@ -0,0 +1,91 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 | FileCheck %s
+
+; Test that we correctly fold a shuffle that performs a swizzle of another
+; shuffle node according to the rule
+;  shuffle (shuffle (x, undef, M0), undef, M1) -> shuffle(x, undef, M2)
+;
+; We only do this if the resulting mask is legal to avoid introducing an
+; illegal shuffle that is expanded into a sub-optimal sequence of instructions
+; during lowering stage.
+
+; Check that we produce a single vector permute / shuffle in all cases.
+
+define <8 x i32> @swizzle_1(<8 x i32> %v) {
+  %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 7, i32 5, i32 6, i32 4>
+  %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 7, i32 5, i32 6, i32 4>
+  ret <8 x i32> %2
+}
+; CHECK-LABEL: swizzle_1
+; CHECK: vpermd
+; CHECK-NOT: vpermd
+; CHECK: ret
+
+
+define <8 x i32> @swizzle_2(<8 x i32> %v) {
+  %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>
+  %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x i32> %2
+}
+; CHECK-LABEL: swizzle_2
+; CHECK: vpshufd $78
+; CHECK-NOT: vpermd
+; CHECK-NOT: vpshufd
+; CHECK: ret
+
+
+define <8 x i32> @swizzle_3(<8 x i32> %v) {
+  %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+  %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+  ret <8 x i32> %2
+}
+; CHECK-LABEL: swizzle_3
+; CHECK: vpshufd $78
+; CHECK-NOT: vpermd
+; CHECK-NOT: vpshufd
+; CHECK: ret
+
+
+define <8 x i32> @swizzle_4(<8 x i32> %v) {
+  %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 4, i32 7, i32 5, i32 6, i32 3, i32 2, i32 0, i32 1>
+  %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 4, i32 7, i32 5, i32 6, i32 3, i32 2, i32 0, i32 1>
+  ret <8 x i32> %2
+}
+; CHECK-LABEL: swizzle_4
+; CHECK: vpermd
+; CHECK-NOT: vpermd
+; CHECK: ret
+
+
+define <8 x i32> @swizzle_5(<8 x i32> %v) {
+  %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 0, i32 2, i32 1, i32 3>
+  %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 0, i32 2, i32 1, i32 3>
+  ret <8 x i32> %2
+}
+; CHECK-LABEL: swizzle_5
+; CHECK: vpermd
+; CHECK-NOT: vpermd
+; CHECK: ret
+
+
+define <8 x i32> @swizzle_6(<8 x i32> %v) {
+  %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 0, i32 4, i32 7, i32 6, i32 5>
+  %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 0, i32 4, i32 7, i32 6, i32 5>
+  ret <8 x i32> %2
+}
+; CHECK-LABEL: swizzle_6
+; CHECK: vpermd
+; CHECK-NOT: vpermd
+; CHECK: ret
+
+
+define <8 x i32> @swizzle_7(<8 x i32> %v) {
+  %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 1, i32 2, i32 5, i32 4, i32 6, i32 7>
+  %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 1, i32 2, i32 5, i32 4, i32 6, i32 7>
+  ret <8 x i32> %2
+}
+; CHECK-LABEL: swizzle_7
+; CHECK: vpermd
+; CHECK-NOT: vpermd
+; CHECK: ret
+
+
diff --git a/test/CodeGen/X86/testb-je-fusion.ll b/test/CodeGen/X86/testb-je-fusion.ll
new file mode 100644
index 0000000..9e946ae
--- /dev/null
+++ b/test/CodeGen/X86/testb-je-fusion.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s
+
+; testb should be scheduled right before je to enable macro-fusion.
+
+; CHECK: testb $2, %{{[abcd]}}h
+; CHECK-NEXT: je
+
+define i32 @check_flag(i32 %flags, ...) nounwind {
+entry:
+  %and = and i32 %flags, 512
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+  br label %if.end
+
+if.end:
+  %hasflag = phi i32 [ 1, %if.then ], [ 0, %entry ]
+  ret i32 %hasflag
+}
diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll
index 5f6e7a8..1a6c05d 100644
--- a/test/CodeGen/X86/vec_cast2.ll
+++ b/test/CodeGen/X86/vec_cast2.ll
@@ -1,8 +1,20 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE
 
 ;CHECK-LABEL: foo1_8:
 ;CHECK: vcvtdq2ps
 ;CHECK: ret
+;
+;CHECK-WIDE-LABEL: foo1_8:
+;CHECK-WIDE:      vpmovzxbd %xmm0, %xmm1
+;CHECK-WIDE-NEXT: vpslld $24, %xmm1, %xmm1
+;CHECK-WIDE-NEXT: vpsrad $24, %xmm1, %xmm1
+;CHECK-WIDE-NEXT: vpshufb {{.*}}, %xmm0, %xmm0
+;CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0
+;CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0
+;CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+;CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0
+;CHECK-WIDE-NEXT: ret
 define <8 x float> @foo1_8(<8 x i8> %src) {
   %res = sitofp <8 x i8> %src to <8 x float>
   ret <8 x float> %res
@@ -11,6 +23,13 @@ define <8 x float> @foo1_8(<8 x i8> %src) {
 ;CHECK-LABEL: foo1_4:
 ;CHECK: vcvtdq2ps
 ;CHECK: ret
+;
+;CHECK-WIDE-LABEL: foo1_4:
+;CHECK-WIDE:      vpmovzxbd %xmm0, %xmm0
+;CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0
+;CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0
+;CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0
+;CHECK-WIDE-NEXT: ret
 define <4 x float> @foo1_4(<4 x i8> %src) {
   %res = sitofp <4 x i8> %src to <4 x float>
   ret <4 x float> %res
@@ -19,6 +38,10 @@ define <4 x float> @foo1_4(<4 x i8> %src) {
 ;CHECK-LABEL: foo2_8:
 ;CHECK: vcvtdq2ps
 ;CHECK: ret
+;
+;CHECK-WIDE-LABEL: foo2_8:
+;CHECK-WIDE: vcvtdq2ps %ymm{{.*}}, %ymm{{.*}}
+;CHECK-WIDE: ret
 define <8 x float> @foo2_8(<8 x i8> %src) {
   %res = uitofp <8 x i8> %src to <8 x float>
   ret <8 x float> %res
@@ -27,6 +50,10 @@ define <8 x float> @foo2_8(<8 x i8> %src) {
 ;CHECK-LABEL: foo2_4:
 ;CHECK: vcvtdq2ps
 ;CHECK: ret
+;
+;CHECK-WIDE-LABEL: foo2_4:
+;CHECK-WIDE: vcvtdq2ps %xmm{{.*}}, %xmm{{.*}}
+;CHECK-WIDE: ret
 define <4 x float> @foo2_4(<4 x i8> %src) {
   %res = uitofp <4 x i8> %src to <4 x float>
   ret <4 x float> %res
diff --git a/test/CodeGen/X86/vec_splat.ll b/test/CodeGen/X86/vec_splat.ll
index a02e383..28f2a90 100644
--- a/test/CodeGen/X86/vec_splat.ll
+++ b/test/CodeGen/X86/vec_splat.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s -check-prefix=SSE2
 ; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse3 | FileCheck %s -check-prefix=SSE3
+; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s -check-prefix=AVX
 
 define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind {
 	%tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0		; <<4 x float>> [#uses=1]
@@ -37,6 +38,23 @@ define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind {
 define <4 x float> @load_extract_splat(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind {
   %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
   %2 = load <4 x float>* %1, align 16
+  %3 = trunc i64 %j to i32
+  %4 = extractelement <4 x float> %2, i32 %3
+  %5 = insertelement <4 x float> undef, float %4, i32 0
+  %6 = insertelement <4 x float> %5, float %4, i32 1
+  %7 = insertelement <4 x float> %6, float %4, i32 2
+  %8 = insertelement <4 x float> %7, float %4, i32 3
+  ret <4 x float> %8
+  
+; AVX-LABEL: load_extract_splat
+; AVX-NOT: rsp
+; AVX: vbroadcastss
+}
+
+; Fold extract of a load into the load's address computation. This avoids spilling to the stack.
+define <4 x float> @load_extract_splat1(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind {
+  %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
+  %2 = load <4 x float>* %1, align 16
   %3 = extractelement <4 x float> %2, i64 %j
   %4 = insertelement <4 x float> undef, float %3, i32 0
   %5 = insertelement <4 x float> %4, float %3, i32 1
@@ -44,7 +62,7 @@ define <4 x float> @load_extract_splat(<4 x float>* nocapture readonly %ptr, i64
   %7 = insertelement <4 x float> %6, float %3, i32 3
   ret <4 x float> %7
   
-; AVX-LABEL: load_extract_splat
+; AVX-LABEL: load_extract_splat1
 ; AVX-NOT: movs
 ; AVX: vbroadcastss
 }
diff --git a/test/CodeGen/X86/vec_split.ll b/test/CodeGen/X86/vec_split.ll
index f9e7c20..bc2c663 100644
--- a/test/CodeGen/X86/vec_split.ll
+++ b/test/CodeGen/X86/vec_split.ll
@@ -40,3 +40,36 @@ define <32 x i16> @split32(<32 x i16> %a, <32 x i16> %b, <32 x i8> %__mask) {
   %2 = select <32 x i1> %1, <32 x i16> %a, <32 x i16> %b
   ret <32 x i16> %2
 }
+
+; PR19492
+define i128 @split128(<2 x i128> %a, <2 x i128> %b) {
+; SSE4-LABEL: split128:
+; SSE4: addq
+; SSE4: adcq
+; SSE4: addq
+; SSE4: adcq
+; SSE4: addq
+; SSE4: adcq
+; SSE4: ret
+; AVX1-LABEL: split128:
+; AVX1: addq
+; AVX1: adcq
+; AVX1: addq
+; AVX1: adcq
+; AVX1: addq
+; AVX1: adcq
+; AVX1: ret
+; AVX2-LABEL: split128:
+; AVX2: addq
+; AVX2: adcq
+; AVX2: addq
+; AVX2: adcq
+; AVX2: addq
+; AVX2: adcq
+; AVX2: ret
+  %add = add nsw <2 x i128> %a, %b
+  %rdx.shuf = shufflevector <2 x i128> %add, <2 x i128> undef, <2 x i32> <i32 undef, i32 0>
+  %bin.rdx = add <2 x i128> %add, %rdx.shuf
+  %e = extractelement <2 x i128> %bin.rdx, i32 1
+  ret i128 %e
+}
diff --git a/test/CodeGen/X86/vector-gep.ll b/test/CodeGen/X86/vector-gep.ll
index 9c68f44..3f7ee3a 100644
--- a/test/CodeGen/X86/vector-gep.ll
+++ b/test/CodeGen/X86/vector-gep.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=i686-linux -mcpu=corei7-avx | FileCheck %s
 ; RUN: opt -instsimplify -disable-output < %s
 
 ;CHECK-LABEL: AGEP0:
diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll
index 4c30184..b6d43e9 100644
--- a/test/CodeGen/X86/vector-idiv.ll
+++ b/test/CodeGen/X86/vector-idiv.ll
@@ -8,7 +8,7 @@ define <4 x i32> @test1(<4 x i32> %a) {
 
 ; SSE41-LABEL: test1:
 ; SSE41: pmuludq
-; SSE41: pshufd	$57
+; SSE41: pshufd	$49
 ; SSE41: pmuludq
 ; SSE41: shufps	$-35
 ; SSE41: psubd
@@ -18,7 +18,7 @@ define <4 x i32> @test1(<4 x i32> %a) {
 
 ; AVX-LABEL: test1:
 ; AVX: vpmuludq
-; AVX: vpshufd	$57
+; AVX: vpshufd	$49
 ; AVX: vpmuludq
 ; AVX: vshufps	$-35
 ; AVX: vpsubd
@@ -32,11 +32,11 @@ define <8 x i32> @test2(<8 x i32> %a) {
   ret <8 x i32> %div
 
 ; AVX-LABEL: test2:
-; AVX: vpermd
+; AVX: vpbroadcastd
+; AVX: vpalignr $4
 ; AVX: vpmuludq
-; AVX: vshufps	$-35
 ; AVX: vpmuludq
-; AVX: vshufps	$-35
+; AVX: vpblendd $170
 ; AVX: vpsubd
 ; AVX: vpsrld $1
 ; AVX: vpadd
@@ -107,6 +107,12 @@ define <16 x i16> @test6(<16 x i16> %a) {
 define <16 x i8> @test7(<16 x i8> %a) {
   %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
   ret <16 x i8> %div
+
+; FIXME: scalarized
+; SSE41-LABEL: test7:
+; SSE41: pext
+; AVX-LABEL: test7:
+; AVX: pext
 }
 
 define <4 x i32> @test8(<4 x i32> %a) {
@@ -115,8 +121,8 @@ define <4 x i32> @test8(<4 x i32> %a) {
 
 ; SSE41-LABEL: test8:
 ; SSE41: pmuldq
-; SSE41: pshufd	$57
-; SSE41-NOT: pshufd	$57
+; SSE41: pshufd	$49
+; SSE41-NOT: pshufd	$49
 ; SSE41: pmuldq
 ; SSE41: shufps	$-35
 ; SSE41: pshufd	$-40
@@ -130,8 +136,8 @@ define <4 x i32> @test8(<4 x i32> %a) {
 ; SSE: pand
 ; SSE: paddd
 ; SSE: pmuludq
-; SSE: pshufd	$57
-; SSE-NOT: pshufd	$57
+; SSE: pshufd	$49
+; SSE-NOT: pshufd	$49
 ; SSE: pmuludq
 ; SSE: shufps	$-35
 ; SSE: pshufd	$-40
@@ -143,8 +149,8 @@ define <4 x i32> @test8(<4 x i32> %a) {
 
 ; AVX-LABEL: test8:
 ; AVX: vpmuldq
-; AVX: vpshufd	$57
-; AVX-NOT: vpshufd	$57
+; AVX: vpshufd	$49
+; AVX-NOT: vpshufd	$49
 ; AVX: vpmuldq
 ; AVX: vshufps	$-35
 ; AVX: vpshufd	$-40
@@ -159,12 +165,11 @@ define <8 x i32> @test9(<8 x i32> %a) {
   ret <8 x i32> %div
 
 ; AVX-LABEL: test9:
+; AVX: vpalignr $4
 ; AVX: vpbroadcastd
 ; AVX: vpmuldq
-; AVX: vshufps	$-35
 ; AVX: vpmuldq
-; AVX: vshufps	$-35
-; AVX: vpshufd	$-40
+; AVX: vpblendd $170
 ; AVX: vpadd
 ; AVX: vpsrld $31
 ; AVX: vpsrad $2
@@ -177,10 +182,10 @@ define <8 x i32> @test10(<8 x i32> %a) {
 
 ; AVX-LABEL: test10:
 ; AVX: vpbroadcastd
+; AVX: vpalignr $4
 ; AVX: vpmuludq
-; AVX: vshufps	$-35
 ; AVX: vpmuludq
-; AVX: vshufps	$-35
+; AVX: vpblendd $170
 ; AVX: vpsubd
 ; AVX: vpsrld $1
 ; AVX: vpadd
@@ -193,12 +198,11 @@ define <8 x i32> @test11(<8 x i32> %a) {
   ret <8 x i32> %rem
 
 ; AVX-LABEL: test11:
+; AVX: vpalignr $4
 ; AVX: vpbroadcastd
 ; AVX: vpmuldq
-; AVX: vshufps	$-35
 ; AVX: vpmuldq
-; AVX: vshufps	$-35
-; AVX: vpshufd	$-40
+; AVX: vpblendd $170
 ; AVX: vpadd
 ; AVX: vpsrld $31
 ; AVX: vpsrad $2
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
new file mode 100644
index 0000000..4da7e42
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -0,0 +1,196 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,2,4,5,6,7]
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,6,6,6]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    punpcklwd %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    punpckhwd %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,6,6]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_0101010101010101
+; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23
+; CHECK-SSE2:         punpcklbw %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm1
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12
+; CHECK-SSE2:         pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-SSE2-NEXT:    punpckhbw %xmm1, %xmm2
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; CHECK-SSE2-NEXT:    packuswb %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20
+; CHECK-SSE2:         pxor %xmm2, %xmm2
+; CHECK-SSE2-NEXT:    punpcklbw %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; CHECK-SSE2-NEXT:    punpcklbw %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; CHECK-SSE2-NEXT:    packuswb %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20
+; CHECK-SSE2:         pxor %xmm2, %xmm2
+; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; CHECK-SSE2-NEXT:    punpcklbw %xmm2, %xmm3
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; CHECK-SSE2-NEXT:    punpckhbw %xmm2, %xmm4
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    shufpd {{.*}} # xmm4 = xmm4[0],xmm3[1]
+; CHECK-SSE2-NEXT:    punpckhbw %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; CHECK-SSE2-NEXT:    punpcklbw %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; CHECK-SSE2-NEXT:    packuswb %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 31, i32 30, i32 29, i32 28, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @zext_to_v8i16_shuffle(<16 x i8> %a) {
+; CHECK-SSE2-LABEL: @zext_to_v8i16_shuffle
+; CHECK-SSE2:         pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm0
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @zext_to_v4i32_shuffle(<16 x i8> %a) {
+; CHECK-SSE2-LABEL: @zext_to_v4i32_shuffle
+; CHECK-SSE2:         pxor %xmm1, %xmm1
+; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm0
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
+; CHECK-SSE2-LABEL: @trunc_v4i32_shuffle
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pand
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    packuswb %xmm0, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll
new file mode 100644
index 0000000..78b4ee7
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -0,0 +1,219 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+define <2 x i64> @shuffle_v2i64_00(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_00
+; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 0>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_10(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_10
+; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[2,3,0,1]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 0>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_11(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_11
+; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_22(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_22
+; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[0,1,0,1]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 2>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_32(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_32
+; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[2,3,0,1]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 2>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_33(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_33
+; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 3>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2f64_00
+; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %shuffle
+}
+define <2 x double> @shuffle_v2f64_10(<2 x double> %a, <2 x double> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2f64_10
+; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[1,0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 0>
+  ret <2 x double> %shuffle
+}
+define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2f64_11
+; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[1,1]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
+  ret <2 x double> %shuffle
+}
+define <2 x double> @shuffle_v2f64_22(<2 x double> %a, <2 x double> %b) {
+; FIXME: Should these use movapd + shufpd to remove a domain change at the cost
+;        of a mov?
+;
+; CHECK-SSE2-LABEL: @shuffle_v2f64_22
+; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[0,1,0,1]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 2>
+  ret <2 x double> %shuffle
+}
+define <2 x double> @shuffle_v2f64_32(<2 x double> %a, <2 x double> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2f64_32
+; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[2,3,0,1]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 2>
+  ret <2 x double> %shuffle
+}
+define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2f64_33
+; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 3>
+  ret <2 x double> %shuffle
+}
+
+
+define <2 x i64> @shuffle_v2i64_02(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_02
+; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[0],xmm1[0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_02_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_02_copy
+; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[0],xmm2[0]
+; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_03
+; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_03_copy
+; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
+; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_12
+; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_12_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_12_copy
+; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[1],xmm2[0]
+; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_13(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_13
+; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[1],xmm1[1]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_13_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_13_copy
+; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_20(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_20
+; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[0],xmm0[0]
+; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_20_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_20_copy
+; CHECK-SSE2:         shufpd {{.*}} # xmm2 = xmm2[0],xmm1[0]
+; CHECK-SSE2-NEXT:    movapd %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_21
+; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
+; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_21_copy
+; CHECK-SSE2:         shufpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
+; CHECK-SSE2-NEXT:    movapd %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_30(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_30
+; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[1],xmm0[0]
+; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 0>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_30_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_30_copy
+; CHECK-SSE2:         shufpd {{.*}} # xmm2 = xmm2[1],xmm1[0]
+; CHECK-SSE2-NEXT:    movapd %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 0>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_31(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_31
+; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[1],xmm0[1]
+; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
+  ret <2 x i64> %shuffle
+}
+define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v2i64_31_copy
+; CHECK-SSE2:         shufpd {{.*}} # xmm2 = xmm2[1],xmm1[1]
+; CHECK-SSE2-NEXT:    movapd %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
+  ret <2 x i64> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll
new file mode 100644
index 0000000..7d496fa
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -0,0 +1,170 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4i32_0001
+; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[0,0,0,1]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+  ret <4 x i32> %shuffle
+}
+define <4 x i32> @shuffle_v4i32_0020(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4i32_0020
+; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[0,0,2,0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+  ret <4 x i32> %shuffle
+}
+define <4 x i32> @shuffle_v4i32_0300(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4i32_0300
+; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[0,3,0,0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
+  ret <4 x i32> %shuffle
+}
+define <4 x i32> @shuffle_v4i32_1000(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4i32_1000
+; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[1,0,0,0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+  ret <4 x i32> %shuffle
+}
+define <4 x i32> @shuffle_v4i32_2200(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4i32_2200
+; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[2,2,0,0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
+  ret <4 x i32> %shuffle
+}
+define <4 x i32> @shuffle_v4i32_3330(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4i32_3330
+; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[3,3,3,0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
+  ret <4 x i32> %shuffle
+}
+define <4 x i32> @shuffle_v4i32_3210(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4i32_3210
+; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[3,2,1,0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4f32_0001
+; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[0,0,0,1]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4f32_0020
+; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[0,0,2,0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4f32_0300
+; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[0,3,0,0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4f32_1000
+; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[1,0,0,0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4f32_2200
+; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[2,2,0,0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4f32_3330
+; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[3,3,3,0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4f32_3210
+; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[3,2,1,0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x float> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4i32_0124
+; CHECK-SSE2:         shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[2,0]
+; CHECK-SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[2,0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i32> %shuffle
+}
+define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4i32_0142
+; CHECK-SSE2:         shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[2,0]
+; CHECK-SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[0,2]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
+  ret <4 x i32> %shuffle
+}
+define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4i32_0412
+; CHECK-SSE2:         shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[0,0]
+; CHECK-SSE2-NEXT:    shufps {{.*}} # xmm1 = xmm1[2,0],xmm0[1,2]
+; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
+  ret <4 x i32> %shuffle
+}
+define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4i32_4012
+; CHECK-SSE2:         shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[0,0]
+; CHECK-SSE2-NEXT:    shufps {{.*}} # xmm1 = xmm1[0,2],xmm0[1,2]
+; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
+  ret <4 x i32> %shuffle
+}
+define <4 x i32> @shuffle_v4i32_0145(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4i32_0145
+; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[0],xmm1[0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %shuffle
+}
+define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4i32_0451
+; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[0,1]
+; CHECK-SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2,3,1]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
+  ret <4 x i32> %shuffle
+}
+define <4 x i32> @shuffle_v4i32_4501(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4i32_4501
+; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[0],xmm0[0]
+; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+  ret <4 x i32> %shuffle
+}
+define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v4i32_4015
+; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[0,1]
+; CHECK-SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[2,0,1,3]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
+  ret <4 x i32> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
new file mode 100644
index 0000000..5d1922a
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -0,0 +1,493 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+define <8 x i16> @shuffle_v8i16_01012323(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_01012323
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,0,1,1]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_67452301(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_67452301
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,2,1,0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_456789AB(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_456789AB
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_00000000
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_00004444(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_00004444
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_31206745(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_31206745
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,3,2]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 6, i32 7, i32 4, i32 5>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_44440000(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_44440000
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,1,0,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_75643120(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_75643120
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,3,0,1]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 7, i32 5, i32 6, i32 4, i32 3, i32 1, i32 2, i32 0>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_10545410(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_10545410
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 5, i32 4, i32 5, i32 4, i32 1, i32 0>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_54105410(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_54105410
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 4, i32 1, i32 0, i32 5, i32 4, i32 1, i32 0>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_54101054(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_54101054
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 4, i32 1, i32 0, i32 1, i32 0, i32 5, i32 4>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_04400440(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_04400440
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,4,4,6]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 0>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_40044004(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_40044004
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[2,0,0,2,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,4]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 0, i32 0, i32 4, i32 4, i32 0, i32 0, i32 4>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_26405173(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_26405173
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 5, i32 1, i32 7, i32 3>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_20645173(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_20645173
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 0, i32 6, i32 4, i32 5, i32 1, i32 7, i32 3>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_26401375(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_26401375
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,1,2]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 1, i32 3, i32 7, i32 5>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_00444444(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_00444444
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_44004444(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_44004444
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[2,2,0,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_04404444(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_04404444
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_04400000(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_04400000
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,0,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_04404567(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_04404567
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0X444444(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_0X444444
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,1,2,2,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 undef, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_44X04444(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_44X04444
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[2,2,2,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 undef, i32 0, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_X4404444(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_X4404444
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0127XXXX(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_0127XXXX
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,1,3]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_XXXX4563(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_XXXX4563
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,2,0]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 3>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_4563XXXX(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_4563XXXX
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,0,2,3]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_01274563(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_01274563
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,1,3]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,5,4,7]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,1,2]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 3>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_45630127(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_45630127
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,1,2,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,0,1,3]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,5,4]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 0, i32 1, i32 2, i32 7>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_08192a3b(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_08192a3b
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_0c1d2e3f
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
+; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 2, i32 14, i32 3, i32 15>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_4c5d6e7f(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_4c5d6e7f
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
+; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_48596a7b(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_48596a7b
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
+; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 8, i32 5, i32 9, i32 6, i32 10, i32 7, i32 11>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_08196e7f(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_08196e7f
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[0,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,2,3]
+; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0c1d6879(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_0c1d6879
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,0,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,2,3]
+; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 6, i32 8, i32 7, i32 9>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_109832ba
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm1 = xmm0[2,0,3,1,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[2,0,3,1,4,5,6,7]
+; CHECK-SSE2-NEXT:    punpcklqdq %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 9, i32 8, i32 3, i32 2, i32 11, i32 10>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_8091a2b3(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_8091a2b3
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    punpcklwd %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_c4d5e6f7(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_c4d5e6f7
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm2 = xmm0[2,3,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
+; CHECK-SSE2-NEXT:    punpcklwd %xmm2, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_0213cedf
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm1 = xmm1[0,2,1,3,4,5,6,7]
+; CHECK-SSE2-NEXT:    punpcklqdq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 12, i32 14, i32 13, i32 15>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_032dXXXX
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,1,2,3]
+; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,2,1,4,5,6,7]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_XXXcXXXX(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_XXXcXXXX
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm1[2,1,2,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,1,2,1,4,5,6,7]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_012dXXXX
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,1,2,3]
+; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,1,2,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,2,0,3,4,5,6,7]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_XXXXcde3
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
+; CHECK-SSE2-NEXT:    punpckhwd %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,2]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 3>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_cde3XXXX
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
+; CHECK-SSE2-NEXT:    punpckhwd %xmm0, %xmm1
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 13, i32 14, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SSE2-LABEL: @shuffle_v8i16_012dcde3
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm2 = xmm0[0,1,2,1]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm3 = xmm1[2,1,2,3]
+; CHECK-SSE2-NEXT:    punpckhwd %xmm2, %xmm1
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,7,6,7]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    punpcklwd %xmm3, %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,1,2,3]
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,2,0,3,4,5,6,7]
+; CHECK-SSE2-NEXT:    punpcklqdq %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 12, i32 13, i32 14, i32 3>
+  ret <8 x i16> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll
new file mode 100644
index 0000000..e60ecb7
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -0,0 +1,119 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
+declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
+declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
+
+define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd1
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    retq
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 
+  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27) 
+  ret <4 x i32> %c
+}
+
+define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd2
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    retq
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 
+  %b.cast = bitcast <4 x i32> %b to <8 x i16>
+  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
+  %c.cast = bitcast <8 x i16> %c to <4 x i32>
+  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd3
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    retq
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 
+  %b.cast = bitcast <4 x i32> %b to <8 x i16>
+  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
+  %c.cast = bitcast <8 x i16> %c to <4 x i32>
+  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd4
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; CHECK-SSE2-NEXT:    retq
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31) 
+  %b.cast = bitcast <4 x i32> %b to <8 x i16>
+  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
+  %c.cast = bitcast <8 x i16> %c to <4 x i32>
+  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31) 
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd5
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    retq
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76) 
+  %b.cast = bitcast <4 x i32> %b to <8 x i16>
+  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
+  %c.cast = bitcast <8 x i16> %c to <4 x i32>
+  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76)
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd6
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufd $0
+; CHECK-SSE2-NEXT:    retq
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
+  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
+  ret <4 x i32> %c
+}
+
+define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
+; CHECK-SSE2-LABEL: @combine_pshuflw1
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    retq
+  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 
+  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 
+  ret <8 x i16> %c
+}
+
+define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
+; CHECK-SSE2-LABEL: @combine_pshuflw2
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    retq
+  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
+  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28) 
+  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 
+  ret <8 x i16> %d
+}
+
+define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
+; CHECK-SSE2-LABEL: @combine_pshuflw3
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; CHECK-SSE2-NEXT:    retq
+  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
+  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27) 
+  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 
+  ret <8 x i16> %d
+}
+
+define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufhw1
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    retq
+  %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
+  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 
+  %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27) 
+  ret <8 x i16> %d
+}
+
diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll
index 0cf03fc..42cf06a 100644
--- a/test/CodeGen/X86/vselect.ll
+++ b/test/CodeGen/X86/vselect.ll
@@ -262,3 +262,17 @@ define <2 x i64> @test25(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK: movsd
 ; CHECK: ret
 
+define <4 x float> @select_of_shuffles_0(<2 x float> %a0, <2 x float> %b0, <2 x float> %a1, <2 x float> %b1) {
+; CHECK-LABEL: select_of_shuffles_0
+; CHECK-DAG: movlhps %xmm2, [[REGA:%xmm[0-9]+]]
+; CHECK-DAG: movlhps %xmm3, [[REGB:%xmm[0-9]+]]
+; CHECK: subps [[REGB]], [[REGA]]
+  %1 = shufflevector <2 x float> %a0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %2 = shufflevector <2 x float> %a1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+  %3 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %2, <4 x float> %1
+  %4 = shufflevector <2 x float> %b0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %5 = shufflevector <2 x float> %b1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+  %6 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %5, <4 x float> %4
+  %7 = fsub <4 x float> %3, %6
+  ret <4 x float> %7
+}
diff --git a/test/CodeGen/X86/widen_cast-4.ll b/test/CodeGen/X86/widen_cast-4.ll
index 1bc06a7..19b84f1 100644
--- a/test/CodeGen/X86/widen_cast-4.ll
+++ b/test/CodeGen/X86/widen_cast-4.ll
@@ -1,8 +1,9 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
-; CHECK: psraw
-; CHECK: psraw
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE
 
 define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
+; CHECK-LABEL: update:
+; CHECK-WIDE-LABEL: update:
 entry:
 	%dst_i.addr = alloca i64*		; <i64**> [#uses=2]
 	%src_i.addr = alloca i64*		; <i64**> [#uses=2]
@@ -44,6 +45,26 @@ forbody:		; preds = %forcond
 	%shr = ashr <8 x i8> %add, < i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2 >		; <<8 x i8>> [#uses=1]
 	store <8 x i8> %shr, <8 x i8>* %arrayidx10
 	br label %forinc
+; CHECK: %forbody
+; CHECK:      pmovzxbw
+; CHECK-NEXT: paddw
+; CHECK-NEXT: psllw $8
+; CHECK-NEXT: psraw $8
+; CHECK-NEXT: psraw $2
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: movlpd
+;
+; FIXME: We shouldn't require both a movd and an insert.
+; CHECK-WIDE: %forbody
+; CHECK-WIDE:      movd
+; CHECK-WIDE-NEXT: pinsrd
+; CHECK-WIDE-NEXT: paddb
+; CHECK-WIDE-NEXT: psrlw $2
+; CHECK-WIDE-NEXT: pand
+; CHECK-WIDE-NEXT: pxor
+; CHECK-WIDE-NEXT: psubb
+; CHECK-WIDE-NEXT: pextrd
+; CHECK-WIDE-NEXT: movd
 
 forinc:		; preds = %forbody
 	%tmp15 = load i32* %i		; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/widen_cast-6.ll b/test/CodeGen/X86/widen_cast-6.ll
index 7c06ad8..46d8dd7 100644
--- a/test/CodeGen/X86/widen_cast-6.ll
+++ b/test/CodeGen/X86/widen_cast-6.ll
@@ -1,9 +1,13 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse4.1 | FileCheck %s
-; CHECK: movd
 
 ; Test bit convert that requires widening in the operand.
 
 define i32 @return_v2hi() nounwind {
+; CHECK-LABEL: @return_v2hi
+; CHECK:      pushl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: popl
+; CHECK-NEXT: ret
 entry:
 	%retval12 = bitcast <2 x i16> zeroinitializer to i32		; <i32> [#uses=1]
 	ret i32 %retval12
diff --git a/test/CodeGen/X86/widen_conversions.ll b/test/CodeGen/X86/widen_conversions.ll
new file mode 100644
index 0000000..522ab47
--- /dev/null
+++ b/test/CodeGen/X86/widen_conversions.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mcpu=x86-64 -x86-experimental-vector-widening-legalization -x86-experimental-vector-shuffle-lowering | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+define <4 x i32> @zext_v4i8_to_v4i32(<4 x i8>* %ptr) {
+; CHECK-LABEL: zext_v4i8_to_v4i32:
+; 
+; CHECK:      movd (%{{.*}}), %[[X:xmm[0-9]+]]
+; CHECK-NEXT: pxor %[[Z:xmm[0-9]+]], %[[Z]]
+; CHECK-NEXT: punpcklbw %[[Z]], %[[X]]
+; CHECK-NEXT: punpcklbw %[[Z]], %[[X]]
+; CHECK-NEXT: ret
+
+  %val = load <4 x i8>* %ptr
+  %ext = zext <4 x i8> %val to <4 x i32>
+  ret <4 x i32> %ext
+}
diff --git a/test/CodeGen/X86/widen_shuffle-1.ll b/test/CodeGen/X86/widen_shuffle-1.ll
index 803402b..a355b75 100644
--- a/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/test/CodeGen/X86/widen_shuffle-1.ll
@@ -33,7 +33,9 @@ entry:
 define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind {
 entry:
 ; CHECK-LABEL: shuf3:
-; CHECK: shufps
+; CHECK-NOT: movlhps
+; CHECK-NOT: shufps
+; CHECK: pshufd
   %shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 
   %tmp1.i.i = shufflevector <3 x float> %tmp25.i.i, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/test/CodeGen/X86/win64_eh.ll b/test/CodeGen/X86/win64_eh.ll
new file mode 100644
index 0000000..f1f874e
--- /dev/null
+++ b/test/CodeGen/X86/win64_eh.ll
@@ -0,0 +1,170 @@
+; RUN: llc < %s -O0 -mcpu=corei7 -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN64
+; RUN: llc < %s -O0 -mcpu=corei7 -mtriple=x86_64-pc-mingw32 | FileCheck %s -check-prefix=WIN64
+
+; Check function without prolog
+define void @foo0() uwtable {
+entry:
+  ret void
+}
+; WIN64-LABEL: foo0:
+; WIN64: .seh_proc foo0
+; WIN64: .seh_endprologue
+; WIN64: ret
+; WIN64: .seh_endproc
+
+; Checks a small stack allocation
+define void @foo1() uwtable {
+entry:
+  %baz = alloca [2000 x i16], align 2
+  ret void
+}
+; WIN64-LABEL: foo1:
+; WIN64: .seh_proc foo1
+; WIN64: subq $4000, %rsp
+; WIN64: .seh_stackalloc 4000
+; WIN64: .seh_endprologue
+; WIN64: addq $4000, %rsp
+; WIN64: ret
+; WIN64: .seh_endproc
+
+; Checks a stack allocation requiring call to __chkstk/___chkstk_ms
+define void @foo2() uwtable {
+entry:
+  %baz = alloca [4000 x i16], align 2
+  ret void
+}
+; WIN64-LABEL: foo2:
+; WIN64: .seh_proc foo2
+; WIN64: movabsq $8000, %rax
+; WIN64: callq {{__chkstk|___chkstk_ms}}
+; WIN64: subq %rax, %rsp
+; WIN64: .seh_stackalloc 8000
+; WIN64: .seh_endprologue
+; WIN64: addq $8000, %rsp
+; WIN64: ret
+; WIN64: .seh_endproc
+
+
+; Checks stack push
+define i32 @foo3(i32 %f_arg, i32 %e_arg, i32 %d_arg, i32 %c_arg, i32 %b_arg, i32 %a_arg) uwtable {
+entry:
+  %a = alloca i32
+  %b = alloca i32
+  %c = alloca i32
+  %d = alloca i32
+  %e = alloca i32
+  %f = alloca i32
+  store i32 %a_arg, i32* %a
+  store i32 %b_arg, i32* %b
+  store i32 %c_arg, i32* %c
+  store i32 %d_arg, i32* %d
+  store i32 %e_arg, i32* %e
+  store i32 %f_arg, i32* %f
+  %tmp = load i32* %a
+  %tmp1 = mul i32 %tmp, 2
+  %tmp2 = load i32* %b
+  %tmp3 = mul i32 %tmp2, 3
+  %tmp4 = add i32 %tmp1, %tmp3
+  %tmp5 = load i32* %c
+  %tmp6 = mul i32 %tmp5, 5
+  %tmp7 = add i32 %tmp4, %tmp6
+  %tmp8 = load i32* %d
+  %tmp9 = mul i32 %tmp8, 7
+  %tmp10 = add i32 %tmp7, %tmp9
+  %tmp11 = load i32* %e
+  %tmp12 = mul i32 %tmp11, 11
+  %tmp13 = add i32 %tmp10, %tmp12
+  %tmp14 = load i32* %f
+  %tmp15 = mul i32 %tmp14, 13
+  %tmp16 = add i32 %tmp13, %tmp15
+  ret i32 %tmp16
+}
+; WIN64-LABEL: foo3:
+; WIN64: .seh_proc foo3
+; WIN64: pushq %rsi
+; WIN64: .seh_pushreg 6
+; WIN64: subq $24, %rsp
+; WIN64: .seh_stackalloc 24
+; WIN64: .seh_endprologue
+; WIN64: addq $24, %rsp
+; WIN64: popq %rsi
+; WIN64: ret
+; WIN64: .seh_endproc
+
+
+; Check emission of eh handler and handler data
+declare i32 @_d_eh_personality(i32, i32, i64, i8*, i8*)
+declare void @_d_eh_resume_unwind(i8*)
+
+declare i32 @bar()
+
+define i32 @foo4() #0 {
+entry:
+  %step = alloca i32, align 4
+  store i32 0, i32* %step
+  %tmp = load i32* %step
+
+  %tmp1 = invoke i32 @bar()
+          to label %finally unwind label %landingpad
+
+finally:
+  store i32 1, i32* %step
+  br label %endtryfinally
+
+landingpad:
+  %landing_pad = landingpad { i8*, i32 } personality i32 (i32, i32, i64, i8*, i8*)* @_d_eh_personality
+          cleanup
+  %tmp3 = extractvalue { i8*, i32 } %landing_pad, 0
+  store i32 2, i32* %step
+  call void @_d_eh_resume_unwind(i8* %tmp3)
+  unreachable
+
+endtryfinally:
+  %tmp10 = load i32* %step
+  ret i32 %tmp10
+}
+; WIN64-LABEL: foo4:
+; WIN64: .seh_proc foo4
+; WIN64: .seh_handler _d_eh_personality, @unwind, @except
+; WIN64: subq $56, %rsp
+; WIN64: .seh_stackalloc 56
+; WIN64: .seh_endprologue
+; WIN64: addq $56, %rsp
+; WIN64: ret
+; WIN64: .seh_handlerdata
+; WIN64: .seh_endproc
+
+
+; Check stack re-alignment and xmm spilling
+define void @foo5() uwtable {
+entry:
+  %s = alloca i32, align 64
+  call void asm sideeffect "", "~{rbx},~{rdi},~{xmm6},~{xmm7}"()
+  ret void
+}
+; WIN64-LABEL: foo5:
+; WIN64: .seh_proc foo5
+; WIN64: pushq %rbp
+; WIN64: .seh_pushreg 5
+; WIN64: movq  %rsp, %rbp
+; WIN64: pushq %rdi
+; WIN64: .seh_pushreg 7
+; WIN64: pushq %rbx
+; WIN64: .seh_pushreg 3
+; WIN64: andq  $-64, %rsp
+; WIN64: subq  $128, %rsp
+; WIN64: .seh_stackalloc 48
+; WIN64: .seh_setframe 5, 64
+; WIN64: movaps  %xmm7, -32(%rbp)        # 16-byte Spill
+; WIN64: movaps  %xmm6, -48(%rbp)        # 16-byte Spill
+; WIN64: .seh_savexmm 6, 16
+; WIN64: .seh_savexmm 7, 32
+; WIN64: .seh_endprologue
+; WIN64: movaps  -48(%rbp), %xmm6        # 16-byte Reload
+; WIN64: movaps  -32(%rbp), %xmm7        # 16-byte Reload
+; WIN64: leaq  -16(%rbp), %rsp
+; WIN64: popq  %rbx
+; WIN64: popq  %rdi
+; WIN64: popq  %rbp
+; WIN64: retq
+; WIN64: .seh_endproc
diff --git a/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll b/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
index 5d7a10b..08d0257 100644
--- a/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
+++ b/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
@@ -3,7 +3,7 @@
 ; clang -Oz -c test1.cpp -emit-llvm -S -o
 ; Verify that we generate shld insruction when we are optimizing for size,
 ; even for X86_64 processors that are known to have poor latency double 
-; precision shift instuctions.
+; precision shift instructions.
 ; uint64_t lshift10(uint64_t a, uint64_t b)
 ; {
 ;     return (a << 10) | (b >> 54);
@@ -25,7 +25,7 @@ attributes #0 = { minsize nounwind optsize readnone uwtable "less-precise-fpmad"
 ; clang -Os -c test2.cpp -emit-llvm -S
 ; Verify that we generate shld insruction when we are optimizing for size,
 ; even for X86_64 processors that are known to have poor latency double
-; precision shift instuctions.
+; precision shift instructions.
 ; uint64_t lshift11(uint64_t a, uint64_t b)
 ; {
 ;     return (a << 11) | (b >> 53);
@@ -46,7 +46,7 @@ attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false"
 ; clang -O2 -c test2.cpp -emit-llvm -S
 ; Verify that we do not generate shld insruction when we are not optimizing
 ; for size for X86_64 processors that are known to have poor latency double
-; precision shift instuctions.
+; precision shift instructions.
 ; uint64_t lshift12(uint64_t a, uint64_t b)
 ; {
 ;     return (a << 12) | (b >> 52);
diff --git a/test/CodeGen/X86/x86-64-frameaddr.ll b/test/CodeGen/X86/x86-64-frameaddr.ll
deleted file mode 100644
index 7d36a7a..0000000
--- a/test/CodeGen/X86/x86-64-frameaddr.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
-
-; CHECK: stack_end_address
-; CHECK: {{movq.+rbp.*$}}
-; CHECK: {{movq.+rbp.*$}}
-; CHECK: ret
-
-define i64* @stack_end_address() nounwind  {
-entry:
-	tail call i8* @llvm.frameaddress( i32 0 )
-	bitcast i8* %0 to i64*
-	ret i64* %1
-}
-
-declare i8* @llvm.frameaddress(i32) nounwind readnone 
diff --git a/test/CodeGen/X86/x86-64-static-relo-movl.ll b/test/CodeGen/X86/x86-64-static-relo-movl.ll
new file mode 100644
index 0000000..71e52bb
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-static-relo-movl.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=x86_64-pc-win32-macho -relocation-model=static -O0 < %s | FileCheck %s
+
+; Ensure that we don't generate a movl and not a lea for a static relocation
+; when compiling for 64 bit.
+
+%struct.MatchInfo = type [64 x i64]
+
+@NO_MATCH = internal constant %struct.MatchInfo zeroinitializer, align 8
+
+define void @setup() {
+  %pending = alloca %struct.MatchInfo, align 8
+  %t = bitcast %struct.MatchInfo* %pending to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t, i8* bitcast (%struct.MatchInfo* @NO_MATCH to i8*), i64 512, i32 8, i1 false)
+  %u = getelementptr inbounds %struct.MatchInfo* %pending, i32 0, i32 2
+  %v = load i64* %u, align 8
+  br label %done
+done:
+  ret void
+
+  ; CHECK: movabsq $_NO_MATCH, {{.*}}
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
diff --git a/test/CodeGen/X86/x86-frameaddr.ll b/test/CodeGen/X86/x86-frameaddr.ll
deleted file mode 100644
index d595874..0000000
--- a/test/CodeGen/X86/x86-frameaddr.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: llc < %s -march=x86 | grep mov | grep ebp
-
-define i8* @t() nounwind {
-entry:
-	%0 = tail call i8* @llvm.frameaddress(i32 0)
-	ret i8* %0
-}
-
-declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/X86/x86-frameaddr2.ll b/test/CodeGen/X86/x86-frameaddr2.ll
deleted file mode 100644
index c509115..0000000
--- a/test/CodeGen/X86/x86-frameaddr2.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: llc < %s -march=x86 | grep mov | count 3
-
-define i8* @t() nounwind {
-entry:
-	%0 = tail call i8* @llvm.frameaddress(i32 2)
-	ret i8* %0
-}
-
-declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll b/test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll
new file mode 100644
index 0000000..d885f1c
--- /dev/null
+++ b/test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mattr=+avx < %s | FileCheck %s
+
+; Check that we properly upgrade the AVX vbroadcast intrinsics to IR.  The
+; expectation is that we should still get the original instruction back that
+; maps to the intrinsic.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; CHECK-LABEL: test_mm_broadcast_ss:
+define <4 x float> @test_mm_broadcast_ss(float* readonly %__a){
+entry:
+  %0 = bitcast float* %__a to i8*
+; CHECK: vbroadcastss (%{{.*}}), %xmm
+  %1 = tail call <4 x float> @llvm.x86.avx.vbroadcast.ss(i8* %0)
+  ret <4 x float> %1
+}
+
+; CHECK-LABEL: test_mm256_broadcast_sd:
+define <4 x double> @test_mm256_broadcast_sd(double* readonly %__a) {
+entry:
+  %0 = bitcast double* %__a to i8*
+; CHECK: vbroadcastsd (%{{.*}}), %ymm
+  %1 = tail call <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8* %0)
+  ret <4 x double> %1
+}
+
+; CHECK-LABEL: test_mm256_broadcast_ss:
+define <8 x float> @test_mm256_broadcast_ss(float* readonly %__a) {
+entry:
+  %0 = bitcast float* %__a to i8*
+; CHECK: vbroadcastss (%{{.*}}), %ymm
+  %1 = tail call <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8* %0)
+  ret <8 x float> %1
+}
+
+declare <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8*)
+
+declare <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8*)
+
+declare <4 x float> @llvm.x86.avx.vbroadcast.ss(i8*)
diff --git a/test/CodeGen/X86/xaluo.ll b/test/CodeGen/X86/xaluo.ll
new file mode 100644
index 0000000..f078631
--- /dev/null
+++ b/test/CodeGen/X86/xaluo.ll
@@ -0,0 +1,743 @@
+; RUN: llc -mtriple=x86_64-darwin-unknown < %s                             | FileCheck %s --check-prefix=DAG
+; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=FAST
+; RUN: llc -mtriple=x86_64-darwin-unknown < %s                             | FileCheck %s
+; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s
+
+;
+; Get the actual value of the overflow bit.
+;
+; SADDO reg, reg
+define zeroext i1 @saddo.i8(i8 signext %v1, i8 signext %v2, i8* %res) {
+entry:
+; DAG-LABEL:    saddo.i8
+; DAG:          addb %sil, %dil
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   saddo.i8
+; FAST:         addb %sil, %dil
+; FAST-NEXT:    seto %al
+  %t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v1, i8 %v2)
+  %val = extractvalue {i8, i1} %t, 0
+  %obit = extractvalue {i8, i1} %t, 1
+  store i8 %val, i8* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.i16(i16 %v1, i16 %v2, i16* %res) {
+entry:
+; DAG-LABEL:    saddo.i16
+; DAG:          addw %si, %di
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   saddo.i16
+; FAST:         addw %si, %di
+; FAST-NEXT:    seto %al
+  %t = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %v1, i16 %v2)
+  %val = extractvalue {i16, i1} %t, 0
+  %obit = extractvalue {i16, i1} %t, 1
+  store i16 %val, i16* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; DAG-LABEL:    saddo.i32
+; DAG:          addl %esi, %edi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   saddo.i32
+; FAST:         addl %esi, %edi
+; FAST-NEXT:    seto %al
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64
+; DAG:          addq %rsi, %rdi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   saddo.i64
+; FAST:         addq %rsi, %rdi
+; FAST-NEXT:    seto %al
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; SADDO reg, imm | imm, reg
+; FIXME: INC isn't supported in FastISel yet
+define zeroext i1 @saddo.i64imm1(i64 %v1, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64imm1
+; DAG:          incq %rdi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   saddo.i64imm1
+; FAST:         addq $1, %rdi
+; FAST-NEXT:    seto %al
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 1)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; FIXME: DAG doesn't optimize immediates on the LHS.
+define zeroext i1 @saddo.i64imm2(i64 %v1, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64imm2
+; DAG:          mov
+; DAG-NEXT:     addq
+; DAG-NEXT:     seto
+; FAST-LABEL:   saddo.i64imm2
+; FAST:         addq $1, %rdi
+; FAST-NEXT:    seto %al
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 1, i64 %v1)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; Check boundary conditions for large immediates.
+define zeroext i1 @saddo.i64imm3(i64 %v1, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64imm3
+; DAG:          addq $-2147483648, %rdi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   saddo.i64imm3
+; FAST:         addq $-2147483648, %rdi
+; FAST-NEXT:    seto %al
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -2147483648)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.i64imm4(i64 %v1, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64imm4
+; DAG:          movabsq $-21474836489, %[[REG:[a-z]+]]
+; DAG-NEXT:     addq %rdi, %[[REG]]
+; DAG-NEXT:     seto
+; FAST-LABEL:   saddo.i64imm4
+; FAST:         movabsq $-21474836489, %[[REG:[a-z]+]]
+; FAST-NEXT:    addq %rdi, %[[REG]]
+; FAST-NEXT:    seto
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -21474836489)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.i64imm5(i64 %v1, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64imm5
+; DAG:          addq $2147483647, %rdi
+; DAG-NEXT:     seto
+; FAST-LABEL:   saddo.i64imm5
+; FAST:         addq $2147483647, %rdi
+; FAST-NEXT:    seto
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483647)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; TODO: FastISel shouldn't use movabsq.
+define zeroext i1 @saddo.i64imm6(i64 %v1, i64* %res) {
+entry:
+; DAG-LABEL:    saddo.i64imm6
+; DAG:          movl $2147483648, %ecx
+; DAG:          addq %rdi, %rcx
+; DAG-NEXT:     seto
+; FAST-LABEL:   saddo.i64imm6
+; FAST:         movabsq $2147483648, %[[REG:[a-z]+]]
+; FAST:         addq %rdi, %[[REG]]
+; FAST-NEXT:     seto
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483648)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; UADDO
+define zeroext i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; DAG-LABEL:    uaddo.i32
+; DAG:          addl %esi, %edi
+; DAG-NEXT:     setb %al
+; FAST-LABEL:   uaddo.i32
+; FAST:         addl %esi, %edi
+; FAST-NEXT:    setb %al
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; DAG-LABEL:    uaddo.i64
+; DAG:          addq %rsi, %rdi
+; DAG-NEXT:     setb %al
+; FAST-LABEL:   uaddo.i64
+; FAST:         addq %rsi, %rdi
+; FAST-NEXT:    setb %al
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; SSUBO
+define zeroext i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; DAG-LABEL:    ssubo.i32
+; DAG:          subl %esi, %edi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   ssubo.i32
+; FAST:         subl %esi, %edi
+; FAST-NEXT:    seto %al
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; DAG-LABEL:    ssubo.i64
+; DAG:          subq %rsi, %rdi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   ssubo.i64
+; FAST:         subq %rsi, %rdi
+; FAST-NEXT:    seto %al
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; USUBO
+define zeroext i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; DAG-LABEL:    usubo.i32
+; DAG:          subl %esi, %edi
+; DAG-NEXT:     setb %al
+; FAST-LABEL:   usubo.i32
+; FAST:         subl %esi, %edi
+; FAST-NEXT:    setb %al
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; DAG-LABEL:    usubo.i64
+; DAG:          subq %rsi, %rdi
+; DAG-NEXT:     setb %al
+; FAST-LABEL:   usubo.i64
+; FAST:         subq %rsi, %rdi
+; FAST-NEXT:    setb %al
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; SMULO
+define zeroext i1 @smulo.i8(i8 %v1, i8 %v2, i8* %res) {
+entry:
+; FAST-LABEL:   smulo.i8
+; FAST:         movb %dil, %al
+; FAST-NEXT:    imulb %sil
+; FAST-NEXT:    seto %cl
+  %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2)
+  %val = extractvalue {i8, i1} %t, 0
+  %obit = extractvalue {i8, i1} %t, 1
+  store i8 %val, i8* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @smulo.i16(i16 %v1, i16 %v2, i16* %res) {
+entry:
+; DAG-LABEL:    smulo.i16
+; DAG:          imulw %si, %di
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   smulo.i16
+; FAST:         imulw %si, %di
+; FAST-NEXT:    seto %al
+  %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2)
+  %val = extractvalue {i16, i1} %t, 0
+  %obit = extractvalue {i16, i1} %t, 1
+  store i16 %val, i16* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; DAG-LABEL:    smulo.i32
+; DAG:          imull %esi, %edi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   smulo.i32
+; FAST:         imull %esi, %edi
+; FAST-NEXT:    seto %al
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; DAG-LABEL:    smulo.i64
+; DAG:          imulq %rsi, %rdi
+; DAG-NEXT:     seto %al
+; FAST-LABEL:   smulo.i64
+; FAST:         imulq %rsi, %rdi
+; FAST-NEXT:    seto %al
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; UMULO
+define zeroext i1 @umulo.i8(i8 %v1, i8 %v2, i8* %res) {
+entry:
+; FAST-LABEL:   umulo.i8
+; FAST:         movb %dil, %al
+; FAST-NEXT:    mulb %sil
+; FAST-NEXT:    seto %cl
+  %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2)
+  %val = extractvalue {i8, i1} %t, 0
+  %obit = extractvalue {i8, i1} %t, 1
+  store i8 %val, i8* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @umulo.i16(i16 %v1, i16 %v2, i16* %res) {
+entry:
+; DAG-LABEL:    umulo.i16
+; DAG:          mulw %si
+; DAG-NEXT:     seto
+; FAST-LABEL:   umulo.i16
+; FAST:         mulw %si
+; FAST-NEXT:    seto
+  %t = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %v1, i16 %v2)
+  %val = extractvalue {i16, i1} %t, 0
+  %obit = extractvalue {i16, i1} %t, 1
+  store i16 %val, i16* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; DAG-LABEL:    umulo.i32
+; DAG:          mull %esi
+; DAG-NEXT:     seto
+; FAST-LABEL:   umulo.i32
+; FAST:         mull %esi
+; FAST-NEXT:    seto
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; DAG-LABEL:    umulo.i64
+; DAG:          mulq %rsi
+; DAG-NEXT:     seto
+; FAST-LABEL:   umulo.i64
+; FAST:         mulq %rsi
+; FAST-NEXT:    seto
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+;
+; Check the use of the overflow bit in combination with a select instruction.
+;
+define i32 @saddo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:    saddo.select.i32
+; CHECK:          addl   %esi, %eax
+; CHECK-NEXT:     cmovol %edi, %esi
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:    saddo.select.i64
+; CHECK:          addq   %rsi, %rax
+; CHECK-NEXT:     cmovoq %rdi, %rsi
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @uaddo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:    uaddo.select.i32
+; CHECK:          addl   %esi, %eax
+; CHECK-NEXT:     cmovbl %edi, %esi
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:    uaddo.select.i64
+; CHECK:          addq   %rsi, %rax
+; CHECK-NEXT:     cmovbq %rdi, %rsi
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @ssubo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:    ssubo.select.i32
+; CHECK:          cmpl   %esi, %edi
+; CHECK-NEXT:     cmovol %edi, %esi
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:    ssubo.select.i64
+; CHECK:          cmpq   %rsi, %rdi
+; CHECK-NEXT:     cmovoq %rdi, %rsi
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @usubo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:    usubo.select.i32
+; CHECK:          cmpl   %esi, %edi
+; CHECK-NEXT:     cmovbl %edi, %esi
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:    usubo.select.i64
+; CHECK:          cmpq   %rsi, %rdi
+; CHECK-NEXT:     cmovbq %rdi, %rsi
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @smulo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:    smulo.select.i32
+; CHECK:          imull  %esi, %eax
+; CHECK-NEXT:     cmovol %edi, %esi
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:    smulo.select.i64
+; CHECK:          imulq  %rsi, %rax
+; CHECK-NEXT:     cmovoq %rdi, %rsi
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @umulo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:    umulo.select.i32
+; CHECK:          mull   %esi
+; CHECK-NEXT:     cmovol %edi, %esi
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:    umulo.select.i64
+; CHECK:          mulq   %rsi
+; CHECK-NEXT:     cmovoq %rdi, %rsi
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+
+;
+; Check the use of the overflow bit in combination with a branch instruction.
+;
+define zeroext i1 @saddo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:    saddo.br.i32
+; CHECK:          addl   %esi, %edi
+; CHECK-NEXT:     jo
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue, !prof !0
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:    saddo.br.i64
+; CHECK:          addq   %rsi, %rdi
+; CHECK-NEXT:     jo
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue, !prof !0
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:    uaddo.br.i32
+; CHECK:          addl   %esi, %edi
+; CHECK-NEXT:     jb
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue, !prof !0
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:    uaddo.br.i64
+; CHECK:          addq   %rsi, %rdi
+; CHECK-NEXT:     jb
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue, !prof !0
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:    ssubo.br.i32
+; CHECK:          cmpl   %esi, %edi
+; CHECK-NEXT:     jo
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue, !prof !0
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:    ssubo.br.i64
+; CHECK:          cmpq   %rsi, %rdi
+; CHECK-NEXT:     jo
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue, !prof !0
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @usubo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:    usubo.br.i32
+; CHECK:          cmpl   %esi, %edi
+; CHECK-NEXT:     jb
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue, !prof !0
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:    usubo.br.i64
+; CHECK:          cmpq   %rsi, %rdi
+; CHECK-NEXT:     jb
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue, !prof !0
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:    smulo.br.i32
+; CHECK:          imull  %esi, %edi
+; CHECK-NEXT:     jo
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue, !prof !0
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:    smulo.br.i64
+; CHECK:          imulq  %rsi, %rdi
+; CHECK-NEXT:     jo
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue, !prof !0
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:    umulo.br.i32
+; CHECK:          mull  %esi
+; CHECK-NEXT:     jo
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue, !prof !0
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:    umulo.br.i64
+; CHECK:          mulq  %rsi
+; CHECK-NEXT:     jo
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue, !prof !0
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+declare {i8,  i1} @llvm.sadd.with.overflow.i8 (i8,  i8 ) nounwind readnone
+declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) nounwind readnone
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i8,  i1} @llvm.smul.with.overflow.i8 (i8,  i8 ) nounwind readnone
+declare {i16, i1} @llvm.smul.with.overflow.i16(i16, i16) nounwind readnone
+declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
+declare {i8,  i1} @llvm.umul.with.overflow.i8 (i8,  i8 ) nounwind readnone
+declare {i16, i1} @llvm.umul.with.overflow.i16(i16, i16) nounwind readnone
+declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
+
+!0 = metadata !{metadata !"branch_weights", i32 0, i32 2147483647}
diff --git a/test/CodeGen/XCore/dwarf_debug.ll b/test/CodeGen/XCore/dwarf_debug.ll
new file mode 100644
index 0000000..2f4b231
--- /dev/null
+++ b/test/CodeGen/XCore/dwarf_debug.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -mtriple=xcore-unknown-unknown -O0 | FileCheck %s
+
+; target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32"
+; target triple = "xcore"
+
+; CHECK-LABEL: f
+; CHECK: entsp 2
+; ...the prologue...
+; CHECK: .loc 1 2 0 prologue_end      # :2:0
+; CHECK: add r0, r0, 1
+; CHECK: retsp 2
+define i32 @f(i32 %a) {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !11), !dbg !12
+  %0 = load i32* %a.addr, align 4, !dbg !12
+  %add = add nsw i32 %0, 1, !dbg !12
+  ret i32 %add, !dbg !12
+}
+
+declare void @llvm.dbg.declare(metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1}
+!1 = metadata !{metadata !"", metadata !""}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f", metadata !"f", metadata !"", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @f, null, null, metadata !2, i32 2}
+!5 = metadata !{i32 786473, metadata !1}
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null}
+!7 = metadata !{metadata !8, metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
+!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 786689, metadata !4, metadata !"a", metadata !5, i32 16777218, metadata !8, i32 0, i32 0}
+!12 = metadata !{i32 2, i32 0, metadata !4, null}
+
diff --git a/test/CodeGen/XCore/lit.local.cfg b/test/CodeGen/XCore/lit.local.cfg
index 3e84c1b..0b947bb 100644
--- a/test/CodeGen/XCore/lit.local.cfg
+++ b/test/CodeGen/XCore/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'XCore' in targets:
+if not 'XCore' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll b/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll
index c78b8b8..65907d6 100644
--- a/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll
+++ b/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll
@@ -1,5 +1,5 @@
 ; RUN: llc %s -o /dev/null
-; Here variable bar is optimzied away. Do not trip over while trying to generate debug info.
+; Here variable bar is optimized away. Do not trip over while trying to generate debug info.
 
 
 define i32 @foo() nounwind uwtable readnone ssp {
diff --git a/test/DebugInfo/2010-01-19-DbgScope.ll b/test/DebugInfo/2010-01-19-DbgScope.ll
deleted file mode 100644
index 1a7e378..0000000
--- a/test/DebugInfo/2010-01-19-DbgScope.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: llc -O0 < %s -o /dev/null
-; Ignore unreachable scopes.
-declare void @foo(i32) noreturn
-
-define i32 @bar() nounwind ssp {
-entry:
-  br i1 undef, label %bb, label %bb11, !dbg !0
-
-bb:                                               ; preds = %entry
-  call void @foo(i32 0) noreturn nounwind, !dbg !7
-  unreachable, !dbg !7
-
-bb11:                                             ; preds = %entry
-  ret i32 1, !dbg !11
-}
-
-!llvm.dbg.cu = !{!3}
-!llvm.module.flags = !{!15}
-
-!0 = metadata !{i32 8647, i32 0, metadata !1, null}
-!1 = metadata !{i32 458763, metadata !12, metadata !2, i32 0, i32 0, i32 0}          ; [ DW_TAG_lexical_block ]
-!2 = metadata !{i32 458798, null, metadata !3, metadata !"bar", metadata !"bar", metadata !"bar", i32 8639, metadata !4, i1 true, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!3 = metadata !{i32 458769, metadata !12, i32 1, metadata !"LLVM build 00", i1 true, metadata !"", i32 0, metadata !13, metadata !13, metadata !14, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 458773, null, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{metadata !6}
-!6 = metadata !{i32 458788, null, metadata !3, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 8648, i32 0, metadata !8, null}
-!8 = metadata !{i32 458763, metadata !12, metadata !9, i32 0, i32 0, i32 0}          ; [ DW_TAG_lexical_block ]
-!9 = metadata !{i32 458763, metadata !12, metadata !10, i32 0, i32 0, i32 0}         ; [ DW_TAG_lexical_block ]
-!10 = metadata !{i32 458798, null, metadata !3, metadata !"bar2", metadata !"bar2", metadata !"bar2", i32 8639, metadata !4, i1 true, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 8652, i32 0, metadata !1, null}
-!12 = metadata !{metadata !"c-parser.c", metadata !"llvmgcc"}
-!13 = metadata !{i32 0}
-!14 = metadata !{metadata !2}
-!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/AArch64/eh_frame.s b/test/DebugInfo/AArch64/eh_frame.s
index d8d6b6d..12a5896 100644
--- a/test/DebugInfo/AArch64/eh_frame.s
+++ b/test/DebugInfo/AArch64/eh_frame.s
@@ -17,7 +17,7 @@ foo:
 // Output is:
 
 // CHECK: Contents of section .eh_frame:
-// CHECK-NEXT: 0000 10000000 00000000 017a5200 017c1e01  .........zR..|..
+// CHECK-NEXT: 0000 10000000 00000000 037a5200 017c1e01  .........zR..|..
 // CHECK-NEXT: 0010 1b0c1f00 10000000 18000000 00000000  ................
 
 
@@ -30,7 +30,7 @@ foo:
 // -------------------
 // 10000000: length of first CIE = 0x10
 // 00000000: This is a CIE
-// 01: version = 0x1
+// 03: version = 0x3
 // 7a 52 00: augmentation string "zR" -- pointer format is specified
 // 01: code alignment factor 1
 // 7c: data alignment factor -4
diff --git a/test/DebugInfo/AArch64/eh_frame_personality.ll b/test/DebugInfo/AArch64/eh_frame_personality.ll
index d35f2a2..51d6bf8 100644
--- a/test/DebugInfo/AArch64/eh_frame_personality.ll
+++ b/test/DebugInfo/AArch64/eh_frame_personality.ll
@@ -16,7 +16,7 @@ clean:
 }
 
 ; CHECK: Contents of section .eh_frame:
-; CHECK: 0000 1c000000 00000000 017a504c 5200017c  .........zPLR..|
+; CHECK: 0000 1c000000 00000000 037a504c 5200017c  .........zPLR..|
 ; CHECK: 0010 1e0b0000 00000000 00000000 1b0c1f00  ................
 
 ; Don't really care about the rest:
@@ -33,7 +33,7 @@ clean:
 ; ----------
 ; 1c000000: Length = 0x1c
 ; 00000000: This is a CIE
-; 01: Version 1
+; 03: Version 3
 ; 7a 50 4c 52 00: Augmentation string "zPLR" (personality routine, language-specific data, pointer format)
 ; 01: Code alignment factor 1
 ; 78: Data alignment factor: -8
diff --git a/test/DebugInfo/AArch64/lit.local.cfg b/test/DebugInfo/AArch64/lit.local.cfg
index a75a42b..cec29af 100644
--- a/test/DebugInfo/AArch64/lit.local.cfg
+++ b/test/DebugInfo/AArch64/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
+if not 'AArch64' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/DebugInfo/AArch64/struct_by_value.ll b/test/DebugInfo/AArch64/struct_by_value.ll
index 0023c3d..0e336f7 100644
--- a/test/DebugInfo/AArch64/struct_by_value.ll
+++ b/test/DebugInfo/AArch64/struct_by_value.ll
@@ -1,7 +1,9 @@
 ; A by-value struct is a register-indirect value (breg).
 ; RUN: llc %s -filetype=asm -o - | FileCheck %s
 
-; CHECK: DW_OP_breg0
+; CHECK: DW_AT_location
+; CHECK-NEXT: .byte 112
+; 112 = 0x70 = DW_OP_breg0
 
 ; rdar://problem/13658587
 ;
diff --git a/test/DebugInfo/ARM/lit.local.cfg b/test/DebugInfo/ARM/lit.local.cfg
index 8a3ba96..98c6700 100644
--- a/test/DebugInfo/ARM/lit.local.cfg
+++ b/test/DebugInfo/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/DebugInfo/COFF/lit.local.cfg b/test/DebugInfo/COFF/lit.local.cfg
index 19840aa..c8625f4 100644
--- a/test/DebugInfo/COFF/lit.local.cfg
+++ b/test/DebugInfo/COFF/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
diff --git a/test/DebugInfo/Inputs/arange-overlap.cc b/test/DebugInfo/Inputs/arange-overlap.cc
new file mode 100644
index 0000000..82e3f12
--- /dev/null
+++ b/test/DebugInfo/Inputs/arange-overlap.cc
@@ -0,0 +1,26 @@
+void call();
+
+struct S {
+  static void foo() { call(); call(); }
+  static void bar() { call(); call(); }
+  static void baz() {}
+};
+
+#ifdef FILE1
+# define FUNC_NAME func1
+# define FUNC_BODY \
+    S::foo(); S::bar(); S::baz();
+#else
+# define FUNC_NAME func2
+# define FUNC_BODY \
+    S::bar();
+#endif
+
+void FUNC_NAME() {
+  FUNC_BODY
+}
+
+// Build instructions:
+// $ clang -g -fPIC -c -DFILE1 arange-overlap.cc -o obj1.o
+// $ clang -g -fPIC -c arange-overlap.cc -o obj2.o
+// $ clang -shared obj1.o obj2.o -o <output>
diff --git a/test/DebugInfo/Inputs/arange-overlap.elf-x86_64 b/test/DebugInfo/Inputs/arange-overlap.elf-x86_64
new file mode 100755
index 0000000..075e9c2
--- /dev/null
+++ b/test/DebugInfo/Inputs/arange-overlap.elf-x86_64
diff --git a/test/DebugInfo/Inputs/fission-ranges.cc b/test/DebugInfo/Inputs/fission-ranges.cc
new file mode 100644
index 0000000..a585bf9
--- /dev/null
+++ b/test/DebugInfo/Inputs/fission-ranges.cc
@@ -0,0 +1,17 @@
+static inline int inlined_f() {
+  volatile int x = 2;
+  return x;
+}
+
+int main() {
+  return inlined_f();
+}
+
+// Build instructions:
+// $ mkdir /tmp/dbginfo
+// $ cp fission-ranges.cc /tmp/dbginfo/
+// $ cd /tmp/dbginfo
+// $ gcc -gsplit-dwarf -O2 -fPIC fission-ranges.cc -c -o obj2.o
+// $ clang -gsplit-dwarf -O2 -fsanitize=address -fPIC -Dmain=foo fission-ranges.cc -c -o obj1.o
+// $ gcc obj1.o obj2.o -shared -o <output>
+// $ objcopy --remove-section=.debug_aranges <output>
diff --git a/test/DebugInfo/Inputs/fission-ranges.elf-x86_64 b/test/DebugInfo/Inputs/fission-ranges.elf-x86_64
new file mode 100755
index 0000000..3d2fd79
--- /dev/null
+++ b/test/DebugInfo/Inputs/fission-ranges.elf-x86_64
diff --git a/test/DebugInfo/Mips/lit.local.cfg b/test/DebugInfo/Mips/lit.local.cfg
index 88262fb..7d12f7a 100644
--- a/test/DebugInfo/Mips/lit.local.cfg
+++ b/test/DebugInfo/Mips/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Mips' in targets:
+if not 'Mips' in config.root.targets:
     config.unsupported = True
diff --git a/test/DebugInfo/PR20038.ll b/test/DebugInfo/PR20038.ll
new file mode 100644
index 0000000..61145e5
--- /dev/null
+++ b/test/DebugInfo/PR20038.ll
@@ -0,0 +1,168 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; IR generated from clang -O0 with:
+; struct C {
+;   ~C();
+; };
+; extern bool b;
+; void fun4() { b && (C(), 1); }
+; __attribute__((always_inline)) C::~C() { }
+
+; CHECK: DW_TAG_structure_type
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "C"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[C_DTOR_DECL:.*]]:  DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "~C"
+
+; CHECK: [[D1_ABS:.*]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_MIPS_linkage_name {{.*}} "_ZN1CD1Ev"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[D1_THIS_ABS:.*]]:   DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "this"
+
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "fun4"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:   DW_TAG_lexical_block
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     DW_TAG_inlined_subroutine
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_abstract_origin {{.*}} {[[D1_ABS]]}
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:       DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:         DW_AT_abstract_origin {{.*}} {[[D1_THIS_ABS]]}
+
+; FIXME: D2 is actually inlined into D1 but doesn't show up here, possibly due
+; to there being no work in D2 (calling another member function from the dtor
+; causes D2 to show up, calling a free function doesn't).
+
+; CHECK-NOT: DW_TAG
+; CHECK:       NULL
+; CHECK-NOT: DW_TAG
+; CHECK:     NULL
+; CHECK-NOT: DW_TAG
+; CHECK:   NULL
+
+%struct.C = type { i8 }
+
+@b = external global i8
+
+; Function Attrs: nounwind
+define void @_Z4fun4v() #0 {
+entry:
+  %this.addr.i.i = alloca %struct.C*, align 8, !dbg !21
+  %this.addr.i = alloca %struct.C*, align 8, !dbg !22
+  %agg.tmp.ensured = alloca %struct.C, align 1
+  %cleanup.cond = alloca i1
+  %0 = load i8* @b, align 1, !dbg !24
+  %tobool = trunc i8 %0 to i1, !dbg !24
+  store i1 false, i1* %cleanup.cond
+  br i1 %tobool, label %land.rhs, label %land.end, !dbg !24
+
+land.rhs:                                         ; preds = %entry
+  store i1 true, i1* %cleanup.cond, !dbg !25
+  br label %land.end
+
+land.end:                                         ; preds = %land.rhs, %entry
+  %1 = phi i1 [ false, %entry ], [ true, %land.rhs ]
+  %cleanup.is_active = load i1* %cleanup.cond, !dbg !27
+  br i1 %cleanup.is_active, label %cleanup.action, label %cleanup.done, !dbg !27
+
+cleanup.action:                                   ; preds = %land.end
+  store %struct.C* %agg.tmp.ensured, %struct.C** %this.addr.i, align 8, !dbg !22
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr.i}, metadata !29), !dbg !31
+  %this1.i = load %struct.C** %this.addr.i, !dbg !22
+  store %struct.C* %this1.i, %struct.C** %this.addr.i.i, align 8, !dbg !21
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr.i.i}, metadata !32), !dbg !33
+  %this1.i.i = load %struct.C** %this.addr.i.i, !dbg !21
+  br label %cleanup.done, !dbg !22
+
+cleanup.done:                                     ; preds = %cleanup.action, %land.end
+  ret void, !dbg !34
+}
+
+; Function Attrs: alwaysinline nounwind
+define void @_ZN1CD1Ev(%struct.C* %this) unnamed_addr #1 align 2 {
+entry:
+  %this.addr.i = alloca %struct.C*, align 8, !dbg !37
+  %this.addr = alloca %struct.C*, align 8
+  store %struct.C* %this, %struct.C** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !29), !dbg !38
+  %this1 = load %struct.C** %this.addr
+  store %struct.C* %this1, %struct.C** %this.addr.i, align 8, !dbg !37
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr.i}, metadata !32), !dbg !39
+  %this1.i = load %struct.C** %this.addr.i, !dbg !37
+  ret void, !dbg !37
+}
+
+; Function Attrs: alwaysinline nounwind
+define void @_ZN1CD2Ev(%struct.C* %this) unnamed_addr #1 align 2 {
+entry:
+  %this.addr = alloca %struct.C*, align 8
+  store %struct.C* %this, %struct.C** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !32), !dbg !40
+  %this1 = load %struct.C** %this.addr
+  ret void, !dbg !41
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { alwaysinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!18, !19}
+!llvm.ident = !{!20}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !11, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/<stdin>] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"<stdin>", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786451, metadata !5, null, metadata !"C", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{metadata !"PR20038.cpp", metadata !"/tmp/dbginfo"}
+!6 = metadata !{metadata !7}
+!7 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1C", metadata !"~C", metadata !"~C", metadata !"", i32 2, metadata !8, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 2} ; [ DW_TAG_subprogram ] [line 2] [~C]
+!8 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{null, metadata !10}
+!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
+!11 = metadata !{metadata !12, metadata !16, metadata !17}
+!12 = metadata !{i32 786478, metadata !5, metadata !13, metadata !"fun4", metadata !"fun4", metadata !"_Z4fun4v", i32 5, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z4fun4v, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [fun4]
+!13 = metadata !{i32 786473, metadata !5}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/PR20038.cpp]
+!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{null}
+!16 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1C", metadata !"~C", metadata !"~C", metadata !"_ZN1CD2Ev", i32 6, metadata !8, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.C*)* @_ZN1CD2Ev, null, metadata !7, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [~C]
+!17 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1C", metadata !"~C", metadata !"~C", metadata !"_ZN1CD1Ev", i32 6, metadata !8, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.C*)* @_ZN1CD1Ev, null, metadata !7, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [~C]
+!18 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!19 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{metadata !"clang version 3.5.0 "}
+!21 = metadata !{i32 6, i32 0, metadata !17, metadata !22}
+!22 = metadata !{i32 5, i32 0, metadata !23, null}
+!23 = metadata !{i32 786443, metadata !5, metadata !12, i32 5, i32 0, i32 3, i32 3} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
+!24 = metadata !{i32 5, i32 0, metadata !12, null}
+!25 = metadata !{i32 5, i32 0, metadata !26, null}
+!26 = metadata !{i32 786443, metadata !5, metadata !12, i32 5, i32 0, i32 1, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
+!27 = metadata !{i32 5, i32 0, metadata !28, null}
+!28 = metadata !{i32 786443, metadata !5, metadata !12, i32 5, i32 0, i32 2, i32 2} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
+!29 = metadata !{i32 786689, metadata !17, metadata !"this", null, i32 16777216, metadata !30, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!30 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
+!31 = metadata !{i32 0, i32 0, metadata !17, metadata !22}
+!32 = metadata !{i32 786689, metadata !16, metadata !"this", null, i32 16777216, metadata !30, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!33 = metadata !{i32 0, i32 0, metadata !16, metadata !21}
+!34 = metadata !{i32 5, i32 0, metadata !35, null}
+!35 = metadata !{i32 786443, metadata !5, metadata !36, i32 5, i32 0, i32 5, i32 5} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
+!36 = metadata !{i32 786443, metadata !5, metadata !12, i32 5, i32 0, i32 4, i32 4} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
+!37 = metadata !{i32 6, i32 0, metadata !17, null}
+!38 = metadata !{i32 0, i32 0, metadata !17, null}
+!39 = metadata !{i32 0, i32 0, metadata !16, metadata !37}
+!40 = metadata !{i32 0, i32 0, metadata !16, null}
+!41 = metadata !{i32 6, i32 0, metadata !16, null}
diff --git a/test/DebugInfo/PowerPC/lit.local.cfg b/test/DebugInfo/PowerPC/lit.local.cfg
index 193ebeb..0913324 100644
--- a/test/DebugInfo/PowerPC/lit.local.cfg
+++ b/test/DebugInfo/PowerPC/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'PowerPC' in targets:
+if not 'PowerPC' in config.root.targets:
     config.unsupported = True
diff --git a/test/DebugInfo/Sparc/lit.local.cfg b/test/DebugInfo/Sparc/lit.local.cfg
index e4cee97..d86c9e6 100644
--- a/test/DebugInfo/Sparc/lit.local.cfg
+++ b/test/DebugInfo/Sparc/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Sparc' in targets:
+if not 'Sparc' in config.root.targets:
     config.unsupported = True
diff --git a/test/DebugInfo/SystemZ/eh_frame.s b/test/DebugInfo/SystemZ/eh_frame.s
index 4e7afd5..6189b90 100644
--- a/test/DebugInfo/SystemZ/eh_frame.s
+++ b/test/DebugInfo/SystemZ/eh_frame.s
@@ -11,9 +11,25 @@ check_largest_class:
 	.cfi_offset %r13, -56
 	.cfi_offset %r14, -48
 	.cfi_offset %r15, -40
-	aghi	%r15, -160
-	.cfi_def_cfa_offset 320
-	lmg	%r13, %r15, 264(%r15)
+	aghi	%r15, -224
+	.cfi_def_cfa_offset 384
+	std	%f8, 160(%r15)
+	std	%f9, 168(%r15)
+	std	%f10, 176(%r15)
+	std	%f11, 184(%r15)
+	std	%f12, 192(%r15)
+	std	%f13, 200(%r15)
+	std	%f14, 208(%r15)
+	std	%f15, 216(%r15)
+	.cfi_offset %f8, -224
+	.cfi_offset %f9, -216
+	.cfi_offset %f10, -208
+	.cfi_offset %f11, -200
+	.cfi_offset %f12, -192
+	.cfi_offset %f13, -184
+	.cfi_offset %f14, -176
+	.cfi_offset %f15, -168
+	lmg	%r13, %r15, 328(%r15)
 	br	%r14
 	.size	check_largest_class, .-check_largest_class
 	.cfi_endproc
@@ -22,8 +38,8 @@ check_largest_class:
 #
 # Contents of the .eh_frame section:
 #
-# 00000000 0000001c 00000000 CIE
-#   Version:               1
+# 00000000 0000000000000014 00000000 CIE
+#   Version:               3
 #   Augmentation:          "zR"
 #   Code alignment factor: 1
 #   Data alignment factor: -8
@@ -35,20 +51,29 @@ check_largest_class:
 #   DW_CFA_nop
 #   DW_CFA_nop
 #
-# 00000020 0000001c 00000024 FDE cie=00000000 pc=00000000..00000012
-#   DW_CFA_advance_loc: 6 to 00000006
+# 000000.. 000000000000002c 0000001c FDE cie=00000000 pc=0000000000000000..0000000000000032
+#   DW_CFA_advance_loc: 6 to 0000000000000006
 #   DW_CFA_offset: r13 at cfa-56
 #   DW_CFA_offset: r14 at cfa-48
 #   DW_CFA_offset: r15 at cfa-40
-#   DW_CFA_advance_loc: 4 to 0000000a
-#   DW_CFA_def_cfa_offset: 320
-#   DW_CFA_nop
+#   DW_CFA_advance_loc: 4 to 000000000000000a
+#   DW_CFA_def_cfa_offset: 384
+#   DW_CFA_advance_loc: 32 to 000000000000002a
+#   DW_CFA_offset: r24 at cfa-224
+#   DW_CFA_offset: r28 at cfa-216
+#   DW_CFA_offset: r25 at cfa-208
+#   DW_CFA_offset: r29 at cfa-200
+#   DW_CFA_offset: r26 at cfa-192
+#   DW_CFA_offset: r30 at cfa-184
+#   DW_CFA_offset: r27 at cfa-176
+#   DW_CFA_offset: r31 at cfa-168
 #   DW_CFA_nop
 #   DW_CFA_nop
 #   DW_CFA_nop
 #
 # CHECK: Contents of section .eh_frame:
-# CHECK-NEXT: 0000 00000014 00000000 017a5200 01780e01  .........zR..x..
-# CHECK-NEXT: 0010 1b0c0fa0 01000000 0000001c 0000001c  ................
-# CHECK-NEXT: 0020 00000000 00000012 00468d07 8e068f05  .........F......
-# CHECK-NEXT: 0030 440ec002 00000000                    D.......
+# CHECK-NEXT: 0000 00000014 00000000 037a5200 01780e01  {{.*}}
+# CHECK-NEXT: 0010 1b0c0fa0 01000000 0000002c 0000001c  {{.*}}
+# CHECK-NEXT: 0020 00000000 00000032 00468d07 8e068f05  {{.*}}
+# CHECK-NEXT: 0030 440e8003 60981c9c 1b991a9d 199a189e  {{.*}}
+# CHECK-NEXT: 0040 179b169f 15000000                    {{.*}}
diff --git a/test/DebugInfo/SystemZ/eh_frame_personality.s b/test/DebugInfo/SystemZ/eh_frame_personality.s
index 46b46db..456e0a6 100644
--- a/test/DebugInfo/SystemZ/eh_frame_personality.s
+++ b/test/DebugInfo/SystemZ/eh_frame_personality.s
@@ -37,7 +37,7 @@ DW.ref.__gxx_personality_v0:
 # Contents of the .eh_frame section:
 #
 # 00000000 0000001c 00000000 CIE
-#   Version:               1
+#   Version:               3
 #   Augmentation:          "zPLR"
 #   Code alignment factor: 1
 #   Data alignment factor: -8
@@ -61,7 +61,7 @@ DW.ref.__gxx_personality_v0:
 #   DW_CFA_nop
 #
 # CHECK: Contents of section .eh_frame:
-# CHECK-NEXT: 0000 0000001c 00000000 017a504c 52000178  .........zPLR..x
+# CHECK-NEXT: 0000 0000001c 00000000 037a504c 52000178  .........zPLR..x
 # CHECK-NEXT: 0010 0e079b00 0000001b 1b0c0fa0 01000000  ................
 # CHECK-NEXT: 0020 0000001c 00000024 00000000 00000012  .......$........
 # CHECK-NEXT: 0030 04000000 00468e06 8f05440e c0020000  .....F....D.....
diff --git a/test/DebugInfo/SystemZ/lit.local.cfg b/test/DebugInfo/SystemZ/lit.local.cfg
index b12af09..5c02dd3 100644
--- a/test/DebugInfo/SystemZ/lit.local.cfg
+++ b/test/DebugInfo/SystemZ/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'SystemZ' in targets:
+if not 'SystemZ' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/DebugInfo/SystemZ/variable-loc.ll b/test/DebugInfo/SystemZ/variable-loc.ll
index e0e4156..23df1cb 100644
--- a/test/DebugInfo/SystemZ/variable-loc.ll
+++ b/test/DebugInfo/SystemZ/variable-loc.ll
@@ -14,11 +14,11 @@
 ; CHECK: brasl   %r14, populate_array@PLT
 
 ; DEBUG: DW_TAG_variable
-; DEBUG-NOT: DW_TAG
-; DEBUG: DW_AT_name {{.*}} "main_arr"
 ; Rather hard-coded, but 0x91 => DW_OP_fbreg and 0xa401 is SLEB128 encoded 164.
 ; DEBUG-NOT: DW_TAG
 ; DEBUG: DW_AT_location {{.*}}(<0x3> 91 a4 01 )
+; DEBUG-NOT: DW_TAG
+; DEBUG: DW_AT_name {{.*}} "main_arr"
 
 
 @.str = private unnamed_addr constant [13 x i8] c"Total is %d\0A\00", align 2
diff --git a/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll b/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
index 1bbfbf4..4dc747f 100644
--- a/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
+++ b/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
@@ -37,13 +37,19 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !20 = metadata !{metadata !"test.c", metadata !"/work/llvm/vanilla/test/DebugInfo"}
 
 ; CHECK: DW_TAG_variable
-; CHECK-NEXT: DW_AT_name [DW_FORM_strp]       ( .debug_str[0x{{[0-9a-f]*}}] = "GLB")
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name [DW_FORM_strp]       ( .debug_str[0x{{[0-9a-f]*}}] = "GLB")
+; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_decl_file [DW_FORM_data1] (0x01)
+; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_decl_line [DW_FORM_data1] (0x01)
 
 ; CHECK: DW_TAG_variable
-; CHECK-NEXT: DW_AT_name [DW_FORM_strp]   ( .debug_str[0x{{[0-9a-f]*}}] = "LOC")
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name [DW_FORM_strp]   ( .debug_str[0x{{[0-9a-f]*}}] = "LOC")
+; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_decl_file [DW_FORM_data1]     (0x01)
+; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_decl_line [DW_FORM_data1]     (0x04)
 
 !21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/DW_AT_location-reference.ll b/test/DebugInfo/X86/DW_AT_location-reference.ll
index 6c5e32c0..f31b0ad 100644
--- a/test/DebugInfo/X86/DW_AT_location-reference.ll
+++ b/test/DebugInfo/X86/DW_AT_location-reference.ll
@@ -31,11 +31,11 @@
 ; // The 'x' variable and its symbol reference location
 ; CHECK: .debug_info contents:
 ; CHECK:      DW_TAG_variable
+; CHECK-NEXT:   DW_AT_location [DW_FORM_sec_offset] (0x00000000)
 ; CHECK-NEXT:   DW_AT_name {{.*}} "x"
 ; CHECK-NEXT:   DW_AT_decl_file
 ; CHECK-NEXT:   DW_AT_decl_line
 ; CHECK-NEXT:   DW_AT_type
-; CHECK-NEXT:   DW_AT_location [DW_FORM_sec_offset] (0x00000000)
 
 ; Check that the location contains only 4 ranges - this verifies that the 4th
 ; and 5th ranges were successfully merged into a single range.
diff --git a/test/DebugInfo/X86/DW_AT_object_pointer.ll b/test/DebugInfo/X86/DW_AT_object_pointer.ll
index 5fa9699..4b9fae8 100644
--- a/test/DebugInfo/X86/DW_AT_object_pointer.ll
+++ b/test/DebugInfo/X86/DW_AT_object_pointer.ll
@@ -7,7 +7,8 @@
 ; CHECK: DW_TAG_class_type
 ; CHECK: DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x{{[0-9a-f]*}} => {[[PARAM:0x[0-9a-f]*]]})
 ; CHECK: [[PARAM]]:     DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_name [DW_FORM_strp]     ( .debug_str[0x{{[0-9a-f]*}}] = "this")
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name [DW_FORM_strp]     ( .debug_str[0x{{[0-9a-f]*}}] = "this")
 
 %class.A = type { i32 }
 
diff --git a/test/DebugInfo/X86/DW_AT_specification.ll b/test/DebugInfo/X86/DW_AT_specification.ll
index b93cdf0..4f45f36 100644
--- a/test/DebugInfo/X86/DW_AT_specification.ll
+++ b/test/DebugInfo/X86/DW_AT_specification.ll
@@ -6,7 +6,8 @@
 ; CHECK: [[BAR_DECL:0x[0-9a-f]*]]: DW_TAG_subprogram
 ; CHECK-NEXT: DW_AT_MIPS_linkage_name {{.*}} "_ZN3foo3barEv"
 ; CHECK: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_specification {{.*}} {[[BAR_DECL]]}
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_specification {{.*}} {[[BAR_DECL]]}
 
 
 @_ZZN3foo3barEvE1x = constant i32 0, align 4
@@ -36,6 +37,6 @@ entry:
 !21 = metadata !{i32 720934, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !22} ; [ DW_TAG_const_type ]
 !22 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !25 = metadata !{i32 6, i32 1, metadata !26, null}
-!26 = metadata !{i32 786443, metadata !5, i32 4, i32 17, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
+!26 = metadata !{i32 786443, metadata !6, metadata !5, i32 4, i32 17, i32 0} ; [ DW_TAG_lexical_block ]
 !27 = metadata !{metadata !"nsNativeAppSupportBase.ii", metadata !"/Users/espindola/mozilla-central/obj-x86_64-apple-darwin11.2.0/toolkit/library"}
 !28 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/arguments.ll b/test/DebugInfo/X86/arguments.ll
index 3597b2c..989e4ff 100644
--- a/test/DebugInfo/X86/arguments.ll
+++ b/test/DebugInfo/X86/arguments.ll
@@ -19,10 +19,12 @@
 ; CHECK: DW_AT_MIPS_linkage_name{{.*}}"_Z4func3fooS_"
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_name{{.*}}"f"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}"f"
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_name{{.*}}"g"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}"g"
 
 %struct.foo = type { i32 }
 
diff --git a/test/DebugInfo/X86/block-capture.ll b/test/DebugInfo/X86/block-capture.ll
index 31b4fa9..e842afe 100644
--- a/test/DebugInfo/X86/block-capture.ll
+++ b/test/DebugInfo/X86/block-capture.ll
@@ -6,13 +6,13 @@
 ; Checks that we emit debug info for the block variable declare.
 ; CHECK: DW_TAG_subprogram
 ; CHECK: DW_TAG_variable
-; CHECK: DW_AT_name {{.*}} "block"
 ; CHECK: DW_AT_location [DW_FORM_sec_offset]
+; CHECK: DW_AT_name {{.*}} "block"
 
 ; DWARF3: DW_TAG_subprogram
 ; DWARF3: DW_TAG_variable
-; DWARF3: DW_AT_name {{.*}} "block"
 ; DWARF3: DW_AT_location [DW_FORM_data4]
+; DWARF3: DW_AT_name {{.*}} "block"
 
 %struct.__block_descriptor = type { i64, i64 }
 %struct.__block_literal_generic = type { i8*, i32, i32, i8*, %struct.__block_descriptor* }
@@ -118,15 +118,16 @@ declare i32 @__objc_personality_v0(...)
 !50 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"block", i32 7, i64 64, i64 64, i64 256, i32 0, metadata !9} ; [ DW_TAG_member ]
 !51 = metadata !{i32 7, i32 18, metadata !28, null}
 !52 = metadata !{i32 7, i32 19, metadata !28, null}
-!53 = metadata !{i32 786688, metadata !28, metadata !"block", metadata !6, i32 5, metadata !9, i32 0, i32 0, i64 1, i64 32} ; [ DW_TAG_auto_variable ]
+!53 = metadata !{i32 786688, metadata !28, metadata !"block", metadata !6, i32 5, metadata !9, i32 0, i32 0, metadata !65} ; [ DW_TAG_auto_variable ]
 !54 = metadata !{i32 5, i32 27, metadata !28, null}
 !55 = metadata !{i32 8, i32 22, metadata !56, null}
-!56 = metadata !{i32 786443, metadata !57, i32 7, i32 26, metadata !6, i32 2} ; [ DW_TAG_lexical_block ]
-!57 = metadata !{i32 786443, metadata !28, i32 7, i32 19, metadata !6, i32 1} ; [ DW_TAG_lexical_block ]
+!56 = metadata !{i32 786443, metadata !6, metadata !57, i32 7, i32 26, i32 2} ; [ DW_TAG_lexical_block ]
+!57 = metadata !{i32 786443, metadata !6, metadata !28, i32 7, i32 19, i32 1} ; [ DW_TAG_lexical_block ]
 !58 = metadata !{i32 10, i32 20, metadata !59, null}
-!59 = metadata !{i32 786443, metadata !60, i32 9, i32 35, metadata !6, i32 4} ; [ DW_TAG_lexical_block ]
-!60 = metadata !{i32 786443, metadata !57, i32 9, i32 35, metadata !6, i32 3} ; [ DW_TAG_lexical_block ]
+!59 = metadata !{i32 786443, metadata !6, metadata !60, i32 9, i32 35, i32 4} ; [ DW_TAG_lexical_block ]
+!60 = metadata !{i32 786443, metadata !6, metadata !57, i32 9, i32 35, i32 3} ; [ DW_TAG_lexical_block ]
 !61 = metadata !{i32 10, i32 21, metadata !28, null}
 !62 = metadata !{i32 9, i32 20, metadata !56, null}
 !63 = metadata !{metadata !"foo.m", metadata !"/Users/echristo"}
 !64 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!65 = metadata !{i64 1, i64 32}
diff --git a/test/DebugInfo/X86/byvalstruct.ll b/test/DebugInfo/X86/byvalstruct.ll
index 731f8db..d787ef3 100644
--- a/test/DebugInfo/X86/byvalstruct.ll
+++ b/test/DebugInfo/X86/byvalstruct.ll
@@ -6,7 +6,8 @@
 ; CHECK: DW_TAG_formal_parameter
 ; CHECK: DW_TAG_formal_parameter
 ; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_name {{.*}} "info"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name {{.*}} "info"
 ;
 ; generated from
 ;
diff --git a/test/DebugInfo/X86/coff_debug_info_type.ll b/test/DebugInfo/X86/coff_debug_info_type.ll
index a0b8ccc..a1051c3 100644
--- a/test/DebugInfo/X86/coff_debug_info_type.ll
+++ b/test/DebugInfo/X86/coff_debug_info_type.ll
@@ -6,6 +6,8 @@
 ; RUN: llc -mtriple=i686-pc-win32 -filetype=asm -O0 < %s | FileCheck -check-prefix=WIN32 %s
 ; WIN32:    .section .debug$S,"rnd"
 
+; RUN: llc -mtriple=i686-pc-win32 -filetype=null -O0 < %s
+
 ; generated from:
 ; clang -g -S -emit-llvm test.c -o test.ll
 ; int main()
diff --git a/test/DebugInfo/X86/concrete_out_of_line.ll b/test/DebugInfo/X86/concrete_out_of_line.ll
index 40300de..ac038f3 100644
--- a/test/DebugInfo/X86/concrete_out_of_line.ll
+++ b/test/DebugInfo/X86/concrete_out_of_line.ll
@@ -79,7 +79,7 @@ declare void @_Z8moz_freePv(i8*)
 !0 = metadata !{i32 786449, metadata !59, i32 4, metadata !"clang version 3.1 ()", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !47,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5, metadata !23, metadata !27, metadata !31}
-!5 = metadata !{i32 720942, metadata !6, null, metadata !"Release", metadata !"Release", metadata !"_ZN17nsAutoRefCnt7ReleaseEv", i32 14, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32* null, null, metadata !12, metadata !20, i32 14} ; [ DW_TAG_subprogram ] [line 14] [def] [Release]
+!5 = metadata !{i32 720942, metadata !6, null, metadata !"Release", metadata !"Release", metadata !"_ZN17nsAutoRefCnt7ReleaseEv", i32 14, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @_ZN17nsAutoRefCnt7ReleaseEv , null, metadata !12, metadata !20, i32 14} ; [ DW_TAG_subprogram ] [line 14] [def] [Release]
 !6 = metadata !{i32 720937, metadata !59} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !10}
@@ -95,7 +95,7 @@ declare void @_Z8moz_freePv(i8*)
 !18 = metadata !{}
 !20 = metadata !{metadata !22}
 !22 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777230, metadata !10, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
-!23 = metadata !{i32 720942, metadata !6, null, metadata !"~nsAutoRefCnt", metadata !"~nsAutoRefCnt", metadata !"_ZN17nsAutoRefCntD1Ev", i32 18, metadata !16, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32* null, null, metadata !15, metadata !24, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [~nsAutoRefCnt]
+!23 = metadata !{i32 720942, metadata !6, null, metadata !"~nsAutoRefCnt", metadata !"~nsAutoRefCnt", metadata !"_ZN17nsAutoRefCntD1Ev", i32 18, metadata !16, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @_ZN17nsAutoRefCntD1Ev, null, metadata !15, metadata !24, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [~nsAutoRefCnt]
 !24 = metadata !{metadata !26}
 !26 = metadata !{i32 786689, metadata !23, metadata !"this", metadata !6, i32 16777234, metadata !10, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
 !27 = metadata !{i32 720942, metadata !6, null, metadata !"~nsAutoRefCnt", metadata !"~nsAutoRefCnt", metadata !"_ZN17nsAutoRefCntD2Ev", i32 18, metadata !16, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32* null, null, metadata !15, metadata !28, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [~nsAutoRefCnt]
diff --git a/test/DebugInfo/X86/dbg-const-int.ll b/test/DebugInfo/X86/dbg-const-int.ll
index f2f51c9..bf7ee08 100644
--- a/test/DebugInfo/X86/dbg-const-int.ll
+++ b/test/DebugInfo/X86/dbg-const-int.ll
@@ -1,12 +1,14 @@
-; RUN: llc -mtriple=x86_64-apple-darwin12 -filetype=obj %s -o %t
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin12 -filetype=obj < %s \
+; RUN:    | llvm-dwarfdump -debug-dump=info - | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-macosx10.6.7"
 ; Radar 9511391
 
 ; CHECK: DW_TAG_variable
-; CHECK: "i"
-; CHECK: DW_AT_const_value [DW_FORM_sdata]   (42)
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_const_value [DW_FORM_sdata]   (42)
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "i"
 
 define i32 @foo() nounwind uwtable readnone optsize ssp {
 entry:
diff --git a/test/DebugInfo/X86/dbg-value-const-byref.ll b/test/DebugInfo/X86/dbg-value-const-byref.ll
index baba0cd..23fa352 100644
--- a/test/DebugInfo/X86/dbg-value-const-byref.ll
+++ b/test/DebugInfo/X86/dbg-value-const-byref.ll
@@ -20,9 +20,10 @@
 ;
 ; CHECK: .debug_info contents:
 ; CHECK: DW_TAG_variable
-; CHECK-NEXT: DW_AT_name{{.*}}"i"
 ; CHECK-NOT: DW_TAG
 ; CHECK:     DW_AT_location [DW_FORM_data4]	([[LOC:.*]])
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}"i"
 ; CHECK: .debug_loc contents:
 ; CHECK: [[LOC]]:
 ;        consts 0x00000003
diff --git a/test/DebugInfo/X86/dbg-value-inlined-parameter.ll b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
index 1922272..4d18f7d 100644
--- a/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
+++ b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
@@ -7,19 +7,22 @@
 
 ; CHECK: DW_TAG_subprogram
 ; CHECK:   DW_AT_abstract_origin {{.*}}{[[ABS:.*]]}
-; FIXME: An out of line definition preceeding an inline usage doesn't properly
-; reference abstract variables.
 ; CHECK:   DW_TAG_formal_parameter
-; CHECK-NEXT:     DW_AT_name {{.*}} "sp"
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_abstract_origin {{.*}}{[[ABS_SP:.*]]}
 ; CHECK:   DW_TAG_formal_parameter
-; CHECK-NEXT:     DW_AT_name {{.*}} "nums"
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_abstract_origin {{.*}}{[[ABS_NUMS:.*]]}
 
 ; CHECK: [[ABS]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
 ; CHECK:   DW_AT_name {{.*}} "foo"
-; CHECK: [[ABS_SP:.*]]:   DW_TAG_formal_parameter
-; CHECK-NEXT:     DW_AT_name {{.*}} "sp"
-; CHECK: [[ABS_NUMS:.*]]:  DW_TAG_formal_parameter
-; CHECK-NEXT:     DW_AT_name {{.*}} "nums"
+; CHECK: [[ABS_SP]]:   DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "sp"
+; CHECK: [[ABS_NUMS]]:  DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "nums"
 
 ;CHECK: DW_TAG_inlined_subroutine
 ;CHECK-NEXT: DW_AT_abstract_origin {{.*}}{[[ABS]]}
@@ -30,9 +33,10 @@
 
 ;CHECK: DW_TAG_formal_parameter
 ;FIXME: Linux shouldn't drop this parameter either...
-;DARWIN-NEXT:   DW_AT_abstract_origin {{.*}}{[[ABS_SP]]}
+;CHECK-NOT: DW_TAG
+;DARWIN:   DW_AT_abstract_origin {{.*}}{[[ABS_SP]]}
 ;DARWIN: DW_TAG_formal_parameter
-;CHECK-NEXT: DW_AT_abstract_origin {{.*}}{[[ABS_NUMS]]}
+;CHECK: DW_AT_abstract_origin {{.*}}{[[ABS_NUMS]]}
 ;CHECK-NOT: DW_TAG_formal_parameter
 
 %struct.S1 = type { float*, i32 }
diff --git a/test/DebugInfo/X86/dbg-value-isel.ll b/test/DebugInfo/X86/dbg-value-isel.ll
index f899f48..155f76f 100644
--- a/test/DebugInfo/X86/dbg-value-isel.ll
+++ b/test/DebugInfo/X86/dbg-value-isel.ll
@@ -92,7 +92,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !8 = metadata !{i32 786689, metadata !0, metadata !"ip", metadata !1, i32 1, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
 !9 = metadata !{i32 1, i32 32, metadata !0, null}
 !10 = metadata !{i32 786688, metadata !11, metadata !"tid", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{i32 786443, metadata !0, i32 2, i32 1, metadata !1, i32 1} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{i32 786443, metadata !1, metadata !0, i32 2, i32 1, i32 1} ; [ DW_TAG_lexical_block ]
 !12 = metadata !{i32 5, i32 24, metadata !11, null}
 !13 = metadata !{i32 786688, metadata !11, metadata !"gid", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
 !14 = metadata !{i32 6, i32 25, metadata !11, null}
diff --git a/test/DebugInfo/X86/dbg-value-location.ll b/test/DebugInfo/X86/dbg-value-location.ll
index 9184217..55d1ae6 100644
--- a/test/DebugInfo/X86/dbg-value-location.ll
+++ b/test/DebugInfo/X86/dbg-value-location.ll
@@ -6,11 +6,11 @@ target triple = "x86_64-apple-darwin10.0.0"
 ; rdar://8950491
 
 ;CHECK: DW_TAG_formal_parameter
+;CHECK-NEXT: DW_AT_location
 ;CHECK-NEXT: DW_AT_name {{.*}} "var"
 ;CHECK-NEXT: DW_AT_decl_file
 ;CHECK-NEXT: DW_AT_decl_line
 ;CHECK-NEXT: DW_AT_type
-;CHECK-NEXT: DW_AT_location
 
 @dfm = external global i32, align 4
 
diff --git a/test/DebugInfo/X86/dbg-value-terminator.ll b/test/DebugInfo/X86/dbg-value-terminator.ll
index f08f281..974e0ad 100644
--- a/test/DebugInfo/X86/dbg-value-terminator.ll
+++ b/test/DebugInfo/X86/dbg-value-terminator.ll
@@ -11,84 +11,84 @@
 
 define hidden fastcc %a* @test() #1 {
 entry:
-  %0 = icmp eq %a* undef, null, !dbg !1
-  br i1 %0, label %"14", label %return, !dbg !1
+  %0 = icmp eq %a* undef, null, !dbg !12
+  br i1 %0, label %"14", label %return, !dbg !12
 
 "14":                                             ; preds = %"8"
-  br i1 undef, label %"25", label %"21", !dbg !1
+  br i1 undef, label %"25", label %"21", !dbg !12
 
 "21":                                             ; preds = %"14"
-  br i1 undef, label %may_unswitch_on.exit, label %"6.i", !dbg !1
+  br i1 undef, label %may_unswitch_on.exit, label %"6.i", !dbg !12
 
 "6.i":                                            ; preds = %"21"
-  br i1 undef, label %"10.i", label %may_unswitch_on.exit, !dbg !1
+  br i1 undef, label %"10.i", label %may_unswitch_on.exit, !dbg !12
 
 "10.i":                                           ; preds = %"6.i"
-  br i1 undef, label %may_unswitch_on.exit, label %"12.i", !dbg !1
+  br i1 undef, label %may_unswitch_on.exit, label %"12.i", !dbg !12
 
 "12.i":                                           ; preds = %"10.i"
-  br i1 undef, label %"4.i.i", label %"3.i.i", !dbg !1
+  br i1 undef, label %"4.i.i", label %"3.i.i", !dbg !12
 
 "3.i.i":                                          ; preds = %"12.i"
-  br i1 undef, label %"4.i.i", label %VEC_edge_base_index.exit.i, !dbg !1
+  br i1 undef, label %"4.i.i", label %VEC_edge_base_index.exit.i, !dbg !12
 
 "4.i.i":                                          ; preds = %"3.i.i", %"12.i"
-  unreachable, !dbg !1
+  unreachable, !dbg !12
 
 VEC_edge_base_index.exit.i:                       ; preds = %"3.i.i"
-  br i1 undef, label %may_unswitch_on.exit, label %"16.i", !dbg !1
+  br i1 undef, label %may_unswitch_on.exit, label %"16.i", !dbg !12
 
 "16.i":                                           ; preds = %VEC_edge_base_index.exit.i
-  br i1 undef, label %"4.i6.i", label %"3.i5.i", !dbg !1
+  br i1 undef, label %"4.i6.i", label %"3.i5.i", !dbg !12
 
 "3.i5.i":                                         ; preds = %"16.i"
-  br i1 undef, label %VEC_edge_base_index.exit7.i, label %"4.i6.i", !dbg !1
+  br i1 undef, label %VEC_edge_base_index.exit7.i, label %"4.i6.i", !dbg !12
 
 "4.i6.i":                                         ; preds = %"3.i5.i", %"16.i"
-  unreachable, !dbg !1
+  unreachable, !dbg !12
 
 VEC_edge_base_index.exit7.i:                      ; preds = %"3.i5.i"
-  br i1 undef, label %may_unswitch_on.exit, label %"21.i", !dbg !1
+  br i1 undef, label %may_unswitch_on.exit, label %"21.i", !dbg !12
 
 "21.i":                                           ; preds = %VEC_edge_base_index.exit7.i
-  br i1 undef, label %may_unswitch_on.exit, label %"23.i", !dbg !1
+  br i1 undef, label %may_unswitch_on.exit, label %"23.i", !dbg !12
 
 "23.i":                                           ; preds = %"21.i"
-  br i1 undef, label %may_unswitch_on.exit, label %"26.i", !dbg !1
+  br i1 undef, label %may_unswitch_on.exit, label %"26.i", !dbg !12
 
 "26.i":                                           ; preds = %"34.i", %"23.i"
-  %1 = icmp eq i32 undef, 9, !dbg !1
-  br i1 %1, label %"34.i", label %"28.i", !dbg !1
+  %1 = icmp eq i32 undef, 9, !dbg !12
+  br i1 %1, label %"34.i", label %"28.i", !dbg !12
 
 "28.i":                                           ; preds = %"26.i"
   unreachable
 
 "34.i":                                           ; preds = %"26.i"
-  br i1 undef, label %"26.i", label %"36.i", !dbg !1
+  br i1 undef, label %"26.i", label %"36.i", !dbg !12
 
 "36.i":                                           ; preds = %"34.i"
-  br i1 undef, label %"37.i", label %"38.i", !dbg !1
+  br i1 undef, label %"37.i", label %"38.i", !dbg !12
 
 "37.i":                                           ; preds = %"36.i"
-  br label %"38.i", !dbg !1
+  br label %"38.i", !dbg !12
 
 "38.i":                                           ; preds = %"37.i", %"36.i"
-  br i1 undef, label %"39.i", label %"45.i", !dbg !1
+  br i1 undef, label %"39.i", label %"45.i", !dbg !12
 
 "39.i":                                           ; preds = %"38.i"
-  br i1 undef, label %"41.i", label %may_unswitch_on.exit, !dbg !1
+  br i1 undef, label %"41.i", label %may_unswitch_on.exit, !dbg !12
 
 "41.i":                                           ; preds = %"39.i"
-  br i1 undef, label %may_unswitch_on.exit, label %"42.i", !dbg !1
+  br i1 undef, label %may_unswitch_on.exit, label %"42.i", !dbg !12
 
 "42.i":                                           ; preds = %"41.i"
-  br i1 undef, label %may_unswitch_on.exit, label %"44.i", !dbg !1
+  br i1 undef, label %may_unswitch_on.exit, label %"44.i", !dbg !12
 
 "44.i":                                           ; preds = %"42.i"
-  %2 = load %a** undef, align 8, !dbg !1
-  %3 = bitcast %a* %2 to %a*, !dbg !1
+  %2 = load %a** undef, align 8, !dbg !12
+  %3 = bitcast %a* %2 to %a*, !dbg !12
   call void @llvm.dbg.value(metadata !{%a* %3}, i64 0, metadata !6), !dbg !12
-  br label %may_unswitch_on.exit, !dbg !1
+  br label %may_unswitch_on.exit, !dbg !12
 
 "45.i":                                           ; preds = %"38.i"
   unreachable
@@ -102,7 +102,7 @@ may_unswitch_on.exit:                             ; preds = %"44.i", %"42.i", %"
 
 "return":
   %result = phi %a* [ null, %entry ], [ %4, %may_unswitch_on.exit ]
-  ret %a* %result, !dbg !1
+  ret %a* %result, !dbg !12
 }
 
 attributes #0 = { nounwind readnone }
diff --git a/test/DebugInfo/X86/dbg_value_direct.ll b/test/DebugInfo/X86/dbg_value_direct.ll
index 28b7dc6..db947ac 100644
--- a/test/DebugInfo/X86/dbg_value_direct.ll
+++ b/test/DebugInfo/X86/dbg_value_direct.ll
@@ -170,8 +170,9 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !20 = metadata !{i32 786468}
 !21 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !22 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!23 = metadata !{i32 786689, metadata !4, metadata !"", metadata !5, i32 16777222, metadata !21, i32 0, i32 0, i64 2} ; [ DW_TAG_arg_variable ] [line 6]
+!23 = metadata !{i32 786689, metadata !4, metadata !"", metadata !5, i32 16777222, metadata !21, i32 0, i32 0, metadata !28} ; [ DW_TAG_arg_variable ] [line 6]
 !24 = metadata !{i32 786688, metadata !4, metadata !"a", metadata !5, i32 7, metadata !8, i32 8192, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 7]
 !25 = metadata !{i32 7, i32 0, metadata !4, null}
 !26 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
 !27 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!28 = metadata !{i64 2}
diff --git a/test/DebugInfo/X86/debug-info-block-captured-self.ll b/test/DebugInfo/X86/debug-info-block-captured-self.ll
index 87e8f03..95eda60 100644
--- a/test/DebugInfo/X86/debug-info-block-captured-self.ll
+++ b/test/DebugInfo/X86/debug-info-block-captured-self.ll
@@ -7,17 +7,19 @@
 ; This test is split into two parts, the frontend part can be found at
 ; llvm/tools/clang/test/CodeGenObjC/debug-info-block-captured-self.m
 ;
-; CHECK:      {{.*}}DW_AT_name{{.*}}_block_invoke{{.*}}
-; CHECK:      DW_TAG_variable
-; CHECK:      {{.*}}DW_AT_name{{.*}}"self"{{.*}}
+; CHECK: {{.*}}DW_AT_name{{.*}}_block_invoke{{.*}}
+; CHECK: DW_TAG_variable
 ; CHECK-NOT:  DW_TAG
-; CHECK:      DW_AT_location
+; CHECK:   DW_AT_location
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name{{.*}}"self"{{.*}}
 ;
-; CHECK:      {{.*}}DW_AT_name{{.*}}_block_invoke{{.*}}
-; CHECK:      DW_TAG_variable
-; CHECK:      {{.*}}DW_AT_name{{.*}}"self"{{.*}}
+; CHECK: {{.*}}DW_AT_name{{.*}}_block_invoke{{.*}}
+; CHECK: DW_TAG_variable
 ; CHECK-NOT:  DW_TAG
-; CHECK:      DW_AT_location
+; CHECK:   DW_AT_location
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name{{.*}}"self"{{.*}}
 ;
 ; Generated (and then reduced) from
 ; ----------------------------------------------------------------------
@@ -99,10 +101,12 @@ define internal void @"__24-[Main initWithContext:]_block_invoke_2"(i8* %.block_
 !41 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
 !42 = metadata !{i32 786478, metadata !1, metadata !1, metadata !"__24-[Main initWithContext:]_block_invoke_2", metadata !"__24-[Main initWithContext:]_block_invoke_2", metadata !"", i32 35, metadata !39, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*, i8*)* @"__24-[Main initWithContext:]_block_invoke_2", null, null, metadata !15, i32 35} ; [ DW_TAG_subprogram ] [line 35] [local] [def] [__24-[Main initWithContext:]_block_invoke_2]
 !84 = metadata !{i32 33, i32 0, metadata !38, null}
-!86 = metadata !{i32 786688, metadata !38, metadata !"self", metadata !1, i32 41, metadata !34, i32 0, i32 0, i64 1, i64 32} ; [ DW_TAG_auto_variable ] [self] [line 41]
+!86 = metadata !{i32 786688, metadata !38, metadata !"self", metadata !1, i32 41, metadata !34, i32 0, i32 0, metadata !110} ; [ DW_TAG_auto_variable ] [self] [line 41]
 !87 = metadata !{i32 41, i32 0, metadata !38, null}
 !103 = metadata !{i32 35, i32 0, metadata !42, null}
-!105 = metadata !{i32 786688, metadata !42, metadata !"self", metadata !1, i32 40, metadata !34, i32 0, i32 0, i64 1, i64 32} ; [ DW_TAG_auto_variable ] [self] [line 40]
+!105 = metadata !{i32 786688, metadata !42, metadata !"self", metadata !1, i32 40, metadata !34, i32 0, i32 0, metadata !109} ; [ DW_TAG_auto_variable ] [self] [line 40]
 !106 = metadata !{i32 40, i32 0, metadata !42, null}
 !107 = metadata !{metadata !"llvm/tools/clang/test/CodeGenObjC/debug-info-block-captured-self.m", metadata !""}
 !108 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!109 = metadata !{i64 1, i64 32}
+!110 = metadata !{i64 1, i64 32}
diff --git a/test/DebugInfo/X86/debug-info-blocks.ll b/test/DebugInfo/X86/debug-info-blocks.ll
index 430c157..8a1a125 100644
--- a/test/DebugInfo/X86/debug-info-blocks.ll
+++ b/test/DebugInfo/X86/debug-info-blocks.ll
@@ -20,22 +20,23 @@
 ; CHECK-NOT: {{DW_TAG|NULL}}
 ; CHECK: DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_name{{.*}}.block_descriptor
-; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_location
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}.block_descriptor
 
 ; CHECK-NOT: {{DW_TAG|NULL}}
 ; CHECK: DW_TAG_variable
-; CHECK-NEXT: DW_AT_name{{.*}}"self"
-; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_type{{.*}}{[[APTR:.*]]}
-; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_artificial
 ; CHECK-NOT: DW_TAG
 ; 0x06 = DW_OP_deref
 ; 0x23 = DW_OP_uconst
 ; 0x91 = DW_OP_fbreg
 ; CHECK: DW_AT_location{{.*}}91 {{[0-9]+}} 06 23 {{[0-9]+}} )
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}"self"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_type{{.*}}{[[APTR:.*]]}
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_artificial
 
 ; CHECK: [[APTR]]:   DW_TAG_pointer_type
 ; CHECK-NEXT: {[[A]]}
@@ -358,7 +359,7 @@ attributes #3 = { nounwind }
 !86 = metadata !{i32 786451, metadata !1, null, metadata !"__block_descriptor_withcopydispose", i32 49, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 49, size 0, align 0, offset 0] [decl] [from ]
 !87 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"self", i32 49, i64 64, i64 64, i64 256, i32 0, metadata !61} ; [ DW_TAG_member ] [self] [line 49, size 64, align 64, offset 256] [from ]
 !88 = metadata !{i32 49, i32 0, metadata !27, null}
-!89 = metadata !{i32 786688, metadata !27, metadata !"self", metadata !32, i32 52, metadata !23, i32 0, i32 0, i64 2, i64 1, i64 32} ; [ DW_TAG_auto_variable ] [self] [line 52]
+!89 = metadata !{i32 786688, metadata !27, metadata !"self", metadata !32, i32 52, metadata !23, i32 0, i32 0, metadata !111} ; [ DW_TAG_auto_variable ] [self] [line 52]
 !90 = metadata !{i32 52, i32 0, metadata !27, null}
 !91 = metadata !{i32 786688, metadata !92, metadata !"d", metadata !6, i32 50, metadata !93, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [d] [line 50]
 !92 = metadata !{i32 786443, metadata !5, metadata !27, i32 49, i32 0, i32 2} ; [ DW_TAG_lexical_block ] [llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m]
@@ -380,3 +381,4 @@ attributes #3 = { nounwind }
 !108 = metadata !{i32 61, i32 0, metadata !36, null}
 !109 = metadata !{i32 62, i32 0, metadata !36, null}
 !110 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!111 = metadata !{i64 2, i64 1, i64 32}
diff --git a/test/DebugInfo/X86/debug-loc-asan.ll b/test/DebugInfo/X86/debug-loc-asan.ll
new file mode 100644
index 0000000..b1980ec
--- /dev/null
+++ b/test/DebugInfo/X86/debug-loc-asan.ll
@@ -0,0 +1,186 @@
+; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+; Verify that we have correct debug info for local variables in code
+; instrumented with AddressSanitizer.
+
+; Generated from the source file test.cc:
+; int bar(int y) {
+;   return y + 2;
+; }
+; with "clang++ -S -emit-llvm -fsanitize=address -O0 -g test.cc"
+
+; First, argument variable "y" resides in %rdi:
+; CHECK: DEBUG_VALUE: bar:y <- RDI
+
+; Then its address is stored in a location on a stack:
+; CHECK: movq %rdi, [[OFFSET:[0-9]+]](%rsp)
+; CHECK-NEXT: [[START_LABEL:.Ltmp[0-9]+]]
+; CHECK-NEXT: DEBUG_VALUE: bar:y <- [RSP+[[OFFSET]]]
+; This location should be valid until the end of the function.
+
+; CHECK: .Ldebug_loc{{[0-9]+}}:
+; We expect two location ranges for the variable.
+
+; First, it is stored in %rdx:
+; CHECK:      .Lset{{[0-9]+}} = .Lfunc_begin0-.Lfunc_begin0
+; CHECK-NEXT: .quad .Lset{{[0-9]+}}
+; CHECK-NEXT: .Lset{{[0-9]+}} = [[START_LABEL]]-.Lfunc_begin0
+; CHECK-NEXT: .quad .Lset{{[0-9]+}}
+; CHECK: DW_OP_reg5
+
+; Then it's addressed via %rsp:
+; CHECK:      .Lset{{[0-9]+}} = [[START_LABEL]]-.Lfunc_begin0
+; CHECK-NEXT: .quad .Lset{{[0-9]+}}
+; CHECK-NEXT: .Lset{{[0-9]+}} = .Lfunc_end0-.Lfunc_begin0
+; CHECK-NEXT: .quad .Lset{{[0-9]+}}
+; CHECK: DW_OP_breg7
+; CHECK-NEXT: [[OFFSET]]
+; CHECK: DW_OP_deref
+
+; ModuleID = 'test.cc'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 1, void ()* @asan.module_ctor }]
+@__asan_option_detect_stack_use_after_return = external global i32
+@__asan_gen_ = private unnamed_addr constant [16 x i8] c"1 32 4 6 y.addr\00", align 1
+
+; Function Attrs: nounwind sanitize_address uwtable
+define i32 @_Z3bari(i32 %y) #0 {
+entry:
+  %MyAlloca = alloca [64 x i8], align 32
+  %0 = ptrtoint [64 x i8]* %MyAlloca to i64
+  %1 = load i32* @__asan_option_detect_stack_use_after_return
+  %2 = icmp ne i32 %1, 0
+  br i1 %2, label %3, label %5
+
+; <label>:3                                       ; preds = %entry
+  %4 = call i64 @__asan_stack_malloc_0(i64 64, i64 %0)
+  br label %5
+
+; <label>:5                                       ; preds = %entry, %3
+  %6 = phi i64 [ %0, %entry ], [ %4, %3 ]
+  %7 = add i64 %6, 32
+  %8 = inttoptr i64 %7 to i32*
+  %9 = inttoptr i64 %6 to i64*
+  store i64 1102416563, i64* %9
+  %10 = add i64 %6, 8
+  %11 = inttoptr i64 %10 to i64*
+  store i64 ptrtoint ([16 x i8]* @__asan_gen_ to i64), i64* %11
+  %12 = add i64 %6, 16
+  %13 = inttoptr i64 %12 to i64*
+  store i64 ptrtoint (i32 (i32)* @_Z3bari to i64), i64* %13
+  %14 = lshr i64 %6, 3
+  %15 = add i64 %14, 2147450880
+  %16 = add i64 %15, 0
+  %17 = inttoptr i64 %16 to i64*
+  store i64 -868083100587789839, i64* %17
+  %18 = ptrtoint i32* %8 to i64
+  %19 = lshr i64 %18, 3
+  %20 = add i64 %19, 2147450880
+  %21 = inttoptr i64 %20 to i8*
+  %22 = load i8* %21
+  %23 = icmp ne i8 %22, 0
+  call void @llvm.dbg.declare(metadata !{i32* %8}, metadata !12)
+  br i1 %23, label %24, label %30
+
+; <label>:24                                      ; preds = %5
+  %25 = and i64 %18, 7
+  %26 = add i64 %25, 3
+  %27 = trunc i64 %26 to i8
+  %28 = icmp sge i8 %27, %22
+  br i1 %28, label %29, label %30
+
+; <label>:29                                      ; preds = %24
+  call void @__asan_report_store4(i64 %18)
+  call void asm sideeffect "", ""()
+  unreachable
+
+; <label>:30                                      ; preds = %24, %5
+  store i32 %y, i32* %8, align 4
+  %31 = ptrtoint i32* %8 to i64, !dbg !13
+  %32 = lshr i64 %31, 3, !dbg !13
+  %33 = add i64 %32, 2147450880, !dbg !13
+  %34 = inttoptr i64 %33 to i8*, !dbg !13
+  %35 = load i8* %34, !dbg !13
+  %36 = icmp ne i8 %35, 0, !dbg !13
+  br i1 %36, label %37, label %43, !dbg !13
+
+; <label>:37                                      ; preds = %30
+  %38 = and i64 %31, 7, !dbg !13
+  %39 = add i64 %38, 3, !dbg !13
+  %40 = trunc i64 %39 to i8, !dbg !13
+  %41 = icmp sge i8 %40, %35, !dbg !13
+  br i1 %41, label %42, label %43
+
+; <label>:42                                      ; preds = %37
+  call void @__asan_report_load4(i64 %31), !dbg !13
+  call void asm sideeffect "", ""()
+  unreachable
+
+; <label>:43                                      ; preds = %37, %30
+  %44 = load i32* %8, align 4, !dbg !13
+  %add = add nsw i32 %44, 2, !dbg !13
+  store i64 1172321806, i64* %9, !dbg !13
+  %45 = icmp ne i64 %6, %0, !dbg !13
+  br i1 %45, label %46, label %53, !dbg !13
+
+; <label>:46                                      ; preds = %43
+  %47 = add i64 %15, 0, !dbg !13
+  %48 = inttoptr i64 %47 to i64*, !dbg !13
+  store i64 -723401728380766731, i64* %48, !dbg !13
+  %49 = add i64 %6, 56, !dbg !13
+  %50 = inttoptr i64 %49 to i64*, !dbg !13
+  %51 = load i64* %50, !dbg !13
+  %52 = inttoptr i64 %51 to i8*, !dbg !13
+  store i8 0, i8* %52, !dbg !13
+  br label %56, !dbg !13
+
+; <label>:53                                      ; preds = %43
+  %54 = add i64 %15, 0, !dbg !13
+  %55 = inttoptr i64 %54 to i64*, !dbg !13
+  store i64 0, i64* %55, !dbg !13
+  br label %56, !dbg !13
+
+; <label>:56                                      ; preds = %53, %46
+  ret i32 %add, !dbg !13
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+define internal void @asan.module_ctor() {
+  call void @__asan_init_v3()
+  ret void
+}
+
+declare void @__asan_init_v3()
+
+declare void @__asan_report_load4(i64)
+
+declare void @__asan_report_store4(i64)
+
+declare i64 @__asan_stack_malloc_0(i64, i64)
+
+attributes #0 = { nounwind sanitize_address uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (209308)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/test.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"test.cc", metadata !"/llvm_cmake_gcc"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"bar", metadata !"bar", metadata !"_Z3bari", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z3bari, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [bar]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/test.cc]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{metadata !"clang version 3.5.0 (209308)"}
+!12 = metadata !{i32 786689, metadata !4, metadata !"y", metadata !5, i32 16777217, metadata !8, i32 0, i32 0, metadata !14} ; [ DW_TAG_arg_variable ] [y] [line 1]
+!13 = metadata !{i32 2, i32 0, metadata !4, null}
+!14 = metadata !{i64 2}
diff --git a/test/DebugInfo/X86/debug-loc-offset.ll b/test/DebugInfo/X86/debug-loc-offset.ll
index 3f4d39d..7866d0e 100644
--- a/test/DebugInfo/X86/debug-loc-offset.ll
+++ b/test/DebugInfo/X86/debug-loc-offset.ll
@@ -3,20 +3,23 @@
 
 ; From the code:
 
-; bar.cpp
+; debug-loc-offset1.cc
 ; int bar (int b) {
 ;   return b+4;
 ; }
 
-; foo.cpp
+; debug-loc-offset2.cc
 ; struct A {
-;   int a;
-;   int b;
-;   int c;
+;   int var;
+;   virtual char foo();
 ; };
 
-; int a (struct A var) {
-;   return var.a;
+; void baz(struct A a) {
+;   int z = 2;
+;   if (a.var > 2)
+;     z++;
+;   if (a.foo() == 'a')
+;     z++;
 ; }
 
 ; Compiled separately for i386-pc-linux-gnu and linked together.
@@ -38,78 +41,113 @@
 
 ; CHECK: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_MIPS_linkage_name [DW_FORM_strp]{{.*}}"_Z1a1A"
+; CHECK: DW_AT_MIPS_linkage_name [DW_FORM_strp]{{.*}}"_Z3baz1A"
 ; CHECK-NOT: {{DW_TAG|NULL}}
 ; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_name [DW_FORM_strp]{{.*}}"var"
+; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_location [DW_FORM_sec_offset]   (0x00000000)
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name [DW_FORM_strp]{{.*}}"a"
+
+; CHECK: DW_TAG_variable
+; CHECK: DW_AT_location [DW_FORM_exprloc]
 ; CHECK-NOT: DW_AT_location
 
 ; CHECK: .debug_loc contents:
 ; CHECK: 0x00000000: Beginning address offset: 0x0000000000000000
-; CHECK:                Ending address offset: 0x0000000000000009
+; CHECK:                Ending address offset: 0x000000000000001a
 
-
-%struct.A = type { i32, i32, i32 }
+%struct.A = type { i32 (...)**, i32 }
 
 ; Function Attrs: nounwind
 define i32 @_Z3bari(i32 %b) #0 {
 entry:
   %b.addr = alloca i32, align 4
   store i32 %b, i32* %b.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !25), !dbg !26
-  %0 = load i32* %b.addr, align 4, !dbg !27
-  %add = add nsw i32 %0, 4, !dbg !27
-  ret i32 %add, !dbg !27
+  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !21), !dbg !22
+  %0 = load i32* %b.addr, align 4, !dbg !23
+  %add = add nsw i32 %0, 4, !dbg !23
+  ret i32 %add, !dbg !23
 }
 
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata) #1
 
-; Function Attrs: nounwind
-define i32 @_Z1a1A(%struct.A* byval align 4 %var) #0 {
+define void @_Z3baz1A(%struct.A* %a) #2 {
 entry:
-  call void @llvm.dbg.declare(metadata !{%struct.A* %var}, metadata !28), !dbg !29
-  %a = getelementptr inbounds %struct.A* %var, i32 0, i32 0, !dbg !30
-  %0 = load i32* %a, align 4, !dbg !30
-  ret i32 %0, !dbg !30
+  %z = alloca i32, align 4
+  call void @llvm.dbg.declare(metadata !{%struct.A* %a}, metadata !24), !dbg !25
+  call void @llvm.dbg.declare(metadata !{i32* %z}, metadata !26), !dbg !27
+  store i32 2, i32* %z, align 4, !dbg !27
+  %var = getelementptr inbounds %struct.A* %a, i32 0, i32 1, !dbg !28
+  %0 = load i32* %var, align 4, !dbg !28
+  %cmp = icmp sgt i32 %0, 2, !dbg !28
+  br i1 %cmp, label %if.then, label %if.end, !dbg !28
+
+if.then:                                          ; preds = %entry
+  %1 = load i32* %z, align 4, !dbg !30
+  %inc = add nsw i32 %1, 1, !dbg !30
+  store i32 %inc, i32* %z, align 4, !dbg !30
+  br label %if.end, !dbg !30
+
+if.end:                                           ; preds = %if.then, %entry
+  %call = call signext i8 @_ZN1A3fooEv(%struct.A* %a), !dbg !31
+  %conv = sext i8 %call to i32, !dbg !31
+  %cmp1 = icmp eq i32 %conv, 97, !dbg !31
+  br i1 %cmp1, label %if.then2, label %if.end4, !dbg !31
+
+if.then2:                                         ; preds = %if.end
+  %2 = load i32* %z, align 4, !dbg !33
+  %inc3 = add nsw i32 %2, 1, !dbg !33
+  store i32 %inc3, i32* %z, align 4, !dbg !33
+  br label %if.end4, !dbg !33
+
+if.end4:                                          ; preds = %if.then2, %if.end
+  ret void, !dbg !34
 }
 
+declare signext i8 @_ZN1A3fooEv(%struct.A*) #2
+
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0, !9}
-!llvm.module.flags = !{!22, !23}
-!llvm.ident = !{!24, !24}
+!llvm.module.flags = !{!18, !19}
+!llvm.ident = !{!20, !20}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (trunk 204264) (llvm/trunk 204286)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1}
-!1 = metadata !{metadata !"bar.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (210479)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/debug-loc-offset1.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"debug-loc-offset1.cc", metadata !"/llvm_cmake_gcc"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"bar", metadata !"bar", metadata !"_Z3bari", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z3bari, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [bar]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/bar.cpp]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/debug-loc-offset1.cc]
 !6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8}
 !8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786449, metadata !10, i32 4, metadata !"clang version 3.5.0 (trunk 204264) (llvm/trunk 204286)", i1 false, metadata !"", i32 0, metadata !2, metadata !11, metadata !17, metadata !2, metadata !2, metadata !"", i32 1}
-!10 = metadata !{metadata !"foo.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
+!9 = metadata !{i32 786449, metadata !10, i32 4, metadata !"clang version 3.5.0 (210479)", i1 false, metadata !"", i32 0, metadata !2, metadata !11, metadata !13, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/debug-loc-offset2.cc] [DW_LANG_C_plus_plus]
+!10 = metadata !{metadata !"debug-loc-offset2.cc", metadata !"/llvm_cmake_gcc"}
 !11 = metadata !{metadata !12}
-!12 = metadata !{i32 786451, metadata !10, null, metadata !"A", i32 1, i64 96, i64 32, i32 0, i32 0, null, metadata !13, i32 0, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 96, align 32, offset 0] [def] [from ]
-!13 = metadata !{metadata !14, metadata !15, metadata !16}
-!14 = metadata !{i32 786445, metadata !10, metadata !"_ZTS1A", metadata !"a", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
-!15 = metadata !{i32 786445, metadata !10, metadata !"_ZTS1A", metadata !"b", i32 3, i64 32, i64 32, i64 32, i32 0, metadata !8} ; [ DW_TAG_member ] [b] [line 3, size 32, align 32, offset 32] [from int]
-!16 = metadata !{i32 786445, metadata !10, metadata !"_ZTS1A", metadata !"c", i32 4, i64 32, i64 32, i64 64, i32 0, metadata !8} ; [ DW_TAG_member ] [c] [line 4, size 32, align 32, offset 64] [from int]
-!17 = metadata !{metadata !18}
-!18 = metadata !{i32 786478, metadata !10, metadata !19, metadata !"a", metadata !"a", metadata !"_Z1a1A", i32 7, metadata !20, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%struct.A*)* @_Z1a1A, null, null, metadata !2, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [a]
-!19 = metadata !{i32 786473, metadata !10}        ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/foo.cpp]
-!20 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!21 = metadata !{metadata !8, metadata !12}
-!22 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
-!24 = metadata !{metadata !"clang version 3.5.0 (trunk 204264) (llvm/trunk 204286)"}
-!25 = metadata !{i32 786689, metadata !4, metadata !"b", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 1]
-!26 = metadata !{i32 1, i32 0, metadata !4, null}
-!27 = metadata !{i32 2, i32 0, metadata !4, null}
-!28 = metadata !{i32 786689, metadata !18, metadata !"var", metadata !19, i32 16777223, metadata !"_ZTS1A", i32 0, i32 0}
-!29 = metadata !{i32 7, i32 0, metadata !18, null}
-!30 = metadata !{i32 8, i32 0, metadata !18, null} ; [ DW_TAG_imported_declaration ]
+!12 = metadata !{i32 786451, metadata !10, null, metadata !"A", i32 1, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 0, align 0, offset 0] [decl] [from ]
+!13 = metadata !{metadata !14}
+!14 = metadata !{i32 786478, metadata !10, metadata !15, metadata !"baz", metadata !"baz", metadata !"_Z3baz1A", i32 6, metadata !16, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.A*)* @_Z3baz1A, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [baz]
+!15 = metadata !{i32 786473, metadata !10}        ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/debug-loc-offset2.cc]
+!16 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = metadata !{null, metadata !12}
+!18 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!19 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{metadata !"clang version 3.5.0 (210479)"}
+!21 = metadata !{i32 786689, metadata !4, metadata !"b", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 1]
+!22 = metadata !{i32 1, i32 0, metadata !4, null}
+!23 = metadata !{i32 2, i32 0, metadata !4, null}
+!24 = metadata !{i32 786689, metadata !14, metadata !"a", metadata !15, i32 16777222, metadata !"_ZTS1A", i32 8192, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 6]
+!25 = metadata !{i32 6, i32 0, metadata !14, null}
+!26 = metadata !{i32 786688, metadata !14, metadata !"z", metadata !15, i32 7, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [z] [line 7]
+!27 = metadata !{i32 7, i32 0, metadata !14, null}
+!28 = metadata !{i32 8, i32 0, metadata !29, null} ; [ DW_TAG_imported_declaration ]
+!29 = metadata !{i32 786443, metadata !10, metadata !14, i32 8, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/debug-loc-offset2.cc]
+!30 = metadata !{i32 9, i32 0, metadata !29, null}
+!31 = metadata !{i32 10, i32 0, metadata !32, null}
+!32 = metadata !{i32 786443, metadata !10, metadata !14, i32 10, i32 0, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/debug-loc-offset2.cc]
+!33 = metadata !{i32 11, i32 0, metadata !32, null}
+!34 = metadata !{i32 12, i32 0, metadata !14, null}
diff --git a/test/DebugInfo/X86/dwarf-public-names.ll b/test/DebugInfo/X86/dwarf-public-names.ll
index d870ccb..793971a 100644
--- a/test/DebugInfo/X86/dwarf-public-names.ll
+++ b/test/DebugInfo/X86/dwarf-public-names.ll
@@ -43,12 +43,14 @@
 ; LINUX: debug_pubnames
 
 ; Check for each name in the output.
-; LINUX: global_namespace_variable
-; LINUX: global_namespace_function
-; LINUX: static_member_function
-; LINUX: global_variable
-; LINUX: global_function
-; LINUX: member_function
+; LINUX-DAG: "ns"
+; LINUX-DAG: "C::static_member_function"
+; LINUX-DAG: "global_variable"
+; LINUX-DAG: "ns::global_namespace_variable"
+; LINUX-DAG: "ns::global_namespace_function"
+; LINUX-DAG: "global_function"
+; LINUX-DAG: "C::static_member_variable"
+; LINUX-DAG: "C::member_function"
 
 %struct.C = type { i8 }
 
@@ -112,7 +114,7 @@ attributes #1 = { nounwind readnone }
 !18 = metadata !{i32 786478, metadata !4, null, metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 13, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !14, metadata !1, i32 13} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
 !19 = metadata !{i32 786478, metadata !4, metadata !4, metadata !"global_function", metadata !"global_function", metadata !"_Z15global_functionv", i32 19, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z15global_functionv, null, null, metadata !1, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
 !20 = metadata !{i32 786478, metadata !4, metadata !21, metadata !"global_namespace_function", metadata !"global_namespace_function", metadata !"_ZN2ns25global_namespace_functionEv", i32 24, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !1, i32 24} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
-!21 = metadata !{i32 786489, null, metadata !"ns", metadata !4, i32 23} ; [ DW_TAG_namespace ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp]
+!21 = metadata !{i32 786489, metadata !4, null, metadata !"ns", i32 23} ; [ DW_TAG_namespace ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp]
 !22 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !23, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !23 = metadata !{null}
 !24 = metadata !{metadata !25, metadata !26, metadata !27}
diff --git a/test/DebugInfo/X86/elf-names.ll b/test/DebugInfo/X86/elf-names.ll
index 176c2af..36fd232 100644
--- a/test/DebugInfo/X86/elf-names.ll
+++ b/test/DebugInfo/X86/elf-names.ll
@@ -96,14 +96,14 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !35 = metadata !{i32 786689, metadata !31, metadata !"d", metadata !6, i32 33554451, metadata !23, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [d] [line 19]
 !36 = metadata !{i32 12, i32 0, metadata !5, null}
 !37 = metadata !{i32 13, i32 0, metadata !38, null}
-!38 = metadata !{i32 786443, metadata !5, i32 12, i32 0, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
+!38 = metadata !{i32 786443, metadata !6, metadata !5, i32 12, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
 !42 = metadata !{i32 14, i32 0, metadata !38, null}
 !43 = metadata !{i32 15, i32 0, metadata !38, null}
 !44 = metadata !{i32 16, i32 0, metadata !38, null}
 !45 = metadata !{i32 17, i32 0, metadata !38, null}
 !46 = metadata !{i32 19, i32 0, metadata !31, null}
 !47 = metadata !{i32 20, i32 0, metadata !48, null}
-!48 = metadata !{i32 786443, metadata !31, i32 19, i32 0, metadata !6, i32 1} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
+!48 = metadata !{i32 786443, metadata !6, metadata !31, i32 19, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
 !49 = metadata !{i32 21, i32 0, metadata !48, null}
 !50 = metadata !{i32 22, i32 0, metadata !48, null}
 !51 = metadata !{i32 23, i32 0, metadata !48, null}
diff --git a/test/DebugInfo/X86/empty-and-one-elem-array.ll b/test/DebugInfo/X86/empty-and-one-elem-array.ll
index 974bd73..f5c37df 100644
--- a/test/DebugInfo/X86/empty-and-one-elem-array.ll
+++ b/test/DebugInfo/X86/empty-and-one-elem-array.ll
@@ -28,6 +28,11 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 ; An empty array should not have an AT_upper_bound attribute. But an array of 1
 ; should.
 
+; CHECK:      DW_TAG_base_type
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]  ( .debug_str[{{.*}}] = "int")
+; CHECK-NEXT: DW_AT_encoding [DW_FORM_data1]   (0x05)
+; CHECK-NEXT: DW_AT_byte_size [DW_FORM_data1]  (0x04)
+
 ; int foo::b[1]:
 ; CHECK: DW_TAG_structure_type
 ; CHECK: DW_AT_name{{.*}}"foo"
@@ -36,11 +41,6 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 ; CHECK-NEXT: DW_AT_name [DW_FORM_strp]  ( .debug_str[{{.*}}] = "b")
 ; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]
 
-; CHECK:      DW_TAG_base_type
-; CHECK-NEXT: DW_AT_name [DW_FORM_strp]  ( .debug_str[{{.*}}] = "int")
-; CHECK-NEXT: DW_AT_encoding [DW_FORM_data1]   (0x05)
-; CHECK-NEXT: DW_AT_byte_size [DW_FORM_data1]  (0x04)
-
 ; int[1]:
 ; CHECK:      DW_TAG_array_type [{{.*}}] *
 ; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]
diff --git a/test/DebugInfo/X86/fission-ranges.ll b/test/DebugInfo/X86/fission-ranges.ll
index 057039c..1358375 100644
--- a/test/DebugInfo/X86/fission-ranges.ll
+++ b/test/DebugInfo/X86/fission-ranges.ll
@@ -44,6 +44,13 @@
 ; Make sure we don't produce any relocations in any .dwo section (though in particular, debug_info.dwo)
 ; HDR-NOT: .rela.{{.*}}.dwo
 
+; Make sure we have enough stuff in the debug_addr to cover the address indexes
+; (6 is the last index in debug_loc.dwo, making 7 entries of 8 bytes each, 7 * 8
+; == 56 base 10 == 38 base 16)
+
+; HDR: .debug_addr 00000038
+; HDR-NOT: .rela.{{.*}}.dwo
+
 ; From the code:
 
 ; extern int c;
diff --git a/test/DebugInfo/X86/formal_parameter.ll b/test/DebugInfo/X86/formal_parameter.ll
index 3445f46..2fdab7a 100644
--- a/test/DebugInfo/X86/formal_parameter.ll
+++ b/test/DebugInfo/X86/formal_parameter.ll
@@ -19,7 +19,8 @@ target triple = "x86_64-apple-macosx10.9.0"
 ; rdar://problem/14874886
 ;
 ; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_name {{.*}}map
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name {{.*}}map
 ; CHECK-NOT: DW_AT_name {{.*}}map
 
 ; Function Attrs: nounwind ssp uwtable
diff --git a/test/DebugInfo/X86/generate-odr-hash.ll b/test/DebugInfo/X86/generate-odr-hash.ll
index e713f14..2256b3e 100644
--- a/test/DebugInfo/X86/generate-odr-hash.ll
+++ b/test/DebugInfo/X86/generate-odr-hash.ll
@@ -156,10 +156,10 @@
 ; Don't emit pubtype entries for type DIEs in the compile unit that just indirect to a type unit.
 ; CHECK-NEXT: unit_size = [[CU_SIZE]]
 ; CHECK-NEXT: Offset Name
-; CHECK-NEXT: [[BAR]] "bar"
-; CHECK-NEXT: [[WOMBAT]] "wombat"
-; CHECK-NEXT: [[FLUFFY]] "echidna::capybara::mongoose::fluffy"
-; CHECK-NEXT: [[WALRUS]] "walrus"
+; CHECK-DAG: [[BAR]] "bar"
+; CHECK-DAG: [[WALRUS]] "(anonymous namespace)::walrus"
+; CHECK-DAG: [[WOMBAT]] "wombat"
+; CHECK-DAG: [[FLUFFY]] "echidna::capybara::mongoose::fluffy"
 
 %struct.bar = type { i8 }
 %"class.echidna::capybara::mongoose::fluffy" = type { i32, i32 }
diff --git a/test/DebugInfo/X86/gnu-public-names.ll b/test/DebugInfo/X86/gnu-public-names.ll
index 4e35dbe..96fa52b 100644
--- a/test/DebugInfo/X86/gnu-public-names.ll
+++ b/test/DebugInfo/X86/gnu-public-names.ll
@@ -1,6 +1,5 @@
 ; RUN: llc -mtriple=x86_64-pc-linux-gnu -generate-gnu-dwarf-pub-sections < %s | FileCheck -check-prefix=ASM %s
 ; RUN: llc -mtriple=x86_64-pc-linux-gnu -generate-gnu-dwarf-pub-sections -filetype=obj < %s | llvm-dwarfdump - | FileCheck %s
-; RUN: llc -mtriple=x86_64-pc-linux-gnu -generate-gnu-dwarf-pub-sections -filetype=obj -dwarf-version=3 < %s | llvm-dwarfdump - | FileCheck %s -check-prefix=DWARF3
 ; ModuleID = 'dwarf-public-names.cpp'
 ;
 ; Generated from:
@@ -46,73 +45,135 @@
 ; ASM-NEXT: .asciz  "C"                     # External Name
 
 ; CHECK: .debug_info contents:
-; CHECK: Compile Unit: length = [[UNIT_SIZE:[0-9a-f]+]]
+; CHECK: Compile Unit:
 ; CHECK: DW_AT_GNU_pubnames [DW_FORM_flag_present]   (true)
 ; CHECK-NOT: DW_AT_GNU_pubtypes [
 
-; CHECK: [[C:[0-9a-f]+]]: DW_TAG_structure_type
+; CHECK: [[C:0x[0-9a-f]+]]: DW_TAG_structure_type
 ; CHECK-NEXT: DW_AT_name {{.*}} "C"
 
-; CHECK: [[STATIC_MEM_DECL:[0-9a-f]+]]: DW_TAG_member
+; CHECK: [[STATIC_MEM_DECL:0x[0-9a-f]+]]: DW_TAG_member
 ; CHECK-NEXT: DW_AT_name {{.*}} "static_member_variable"
 
-; CHECK: [[MEM_FUNC_DECL:[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK: [[MEM_FUNC_DECL:0x[0-9a-f]+]]: DW_TAG_subprogram
 ; CHECK-NEXT: DW_AT_MIPS_linkage_name
 ; CHECK-NEXT: DW_AT_name {{.*}} "member_function"
 
-; CHECK: [[STATIC_MEM_FUNC_DECL:[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK: [[STATIC_MEM_FUNC_DECL:0x[0-9a-f]+]]: DW_TAG_subprogram
 ; CHECK-NEXT: DW_AT_MIPS_linkage_name
 ; CHECK-NEXT: DW_AT_name {{.*}} "static_member_function"
 
-; CHECK: [[INT:[0-9a-f]+]]: DW_TAG_base_type
+; CHECK: [[INT:0x[0-9a-f]+]]: DW_TAG_base_type
 ; CHECK-NEXT: DW_AT_name {{.*}} "int"
 
-; CHECK: [[STATIC_MEM_VAR:[0-9a-f]+]]: DW_TAG_variable
-; CHECK-NEXT: DW_AT_specification {{.*}}[[STATIC_MEM_DECL]]
+; CHECK: [[STATIC_MEM_VAR:0x[0-9a-f]+]]: DW_TAG_variable
+; CHECK-NEXT: DW_AT_specification {{.*}} {[[STATIC_MEM_DECL]]}
 
-; CHECK: [[GLOB_VAR:[0-9a-f]+]]: DW_TAG_variable
+; CHECK: [[GLOB_VAR:0x[0-9a-f]+]]: DW_TAG_variable
 ; CHECK-NEXT: DW_AT_name {{.*}} "global_variable"
 
-; CHECK: [[NS:[0-9a-f]+]]: DW_TAG_namespace
+; CHECK: [[NS:0x[0-9a-f]+]]: DW_TAG_namespace
 ; CHECK-NEXT: DW_AT_name {{.*}} "ns"
 
-; CHECK: [[GLOB_NS_VAR_DECL:[0-9a-f]+]]: DW_TAG_variable
+; CHECK: [[GLOB_NS_VAR_DECL:0x[0-9a-f]+]]: DW_TAG_variable
 ; CHECK-NEXT: DW_AT_name {{.*}} "global_namespace_variable"
 
-; CHECK: [[D_VAR_DECL:[0-9a-f]+]]: DW_TAG_variable
+; CHECK: [[D_VAR_DECL:0x[0-9a-f]+]]: DW_TAG_variable
 ; CHECK-NEXT: DW_AT_name {{.*}} "d"
 
-; CHECK: [[D:[0-9a-f]+]]: DW_TAG_structure_type
+; CHECK: [[D:0x[0-9a-f]+]]: DW_TAG_structure_type
 ; CHECK-NEXT: DW_AT_name {{.*}} "D"
 
-; CHECK: [[GLOB_NS_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK: [[GLOB_NS_FUNC:0x[0-9a-f]+]]: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_MIPS_linkage_name
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_name {{.*}} "global_namespace_function"
 
-; CHECK: [[GLOB_NS_VAR:[0-9a-f]+]]: DW_TAG_variable
-; CHECK-NEXT: DW_AT_specification {{.*}}[[GLOB_NS_VAR_DECL]]
+; CHECK: [[GLOB_NS_VAR:0x[0-9a-f]+]]: DW_TAG_variable
+; CHECK-NEXT: DW_AT_specification {{.*}} {[[GLOB_NS_VAR_DECL]]}
 
-; CHECK: [[D_VAR:[0-9a-f]+]]: DW_TAG_variable
-; CHECK-NEXT: DW_AT_specification {{.*}}[[D_VAR_DECL]]
+; CHECK: [[D_VAR:0x[0-9a-f]+]]: DW_TAG_variable
+; CHECK-NEXT: DW_AT_specification {{.*}} {[[D_VAR_DECL]]}
 
-; CHECK: [[MEM_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_specification {{.*}}[[MEM_FUNC_DECL]]
+; CHECK:   DW_AT_name {{.*}} "f3"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[F3_Z:.*]]:   DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "z"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     DW_AT_location
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:   NULL
+; CHECK-NOT: {{DW_TAG|NULL}}
+
+; CHECK: [[OUTER:.*]]: DW_TAG_namespace
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "outer"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[OUTER_ANON:.*]]:  DW_TAG_namespace
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK-NOT:     DW_AT_name
+; CHECK: [[OUTER_ANON_C_DECL:.*]]:     DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_name {{.*}} "c"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     NULL
+; CHECK-NOT: {{DW_TAG|NULL}}
+; FIXME: We probably shouldn't bother describing the implicit
+; import of the preceding anonymous namespace. This should be fixed
+; in clang.
+; CHECK:     DW_TAG_imported_module
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:   NULL
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[OUTER_ANON_C:.*]]: DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK-NEXT:   DW_AT_specification {{.*}} {[[OUTER_ANON_C_DECL]]}
+
+; CHECK: [[ANON:.*]]: DW_TAG_namespace
+; CHECK-NOT:   DW_AT_name
+; CHECK: [[ANON_INNER:.*]]:  DW_TAG_namespace
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "inner"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[ANON_INNER_B_DECL:.*]]:     DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_name {{.*}} "b"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     NULL
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[ANON_I_DECL:.*]]:   DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "i"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:   NULL
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[ANON_INNER_B:.*]]: DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK-NEXT:   DW_AT_specification {{.*}} {[[ANON_INNER_B_DECL]]}
+; CHECK: [[ANON_I:.*]]: DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK-NEXT:   DW_AT_specification {{.*}} {[[ANON_I_DECL]]}
+
+; CHECK: [[MEM_FUNC:0x[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_specification {{.*}} {[[MEM_FUNC_DECL]]}
 
-; CHECK: [[STATIC_MEM_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK: [[STATIC_MEM_FUNC:0x[0-9a-f]+]]: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_specification {{.*}}[[STATIC_MEM_FUNC_DECL]]
+; CHECK: DW_AT_specification {{.*}} {[[STATIC_MEM_FUNC_DECL]]}
 
-; CHECK: [[GLOBAL_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK: [[GLOBAL_FUNC:0x[0-9a-f]+]]: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_MIPS_linkage_name
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_name {{.*}} "global_function"
 
 ; CHECK-LABEL: .debug_gnu_pubnames contents:
-; CHECK-NEXT: length = 0x000000e7 version = 0x0002 unit_offset = 0x00000000 unit_size = [[UNIT_SIZE]]
+; CHECK-NEXT: length = {{.*}} version = 0x0002 unit_offset = 0x00000000 unit_size = {{.*}}
 ; CHECK-NEXT: Offset     Linkage  Kind     Name
 ; CHECK-DAG:  [[GLOBAL_FUNC]] EXTERNAL FUNCTION "global_function"
 ; CHECK-DAG:  [[NS]] EXTERNAL TYPE     "ns"
@@ -123,6 +184,20 @@
 ; CHECK-DAG:  [[D_VAR]] EXTERNAL VARIABLE "ns::d"
 ; CHECK-DAG:  [[STATIC_MEM_VAR]] EXTERNAL VARIABLE "C::static_member_variable"
 ; CHECK-DAG:  [[STATIC_MEM_FUNC]] EXTERNAL FUNCTION "C::static_member_function"
+; CHECK-DAG:  [[ANON]] EXTERNAL TYPE "(anonymous namespace)"
+; CHECK-DAG:  [[ANON_INNER]] EXTERNAL TYPE "(anonymous namespace)::inner"
+; CHECK-DAG:  [[OUTER]] EXTERNAL TYPE "outer"
+; CHECK-DAG:  [[OUTER_ANON]] EXTERNAL TYPE "outer::(anonymous namespace)"
+; CHECK-DAG:  [[ANON_I]] STATIC VARIABLE "(anonymous namespace)::i"
+; CHECK-DAG:  [[ANON_INNER_B]] STATIC VARIABLE "(anonymous namespace)::inner::b"
+; CHECK-DAG:  [[OUTER_ANON_C]] STATIC VARIABLE "outer::(anonymous namespace)::c"
+
+; GCC Doesn't put local statics in pubnames, but it seems not unreasonable and
+; comes out naturally from LLVM's implementation, so I'm OK with it for now. If
+; it's demonstrated that this is a major size concern or degrades debug info
+; consumer behavior, feel free to change it.
+
+; CHECK-DAG:  [[F3_Z]] STATIC VARIABLE "f3::z"
 
 
 ; CHECK-LABEL: debug_gnu_pubtypes contents:
@@ -131,92 +206,6 @@
 ; CHECK-DAG:  [[D]] EXTERNAL TYPE     "ns::D"
 ; CHECK-DAG:  [[INT]] STATIC   TYPE     "int"
 
-; DWARF3: .debug_info contents:
-; DWARF3: Compile Unit: length = [[UNIT_SIZE:[0-9a-f]+]]
-; DWARF3: DW_AT_GNU_pubnames [DW_FORM_flag]   (0x01)
-; DWARF3-NOT: DW_AT_GNU_pubtypes [
-
-; DWARF3: [[C:[0-9a-f]+]]: DW_TAG_structure_type
-; DWARF3-NEXT: DW_AT_name {{.*}} "C"
-
-; DWARF3: [[STATIC_MEM_DECL:[0-9a-f]+]]: DW_TAG_member
-; DWARF3-NEXT: DW_AT_name {{.*}} "static_member_variable"
-
-; DWARF3: [[MEM_FUNC_DECL:[0-9a-f]+]]: DW_TAG_subprogram
-; DWARF3-NEXT: DW_AT_MIPS_linkage_name
-; DWARF3-NEXT: DW_AT_name {{.*}} "member_function"
-
-; DWARF3: [[STATIC_MEM_FUNC_DECL:[0-9a-f]+]]: DW_TAG_subprogram
-; DWARF3-NEXT: DW_AT_MIPS_linkage_name
-; DWARF3-NEXT: DW_AT_name {{.*}} "static_member_function"
-
-; DWARF3: [[INT:[0-9a-f]+]]: DW_TAG_base_type
-; DWARF3-NEXT: DW_AT_name {{.*}} "int"
-
-; DWARF3: [[STATIC_MEM_VAR:[0-9a-f]+]]: DW_TAG_variable
-; DWARF3-NEXT: DW_AT_specification {{.*}}[[STATIC_MEM_DECL]]
-
-; DWARF3: [[GLOB_VAR:[0-9a-f]+]]: DW_TAG_variable
-; DWARF3-NEXT: DW_AT_name {{.*}} "global_variable"
-
-; DWARF3: [[NS:[0-9a-f]+]]: DW_TAG_namespace
-; DWARF3-NEXT: DW_AT_name {{.*}} "ns"
-
-; DWARF3: [[GLOB_NS_VAR_DECL:[0-9a-f]+]]: DW_TAG_variable
-; DWARF3-NEXT: DW_AT_name {{.*}} "global_namespace_variable"
-
-; DWARF3: [[D_VAR_DECL:[0-9a-f]+]]: DW_TAG_variable
-; DWARF3-NEXT: DW_AT_name {{.*}} "d"
-
-; DWARF3: [[D:[0-9a-f]+]]: DW_TAG_structure_type
-; DWARF3-NEXT: DW_AT_name {{.*}} "D"
-
-; DWARF3: [[GLOB_NS_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; DWARF3-NOT: DW_TAG
-; DWARF3: DW_AT_MIPS_linkage_name
-; DWARF3-NOT: DW_TAG
-; DWARF3: DW_AT_name {{.*}} "global_namespace_function"
-
-; DWARF3: [[GLOB_NS_VAR:[0-9a-f]+]]: DW_TAG_variable
-; DWARF3-NEXT: DW_AT_specification {{.*}}[[GLOB_NS_VAR_DECL]]
-
-; DWARF3: [[D_VAR:[0-9a-f]+]]: DW_TAG_variable
-; DWARF3-NEXT: DW_AT_specification {{.*}}[[D_VAR_DECL]]
-
-; DWARF3: [[MEM_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; DWARF3-NOT: DW_TAG
-; DWARF3: DW_AT_specification {{.*}}[[MEM_FUNC_DECL]]
-
-; DWARF3: [[STATIC_MEM_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; DWARF3-NOT: DW_TAG
-; DWARF3: DW_AT_specification {{.*}}[[STATIC_MEM_FUNC_DECL]]
-
-; DWARF3: [[GLOBAL_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; DWARF3-NOT: DW_TAG
-; DWARF3: DW_AT_MIPS_linkage_name
-; DWARF3-NOT: DW_TAG
-; DWARF3: DW_AT_name {{.*}} "global_function"
-
-; DWARF3-LABEL: .debug_gnu_pubnames contents:
-; DWARF3-NEXT: length = 0x000000e7 version = 0x0002 unit_offset = 0x00000000 unit_size = [[UNIT_SIZE]]
-; DWARF3-NEXT: Offset     Linkage  Kind     Name
-; DWARF3-DAG:  [[GLOBAL_FUNC]] EXTERNAL FUNCTION "global_function"
-; DWARF3-DAG:  [[NS]] EXTERNAL TYPE     "ns"
-; DWARF3-DAG:  [[MEM_FUNC]] EXTERNAL FUNCTION "C::member_function"
-; DWARF3-DAG:  [[GLOB_VAR]] EXTERNAL VARIABLE "global_variable"
-; DWARF3-DAG:  [[GLOB_NS_VAR]] EXTERNAL VARIABLE "ns::global_namespace_variable"
-; DWARF3-DAG:  [[GLOB_NS_FUNC]] EXTERNAL FUNCTION "ns::global_namespace_function"
-; DWARF3-DAG:  [[D_VAR]] EXTERNAL VARIABLE "ns::d"
-; DWARF3-DAG:  [[STATIC_MEM_VAR]] EXTERNAL VARIABLE "C::static_member_variable"
-; DWARF3-DAG:  [[STATIC_MEM_FUNC]] EXTERNAL FUNCTION "C::static_member_function"
-
-
-; DWARF3-LABEL: debug_gnu_pubtypes contents:
-; DWARF3: Offset     Linkage  Kind     Name
-; DWARF3-DAG:  [[C]] EXTERNAL TYPE     "C"
-; DWARF3-DAG:  [[D]] EXTERNAL TYPE     "ns::D"
-; DWARF3-DAG:  [[INT]] STATIC   TYPE     "int"
-
 %struct.C = type { i8 }
 %"struct.ns::D" = type { i32 }
 
@@ -224,16 +213,20 @@
 @global_variable = global %struct.C zeroinitializer, align 1
 @_ZN2ns25global_namespace_variableE = global i32 1, align 4
 @_ZN2ns1dE = global %"struct.ns::D" zeroinitializer, align 4
+@_ZZ2f3vE1z = internal global i32 0, align 4
+@_ZN12_GLOBAL__N_11iE = internal global i32 0, align 4
+@_ZN12_GLOBAL__N_15inner1bE = internal global i32 0, align 4
+@_ZN5outer12_GLOBAL__N_11cE = internal global i32 0, align 4
 
 ; Function Attrs: nounwind uwtable
 define void @_ZN1C15member_functionEv(%struct.C* %this) #0 align 2 {
 entry:
   %this.addr = alloca %struct.C*, align 8
   store %struct.C* %this, %struct.C** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !36), !dbg !38
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !50), !dbg !52
   %this1 = load %struct.C** %this.addr
-  store i32 0, i32* @_ZN1C22static_member_variableE, align 4, !dbg !39
-  ret void, !dbg !39
+  store i32 0, i32* @_ZN1C22static_member_variableE, align 4, !dbg !53
+  ret void, !dbg !54
 }
 
 ; Function Attrs: nounwind readnone
@@ -242,72 +235,108 @@ declare void @llvm.dbg.declare(metadata, metadata) #1
 ; Function Attrs: nounwind uwtable
 define i32 @_ZN1C22static_member_functionEv() #0 align 2 {
 entry:
-  %0 = load i32* @_ZN1C22static_member_variableE, align 4, !dbg !40
-  ret i32 %0, !dbg !40
+  %0 = load i32* @_ZN1C22static_member_variableE, align 4, !dbg !55
+  ret i32 %0, !dbg !55
 }
 
 ; Function Attrs: nounwind uwtable
 define i32 @_Z15global_functionv() #0 {
 entry:
-  ret i32 -1, !dbg !41
+  ret i32 -1, !dbg !56
 }
 
 ; Function Attrs: nounwind uwtable
 define void @_ZN2ns25global_namespace_functionEv() #0 {
 entry:
-  call void @_ZN1C15member_functionEv(%struct.C* @global_variable), !dbg !42
-  ret void, !dbg !42
+  call void @_ZN1C15member_functionEv(%struct.C* @global_variable), !dbg !57
+  ret void, !dbg !58
+}
+
+; Function Attrs: nounwind uwtable
+define i32* @_Z2f3v() #0 {
+entry:
+  ret i32* @_ZZ2f3vE1z, !dbg !59
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z2f7v() #0 {
+entry:
+  %0 = load i32* @_ZN12_GLOBAL__N_11iE, align 4, !dbg !60
+  %call = call i32* @_Z2f3v(), !dbg !60
+  %1 = load i32* %call, align 4, !dbg !60
+  %add = add nsw i32 %0, %1, !dbg !60
+  %2 = load i32* @_ZN12_GLOBAL__N_15inner1bE, align 4, !dbg !60
+  %add1 = add nsw i32 %add, %2, !dbg !60
+  %3 = load i32* @_ZN5outer12_GLOBAL__N_11cE, align 4, !dbg !60
+  %add2 = add nsw i32 %add1, %3, !dbg !60
+  ret i32 %add2, !dbg !60
 }
 
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!34, !43}
-!llvm.ident = !{!35}
+!llvm.module.flags = !{!47, !48}
+!llvm.ident = !{!49}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (trunk 192862) (llvm/trunk 192861)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !21, metadata !29, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/pubnames.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"pubnames.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !19, metadata !32, metadata !45, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/pubnames.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"pubnames.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !17}
+!3 = metadata !{metadata !4, metadata !15}
 !4 = metadata !{i32 786451, metadata !1, null, metadata !"C", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{metadata !6, metadata !8, metadata !13}
+!5 = metadata !{metadata !6, metadata !8, metadata !12}
 !6 = metadata !{i32 786445, metadata !1, metadata !"_ZTS1C", metadata !"static_member_variable", i32 4, i64 0, i64 0, i64 0, i32 4096, metadata !7, null} ; [ DW_TAG_member ] [static_member_variable] [line 4, size 0, align 0, offset 0] [static] [from int]
 !7 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!8 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 2, metadata !9, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !12, i32 2} ; [ DW_TAG_subprogram ] [line 2] [member_function]
+!8 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 2, metadata !9, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 2} ; [ DW_TAG_subprogram ] [line 2] [member_function]
 !9 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !10 = metadata !{null, metadata !11}
 !11 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
-!12 = metadata !{i32 786468}
-!13 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 3, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !16, i32 3} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
-!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!15 = metadata !{metadata !7}
-!16 = metadata !{i32 786468}
-!17 = metadata !{i32 786451, metadata !1, metadata !18, metadata !"D", i32 21, i64 32, i64 32, i32 0, i32 0, null, metadata !19, i32 0, null, null, metadata !"_ZTSN2ns1DE"} ; [ DW_TAG_structure_type ] [D] [line 21, size 32, align 32, offset 0] [def] [from ]
-!18 = metadata !{i32 786489, metadata !1, null, metadata !"ns", i32 17} ; [ DW_TAG_namespace ] [ns] [line 17]
-!19 = metadata !{metadata !20}
-!20 = metadata !{i32 786445, metadata !1, metadata !"_ZTSN2ns1DE", metadata !"A", i32 22, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] [A] [line 22, size 32, align 32, offset 0] [from int]
-!21 = metadata !{metadata !22, metadata !23, metadata !24, metadata !26}
-!22 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 9, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.C*)* @_ZN1C15member_functionEv, null, metadata !8, metadata !2, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
-!23 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 11, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !13, metadata !2, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [static_member_function]
-!24 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"global_function", metadata !"global_function", metadata !"_Z15global_functionv", i32 15, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z15global_functionv, null, null, metadata !2, i32 15} ; [ DW_TAG_subprogram ] [line 15] [def] [global_function]
-!25 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/pubnames.cpp]
-!26 = metadata !{i32 786478, metadata !1, metadata !18, metadata !"global_namespace_function", metadata !"global_namespace_function", metadata !"_ZN2ns25global_namespace_functionEv", i32 18, metadata !27, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !2, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [global_namespace_function]
-!27 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !28, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!28 = metadata !{null}
-!29 = metadata !{metadata !30, metadata !31, metadata !32, metadata !33}
-!30 = metadata !{i32 786484, i32 0, metadata !4, metadata !"static_member_variable", metadata !"static_member_variable", metadata !"_ZN1C22static_member_variableE", metadata !25, i32 7, metadata !7, i32 0, i32 1, i32* @_ZN1C22static_member_variableE, metadata !6} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
-!31 = metadata !{i32 786484, i32 0, null, metadata !"global_variable", metadata !"global_variable", metadata !"", metadata !25, i32 13, metadata !4, i32 0, i32 1, %struct.C* @global_variable, null} ; [ DW_TAG_variable ] [global_variable] [line 13] [def]
-!32 = metadata !{i32 786484, i32 0, metadata !18, metadata !"global_namespace_variable", metadata !"global_namespace_variable", metadata !"_ZN2ns25global_namespace_variableE", metadata !25, i32 19, metadata !7, i32 0, i32 1, i32* @_ZN2ns25global_namespace_variableE, null} ; [ DW_TAG_variable ] [global_namespace_variable] [line 19] [def]
-!33 = metadata !{i32 786484, i32 0, metadata !18, metadata !"d", metadata !"d", metadata !"_ZN2ns1dE", metadata !25, i32 23, metadata !17, i32 0, i32 1, %"struct.ns::D"* @_ZN2ns1dE, null} ; [ DW_TAG_variable ] [d] [line 23] [def]
-!34 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!35 = metadata !{metadata !"clang version 3.4 (trunk 192862) (llvm/trunk 192861)"}
-!36 = metadata !{i32 786689, metadata !22, metadata !"this", null, i32 16777216, metadata !37, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!37 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
-!38 = metadata !{i32 0, i32 0, metadata !22, null}
-!39 = metadata !{i32 9, i32 0, metadata !22, null}
-!40 = metadata !{i32 11, i32 0, metadata !23, null}
-!41 = metadata !{i32 15, i32 0, metadata !24, null}
-!42 = metadata !{i32 18, i32 0, metadata !26, null}
-
-!43 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 3, metadata !13, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 3} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
+!13 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{metadata !7}
+!15 = metadata !{i32 786451, metadata !1, metadata !16, metadata !"D", i32 28, i64 32, i64 32, i32 0, i32 0, null, metadata !17, i32 0, null, null, metadata !"_ZTSN2ns1DE"} ; [ DW_TAG_structure_type ] [D] [line 28, size 32, align 32, offset 0] [def] [from ]
+!16 = metadata !{i32 786489, metadata !1, null, metadata !"ns", i32 23} ; [ DW_TAG_namespace ] [ns] [line 23]
+!17 = metadata !{metadata !18}
+!18 = metadata !{i32 786445, metadata !1, metadata !"_ZTSN2ns1DE", metadata !"A", i32 29, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] [A] [line 29, size 32, align 32, offset 0] [from int]
+!19 = metadata !{metadata !20, metadata !21, metadata !22, metadata !24, metadata !27, metadata !31}
+!20 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 9, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.C*)* @_ZN1C15member_functionEv, null, metadata !8, metadata !2, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
+!21 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 13, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !12, metadata !2, i32 13} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
+!22 = metadata !{i32 786478, metadata !1, metadata !23, metadata !"global_function", metadata !"global_function", metadata !"_Z15global_functionv", i32 19, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z15global_functionv, null, null, metadata !2, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
+!23 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/pubnames.cpp]
+!24 = metadata !{i32 786478, metadata !1, metadata !16, metadata !"global_namespace_function", metadata !"global_namespace_function", metadata !"_ZN2ns25global_namespace_functionEv", i32 24, metadata !25, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !2, i32 24} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
+!25 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !26, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!26 = metadata !{null}
+!27 = metadata !{i32 786478, metadata !1, metadata !23, metadata !"f3", metadata !"f3", metadata !"_Z2f3v", i32 37, metadata !28, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32* ()* @_Z2f3v, null, null, metadata !2, i32 37} ; [ DW_TAG_subprogram ] [line 37] [def] [f3]
+!28 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !29, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!29 = metadata !{metadata !30}
+!30 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!31 = metadata !{i32 786478, metadata !1, metadata !23, metadata !"f7", metadata !"f7", metadata !"_Z2f7v", i32 54, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z2f7v, null, null, metadata !2, i32 54} ; [ DW_TAG_subprogram ] [line 54] [def] [f7]
+!32 = metadata !{metadata !33, metadata !34, metadata !35, metadata !36, metadata !37, metadata !38, metadata !41, metadata !44}
+!33 = metadata !{i32 786484, i32 0, metadata !4, metadata !"static_member_variable", metadata !"static_member_variable", metadata !"_ZN1C22static_member_variableE", metadata !23, i32 7, metadata !7, i32 0, i32 1, i32* @_ZN1C22static_member_variableE, metadata !6} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
+!34 = metadata !{i32 786484, i32 0, null, metadata !"global_variable", metadata !"global_variable", metadata !"", metadata !23, i32 17, metadata !"_ZTS1C", i32 0, i32 1, %struct.C* @global_variable, null} ; [ DW_TAG_variable ] [global_variable] [line 17] [def]
+!35 = metadata !{i32 786484, i32 0, metadata !16, metadata !"global_namespace_variable", metadata !"global_namespace_variable", metadata !"_ZN2ns25global_namespace_variableE", metadata !23, i32 27, metadata !7, i32 0, i32 1, i32* @_ZN2ns25global_namespace_variableE, null} ; [ DW_TAG_variable ] [global_namespace_variable] [line 27] [def]
+!36 = metadata !{i32 786484, i32 0, metadata !16, metadata !"d", metadata !"d", metadata !"_ZN2ns1dE", metadata !23, i32 30, metadata !"_ZTSN2ns1DE", i32 0, i32 1, %"struct.ns::D"* @_ZN2ns1dE, null} ; [ DW_TAG_variable ] [d] [line 30] [def]
+!37 = metadata !{i32 786484, i32 0, metadata !27, metadata !"z", metadata !"z", metadata !"", metadata !23, i32 38, metadata !7, i32 1, i32 1, i32* @_ZZ2f3vE1z, null} ; [ DW_TAG_variable ] [z] [line 38] [local] [def]
+!38 = metadata !{i32 786484, i32 0, metadata !39, metadata !"c", metadata !"c", metadata !"_ZN5outer12_GLOBAL__N_11cE", metadata !23, i32 50, metadata !7, i32 1, i32 1, i32* @_ZN5outer12_GLOBAL__N_11cE, null} ; [ DW_TAG_variable ] [c] [line 50] [local] [def]
+!39 = metadata !{i32 786489, metadata !1, metadata !40, metadata !"", i32 49} ; [ DW_TAG_namespace ] [line 49]
+!40 = metadata !{i32 786489, metadata !1, null, metadata !"outer", i32 48} ; [ DW_TAG_namespace ] [outer] [line 48]
+!41 = metadata !{i32 786484, i32 0, metadata !42, metadata !"b", metadata !"b", metadata !"_ZN12_GLOBAL__N_15inner1bE", metadata !23, i32 44, metadata !7, i32 1, i32 1, i32* @_ZN12_GLOBAL__N_15inner1bE, null} ; [ DW_TAG_variable ] [b] [line 44] [local] [def]
+!42 = metadata !{i32 786489, metadata !1, metadata !43, metadata !"inner", i32 43} ; [ DW_TAG_namespace ] [inner] [line 43]
+!43 = metadata !{i32 786489, metadata !1, null, metadata !"", i32 33} ; [ DW_TAG_namespace ] [line 33]
+!44 = metadata !{i32 786484, i32 0, metadata !43, metadata !"i", metadata !"i", metadata !"_ZN12_GLOBAL__N_11iE", metadata !23, i32 34, metadata !7, i32 1, i32 1, i32* @_ZN12_GLOBAL__N_11iE, null} ; [ DW_TAG_variable ] [i] [line 34] [local] [def]
+!45 = metadata !{metadata !46}
+!46 = metadata !{i32 786490, metadata !40, metadata !39, i32 40} ; [ DW_TAG_imported_module ]
+!47 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!48 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!49 = metadata !{metadata !"clang version 3.5.0 "}
+!50 = metadata !{i32 786689, metadata !20, metadata !"this", null, i32 16777216, metadata !51, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!51 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
+!52 = metadata !{i32 0, i32 0, metadata !20, null}
+!53 = metadata !{i32 10, i32 0, metadata !20, null}
+!54 = metadata !{i32 11, i32 0, metadata !20, null}
+!55 = metadata !{i32 14, i32 0, metadata !21, null}
+!56 = metadata !{i32 20, i32 0, metadata !22, null}
+!57 = metadata !{i32 25, i32 0, metadata !24, null}
+!58 = metadata !{i32 26, i32 0, metadata !24, null}
+!59 = metadata !{i32 39, i32 0, metadata !27, null}
+!60 = metadata !{i32 55, i32 0, metadata !31, null}
diff --git a/test/DebugInfo/X86/lit.local.cfg b/test/DebugInfo/X86/lit.local.cfg
index 19840aa..c8625f4 100644
--- a/test/DebugInfo/X86/lit.local.cfg
+++ b/test/DebugInfo/X86/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
diff --git a/test/DebugInfo/X86/misched-dbg-value.ll b/test/DebugInfo/X86/misched-dbg-value.ll
index 478f221..c713e65 100644
--- a/test/DebugInfo/X86/misched-dbg-value.ll
+++ b/test/DebugInfo/X86/misched-dbg-value.ll
@@ -6,20 +6,33 @@
 ; function parameters.
 ; CHECK: .debug_info contents:
 ; CHECK: DW_TAG_compile_unit
-; CHECK: DW_TAG_subprogram
-; CHECK: Proc8
-; CHECK: DW_TAG_formal_parameter
-; CHECK: Array1Par
-; CHECK: DW_AT_location
-; CHECK: DW_TAG_formal_parameter
-; CHECK: Array2Par
-; CHECK: DW_AT_location
-; CHECK: DW_TAG_formal_parameter
-; CHECK: IntParI1
-; CHECK: DW_AT_location
-; CHECK: DW_TAG_formal_parameter
-; CHECK: IntParI2
-; CHECK: DW_AT_location
+; CHECK:   DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "Proc8"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_location
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_name {{.*}} "Array1Par"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_location
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_name {{.*}} "Array2Par"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_location
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_name {{.*}} "IntParI1"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_location
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_name {{.*}} "IntParI2"
 
 %struct.Record = type { %struct.Record*, i32, i32, i32, [31 x i8] }
 
diff --git a/test/DebugInfo/X86/op_deref.ll b/test/DebugInfo/X86/op_deref.ll
index 810ebbc..31003ee 100644
--- a/test/DebugInfo/X86/op_deref.ll
+++ b/test/DebugInfo/X86/op_deref.ll
@@ -1,18 +1,21 @@
-; RUN: llc -O0 -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s -check-prefix=DW-CHECK
-; RUN: llc -O0 -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj -dwarf-version=3
-; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s -check-prefix=DWARF3
+; RUN: llc -O0 -mtriple=x86_64-apple-darwin < %s -filetype=obj \
+; RUN:     | llvm-dwarfdump -debug-dump=info - \
+; RUN:     | FileCheck %s -check-prefix=CHECK -check-prefix=DWARF4
+; RUN: llc -O0 -mtriple=x86_64-apple-darwin < %s -filetype=obj -dwarf-version=3 \
+; RUN:     | llvm-dwarfdump -debug-dump=info - \
+; RUN:     | FileCheck %s -check-prefix=CHECK -check-prefix=DWARF3
 
-; DW-CHECK: DW_AT_name [DW_FORM_strp]  ( .debug_str[0x00000067] = "vla")
 ; FIXME: The location here needs to be fixed, but llvm-dwarfdump doesn't handle
 ; DW_AT_location lists yet.
-; DW-CHECK: DW_AT_location [DW_FORM_sec_offset]                      (0x00000000)
+; DWARF4: DW_AT_location [DW_FORM_sec_offset]                      (0x00000000)
 
-; DWARF3: DW_AT_name [DW_FORM_strp]  ( .debug_str[0x00000067] = "vla")
 ; FIXME: The location here needs to be fixed, but llvm-dwarfdump doesn't handle
 ; DW_AT_location lists yet.
 ; DWARF3: DW_AT_location [DW_FORM_data4]                      (0x00000000)
 
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name [DW_FORM_strp]  ( .debug_str[0x00000067] = "vla")
+
 ; Unfortunately llvm-dwarfdump can't unparse a list of DW_AT_locations
 ; right now, so we check the asm output:
 ; RUN: llc -O0 -mtriple=x86_64-apple-darwin %s -o - -filetype=asm | FileCheck %s -check-prefix=ASM-CHECK
@@ -86,7 +89,7 @@ declare void @llvm.stackrestore(i8*) nounwind
 !11 = metadata !{i32 1, i32 26, metadata !5, null}
 !12 = metadata !{i32 3, i32 13, metadata !13, null}
 !13 = metadata !{i32 786443, metadata !28, metadata !5, i32 2, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
-!14 = metadata !{i32 786688, metadata !13, metadata !"vla", metadata !6, i32 3, metadata !15, i32 8192, i32 0, i64 2} ; [ DW_TAG_auto_variable ]
+!14 = metadata !{i32 786688, metadata !13, metadata !"vla", metadata !6, i32 3, metadata !15, i32 8192, i32 0, metadata !30} ; [ DW_TAG_auto_variable ]
 !15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !9, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
 !16 = metadata !{metadata !17}
 !17 = metadata !{i32 786465, i64 0, i64 -1}        ; [ DW_TAG_subrange_type ]
@@ -102,3 +105,4 @@ declare void @llvm.stackrestore(i8*) nounwind
 !27 = metadata !{i32 8, i32 1, metadata !13, null}
 !28 = metadata !{metadata !"bar.c", metadata !"/Users/echristo/tmp"}
 !29 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!30 = metadata !{i64 2}
diff --git a/test/DebugInfo/X86/parameters.ll b/test/DebugInfo/X86/parameters.ll
index 8248cf6..4215c21 100644
--- a/test/DebugInfo/X86/parameters.ll
+++ b/test/DebugInfo/X86/parameters.ll
@@ -23,13 +23,15 @@
 ; }
 
 ; CHECK: debug_info contents
-; CHECK: DW_AT_name{{.*}} = "f"
 ; 0x74 is DW_OP_breg4, showing that the parameter is accessed indirectly
 ; (with a zero offset) from the register parameter
 ; CHECK: DW_AT_location{{.*}}(<0x0{{.}}> 74 00
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}} = "f"
 
-; CHECK: DW_AT_name{{.*}} = "g"
 ; CHECK: DW_AT_location{{.*}}([[G_LOC:0x[0-9]*]])
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}} = "g"
 ; CHECK: debug_loc contents
 ; CHECK-NEXT: [[G_LOC]]: Beginning
 ; CHECK-NEXT:               Ending
diff --git a/test/DebugInfo/X86/pr12831.ll b/test/DebugInfo/X86/pr12831.ll
index 117e426..79d00ed 100644
--- a/test/DebugInfo/X86/pr12831.ll
+++ b/test/DebugInfo/X86/pr12831.ll
@@ -212,7 +212,7 @@ entry:
 !134 = metadata !{i32 786447, null, null, null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ]
 !135 = metadata !{i32 19, i32 39, metadata !5, null}
 !136 = metadata !{i32 20, i32 17, metadata !137, null}
-!137 = metadata !{i32 786443, metadata !5, i32 19, i32 51, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
+!137 = metadata !{i32 786443, metadata !6, metadata !5, i32 19, i32 51, i32 0} ; [ DW_TAG_lexical_block ]
 !138 = metadata !{i32 23, i32 17, metadata !137, null}
 !139 = metadata !{i32 26, i32 15, metadata !137, null}
 !140 = metadata !{i32 786689, metadata !106, metadata !"this", metadata !6, i32 16777224, metadata !141, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
@@ -221,19 +221,19 @@ entry:
 !143 = metadata !{i32 786689, metadata !106, metadata !"__f", metadata !6, i32 33554440, metadata !61, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 !144 = metadata !{i32 8, i32 63, metadata !106, null}
 !145 = metadata !{i32 9, i32 9, metadata !146, null}
-!146 = metadata !{i32 786443, metadata !106, i32 8, i32 81, metadata !6, i32 1} ; [ DW_TAG_lexical_block ]
+!146 = metadata !{i32 786443, metadata !6, metadata !106, i32 8, i32 81, i32 1} ; [ DW_TAG_lexical_block ]
 !147 = metadata !{i32 10, i32 13, metadata !146, null}
 !148 = metadata !{i32 4, i32 5, metadata !149, null}
-!149 = metadata !{i32 786443, metadata !107, i32 3, i32 105, metadata !6, i32 2} ; [ DW_TAG_lexical_block ]
+!149 = metadata !{i32 786443, metadata !6, metadata !107, i32 3, i32 105, i32 2} ; [ DW_TAG_lexical_block ]
 !150 = metadata !{i32 786689, metadata !126, metadata !"this", metadata !6, i32 16777224, metadata !141, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
 !151 = metadata !{i32 8, i32 45, metadata !126, null}
 !152 = metadata !{i32 786689, metadata !126, metadata !"__f", metadata !6, i32 33554440, metadata !26, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 !153 = metadata !{i32 8, i32 63, metadata !126, null}
 !154 = metadata !{i32 9, i32 9, metadata !155, null}
-!155 = metadata !{i32 786443, metadata !126, i32 8, i32 81, metadata !6, i32 3} ; [ DW_TAG_lexical_block ]
+!155 = metadata !{i32 786443, metadata !6, metadata !126, i32 8, i32 81, i32 3} ; [ DW_TAG_lexical_block ]
 !156 = metadata !{i32 10, i32 13, metadata !155, null}
 !157 = metadata !{i32 4, i32 5, metadata !158, null}
-!158 = metadata !{i32 786443, metadata !127, i32 3, i32 105, metadata !6, i32 4} ; [ DW_TAG_lexical_block ]
+!158 = metadata !{i32 786443, metadata !6, metadata !127, i32 3, i32 105, i32 4} ; [ DW_TAG_lexical_block ]
 !159 = metadata !{i32 786473, metadata !161} ; [ DW_TAG_file_type ]
 !160 = metadata !{metadata !"BPLFunctionWriter2.ii", metadata !"/home/peter/crashdelta"}
 !161 = metadata !{metadata !"BPLFunctionWriter.cpp", metadata !"/home/peter/crashdelta"}
diff --git a/test/DebugInfo/X86/pr19307.ll b/test/DebugInfo/X86/pr19307.ll
new file mode 100644
index 0000000..07e3a42
--- /dev/null
+++ b/test/DebugInfo/X86/pr19307.ll
@@ -0,0 +1,147 @@
+; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+; Generated from the source file pr19307.cc:
+; #include <string>
+; void parse_range(unsigned long long &offset, unsigned long long &limit,
+;                  std::string range) {
+;   if (range.compare(0, 6, "items=") != 0 || range[6] == '-')
+;     offset = 1;
+;   range.erase(0, 6);
+;   limit = 2;
+; }
+; with "clang++ -S -emit-llvm -O0 -g pr19307.cc"
+
+; Location of "range" string is spilled from %rdx to stack and is
+; addressed via %rbp.
+; CHECK: movq %rdx, {{[-0-9]+}}(%rbp)
+; CHECK-NEXT: [[START_LABEL:.Ltmp[0-9]+]]
+; This location should be valid until the end of the function.
+
+; Verify that we have proper range in debug_loc section:
+; CHECK: .Ldebug_loc{{[0-9]+}}:
+; CHECK: DW_OP_breg1
+; CHECK:      .Lset{{[0-9]+}} = [[START_LABEL]]-.Lfunc_begin0
+; CHECK-NEXT: .quad .Lset{{[0-9]+}}
+; CHECK-NEXT: .Lset{{[0-9]+}} = .Lfunc_end0-.Lfunc_begin0
+; CHECK-NEXT: .quad .Lset{{[0-9]+}}
+; CHECK: DW_OP_breg6
+; CHECK: DW_OP_deref
+
+; ModuleID = 'pr19307.cc'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%"class.std::basic_string" = type { %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" }
+%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" = type { i8* }
+
+@.str = private unnamed_addr constant [7 x i8] c"items=\00", align 1
+
+; Function Attrs: uwtable
+define void @_Z11parse_rangeRyS_Ss(i64* %offset, i64* %limit, %"class.std::basic_string"* %range) #0 {
+entry:
+  %offset.addr = alloca i64*, align 8
+  %limit.addr = alloca i64*, align 8
+  store i64* %offset, i64** %offset.addr, align 8
+  call void @llvm.dbg.declare(metadata !{i64** %offset.addr}, metadata !45), !dbg !46
+  store i64* %limit, i64** %limit.addr, align 8
+  call void @llvm.dbg.declare(metadata !{i64** %limit.addr}, metadata !47), !dbg !46
+  call void @llvm.dbg.declare(metadata !{%"class.std::basic_string"* %range}, metadata !48), !dbg !49
+  %call = call i32 @_ZNKSs7compareEmmPKc(%"class.std::basic_string"* %range, i64 0, i64 6, i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0)), !dbg !50
+  %cmp = icmp ne i32 %call, 0, !dbg !50
+  br i1 %cmp, label %if.then, label %lor.lhs.false, !dbg !50
+
+lor.lhs.false:                                    ; preds = %entry
+  %call1 = call i8* @_ZNSsixEm(%"class.std::basic_string"* %range, i64 6), !dbg !52
+  %0 = load i8* %call1, !dbg !52
+  %conv = sext i8 %0 to i32, !dbg !52
+  %cmp2 = icmp eq i32 %conv, 45, !dbg !52
+  br i1 %cmp2, label %if.then, label %if.end, !dbg !52
+
+if.then:                                          ; preds = %lor.lhs.false, %entry
+  %1 = load i64** %offset.addr, align 8, !dbg !54
+  store i64 1, i64* %1, align 8, !dbg !54
+  br label %if.end, !dbg !54
+
+if.end:                                           ; preds = %if.then, %lor.lhs.false
+  %call3 = call %"class.std::basic_string"* @_ZNSs5eraseEmm(%"class.std::basic_string"* %range, i64 0, i64 6), !dbg !55
+  %2 = load i64** %limit.addr, align 8, !dbg !56
+  store i64 2, i64* %2, align 8, !dbg !56
+  ret void, !dbg !57
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+declare i32 @_ZNKSs7compareEmmPKc(%"class.std::basic_string"*, i64, i64, i8*) #2
+
+declare i8* @_ZNSsixEm(%"class.std::basic_string"*, i64) #2
+
+declare %"class.std::basic_string"* @_ZNSs5eraseEmm(%"class.std::basic_string"*, i64, i64) #2
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!42, !43}
+!llvm.ident = !{!44}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (209308)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !12, metadata !2, metadata !21, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/pr19307.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"pr19307.cc", metadata !"/llvm_cmake_gcc"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !6, metadata !8}
+!4 = metadata !{i32 786451, metadata !5, null, metadata !"", i32 83, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, metadata !"_ZTS11__mbstate_t"} ; [ DW_TAG_structure_type ] [line 83, size 0, align 0, offset 0] [decl] [from ]
+!5 = metadata !{metadata !"/usr/include/wchar.h", metadata !"/llvm_cmake_gcc"}
+!6 = metadata !{i32 786451, metadata !7, null, metadata !"lconv", i32 54, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, metadata !"_ZTS5lconv"} ; [ DW_TAG_structure_type ] [lconv] [line 54, size 0, align 0, offset 0] [decl] [from ]
+!7 = metadata !{metadata !"/usr/include/locale.h", metadata !"/llvm_cmake_gcc"}
+!8 = metadata !{i32 786434, metadata !9, metadata !10, metadata !"basic_string<char, std::char_traits<char>, std::allocator<char> >", i32 1134, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, metadata !"_ZTSSs"} ; [ DW_TAG_class_type ] [basic_string<char, std::char_traits<char>, std::allocator<char> >] [line 1134, size 0, align 0, offset 0] [decl] [from ]
+!9 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/basic_string.tcc", metadata !"/llvm_cmake_gcc"}
+!10 = metadata !{i32 786489, metadata !11, null, metadata !"std", i32 153} ; [ DW_TAG_namespace ] [std] [line 153]
+!11 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/x86_64-linux-gnu/bits/c++config.h", metadata !"/llvm_cmake_gcc"}
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 786478, metadata !1, metadata !14, metadata !"parse_range", metadata !"parse_range", metadata !"_Z11parse_rangeRyS_Ss", i32 3, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i64*, i64*, %"class.std::basic_string"*)* @_Z11parse_rangeRyS_Ss, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [parse_range]
+!14 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/pr19307.cc]
+!15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{null, metadata !17, metadata !17, metadata !19}
+!17 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !18} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
+!18 = metadata !{i32 786468, null, null, metadata !"long long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!19 = metadata !{i32 786454, metadata !20, metadata !10, metadata !"string", i32 65, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTSSs"} ; [ DW_TAG_typedef ] [string] [line 65, size 0, align 0, offset 0] [from _ZTSSs]
+!20 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/stringfwd.h", metadata !"/llvm_cmake_gcc"}
+!21 = metadata !{metadata !22, metadata !26, metadata !29, metadata !33, metadata !38, metadata !41}
+!22 = metadata !{i32 786490, metadata !23, metadata !25, i32 57} ; [ DW_TAG_imported_module ]
+!23 = metadata !{i32 786489, metadata !24, null, metadata !"__gnu_debug", i32 55} ; [ DW_TAG_namespace ] [__gnu_debug] [line 55]
+!24 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/debug/debug.h", metadata !"/llvm_cmake_gcc"}
+!25 = metadata !{i32 786489, metadata !24, metadata !10, metadata !"__debug", i32 49} ; [ DW_TAG_namespace ] [__debug] [line 49]
+!26 = metadata !{i32 786440, metadata !10, metadata !27, i32 66} ; [ DW_TAG_imported_declaration ]
+!27 = metadata !{i32 786454, metadata !5, null, metadata !"mbstate_t", i32 106, i64 0, i64 0, i64 0, i32 0, metadata !28} ; [ DW_TAG_typedef ] [mbstate_t] [line 106, size 0, align 0, offset 0] [from __mbstate_t]
+!28 = metadata !{i32 786454, metadata !5, null, metadata !"__mbstate_t", i32 95, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTS11__mbstate_t"} ; [ DW_TAG_typedef ] [__mbstate_t] [line 95, size 0, align 0, offset 0] [from _ZTS11__mbstate_t]
+!29 = metadata !{i32 786440, metadata !10, metadata !30, i32 141} ; [ DW_TAG_imported_declaration ]
+!30 = metadata !{i32 786454, metadata !31, null, metadata !"wint_t", i32 141, i64 0, i64 0, i64 0, i32 0, metadata !32} ; [ DW_TAG_typedef ] [wint_t] [line 141, size 0, align 0, offset 0] [from unsigned int]
+!31 = metadata !{metadata !"/llvm_cmake_gcc/bin/../lib/clang/3.5.0/include/stddef.h", metadata !"/llvm_cmake_gcc"}
+!32 = metadata !{i32 786468, null, null, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!33 = metadata !{i32 786440, metadata !34, metadata !36, i32 42} ; [ DW_TAG_imported_declaration ]
+!34 = metadata !{i32 786489, metadata !35, null, metadata !"__gnu_cxx", i32 69} ; [ DW_TAG_namespace ] [__gnu_cxx] [line 69]
+!35 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/cpp_type_traits.h", metadata !"/llvm_cmake_gcc"}
+!36 = metadata !{i32 786454, metadata !11, metadata !10, metadata !"size_t", i32 155, i64 0, i64 0, i64 0, i32 0, metadata !37} ; [ DW_TAG_typedef ] [size_t] [line 155, size 0, align 0, offset 0] [from long unsigned int]
+!37 = metadata !{i32 786468, null, null, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!38 = metadata !{i32 786440, metadata !34, metadata !39, i32 43} ; [ DW_TAG_imported_declaration ]
+!39 = metadata !{i32 786454, metadata !11, metadata !10, metadata !"ptrdiff_t", i32 156, i64 0, i64 0, i64 0, i32 0, metadata !40} ; [ DW_TAG_typedef ] [ptrdiff_t] [line 156, size 0, align 0, offset 0] [from long int]
+!40 = metadata !{i32 786468, null, null, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!41 = metadata !{i32 786440, metadata !10, metadata !"_ZTS5lconv", i32 55} ; [ DW_TAG_imported_declaration ]
+!42 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!43 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!44 = metadata !{metadata !"clang version 3.5.0 (209308)"}
+!45 = metadata !{i32 786689, metadata !13, metadata !"offset", metadata !14, i32 16777219, metadata !17, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [offset] [line 3]
+!46 = metadata !{i32 3, i32 0, metadata !13, null}
+!47 = metadata !{i32 786689, metadata !13, metadata !"limit", metadata !14, i32 33554435, metadata !17, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [limit] [line 3]
+!48 = metadata !{i32 786689, metadata !13, metadata !"range", metadata !14, i32 50331652, metadata !19, i32 8192, i32 0} ; [ DW_TAG_arg_variable ] [range] [line 4]
+!49 = metadata !{i32 4, i32 0, metadata !13, null}
+!50 = metadata !{i32 5, i32 0, metadata !51, null}
+!51 = metadata !{i32 786443, metadata !1, metadata !13, i32 5, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/pr19307.cc]
+!52 = metadata !{i32 5, i32 0, metadata !53, null}
+!53 = metadata !{i32 786443, metadata !1, metadata !51, i32 5, i32 0, i32 1, i32 1} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/pr19307.cc]
+!54 = metadata !{i32 6, i32 0, metadata !51, null}
+!55 = metadata !{i32 7, i32 0, metadata !13, null}
+!56 = metadata !{i32 8, i32 0, metadata !13, null} ; [ DW_TAG_imported_declaration ]
+!57 = metadata !{i32 9, i32 0, metadata !13, null}
+
diff --git a/test/DebugInfo/X86/sret.ll b/test/DebugInfo/X86/sret.ll
index fed4334..faf5158 100644
--- a/test/DebugInfo/X86/sret.ll
+++ b/test/DebugInfo/X86/sret.ll
@@ -3,8 +3,8 @@
 
 ; Based on the debuginfo-tests/sret.cpp code.
 
-; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x72aabf538392d298)
-; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x72aabf538392d298)
+; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x5b59949640ec1580)
+; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x5b59949640ec1580)
 
 %class.A = type { i32 (...)**, i32 }
 %class.B = type { i8 }
diff --git a/test/DebugInfo/X86/subregisters.ll b/test/DebugInfo/X86/subregisters.ll
index 738ab02..d46a95f 100644
--- a/test/DebugInfo/X86/subregisters.ll
+++ b/test/DebugInfo/X86/subregisters.ll
@@ -6,8 +6,8 @@
 ;
 ; rdar://problem/16015314
 ;
+; CHECK:  DW_AT_location [DW_FORM_block1]       (<0x03> 54 93 04 )
 ; CHECK:  DW_AT_name [DW_FORM_strp]{{.*}} "a"
-; CHECK:    DW_AT_location [DW_FORM_block1]       (<0x03> 54 93 04 )
 ;
 ; struct bar {
 ;   int a;
diff --git a/test/DebugInfo/cross-cu-inlining.ll b/test/DebugInfo/cross-cu-inlining.ll
index 266a24d..899558a 100644
--- a/test/DebugInfo/cross-cu-inlining.ll
+++ b/test/DebugInfo/cross-cu-inlining.ll
@@ -56,8 +56,9 @@
 ; CHECK:   DW_AT_abstract_origin {{.*}} {0x[[ABS_FUNC]]}
 ; CHECK:   DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
-; CHECK:     DW_AT_abstract_origin {{.*}} {0x[[ABS_VAR]]}
 ; CHECK:     DW_AT_location
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_abstract_origin {{.*}} {0x[[ABS_VAR]]}
 
 
 @i = external global i32
diff --git a/test/DebugInfo/cross-cu-linkonce-distinct.ll b/test/DebugInfo/cross-cu-linkonce-distinct.ll
new file mode 100644
index 0000000..67eb6c0
--- /dev/null
+++ b/test/DebugInfo/cross-cu-linkonce-distinct.ll
@@ -0,0 +1,95 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; Testing that two distinct (distinct by writing them in separate files, while
+; still fulfilling C++'s ODR by having identical token sequences) functions,
+; linked under LTO, get plausible debug info (and don't crash).
+
+; Built from source:
+; $ clang++ a.cpp b.cpp -g -c -emit-llvm
+; $ llvm-link a.bc b.bc -o ab.bc
+
+; This change is intended to tickle a case where the subprogram MDNode
+; associated with the llvm::Function will differ from the subprogram
+; referenced by the DbgLocs in the function.
+
+; $ sed -ie "s/!12, !0/!0, !12/" ab.ll
+; $ cat a.cpp
+; inline int func(int i) {
+;   return i * 2;
+; }
+; int (*x)(int) = &func;
+; $ cat b.cpp
+; inline int func(int i) {
+;   return i * 2;
+; }
+; int (*y)(int) = &func;
+
+; CHECK: DW_TAG_compile_unit
+; CHECK:   DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "func"
+; CHECK: DW_TAG_compile_unit
+
+; FIXME: Maybe we should drop the subprogram here - since the function was
+; emitted in one CU, due to linkonce_odr uniquing. We certainly don't emit the
+; subprogram here if the source location for this definition is the same (see
+; test/DebugInfo/cross-cu-linkonce.ll), though it's very easy to tickle that
+; into failing even without duplicating the source as has been done in this
+; case (two cpp files in different directories, including the same header that
+; contains an inline function - clang will produce distinct subprogram metadata
+; that won't deduplicate owing to the file location information containing the
+; directory of the source file even though the file name is absolute, not
+; relative)
+
+; CHECK: DW_TAG_subprogram
+
+@x = global i32 (i32)* @_Z4funci, align 8
+@y = global i32 (i32)* @_Z4funci, align 8
+
+; Function Attrs: inlinehint nounwind uwtable
+define linkonce_odr i32 @_Z4funci(i32 %i) #0 {
+  %1 = alloca i32, align 4
+  store i32 %i, i32* %1, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !22), !dbg !23
+  %2 = load i32* %1, align 4, !dbg !24
+  %3 = mul nsw i32 %2, 2, !dbg !24
+  ret i32 %3, !dbg !24
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+attributes #0 = { inlinehint nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!12, !0}
+!llvm.module.flags = !{!19, !20}
+!llvm.ident = !{!21, !21}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/a.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"a.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func", metadata !"func", metadata !"_Z4funci", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z4funci, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/a.cpp]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !10}
+!10 = metadata !{i32 786484, i32 0, null, metadata !"x", metadata !"x", metadata !"", metadata !5, i32 4, metadata !11, i32 0, i32 1, i32 (i32)** @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
+!11 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!12 = metadata !{i32 786449, metadata !13, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !14, metadata !17, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/b.cpp] [DW_LANG_C_plus_plus]
+!13 = metadata !{metadata !"b.cpp", metadata !"/tmp/dbginfo"}
+!14 = metadata !{metadata !15}
+!15 = metadata !{i32 786478, metadata !13, metadata !16, metadata !"func", metadata !"func", metadata !"_Z4funci", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z4funci, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
+!16 = metadata !{i32 786473, metadata !13}        ; [ DW_TAG_file_type ] [/tmp/dbginfo/b.cpp]
+!17 = metadata !{metadata !18}
+!18 = metadata !{i32 786484, i32 0, null, metadata !"y", metadata !"y", metadata !"", metadata !16, i32 4, metadata !11, i32 0, i32 1, i32 (i32)** @y, null} ; [ DW_TAG_variable ] [y] [line 4] [def]
+!19 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!21 = metadata !{metadata !"clang version 3.5.0 "}
+!22 = metadata !{i32 786689, metadata !4, metadata !"i", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [i] [line 1]
+!23 = metadata !{i32 1, i32 0, metadata !4, null}
+!24 = metadata !{i32 2, i32 0, metadata !4, null}
diff --git a/test/DebugInfo/dead-argument-order.ll b/test/DebugInfo/dead-argument-order.ll
new file mode 100644
index 0000000..ea805a4
--- /dev/null
+++ b/test/DebugInfo/dead-argument-order.ll
@@ -0,0 +1,81 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; Built from the following source with clang -O1
+; struct S { int i; };
+; int function(struct S s, int i) { return s.i + i; }
+
+; Due to the X86_64 ABI, 's' is passed in registers and once optimized, the
+; entirety of 's' is never reconstituted, since only the int is required, and
+; thus the variable's location is unknown/dead to debug info.
+
+; Future/current work should enable us to describe partial variables, which, in
+; this case, happens to be the entire variable.
+
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "function"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:   DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "s"
+; CHECK-NOT: DW_TAG
+; FIXME: Even though 's' is never reconstituted into a struct, the one member
+; variable is still live and used, and so we should be able to describe 's's
+; location as the location of that int.
+; CHECK-NOT: DW_AT_location
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:   DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_location
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "i"
+
+
+%struct.S = type { i32 }
+
+; Function Attrs: nounwind readnone uwtable
+define i32 @_Z8function1Si(i32 %s.coerce, i32 %i) #0 {
+entry:
+  tail call void @llvm.dbg.declare(metadata !19, metadata !14), !dbg !20
+  tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !15), !dbg !20
+  %add = add nsw i32 %i, %s.coerce, !dbg !20
+  ret i32 %add, !dbg !20
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #1
+
+attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!16, !17}
+!llvm.ident = !{!18}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !3, metadata !8, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dead-argument-order.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"dead-argument-order.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786451, metadata !1, null, metadata !"S", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS1S"} ; [ DW_TAG_structure_type ] [S] [line 1, size 32, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !6}
+!6 = metadata !{i32 786445, metadata !1, metadata !"_ZTS1S", metadata !"i", i32 1, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] [i] [line 1, size 32, align 32, offset 0] [from int]
+!7 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786478, metadata !1, metadata !10, metadata !"function", metadata !"function", metadata !"_Z8function1Si", i32 2, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i32)* @_Z8function1Si, null, null, metadata !13, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [function]
+!10 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/dead-argument-order.cpp]
+!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !7, metadata !4, metadata !7}
+!13 = metadata !{metadata !14, metadata !15}
+!14 = metadata !{i32 786689, metadata !9, metadata !"s", metadata !10, i32 16777218, metadata !"_ZTS1S", i32 0, i32 0} ; [ DW_TAG_arg_variable ] [s] [line 2]
+!15 = metadata !{i32 786689, metadata !9, metadata !"i", metadata !10, i32 33554434, metadata !7, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [i] [line 2]
+!16 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!17 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!18 = metadata !{metadata !"clang version 3.5.0 "}
+!19 = metadata !{%struct.S* undef}
+!20 = metadata !{i32 2, i32 0, metadata !9, null}
+
diff --git a/test/DebugInfo/dwarf-public-names.ll b/test/DebugInfo/dwarf-public-names.ll
index ca0d721..7218964 100644
--- a/test/DebugInfo/dwarf-public-names.ll
+++ b/test/DebugInfo/dwarf-public-names.ll
@@ -40,12 +40,14 @@
 ; CHECK: version = 0x0002
 
 ; Check for each name in the output.
-; CHECK: global_namespace_variable
-; CHECK: global_namespace_function
-; CHECK: static_member_function
-; CHECK: global_variable
-; CHECK: global_function
-; CHECK: member_function
+; CHECK-DAG: "ns"
+; CHECK-DAG: "C::static_member_function"
+; CHECK-DAG: "global_variable"
+; CHECK-DAG: "ns::global_namespace_variable"
+; CHECK-DAG: "ns::global_namespace_function"
+; CHECK-DAG: "global_function"
+; CHECK-DAG: "C::static_member_variable"
+; CHECK-DAG: "C::member_function"
 
 %struct.C = type { i8 }
 
@@ -109,7 +111,7 @@ attributes #1 = { nounwind readnone }
 !18 = metadata !{i32 786478, metadata !4, null, metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 13, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !14, metadata !1, i32 13} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
 !19 = metadata !{i32 786478, metadata !4, metadata !4, metadata !"global_function", metadata !"global_function", metadata !"_Z15global_functionv", i32 19, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z15global_functionv, null, null, metadata !1, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
 !20 = metadata !{i32 786478, metadata !4, metadata !21, metadata !"global_namespace_function", metadata !"global_namespace_function", metadata !"_ZN2ns25global_namespace_functionEv", i32 24, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !1, i32 24} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
-!21 = metadata !{i32 786489, null, metadata !"ns", metadata !4, i32 23} ; [ DW_TAG_namespace ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp]
+!21 = metadata !{i32 786489, metadata !4, null, metadata !"ns", i32 23} ; [ DW_TAG_namespace ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp]
 !22 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !23, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !23 = metadata !{null}
 !24 = metadata !{metadata !25, metadata !26, metadata !27}
diff --git a/test/DebugInfo/global.ll b/test/DebugInfo/global.ll
index c515114..3c97f0c 100644
--- a/test/DebugInfo/global.ll
+++ b/test/DebugInfo/global.ll
@@ -3,6 +3,9 @@
 ; RUN: %llc_dwarf -O0 -filetype=obj < %s > %t
 ; RUN: llvm-dwarfdump %t | FileCheck %s
 
+; Also test that the null streamer doesn't crash with debug info.
+; RUN: %llc_dwarf -O0 -filetype=null < %s
+
 ; generated from the following source compiled to bitcode with clang -g -O1
 ; static int i;
 ; int main() {
diff --git a/test/DebugInfo/incorrect-variable-debugloc.ll b/test/DebugInfo/incorrect-variable-debugloc.ll
new file mode 100644
index 0000000..284704c
--- /dev/null
+++ b/test/DebugInfo/incorrect-variable-debugloc.ll
@@ -0,0 +1,391 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O2 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; This is a test case that's as reduced as I can get it, though I haven't fully
+; understood the mechanisms by which this bug occurs, so perhaps there's further
+; simplification to be had (it's certainly a bit non-obvious what's going on). I
+; hesitate to hand-craft or otherwise simplify the IR compared to what Clang
+; generates as this is a particular tickling of optimizations and debug location
+; propagation I want a realistic example of.
+
+; Generated with clang-tot -cc1 -g -O2 -w -std=c++11  -fsanitize=address,use-after-return -fcxx-exceptions -fexceptions -x c++ incorrect-variable-debug-loc.cpp -emit-llvm
+
+; struct A {
+;   int m_fn1();
+; };
+;
+; struct B {
+;   void __attribute__((always_inline)) m_fn2() { i = 0; }
+;   int i;
+; };
+;
+; struct C {
+;   void m_fn3();
+;   int j;
+;   B b;
+; };
+;
+; int fn1() {
+;   C A;
+;   A.b.m_fn2();
+;   A.m_fn3();
+; }
+; void C::m_fn3() {
+;   A().m_fn1();
+;   b.m_fn2();
+; }
+
+; CHECK: DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_name {{.*}} "C"
+; CHECK: [[FN3_DECL:.*]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name {{.*}} "m_fn3"
+
+; CHECK: DW_AT_specification {{.*}} {[[FN3_DECL]]}
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "this"
+
+%struct.C = type { i32, %struct.B }
+%struct.B = type { i32 }
+%struct.A = type { i8 }
+
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 1, void ()* @asan.module_ctor }]
+@__asan_option_detect_stack_use_after_return = external global i32
+@__asan_gen_ = private unnamed_addr constant [11 x i8] c"1 32 8 1 A\00", align 1
+@__asan_gen_1 = private unnamed_addr constant [13 x i8] c"1 32 1 3 tmp\00", align 1
+
+; Function Attrs: noreturn sanitize_address
+define i32 @_Z3fn1v() #0 {
+entry:
+  %MyAlloca = alloca [64 x i8], align 32, !dbg !39
+  %0 = ptrtoint [64 x i8]* %MyAlloca to i64, !dbg !39
+  %1 = load i32* @__asan_option_detect_stack_use_after_return, !dbg !39
+  %2 = icmp ne i32 %1, 0, !dbg !39
+  br i1 %2, label %3, label %5
+
+; <label>:3                                       ; preds = %entry
+  %4 = call i64 @__asan_stack_malloc_0(i64 64, i64 %0), !dbg !39
+  br label %5
+
+; <label>:5                                       ; preds = %entry, %3
+  %6 = phi i64 [ %0, %entry ], [ %4, %3 ], !dbg !39
+  %7 = add i64 %6, 32, !dbg !39
+  %8 = inttoptr i64 %7 to %struct.C*, !dbg !39
+  %9 = inttoptr i64 %6 to i64*, !dbg !39
+  store i64 1102416563, i64* %9, !dbg !39
+  %10 = add i64 %6, 8, !dbg !39
+  %11 = inttoptr i64 %10 to i64*, !dbg !39
+  store i64 ptrtoint ([11 x i8]* @__asan_gen_ to i64), i64* %11, !dbg !39
+  %12 = add i64 %6, 16, !dbg !39
+  %13 = inttoptr i64 %12 to i64*, !dbg !39
+  store i64 ptrtoint (i32 ()* @_Z3fn1v to i64), i64* %13, !dbg !39
+  %14 = lshr i64 %6, 3, !dbg !39
+  %15 = add i64 %14, 2147450880, !dbg !39
+  %16 = add i64 %15, 0, !dbg !39
+  %17 = inttoptr i64 %16 to i64*, !dbg !39
+  store i64 -868083117767659023, i64* %17, !dbg !39
+  %i.i = getelementptr inbounds %struct.C* %8, i64 0, i32 1, i32 0, !dbg !39
+  %18 = ptrtoint i32* %i.i to i64, !dbg !39
+  %19 = lshr i64 %18, 3, !dbg !39
+  %20 = add i64 %19, 2147450880, !dbg !39
+  %21 = inttoptr i64 %20 to i8*, !dbg !39
+  %22 = load i8* %21, !dbg !39
+  %23 = icmp ne i8 %22, 0, !dbg !39
+  br i1 %23, label %24, label %30, !dbg !39
+
+; <label>:24                                      ; preds = %5
+  %25 = and i64 %18, 7, !dbg !39
+  %26 = add i64 %25, 3, !dbg !39
+  %27 = trunc i64 %26 to i8, !dbg !39
+  %28 = icmp sge i8 %27, %22, !dbg !39
+  br i1 %28, label %29, label %30
+
+; <label>:29                                      ; preds = %24
+  call void @__asan_report_store4(i64 %18), !dbg !39
+  call void asm sideeffect "", ""()
+  unreachable
+
+; <label>:30                                      ; preds = %24, %5
+  store i32 0, i32* %i.i, align 4, !dbg !39, !tbaa !41
+  tail call void @llvm.dbg.value(metadata !{%struct.C* %8}, i64 0, metadata !27), !dbg !46
+  call void @_ZN1C5m_fn3Ev(%struct.C* %8), !dbg !47
+  unreachable, !dbg !47
+}
+
+; Function Attrs: sanitize_address
+define void @_ZN1C5m_fn3Ev(%struct.C* nocapture %this) #1 align 2 {
+entry:
+  %MyAlloca = alloca [64 x i8], align 32, !dbg !48
+  %0 = ptrtoint [64 x i8]* %MyAlloca to i64, !dbg !48
+  %1 = load i32* @__asan_option_detect_stack_use_after_return, !dbg !48
+  %2 = icmp ne i32 %1, 0, !dbg !48
+  br i1 %2, label %3, label %5
+
+; <label>:3                                       ; preds = %entry
+  %4 = call i64 @__asan_stack_malloc_0(i64 64, i64 %0), !dbg !48
+  br label %5
+
+; <label>:5                                       ; preds = %entry, %3
+  %6 = phi i64 [ %0, %entry ], [ %4, %3 ], !dbg !48
+  %7 = add i64 %6, 32, !dbg !48
+  %8 = inttoptr i64 %7 to %struct.A*, !dbg !48
+  %9 = inttoptr i64 %6 to i64*, !dbg !48
+  store i64 1102416563, i64* %9, !dbg !48
+  %10 = add i64 %6, 8, !dbg !48
+  %11 = inttoptr i64 %10 to i64*, !dbg !48
+  store i64 ptrtoint ([13 x i8]* @__asan_gen_1 to i64), i64* %11, !dbg !48
+  %12 = add i64 %6, 16, !dbg !48
+  %13 = inttoptr i64 %12 to i64*, !dbg !48
+  store i64 ptrtoint (void (%struct.C*)* @_ZN1C5m_fn3Ev to i64), i64* %13, !dbg !48
+  %14 = lshr i64 %6, 3, !dbg !48
+  %15 = add i64 %14, 2147450880, !dbg !48
+  %16 = add i64 %15, 0, !dbg !48
+  %17 = inttoptr i64 %16 to i64*, !dbg !48
+  store i64 -868083113472691727, i64* %17, !dbg !48
+  tail call void @llvm.dbg.value(metadata !{%struct.C* %this}, i64 0, metadata !30), !dbg !48
+  %call = call i32 @_ZN1A5m_fn1Ev(%struct.A* %8), !dbg !49
+  %i.i = getelementptr inbounds %struct.C* %this, i64 0, i32 1, i32 0, !dbg !50
+  %18 = ptrtoint i32* %i.i to i64, !dbg !50
+  %19 = lshr i64 %18, 3, !dbg !50
+  %20 = add i64 %19, 2147450880, !dbg !50
+  %21 = inttoptr i64 %20 to i8*, !dbg !50
+  %22 = load i8* %21, !dbg !50
+  %23 = icmp ne i8 %22, 0, !dbg !50
+  br i1 %23, label %24, label %30, !dbg !50
+
+; <label>:24                                      ; preds = %5
+  %25 = and i64 %18, 7, !dbg !50
+  %26 = add i64 %25, 3, !dbg !50
+  %27 = trunc i64 %26 to i8, !dbg !50
+  %28 = icmp sge i8 %27, %22, !dbg !50
+  br i1 %28, label %29, label %30
+
+; <label>:29                                      ; preds = %24
+  call void @__asan_report_store4(i64 %18), !dbg !50
+  call void asm sideeffect "", ""()
+  unreachable
+
+; <label>:30                                      ; preds = %24, %5
+  store i32 0, i32* %i.i, align 4, !dbg !50, !tbaa !41
+  store i64 1172321806, i64* %9, !dbg !52
+  %31 = icmp ne i64 %6, %0, !dbg !52
+  br i1 %31, label %32, label %39, !dbg !52
+
+; <label>:32                                      ; preds = %30
+  %33 = add i64 %15, 0, !dbg !52
+  %34 = inttoptr i64 %33 to i64*, !dbg !52
+  store i64 -723401728380766731, i64* %34, !dbg !52
+  %35 = add i64 %6, 56, !dbg !52
+  %36 = inttoptr i64 %35 to i64*, !dbg !52
+  %37 = load i64* %36, !dbg !52
+  %38 = inttoptr i64 %37 to i8*, !dbg !52
+  store i8 0, i8* %38, !dbg !52
+  br label %42, !dbg !52
+
+; <label>:39                                      ; preds = %30
+  %40 = add i64 %15, 0, !dbg !52
+  %41 = inttoptr i64 %40 to i64*, !dbg !52
+  store i64 0, i64* %41, !dbg !52
+  br label %42, !dbg !52
+
+; <label>:42                                      ; preds = %39, %32
+  ret void, !dbg !52
+}
+
+declare i32 @_ZN1A5m_fn1Ev(%struct.A*) #2
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #3
+
+define internal void @asan.module_ctor() {
+  tail call void @__asan_init_v3()
+  ret void
+}
+
+declare void @__asan_init_v3()
+
+declare void @__asan_report_load1(i64)
+
+declare void @__asan_load1(i64)
+
+declare void @__asan_report_load2(i64)
+
+declare void @__asan_load2(i64)
+
+declare void @__asan_report_load4(i64)
+
+declare void @__asan_load4(i64)
+
+declare void @__asan_report_load8(i64)
+
+declare void @__asan_load8(i64)
+
+declare void @__asan_report_load16(i64)
+
+declare void @__asan_load16(i64)
+
+declare void @__asan_report_store1(i64)
+
+declare void @__asan_store1(i64)
+
+declare void @__asan_report_store2(i64)
+
+declare void @__asan_store2(i64)
+
+declare void @__asan_report_store4(i64)
+
+declare void @__asan_store4(i64)
+
+declare void @__asan_report_store8(i64)
+
+declare void @__asan_store8(i64)
+
+declare void @__asan_report_store16(i64)
+
+declare void @__asan_store16(i64)
+
+declare void @__asan_report_load_n(i64, i64)
+
+declare void @__asan_report_store_n(i64, i64)
+
+declare void @__asan_loadN(i64, i64)
+
+declare void @__asan_storeN(i64, i64)
+
+declare i8* @__asan_memmove(i8*, i8*, i64)
+
+declare i8* @__asan_memcpy(i8*, i8*, i64)
+
+declare i8* @__asan_memset(i8*, i32, i64)
+
+declare void @__asan_handle_no_return()
+
+declare void @__sanitizer_cov()
+
+declare void @__sanitizer_ptr_cmp(i64, i64)
+
+declare void @__sanitizer_ptr_sub(i64, i64)
+
+declare i64 @__asan_stack_malloc_0(i64, i64)
+
+declare void @__asan_stack_free_0(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_1(i64, i64)
+
+declare void @__asan_stack_free_1(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_2(i64, i64)
+
+declare void @__asan_stack_free_2(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_3(i64, i64)
+
+declare void @__asan_stack_free_3(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_4(i64, i64)
+
+declare void @__asan_stack_free_4(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_5(i64, i64)
+
+declare void @__asan_stack_free_5(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_6(i64, i64)
+
+declare void @__asan_stack_free_6(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_7(i64, i64)
+
+declare void @__asan_stack_free_7(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_8(i64, i64)
+
+declare void @__asan_stack_free_8(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_9(i64, i64)
+
+declare void @__asan_stack_free_9(i64, i64, i64)
+
+declare i64 @__asan_stack_malloc_10(i64, i64)
+
+declare void @__asan_stack_free_10(i64, i64, i64)
+
+declare void @__asan_poison_stack_memory(i64, i64)
+
+declare void @__asan_unpoison_stack_memory(i64, i64)
+
+declare void @__asan_before_dynamic_init(i64)
+
+declare void @__asan_after_dynamic_init()
+
+declare void @__asan_register_globals(i64, i64)
+
+declare void @__asan_unregister_globals(i64, i64)
+
+declare void @__sanitizer_cov_module_init(i64)
+
+attributes #0 = { noreturn sanitize_address "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { sanitize_address "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!36, !37}
+!llvm.ident = !{!38}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !3, metadata !21, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/<stdin>] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"<stdin>", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !14}
+!4 = metadata !{i32 786451, metadata !5, null, metadata !"C", i32 10, i64 64, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 10, size 64, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !"incorrect-variable-debug-loc.cpp", metadata !"/tmp/dbginfo"}
+!6 = metadata !{metadata !7, metadata !9, metadata !10}
+!7 = metadata !{i32 786445, metadata !5, metadata !"_ZTS1C", metadata !"j", i32 12, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [j] [line 12, size 32, align 32, offset 0] [from int]
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786445, metadata !5, metadata !"_ZTS1C", metadata !"b", i32 13, i64 32, i64 32, i64 32, i32 0, metadata !"_ZTS1B"} ; [ DW_TAG_member ] [b] [line 13, size 32, align 32, offset 32] [from _ZTS1B]
+!10 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1C", metadata !"m_fn3", metadata !"m_fn3", metadata !"_ZN1C5m_fn3Ev", i32 11, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 11} ; [ DW_TAG_subprogram ] [line 11] [m_fn3]
+!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{null, metadata !13}
+!13 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
+!14 = metadata !{i32 786451, metadata !5, null, metadata !"B", i32 5, i64 32, i64 32, i32 0, i32 0, null, metadata !15, i32 0, null, null, metadata !"_ZTS1B"} ; [ DW_TAG_structure_type ] [B] [line 5, size 32, align 32, offset 0] [def] [from ]
+!15 = metadata !{metadata !16, metadata !17}
+!16 = metadata !{i32 786445, metadata !5, metadata !"_ZTS1B", metadata !"i", i32 7, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [i] [line 7, size 32, align 32, offset 0] [from int]
+!17 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1B", metadata !"m_fn2", metadata !"m_fn2", metadata !"_ZN1B5m_fn2Ev", i32 6, metadata !18, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 6} ; [ DW_TAG_subprogram ] [line 6] [m_fn2]
+!18 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!19 = metadata !{null, metadata !20}
+!20 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1B]
+!21 = metadata !{metadata !22, metadata !28, metadata !32}
+!22 = metadata !{i32 786478, metadata !5, metadata !23, metadata !"fn1", metadata !"fn1", metadata !"_Z3fn1v", i32 16, metadata !24, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @_Z3fn1v, null, null, metadata !26, i32 16} ; [ DW_TAG_subprogram ] [line 16] [def] [fn1]
+!23 = metadata !{i32 786473, metadata !5}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/incorrect-variable-debug-loc.cpp]
+!24 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!25 = metadata !{metadata !8}
+!26 = metadata !{metadata !27}
+!27 = metadata !{i32 786688, metadata !22, metadata !"A", metadata !23, i32 17, metadata !"_ZTS1C", i32 0, i32 0} ; [ DW_TAG_auto_variable ] [A] [line 17]
+!28 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1C", metadata !"m_fn3", metadata !"m_fn3", metadata !"_ZN1C5m_fn3Ev", i32 21, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%struct.C*)* @_ZN1C5m_fn3Ev, null, metadata !10, metadata !29, i32 21} ; [ DW_TAG_subprogram ] [line 21] [def] [m_fn3]
+!29 = metadata !{metadata !30}
+!30 = metadata !{i32 786689, metadata !28, metadata !"this", null, i32 16777216, metadata !31, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!31 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
+!32 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1B", metadata !"m_fn2", metadata !"m_fn2", metadata !"_ZN1B5m_fn2Ev", i32 6, metadata !18, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !17, metadata !33, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [m_fn2]
+!33 = metadata !{metadata !34}
+!34 = metadata !{i32 786689, metadata !32, metadata !"this", null, i32 16777216, metadata !35, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!35 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1B]
+!36 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!37 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!38 = metadata !{metadata !"clang version 3.5.0 "}
+!39 = metadata !{i32 6, i32 0, metadata !32, metadata !40}
+!40 = metadata !{i32 18, i32 0, metadata !22, null}
+!41 = metadata !{metadata !42, metadata !43, i64 0}
+!42 = metadata !{metadata !"_ZTS1B", metadata !43, i64 0}
+!43 = metadata !{metadata !"int", metadata !44, i64 0}
+!44 = metadata !{metadata !"omnipotent char", metadata !45, i64 0}
+!45 = metadata !{metadata !"Simple C/C++ TBAA"}
+!46 = metadata !{i32 17, i32 0, metadata !22, null}
+!47 = metadata !{i32 19, i32 0, metadata !22, null}
+!48 = metadata !{i32 0, i32 0, metadata !28, null}
+!49 = metadata !{i32 22, i32 0, metadata !28, null}
+!50 = metadata !{i32 6, i32 0, metadata !32, metadata !51}
+!51 = metadata !{i32 23, i32 0, metadata !28, null}
+!52 = metadata !{i32 24, i32 0, metadata !28, null}
diff --git a/test/DebugInfo/inline-no-debug-info.ll b/test/DebugInfo/inline-no-debug-info.ll
new file mode 100644
index 0000000..2257b89
--- /dev/null
+++ b/test/DebugInfo/inline-no-debug-info.ll
@@ -0,0 +1,69 @@
+; RUN: opt < %s -inline -S | FileCheck %s
+
+; This was generated from the following source:
+; int a, b;
+; __attribute__((__always_inline__)) static void callee2() { b = 2; }
+; __attribute__((__nodebug__)) void callee() { a = 1; callee2(); }
+; void caller() { callee(); }
+; by running
+;   clang -S test.c -emit-llvm -O1 -gline-tables-only -fno-strict-aliasing
+
+; CHECK-LABEL: @caller(
+
+; This instruction did not have a !dbg metadata in the callee.
+; CHECK: store i32 1, {{.*}}, !dbg [[A:!.*]]
+
+; This instruction came from callee with a !dbg metadata.
+; CHECK: store i32 2, {{.*}}, !dbg [[B:!.*]]
+
+; The remaining instruction from the caller.
+; CHECK: ret void, !dbg [[A]]
+
+; Debug location of the code in caller() and of the inlined code that did not
+; have any debug location before.
+; CHECK-DAG: [[A]] = metadata !{i32 4, i32 0, metadata !{{[01-9]+}}, null}
+
+; Debug location of the inlined code.
+; CHECK-DAG: [[B]] = metadata !{i32 2, i32 0, metadata !{{[01-9]+}}, metadata [[A]]}
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = common global i32 0, align 4
+@b = common global i32 0, align 4
+
+; Function Attrs: nounwind uwtable
+define void @callee() #0 {
+entry:
+  store i32 1, i32* @a, align 4
+  store i32 2, i32* @b, align 4, !dbg !11
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @caller() #0 {
+entry:
+  tail call void @callee(), !dbg !12
+  ret void, !dbg !12
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 (210174)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2} ; [ DW_TAG_compile_unit ] [/code/llvm/build0/test.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"test.c", metadata !"/code/llvm/build0"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !7}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"caller", metadata !"caller", metadata !"", i32 4, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, void ()* @caller, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [caller]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/code/llvm/build0/test.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"callee2", metadata !"callee2", metadata !"", i32 2, metadata !6, i1 true, i1 true, i32 0, i32 0, null, i32 0, i1 true, null, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [local] [def] [callee2]
+!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!9 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{metadata !"clang version 3.5.0 (210174)"}
+!11 = metadata !{i32 2, i32 0, metadata !7, null}
+!12 = metadata !{i32 4, i32 0, metadata !4, null}
diff --git a/test/DebugInfo/inlined-arguments.ll b/test/DebugInfo/inlined-arguments.ll
index ebc81a6..6979862 100644
--- a/test/DebugInfo/inlined-arguments.ll
+++ b/test/DebugInfo/inlined-arguments.ll
@@ -16,9 +16,11 @@
 
 ; CHECK: DW_AT_name{{.*}}"f1"
 ; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_name{{.*}}"x"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}"x"
 ; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_name{{.*}}"y"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}"y"
 
 ; Function Attrs: uwtable
 define void @_Z2f2v() #0 {
diff --git a/test/DebugInfo/llvm-symbolizer.test b/test/DebugInfo/llvm-symbolizer.test
index 6aa1287..20d3dda 100644
--- a/test/DebugInfo/llvm-symbolizer.test
+++ b/test/DebugInfo/llvm-symbolizer.test
@@ -17,6 +17,8 @@ RUN: echo "%p/Inputs/macho-universal 0x1f84" >> %t.input
 RUN: echo "%p/Inputs/macho-universal:i386 0x1f67" >> %t.input
 RUN: echo "%p/Inputs/macho-universal:x86_64 0x100000f05" >> %t.input
 RUN: echo "%p/Inputs/llvm-symbolizer-dwo-test 0x400514" >> %t.input
+RUN: echo "%p/Inputs/fission-ranges.elf-x86_64 0x720" >> %t.input
+RUN: echo "%p/Inputs/arange-overlap.elf-x86_64 0x714" >> %t.input
 
 RUN: llvm-symbolizer --functions=linkage --inlining --demangle=false \
 RUN:    --default-arch=i386 < %t.input | FileCheck %s
@@ -90,6 +92,12 @@ CHECK:      _Z3inci
 CHECK: main
 CHECK-NEXT: llvm-symbolizer-dwo-test.cc:11
 
+CHECK: main
+CHECK-NEXT: {{.*}}fission-ranges.cc:6
+
+CHECK: _ZN1S3bazEv
+CHECK-NEXT: {{.*}}arange-overlap.cc:6
+
 RUN: echo "unexisting-file 0x1234" > %t.input2
 RUN: llvm-symbolizer < %t.input2
 
diff --git a/test/DebugInfo/missing-abstract-variable.ll b/test/DebugInfo/missing-abstract-variable.ll
new file mode 100644
index 0000000..59a38cf
--- /dev/null
+++ b/test/DebugInfo/missing-abstract-variable.ll
@@ -0,0 +1,191 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; The formal parameter 'b' for Function 'x' when inlined within 'a' is lost on
+; mips and powerpc64 (and on x86_64 at at least -O2). Presumably this is a
+; SelectionDAG issue (do mips/powerpc64 use FastISel?).
+; XFAIL: mips, powerpc64, s390x
+
+; Build from the following source with clang -O2.
+
+; The important details are that 'x's abstract definition is first built during
+; the definition of 'b', where the parameter to 'x' is constant and so 'x's 's'
+; variable is optimized away. No abstract definition DIE for 's' is constructed.
+; Then, during 'a' emission, the abstract DbgVariable for 's' is created, but
+; the abstract DIE isn't (since the abstract definition for 'b' is already
+; built). This results in 's' inlined in 'a' being emitted with its name, line,
+; file there, rather than referencing an abstract definition.
+
+; extern int t;
+;
+; void f(int);
+;
+; inline void x(bool b) {
+;   if (b) {
+;     int s = t;
+;     f(s);
+;   }
+;   f(0);
+; }
+;
+; void b() {
+;   x(false);
+; }
+;
+; void a(bool u) {
+;   x(u);
+; }
+
+; CHECK: [[ABS_X:.*]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "x"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[ABS_B:.*]]:   DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "b"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     DW_TAG_lexical_block
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:       DW_TAG_lexical_block
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[ABS_S:.*]]:       DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:         DW_AT_name {{.*}} "s"
+
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "b"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:   DW_TAG_inlined_subroutine
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_abstract_origin {{.*}} {[[ABS_X]]}
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_abstract_origin {{.*}} {[[ABS_B]]}
+; Notice 'x's local variable 's' is missing. Not necessarily a bug here,
+; since it's been optimized entirely away and it should be described in
+; abstract subprogram.
+; CHECK-NOT: DW_TAG
+; CHECK: NULL
+; CHECK-NOT: DW_TAG
+; CHECK: NULL
+
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "a"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:   DW_TAG_formal_parameter
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:   DW_TAG_inlined_subroutine
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_abstract_origin {{.*}} {[[ABS_X]]}
+; CHECK-NOT: {{DW_TAG|NULL}}
+; FIXME: This formal parameter goes missing at least at -O2 (& on
+; mips/powerpc), maybe before that. Perhaps SelectionDAG is to blame (and
+; fastisel succeeds).
+; CHECK:     DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_abstract_origin {{.*}} {[[ABS_B]]}
+
+; The two lexical blocks here are caused by the scope of the if that includes
+; the condition variable, and the scope within the if's composite statement. I'm
+; not sure we really need both of them since there's no variable declared in the
+; outer of the two
+
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     DW_TAG_lexical_block
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:       DW_TAG_lexical_block
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:         DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:           DW_AT_abstract_origin {{.*}} {[[ABS_S]]}
+
+@t = external global i32
+
+; Function Attrs: uwtable
+define void @_Z1bv() #0 {
+entry:
+  tail call void @llvm.dbg.value(metadata !24, i64 0, metadata !25), !dbg !27
+  tail call void @_Z1fi(i32 0), !dbg !28
+  ret void, !dbg !29
+}
+
+; Function Attrs: uwtable
+define void @_Z1ab(i1 zeroext %u) #0 {
+entry:
+  tail call void @llvm.dbg.value(metadata !{i1 %u}, i64 0, metadata !13), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i1 %u}, i64 0, metadata !31), !dbg !33
+  br i1 %u, label %if.then.i, label %_Z1xb.exit, !dbg !34
+
+if.then.i:                                        ; preds = %entry
+  %0 = load i32* @t, align 4, !dbg !35, !tbaa !36
+  tail call void @llvm.dbg.value(metadata !{i32 %0}, i64 0, metadata !40), !dbg !35
+  tail call void @_Z1fi(i32 %0), !dbg !41
+  br label %_Z1xb.exit, !dbg !42
+
+_Z1xb.exit:                                       ; preds = %entry, %if.then.i
+  tail call void @_Z1fi(i32 0), !dbg !43
+  ret void, !dbg !44
+}
+
+declare void @_Z1fi(i32) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #2
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!21, !22}
+!llvm.ident = !{!23}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/missing-abstract-variables.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"missing-abstract-variables.cc", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !8, metadata !14}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"b", metadata !"b", metadata !"_Z1bv", i32 13, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @_Z1bv, null, null, metadata !2, i32 13} ; [ DW_TAG_subprogram ] [line 13] [def] [b]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/missing-abstract-variables.cc]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null}
+!8 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"a", metadata !"a", metadata !"_Z1ab", i32 17, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i1)* @_Z1ab, null, null, metadata !12, i32 17} ; [ DW_TAG_subprogram ] [line 17] [def] [a]
+!9 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{null, metadata !11}
+!11 = metadata !{i32 786468, null, null, metadata !"bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 786689, metadata !8, metadata !"u", metadata !5, i32 16777233, metadata !11, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [u] [line 17]
+!14 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"x", metadata !"x", metadata !"_Z1xb", i32 5, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !15, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [x]
+!15 = metadata !{metadata !16, metadata !17}
+!16 = metadata !{i32 786689, metadata !14, metadata !"b", metadata !5, i32 16777221, metadata !11, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 5]
+!17 = metadata !{i32 786688, metadata !18, metadata !"s", metadata !5, i32 7, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [s] [line 7]
+!18 = metadata !{i32 786443, metadata !1, metadata !19, i32 6, i32 0, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/missing-abstract-variables.cc]
+!19 = metadata !{i32 786443, metadata !1, metadata !14, i32 6, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/missing-abstract-variables.cc]
+!20 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!21 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!22 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!23 = metadata !{metadata !"clang version 3.5.0 "}
+!24 = metadata !{i1 false}
+!25 = metadata !{i32 786689, metadata !14, metadata !"b", metadata !5, i32 16777221, metadata !11, i32 0, metadata !26} ; [ DW_TAG_arg_variable ] [b] [line 5]
+!26 = metadata !{i32 14, i32 0, metadata !4, null}
+!27 = metadata !{i32 5, i32 0, metadata !14, metadata !26}
+!28 = metadata !{i32 10, i32 0, metadata !14, metadata !26}
+!29 = metadata !{i32 15, i32 0, metadata !4, null}
+!30 = metadata !{i32 17, i32 0, metadata !8, null}
+!31 = metadata !{i32 786689, metadata !14, metadata !"b", metadata !5, i32 16777221, metadata !11, i32 0, metadata !32} ; [ DW_TAG_arg_variable ] [b] [line 5]
+!32 = metadata !{i32 18, i32 0, metadata !8, null}
+!33 = metadata !{i32 5, i32 0, metadata !14, metadata !32}
+!34 = metadata !{i32 6, i32 0, metadata !19, metadata !32}
+!35 = metadata !{i32 7, i32 0, metadata !18, metadata !32}
+!36 = metadata !{metadata !37, metadata !37, i64 0}
+!37 = metadata !{metadata !"int", metadata !38, i64 0}
+!38 = metadata !{metadata !"omnipotent char", metadata !39, i64 0}
+!39 = metadata !{metadata !"Simple C/C++ TBAA"}
+!40 = metadata !{i32 786688, metadata !18, metadata !"s", metadata !5, i32 7, metadata !20, i32 0, metadata !32} ; [ DW_TAG_auto_variable ] [s] [line 7]
+!41 = metadata !{i32 8, i32 0, metadata !18, metadata !32} ; [ DW_TAG_imported_declaration ]
+!42 = metadata !{i32 9, i32 0, metadata !18, metadata !32}
+!43 = metadata !{i32 10, i32 0, metadata !14, metadata !32}
+!44 = metadata !{i32 19, i32 0, metadata !8, null}
diff --git a/test/DebugInfo/nodebug.ll b/test/DebugInfo/nodebug.ll
new file mode 100644
index 0000000..4d86b24
--- /dev/null
+++ b/test/DebugInfo/nodebug.ll
@@ -0,0 +1,51 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf < %s -filetype=obj | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; Test that a nodebug function (a function not appearing in the debug info IR
+; metadata subprogram list) with DebugLocs on its IR doesn't cause crashes/does
+; the right thing.
+
+; Build with clang from the following:
+; extern int i;
+; inline __attribute__((always_inline)) void f1() {
+;   i = 3;
+; }
+;
+; __attribute__((nodebug)) void f2() {
+;   f1();
+; }
+
+; Check that there's only one DW_TAG_subprogram, nothing for the 'f2' function.
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "f1"
+; CHECK-NOT: DW_TAG_subprogram
+
+@i = external global i32
+
+; Function Attrs: uwtable
+define void @_Z2f2v() #0 {
+entry:
+  store i32 3, i32* @i, align 4, !dbg !11
+  ret void
+}
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/nodebug.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"nodebug.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f1", metadata !"f1", metadata !"_Z2f1v", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [f1]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/nodebug.cpp]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null}
+!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!9 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{metadata !"clang version 3.5.0 "}
+!11 = metadata !{i32 3, i32 0, metadata !4, null}
diff --git a/test/ExecutionEngine/MCJIT/eh-lg-pic.ll b/test/ExecutionEngine/MCJIT/eh-lg-pic.ll
index 7c0227d..539c890 100644
--- a/test/ExecutionEngine/MCJIT/eh-lg-pic.ll
+++ b/test/ExecutionEngine/MCJIT/eh-lg-pic.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli_mcjit -relocation-model=pic -code-model=large %s
-; XFAIL: cygwin, win32, mingw, mips, powerpc64, i686, i386, aarch64, arm
+; XFAIL: cygwin, win32, mingw, mips, i686, i386, aarch64, arm
 declare i8* @__cxa_allocate_exception(i64)
 declare void @__cxa_throw(i8*, i8*, i8*)
 declare i32 @__gxx_personality_v0(...)
diff --git a/test/ExecutionEngine/MCJIT/lit.local.cfg b/test/ExecutionEngine/MCJIT/lit.local.cfg
index 5dc749d..f981403 100644
--- a/test/ExecutionEngine/MCJIT/lit.local.cfg
+++ b/test/ExecutionEngine/MCJIT/lit.local.cfg
@@ -1,5 +1,5 @@
 root = config.root
-targets = set(root.targets_to_build.split())
+targets = root.targets
 if ('X86' in targets) | ('AArch64' in targets) | ('ARM' in targets) | \
    ('Mips' in targets) | ('PowerPC' in targets) | ('SystemZ' in targets):
     config.unsupported = False
diff --git a/test/ExecutionEngine/RuntimeDyld/X86/MachO_x86-64_PIC_relocations.s b/test/ExecutionEngine/RuntimeDyld/X86/MachO_x86-64_PIC_relocations.s
new file mode 100644
index 0000000..e87b449
--- /dev/null
+++ b/test/ExecutionEngine/RuntimeDyld/X86/MachO_x86-64_PIC_relocations.s
@@ -0,0 +1,32 @@
+# RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -relocation-model=pic -filetype=obj -o %t.o %s
+# RUN: llvm-rtdyld -triple=x86_64-apple-macosx10.9 -verify -check=%s %t.o
+# RUN: rm %t.o
+
+        .section	__TEXT,__text,regular,pure_instructions
+	.globl	foo
+	.align	4, 0x90
+foo:
+        retq
+
+	.globl	main
+	.align	4, 0x90
+main:
+# Test PC-rel branch.
+# rtdyld-check: decode_operand(insn1, 0) = foo - next_pc(insn1)
+insn1:
+        callq	foo
+
+# Test PC-rel signed.
+# rtdyld-check: decode_operand(insn2, 4) = x - next_pc(insn2)
+insn2:
+	movl	x(%rip), %eax
+	movl	$0, %eax
+	retq
+
+        .section	__DATA,__data
+	.globl	x
+	.align	2
+x:
+        .long   5
+
+.subsections_via_symbols
diff --git a/test/ExecutionEngine/RuntimeDyld/X86/lit.local.cfg b/test/ExecutionEngine/RuntimeDyld/X86/lit.local.cfg
new file mode 100644
index 0000000..e71f3cc
--- /dev/null
+++ b/test/ExecutionEngine/RuntimeDyld/X86/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'X86' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/ExecutionEngine/lit.local.cfg b/test/ExecutionEngine/lit.local.cfg
index 7f0b69e..f6673df 100644
--- a/test/ExecutionEngine/lit.local.cfg
+++ b/test/ExecutionEngine/lit.local.cfg
@@ -1,7 +1,10 @@
-if config.root.host_arch in ['PowerPC', 'AArch64', 'ARM64', 'SystemZ']:
+if config.root.host_arch in ['PowerPC', 'AArch64', 'SystemZ']:
     config.unsupported = True
 
 # CMake and autoconf diverge in naming or host_arch
+if 'powerpc64' in config.root.target_triple:
+    config.unsupported = True
+
 if 'aarch64' in config.root.target_triple \
     or 'arm64' in config.root.target_triple:
         config.unsupported = True
diff --git a/test/Feature/alias2.ll b/test/Feature/alias2.ll
index 693ef7c..73c874f 100644
--- a/test/Feature/alias2.ll
+++ b/test/Feature/alias2.ll
@@ -6,14 +6,23 @@
 @v2 = global [1 x i32] zeroinitializer
 ; CHECK: @v2 = global [1 x i32] zeroinitializer
 
-@v3 = alias i16, i32* @v1
-; CHECK: @v3 = alias i16, i32* @v1
+@v3 = global [2 x i16] zeroinitializer
+; CHECK: @v3 = global [2 x i16] zeroinitializer
 
-@v4 = alias i32, [1 x i32]* @v2
-; CHECK: @v4 = alias i32, [1 x i32]* @v2
+@a1 = alias bitcast (i32* @v1 to i16*)
+; CHECK: @a1 = alias bitcast (i32* @v1 to i16*)
 
-@v5 = alias addrspace(2) i32, i32* @v1
-; CHECK: @v5 = alias addrspace(2) i32, i32* @v1
+@a2 = alias bitcast([1 x i32]* @v2 to i32*)
+; CHECK: @a2 = alias getelementptr inbounds ([1 x i32]* @v2, i32 0, i32 0)
 
-@v6 = alias i16, i32* @v1
-; CHECK: @v6 = alias i16, i32* @v1
+@a3 = alias addrspacecast (i32* @v1 to i32 addrspace(2)*)
+; CHECK: @a3 = alias addrspacecast (i32* @v1 to i32 addrspace(2)*)
+
+@a4 = alias bitcast (i32* @v1 to i16*)
+; CHECK: @a4 = alias bitcast (i32* @v1 to i16*)
+
+@a5 = thread_local(localdynamic) alias i32* @v1
+; CHECK: @a5 = thread_local(localdynamic) alias i32* @v1
+
+@a6 = alias getelementptr ([2 x i16]* @v3, i32 1, i32 1)
+; CHECK: @a6 = alias getelementptr ([2 x i16]* @v3, i32 1, i32 1)
diff --git a/test/Feature/aliases.ll b/test/Feature/aliases.ll
index b2ce82a..ad1d1b0 100644
--- a/test/Feature/aliases.ll
+++ b/test/Feature/aliases.ll
@@ -7,6 +7,14 @@
 @bar = global i32 0
 @foo1 = alias i32* @bar
 @foo2 = alias i32* @bar
+@foo3 = alias i32* @foo2
+@foo4 = unnamed_addr alias i32* @foo2
+
+; Make sure the verifier does not complain about references to a global
+; declaration from an initializer.
+@decl = external global i32
+@ptr = global i32* @decl
+@ptr_a = alias i32** @ptr
 
 %FunTy = type i32()
 
@@ -14,10 +22,11 @@ define i32 @foo_f() {
   ret i32 0
 }
 @bar_f = alias weak_odr %FunTy* @foo_f
+@bar_ff = alias i32()* @bar_f
 
 @bar_i = alias internal i32* @bar
 
-@A = alias i64, i32* @bar
+@A = alias bitcast (i32* @bar to i64*)
 
 define i32 @test() {
 entry:
diff --git a/test/Feature/comdat.ll b/test/Feature/comdat.ll
new file mode 100644
index 0000000..05fb87c
--- /dev/null
+++ b/test/Feature/comdat.ll
@@ -0,0 +1,18 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+$f = comdat any
+; CHECK: $f = comdat any
+
+$f2 = comdat any
+; CHECK-NOT: f2
+
+@v = global i32 0, comdat $f
+; CHECK: @v = global i32 0, comdat $f
+
+@a = alias i32* @v
+; CHECK: @a = alias i32* @v{{$}}
+
+define void @f() comdat $f {
+  ret void
+}
+; CHECK: define void @f() comdat $f
diff --git a/test/Feature/globalvars.ll b/test/Feature/globalvars.ll
index dad1cf3..84b4bdf 100644
--- a/test/Feature/globalvars.ll
+++ b/test/Feature/globalvars.ll
@@ -16,3 +16,5 @@ define i32 @foo(i32 %blah) {
         ret i32 %blah
 }
 
+hidden dllexport global i32 42
+dllexport global i32 42
diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_attr.ll b/test/Instrumentation/AddressSanitizer/X86/asm_attr.ll
index b83a7e9..0667a14 100644
--- a/test/Instrumentation/AddressSanitizer/X86/asm_attr.ll
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_attr.ll
@@ -4,16 +4,16 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; CHECK-LABEL: mov_no_attr
-; CHECK-NOT: callq __sanitizer_sanitize_load8@PLT
-; CHECK-NOT: callq __sanitizer_sanitize_store8@PLT
+; CHECK-NOT: callq __asan_report_load@PLT
+; CHECK-NOT: callq __asan_report_store@PLT
 define void @mov_no_attr(i64* %dst, i64* %src) {
   tail call void asm sideeffect "movq ($1), %rax  \0A\09movq %rax, ($0)  \0A\09", "r,r,~{memory},~{rax},~{dirflag},~{fpsr},~{flags}"(i64* %dst, i64* %src)
   ret void
 }
 
 ; CHECK-LABEL: mov_sanitize
-; CHECK: callq __sanitizer_sanitize_load8@PLT
-; CHECK: callq __sanitizer_sanitize_store8@PLT
+; CHECK: callq __asan_report_load8@PLT
+; CHECK: callq __asan_report_store8@PLT
 define void @mov_sanitize(i64* %dst, i64* %src) sanitize_address {
   tail call void asm sideeffect "movq ($1), %rax  \0A\09movq %rax, ($0)  \0A\09", "r,r,~{memory},~{rax},~{dirflag},~{fpsr},~{flags}"(i64* %dst, i64* %src)
   ret void
diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_mov.ll b/test/Instrumentation/AddressSanitizer/X86/asm_mov.ll
index 030af7e..ad5e02e 100644
--- a/test/Instrumentation/AddressSanitizer/X86/asm_mov.ll
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_mov.ll
@@ -5,18 +5,35 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ; CHECK-LABEL: mov1b
 ; CHECK: leaq -128(%rsp), %rsp
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: pushq %rcx
 ; CHECK-NEXT: pushq %rdi
+; CHECK-NEXT: pushfq
 ; CHECK-NEXT: leaq {{.*}}, %rdi
-; CHECK-NEXT: callq __sanitizer_sanitize_load1@PLT
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: shrq $3, %rax
+; CHECK-NEXT: movb 2147450880(%rax), %al
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: je [[A:.*]]
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: andl $7, %ecx
+; CHECK-NEXT: movsbl %al, %eax
+; CHECK-NEXT: cmpl %eax, %ecx
+; CHECK-NEXT: jl {{.*}}
+; CHECK-NEXT: cld
+; CHECK-NEXT: emms
+; CHECK-NEXT: andq $-16, %rsp
+; CHECK-NEXT: callq __asan_report_load1@PLT
+; CHECK-NEXT: [[A]]:
+; CHECK-NEXT: popfq
 ; CHECK-NEXT: popq %rdi
+; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: popq %rax
 ; CHECK-NEXT: leaq 128(%rsp), %rsp
 
 ; CHECK: leaq -128(%rsp), %rsp
-; CHECK-NEXT: pushq %rdi
-; CHECK-NEXT: leaq {{.*}}, %rdi
-; CHECK-NEXT: callq __sanitizer_sanitize_store1@PLT
-; CHECK-NEXT: popq %rdi
-; CHECK-NEXT: leaq 128(%rsp), %rsp
+; CHECK: callq __asan_report_store1@PLT
+; CHECK: leaq 128(%rsp), %rsp
 
 ; CHECK: movb {{.*}}, {{.*}}
 define void @mov1b(i8* %dst, i8* %src) #0 {
@@ -27,18 +44,14 @@ entry:
 
 ; CHECK-LABEL: mov2b
 ; CHECK: leaq -128(%rsp), %rsp
-; CHECK-NEXT: pushq %rdi
-; CHECK-NEXT: leaq {{.*}}, %rdi
-; CHECK-NEXT: callq __sanitizer_sanitize_load2@PLT
-; CHECK-NEXT: popq %rdi
-; CHECK-NEXT: leaq 128(%rsp), %rsp
+; CHECK: leal 1(%ecx), %ecx
+; CHECK: callq __asan_report_load2@PLT
+; CHECK: leaq 128(%rsp), %rsp
 
 ; CHECK: leaq -128(%rsp), %rsp
-; CHECK-NEXT: pushq %rdi
-; CHECK-NEXT: leaq {{.*}}, %rdi
-; CHECK-NEXT: callq __sanitizer_sanitize_store2@PLT
-; CHECK-NEXT: popq %rdi
-; CHECK-NEXT: leaq 128(%rsp), %rsp
+; CHECK: leal 1(%ecx), %ecx
+; CHECK: callq __asan_report_store2@PLT
+; CHECK: leaq 128(%rsp), %rsp
 
 ; CHECK: movw {{.*}}, {{.*}}
 define void @mov2b(i16* %dst, i16* %src) #0 {
@@ -49,18 +62,14 @@ entry:
 
 ; CHECK-LABEL: mov4b
 ; CHECK: leaq -128(%rsp), %rsp
-; CHECK-NEXT: pushq %rdi
-; CHECK-NEXT: leaq {{.*}}, %rdi
-; CHECK-NEXT: callq __sanitizer_sanitize_load4@PLT
-; CHECK-NEXT: popq %rdi
-; CHECK-NEXT: leaq 128(%rsp), %rsp
+; CHECK: addl $3, %ecx
+; CHECK: callq __asan_report_load4@PLT
+; CHECK: leaq 128(%rsp), %rsp
 
 ; CHECK: leaq -128(%rsp), %rsp
-; CHECK-NEXT: pushq %rdi
-; CHECK-NEXT: leaq {{.*}}, %rdi
-; CHECK-NEXT: callq __sanitizer_sanitize_store4@PLT
-; CHECK-NEXT: popq %rdi
-; CHECK-NEXT: leaq 128(%rsp), %rsp
+; CHECK: addl $3, %ecx
+; CHECK: callq __asan_report_store4@PLT
+; CHECK: leaq 128(%rsp), %rsp
 
 ; CHECK: movl {{.*}}, {{.*}}
 define void @mov4b(i32* %dst, i32* %src) #0 {
@@ -71,17 +80,35 @@ entry:
 
 ; CHECK-LABEL: mov8b
 ; CHECK: leaq -128(%rsp), %rsp
-; CHECK-NEXT: pushq %rdi
-; CHECK-NEXT: leaq {{.*}}, %rdi
-; CHECK-NEXT: callq __sanitizer_sanitize_load8@PLT
-; CHECK-NEXT: popq %rdi
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: pushfq
+; CHECK-NEXT: leaq {{.*}}, %rax
+; CHECK-NEXT: shrq $3, %rax
+; CHECK-NEXT: cmpb $0, 2147450880(%rax)
+; CHECK-NEXT: je [[A:.*]]
+; CHECK-NEXT: cld
+; CHECK-NEXT: emms
+; CHECK-NEXT: andq $-16, %rsp
+; CHECK-NEXT: callq __asan_report_load8@PLT
+; CHECK-NEXT: [[A]]:
+; CHECK-NEXT: popfq
+; CHECK-NEXT: popq %rax
 ; CHECK-NEXT: leaq 128(%rsp), %rsp
 
 ; CHECK: leaq -128(%rsp), %rsp
-; CHECK-NEXT: pushq %rdi
-; CHECK-NEXT: leaq {{.*}}, %rdi
-; CHECK-NEXT: callq __sanitizer_sanitize_store8@PLT
-; CHECK-NEXT: popq %rdi
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: pushfq
+; CHECK-NEXT: leaq {{.*}}, %rax
+; CHECK-NEXT: shrq $3, %rax
+; CHECK-NEXT: cmpb $0, 2147450880(%rax)
+; CHECK-NEXT: je [[A:.*]]
+; CHECK-NEXT: cld
+; CHECK-NEXT: emms
+; CHECK-NEXT: andq $-16, %rsp
+; CHECK-NEXT: callq __asan_report_store8@PLT
+; CHECK-NEXT: [[A]]:
+; CHECK-NEXT: popfq
+; CHECK-NEXT: popq %rax
 ; CHECK-NEXT: leaq 128(%rsp), %rsp
 
 ; CHECK: movq {{.*}}, {{.*}}
@@ -93,18 +120,14 @@ entry:
 
 ; CHECK-LABEL: mov16b
 ; CHECK: leaq -128(%rsp), %rsp
-; CHECK-NEXT: pushq %rdi
-; CHECK-NEXT: leaq {{.*}}, %rdi
-; CHECK-NEXT: callq __sanitizer_sanitize_load16@PLT
-; CHECK-NEXT: popq %rdi
-; CHECK-NEXT: leaq 128(%rsp), %rsp
+; CHECK: cmpw $0, 2147450880(%rax)
+; CHECK: callq __asan_report_load16@PLT
+; CHECK: leaq 128(%rsp), %rsp
 
 ; CHECK: leaq -128(%rsp), %rsp
-; CHECK-NEXT: pushq %rdi
-; CHECK-NEXT: leaq {{.*}}, %rdi
-; CHECK-NEXT: callq __sanitizer_sanitize_store16@PLT
-; CHECK-NEXT: popq %rdi
-; CHECK-NEXT: leaq 128(%rsp), %rsp
+; CHECK: cmpw $0, 2147450880(%rax)
+; CHECK: callq __asan_report_store16@PLT
+; CHECK: leaq 128(%rsp), %rsp
 
 ; CHECK: movaps {{.*}}, {{.*}}
 define void @mov16b(<2 x i64>* %dst, <2 x i64>* %src) #0 {
diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_mov.s b/test/Instrumentation/AddressSanitizer/X86/asm_mov.s
index df217c0..74a788c 100644
--- a/test/Instrumentation/AddressSanitizer/X86/asm_mov.s
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_mov.s
@@ -7,20 +7,14 @@
 # CHECK-LABEL: mov1b:
 #
 # CHECK: leaq -128(%rsp), %rsp
-# CHECK-NEXT: pushq %rdi
-# CHECK-NEXT: leaq (%rsi), %rdi
-# CHECK-NEXT: callq __sanitizer_sanitize_load1@PLT
-# CHECK-NEXT: popq %rdi
-# CHECK-NEXT: leaq 128(%rsp), %rsp
+# CHECK: callq __asan_report_load1@PLT
+# CHECK: leaq 128(%rsp), %rsp
 #
 # CHECK-NEXT: movb (%rsi), %al
 #
 # CHECK-NEXT: leaq -128(%rsp), %rsp
-# CHECK-NEXT: pushq %rdi
-# CHECK-NEXT: leaq (%rdi), %rdi
-# CHECK-NEXT: callq __sanitizer_sanitize_store1@PLT
-# CHECK-NEXT: popq %rdi
-# CHECK-NEXT: leaq 128(%rsp), %rsp
+# CHECK: callq __asan_report_store1@PLT
+# CHECK: leaq 128(%rsp), %rsp
 #
 # CHECK-NEXT: movb %al, (%rdi)
 mov1b:                                  # @mov1b
@@ -42,20 +36,14 @@ mov1b:                                  # @mov1b
 # CHECK-LABEL: mov16b:
 #
 # CHECK: leaq -128(%rsp), %rsp
-# CHECK-NEXT: pushq %rdi
-# CHECK-NEXT: leaq (%rsi), %rdi
-# CHECK-NEXT: callq __sanitizer_sanitize_load16@PLT
-# CHECK-NEXT: popq %rdi
-# CHECK-NEXT: leaq 128(%rsp), %rsp
+# CHECK: callq __asan_report_load16@PLT
+# CHECK: leaq 128(%rsp), %rsp
 #
 # CHECK-NEXT: movaps (%rsi), %xmm0
 #
 # CHECK-NEXT: leaq -128(%rsp), %rsp
-# CHECK-NEXT: pushq %rdi
-# CHECK-NEXT: leaq (%rdi), %rdi
-# CHECK-NEXT: callq __sanitizer_sanitize_store16@PLT
-# CHECK-NEXT: popq %rdi
-# CHECK-NEXT: leaq 128(%rsp), %rsp
+# CHECK: callq __asan_report_store16@PLT
+# CHECK: leaq 128(%rsp), %rsp
 #
 # CHECK-NEXT: movaps %xmm0, (%rdi)
 mov16b:                                 # @mov16b
diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_mov_no_instrumentation.s b/test/Instrumentation/AddressSanitizer/X86/asm_mov_no_instrumentation.s
index cc05527..e3a1541 100644
--- a/test/Instrumentation/AddressSanitizer/X86/asm_mov_no_instrumentation.s
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_mov_no_instrumentation.s
@@ -5,8 +5,8 @@
 	.align	16, 0x90
 	.type	mov1b,@function
 # CHECK-LABEL: mov1b
-# CHECK-NOT: callq __sanitizer_sanitize_load1@PLT
-# CHECK-NOT: callq __sanitizer_sanitize_store1@PLT
+# CHECK-NOT: callq __asan_report_load1@PLT
+# CHECK-NOT: callq __asan_report_store1@PLT
 mov1b:                                  # @mov1b
 	.cfi_startproc
 # BB#0:
diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_swap_intel.s b/test/Instrumentation/AddressSanitizer/X86/asm_swap_intel.s
index 8a6a8d5..ca3c54c 100644
--- a/test/Instrumentation/AddressSanitizer/X86/asm_swap_intel.s
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_swap_intel.s
@@ -7,38 +7,26 @@
 # CHECK-LABEL: swap:
 #
 # CHECK: leaq -128(%rsp), %rsp
-# CHECK-NEXT: pushq %rdi
-# CHECK-NEXT: leaq (%rcx), %rdi
-# CHECK-NEXT: callq __sanitizer_sanitize_load8@PLT
-# CHECK-NEXT: popq %rdi
-# CHECK-NEXT: leaq 128(%rsp), %rsp
+# CHECK: callq __asan_report_load8@PLT
+# CHECK: leaq 128(%rsp), %rsp
 #
 # CHECK-NEXT: movq (%rcx), %rax
 #
 # CHECK-NEXT: leaq -128(%rsp), %rsp
-# CHECK-NEXT: pushq %rdi
-# CHECK-NEXT: leaq (%rdx), %rdi
-# CHECK-NEXT: callq __sanitizer_sanitize_load8@PLT
-# CHECK-NEXT: popq %rdi
-# CHECK-NEXT: leaq 128(%rsp), %rsp
+# CHECK: callq __asan_report_load8@PLT
+# CHECK: leaq 128(%rsp), %rsp
 #
 # CHECK-NEXT: movq (%rdx), %rbx
 #
-# CHECK: leaq -128(%rsp), %rsp
-# CHECK-NEXT: pushq %rdi
-# CHECK-NEXT: leaq (%rcx), %rdi
-# CHECK-NEXT: callq __sanitizer_sanitize_store8@PLT
-# CHECK-NEXT: popq %rdi
-# CHECK-NEXT: leaq 128(%rsp), %rsp
+# CHECK-NEXT: leaq -128(%rsp), %rsp
+# CHECK: callq __asan_report_store8@PLT
+# CHECK: leaq 128(%rsp), %rsp
 #
 # CHECK-NEXT: movq %rbx, (%rcx)
 #
 # CHECK-NEXT: leaq -128(%rsp), %rsp
-# CHECK-NEXT: pushq %rdi
-# CHECK-NEXT: leaq (%rdx), %rdi
-# CHECK-NEXT: callq __sanitizer_sanitize_store8@PLT
-# CHECK-NEXT: popq %rdi
-# CHECK-NEXT: leaq 128(%rsp), %rsp
+# CHECK: callq __asan_report_store8@PLT
+# CHECK: leaq 128(%rsp), %rsp
 #
 # CHECK-NEXT: movq %rax, (%rdx)
 swap:                                   # @swap
diff --git a/test/Instrumentation/AddressSanitizer/X86/lit.local.cfg b/test/Instrumentation/AddressSanitizer/X86/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/Instrumentation/AddressSanitizer/X86/lit.local.cfg
+++ b/test/Instrumentation/AddressSanitizer/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Instrumentation/AddressSanitizer/coverage-dbg.ll b/test/Instrumentation/AddressSanitizer/coverage-dbg.ll
index 77d7286..3f7998d 100644
--- a/test/Instrumentation/AddressSanitizer/coverage-dbg.ll
+++ b/test/Instrumentation/AddressSanitizer/coverage-dbg.ll
@@ -2,32 +2,66 @@
 
 ; RUN: opt < %s -asan -asan-module -asan-coverage=1 -S | FileCheck %s
 
+; C++ source:
+; 1: struct A {
+; 2:  int f();
+; 3:  int x;
+; 4: };
+; 5:
+; 6: int A::f() {
+; 7:    return x;
+; 8: }
+; clang++ ../1.cc -O3 -g -S -emit-llvm  -fno-strict-aliasing
+; and add sanitize_address to @_ZN1A1fEv
+
+; Test that __sanitizer_cov call has !dbg pointing to the opening { of A::f().
+; CHECK: call void @__sanitizer_cov(), !dbg [[A:!.*]]
+; CHECK: [[A]] = metadata !{i32 6, i32 0, metadata !{{.*}}, null}
+
+
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; Function Attrs: nounwind readnone uwtable
-define void @_Z1fv() #0 {
+%struct.A = type { i32 }
+
+; Function Attrs: nounwind readonly uwtable
+define i32 @_ZN1A1fEv(%struct.A* nocapture readonly %this) #0 align 2 {
 entry:
-  ret void, !dbg !11
+  tail call void @llvm.dbg.value(metadata !{%struct.A* %this}, i64 0, metadata !15), !dbg !20
+  %x = getelementptr inbounds %struct.A* %this, i64 0, i32 0, !dbg !21
+  %0 = load i32* %x, align 4, !dbg !21
+  ret i32 %0, !dbg !21
 }
 
-; CHECK: call void @__sanitizer_cov(), !dbg !
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #1
 
-attributes #0 = { sanitize_address nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { sanitize_address nounwind readonly uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!8, !9}
-!llvm.ident = !{!10}
+!llvm.module.flags = !{!17, !18}
+!llvm.ident = !{!19}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (208682)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp//tmp/1.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"/tmp/1.cc", metadata !"/tmp"}
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (210251)", i1 true, metadata !"", i32 0, metadata !2, metadata !3, metadata !12, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/code/llvm/build0/../1.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"../1.cc", metadata !"/code/llvm/build0"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f", metadata !"f", metadata !"_Z1fv", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @_Z1fv, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp//tmp/1.cc]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
-!10 = metadata !{metadata !"clang version 3.5.0 (208682)"}
-!11 = metadata !{i32 2, i32 0, metadata !4, null}
+!4 = metadata !{i32 786451, metadata !1, null, metadata !"A", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !6, metadata !8}
+!6 = metadata !{i32 786445, metadata !1, metadata !"_ZTS1A", metadata !"x", i32 3, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] [x] [line 3, size 32, align 32, offset 0] [from int]
+!7 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"f", metadata !"f", metadata !"_ZN1A1fEv", i32 2, metadata !9, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 2} ; [ DW_TAG_subprogram ] [line 2] [f]
+!9 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !7, metadata !11}
+!11 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"f", metadata !"f", metadata !"_ZN1A1fEv", i32 6, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (%struct.A*)* @_ZN1A1fEv, null, metadata !8, metadata !14, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [f]
+!14 = metadata !{metadata !15}
+!15 = metadata !{i32 786689, metadata !13, metadata !"this", null, i32 16777216, metadata !16, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!16 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
+!17 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!18 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!19 = metadata !{metadata !"clang version 3.5.0 (210251)"}
+!20 = metadata !{i32 0, i32 0, metadata !13, null}
+!21 = metadata !{i32 7, i32 0, metadata !13, null}
diff --git a/test/Instrumentation/AddressSanitizer/coverage.ll b/test/Instrumentation/AddressSanitizer/coverage.ll
index 5bc5103..79bb5c1 100644
--- a/test/Instrumentation/AddressSanitizer/coverage.ll
+++ b/test/Instrumentation/AddressSanitizer/coverage.ll
@@ -1,7 +1,20 @@
+; RUN: opt < %s -asan -asan-module -asan-coverage=0 -S | FileCheck %s --check-prefix=CHECK0
 ; RUN: opt < %s -asan -asan-module -asan-coverage=1 -S | FileCheck %s --check-prefix=CHECK1
 ; RUN: opt < %s -asan -asan-module -asan-coverage=2 -S | FileCheck %s --check-prefix=CHECK2
 ; RUN: opt < %s -asan -asan-module -asan-coverage=2 -asan-coverage-block-threshold=10 -S | FileCheck %s --check-prefix=CHECK2
 ; RUN: opt < %s -asan -asan-module -asan-coverage=2 -asan-coverage-block-threshold=1  -S | FileCheck %s --check-prefix=CHECK1
+
+; RUN: opt < %s -asan -asan-module -asan-coverage=0 -asan-globals=0 -S | \
+; RUN:     FileCheck %s --check-prefix=CHECK0
+; RUN: opt < %s -asan -asan-module -asan-coverage=1 -asan-globals=0 -S | \
+; RUN:     FileCheck %s --check-prefix=CHECK1
+; RUN: opt < %s -asan -asan-module -asan-coverage=2 -asan-globals=0 -S | \
+; RUN:     FileCheck %s --check-prefix=CHECK2
+; RUN: opt < %s -asan -asan-module -asan-coverage=2 -asan-coverage-block-threshold=10 \
+; RUN:     -asan-globals=0 -S | FileCheck %s --check-prefix=CHECK2
+; RUN: opt < %s -asan -asan-module -asan-coverage=2 -asan-coverage-block-threshold=1 \
+; RUN:     -asan-globals=0 -S | FileCheck %s --check-prefix=CHECK1
+
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
 define void @foo(i32* %a) sanitize_address {
@@ -17,6 +30,9 @@ entry:
   ret void
 }
 
+; CHECK0-NOT: call void @__sanitizer_cov(
+; CHECK0-NOT: call void @__sanitizer_cov_module_init(
+
 ; CHECK1-LABEL: define void @foo
 ; CHECK1: %0 = load atomic i8* @__asan_gen_cov_foo monotonic, align 1
 ; CHECK1: %1 = icmp eq i8 0, %0
diff --git a/test/Instrumentation/AddressSanitizer/debug_info.ll b/test/Instrumentation/AddressSanitizer/debug_info.ll
index daf2957..336b98b 100644
--- a/test/Instrumentation/AddressSanitizer/debug_info.ll
+++ b/test/Instrumentation/AddressSanitizer/debug_info.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -asan -asan-module -S | FileCheck %s
+; RUN: opt < %s -asan -asan-module -asan-use-after-return=0 -S | FileCheck %s
 
 ; Checks that llvm.dbg.declare instructions are updated 
 ; accordingly as we merge allocas.
@@ -47,8 +47,9 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 ; Verify that debug descriptors for argument and local variable will be replaced
 ; with descriptors that end with OpDeref (encoded as 2).
-;   CHECK: ![[ARG_ID]] = metadata {{.*}} i64 2} ; [ DW_TAG_arg_variable ] [p] [line 1]
-;   CHECK: ![[VAR_ID]] = metadata {{.*}} i64 2} ; [ DW_TAG_auto_variable ] [r] [line 2]
+;   CHECK: ![[ARG_ID]] = {{.*}}metadata ![[OPDEREF:[0-9]+]]} ; [ DW_TAG_arg_variable ] [p] [line 1]
+;   CHECK: ![[OPDEREF]] = metadata !{i64 2}
+;   CHECK: ![[VAR_ID]] = {{.*}}metadata ![[OPDEREF]]} ; [ DW_TAG_auto_variable ] [r] [line 2]
 ; Verify that there are no more variable descriptors.
 ;   CHECK-NOT: DW_TAG_arg_variable
 ;   CHECK-NOT: DW_TAG_auto_variable
diff --git a/test/Instrumentation/AddressSanitizer/do-not-instrument-llvm-metadata.ll b/test/Instrumentation/AddressSanitizer/do-not-instrument-llvm-metadata.ll
index fbfc096..d02f12a 100644
--- a/test/Instrumentation/AddressSanitizer/do-not-instrument-llvm-metadata.ll
+++ b/test/Instrumentation/AddressSanitizer/do-not-instrument-llvm-metadata.ll
@@ -5,7 +5,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 @.str_noinst = private unnamed_addr constant [4 x i8] c"aaa\00", section "llvm.metadata"
-@.str_inst = private unnamed_addr constant [4 x i8] c"aaa\00",
+@.str_inst = private unnamed_addr constant [4 x i8] c"aaa\00"
 
 ; CHECK-NOT: {{asan_gen.*str_noinst}}
 ; CHECK: {{asan_gen.*str_inst}}
diff --git a/test/Instrumentation/AddressSanitizer/global_metadata.ll b/test/Instrumentation/AddressSanitizer/global_metadata.ll
new file mode 100644
index 0000000..9641c3e
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/global_metadata.ll
@@ -0,0 +1,63 @@
+; RUN: opt < %s -asan -asan-module -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Globals:
+@global = global i32 0, align 4
+@dyn_init_global = global i32 0, align 4
+@blacklisted_global = global i32 0, align 4
+@_ZZ4funcvE10static_var = internal global i32 0, align 4
+@.str = private unnamed_addr constant [14 x i8] c"Hello, world!\00", align 1
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_asan_globals.cpp, i8* null }]
+
+; Sanitizer location descriptors:
+@.str1 = private unnamed_addr constant [22 x i8] c"/tmp/asan-globals.cpp\00", align 1
+@.asan_loc_descr = private unnamed_addr constant { [22 x i8]*, i32, i32 } { [22 x i8]* @.str1, i32 5, i32 5 }
+@.asan_loc_descr1 = private unnamed_addr constant { [22 x i8]*, i32, i32 } { [22 x i8]* @.str1, i32 7, i32 5 }
+@.asan_loc_descr2 = private unnamed_addr constant { [22 x i8]*, i32, i32 } { [22 x i8]* @.str1, i32 12, i32 14 }
+@.asan_loc_descr4 = private unnamed_addr constant { [22 x i8]*, i32, i32 } { [22 x i8]* @.str1, i32 14, i32 25 }
+
+; Check that globals were instrumented, but sanitizer location descriptors weren't:
+; CHECK: @global = global { i32, [60 x i8] } zeroinitializer, align 32
+; CHECK: @.str = internal unnamed_addr constant { [14 x i8], [50 x i8] } { [14 x i8] c"Hello, world!\00", [50 x i8] zeroinitializer }, align 32
+; CHECK: @.asan_loc_descr = private unnamed_addr constant { [22 x i8]*, i32, i32 } { [22 x i8]* @.str1, i32 5, i32 5 }
+
+; Check that location decriptors were passed into __asan_register_globals:
+; CHECK: i64 ptrtoint ({ [22 x i8]*, i32, i32 }* @.asan_loc_descr to i64)
+
+; Function Attrs: nounwind sanitize_address
+define internal void @__cxx_global_var_init() #0 section ".text.startup" {
+entry:
+  %0 = load i32* @global, align 4
+  store i32 %0, i32* @dyn_init_global, align 4
+  ret void
+}
+
+; Function Attrs: nounwind sanitize_address
+define void @_Z4funcv() #1 {
+entry:
+  %literal = alloca i8*, align 8
+  store i8* getelementptr inbounds ([14 x i8]* @.str, i32 0, i32 0), i8** %literal, align 8
+  ret void
+}
+
+; Function Attrs: nounwind sanitize_address
+define internal void @_GLOBAL__sub_I_asan_globals.cpp() #0 section ".text.startup" {
+entry:
+  call void @__cxx_global_var_init()
+  ret void
+}
+
+attributes #0 = { nounwind sanitize_address }
+attributes #1 = { nounwind sanitize_address "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.asan.globals = !{!0, !1, !2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = metadata !{i32* @global, { [22 x i8]*, i32, i32 }* @.asan_loc_descr, i1 false, i1 false}
+!1 = metadata !{i32* @dyn_init_global, { [22 x i8]*, i32, i32 }* @.asan_loc_descr1, i1 true, i1 false}
+!2 = metadata !{i32* @blacklisted_global, null, i1 false, i1 true}
+!3 = metadata !{i32* @_ZZ4funcvE10static_var, { [22 x i8]*, i32, i32 }* @.asan_loc_descr2, i1 false, i1 false}
+!4 = metadata !{[14 x i8]* @.str, { [22 x i8]*, i32, i32 }* @.asan_loc_descr4, i1 false, i1 false}
+!5 = metadata !{metadata !"clang version 3.5.0 (211282)"}
diff --git a/test/Instrumentation/AddressSanitizer/instrument_global.ll b/test/Instrumentation/AddressSanitizer/instrument_global.ll
index 7945e81..816ab29 100644
--- a/test/Instrumentation/AddressSanitizer/instrument_global.ll
+++ b/test/Instrumentation/AddressSanitizer/instrument_global.ll
@@ -68,8 +68,8 @@ entry:
 }
 
 
-!llvm.asan.dynamically_initialized_globals = !{!0}
-!0 = metadata !{[10 x i32]* @GlobDy}
+!llvm.asan.globals = !{!0}
+!0 = metadata !{[10 x i32]* @GlobDy, null, i1 true, i1 false}
 
 ; CHECK-LABEL: define internal void @asan.module_ctor
 ; CHECK-NOT: ret
diff --git a/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll b/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
index 1d00cfa..83ff53f 100644
--- a/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
+++ b/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
@@ -7,9 +7,11 @@ target triple = "x86_64-unknown-linux-gnu"
 @YYY = global i32 0, align 4           ; W/o dynamic initializer.
 ; Clang will emit the following metadata identifying @xxx as dynamically
 ; initialized.
-!0 = metadata !{i32* @xxx}
-!1 = metadata !{i32* @XXX}
-!llvm.asan.dynamically_initialized_globals = !{!0, !1}
+!0 = metadata !{i32* @xxx, null, i1 true, i1 false}
+!1 = metadata !{i32* @XXX, null, i1 true, i1 false}
+!2 = metadata !{i32* @yyy, null, i1 false, i1 false}
+!3 = metadata !{i32* @YYY, null, i1 false, i1 false}
+!llvm.asan.globals = !{!0, !1, !2, !3}
 
 define i32 @initializer() uwtable {
 entry:
@@ -23,6 +25,8 @@ entry:
   ret void
 }
 
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }]
+
 define internal void @_GLOBAL__I_a() sanitize_address section ".text.startup" {
 entry:
   call void @__cxx_global_var_init()
diff --git a/test/Instrumentation/AddressSanitizer/lifetime.ll b/test/Instrumentation/AddressSanitizer/lifetime.ll
index 1961997..175a07d 100644
--- a/test/Instrumentation/AddressSanitizer/lifetime.ll
+++ b/test/Instrumentation/AddressSanitizer/lifetime.ll
@@ -1,5 +1,5 @@
 ; Test hanlding of llvm.lifetime intrinsics.
-; RUN: opt < %s -asan -asan-module -asan-check-lifetime -S | FileCheck %s
+; RUN: opt < %s -asan -asan-module -asan-check-lifetime -asan-use-after-return=0 -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Instrumentation/AddressSanitizer/stack-poisoning.ll b/test/Instrumentation/AddressSanitizer/stack-poisoning.ll
index 6919e53..ace12d0 100644
--- a/test/Instrumentation/AddressSanitizer/stack-poisoning.ll
+++ b/test/Instrumentation/AddressSanitizer/stack-poisoning.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -asan -asan-module -asan-use-after-return -S | FileCheck --check-prefix=CHECK-UAR %s
-; RUN: opt < %s -asan -asan-module -S | FileCheck --check-prefix=CHECK-PLAIN %s
+; RUN: opt < %s -asan -asan-module -asan-use-after-return=0 -S | FileCheck --check-prefix=CHECK-PLAIN %s
 target datalayout = "e-i64:64-f80:128-s:64-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Instrumentation/BoundsChecking/phi.ll b/test/Instrumentation/BoundsChecking/phi.ll
index 0f9d1b0..25a5ed1 100644
--- a/test/Instrumentation/BoundsChecking/phi.ll
+++ b/test/Instrumentation/BoundsChecking/phi.ll
@@ -52,7 +52,7 @@ fn.exit:
 }
 
 
-@global_as1 = private addrspace(1) unnamed_addr constant [10 x i8] c"ola\00mundo\00", align 1
+@global_as1 = private unnamed_addr addrspace(1) constant [10 x i8] c"ola\00mundo\00", align 1
 
 define void @f1_as1(i8 addrspace(1)* nocapture %c) {
 ; CHECK: @f1_as1
diff --git a/test/Instrumentation/DataFlowSanitizer/prefix-rename.ll b/test/Instrumentation/DataFlowSanitizer/prefix-rename.ll
index 1a56460..f3c36b1 100644
--- a/test/Instrumentation/DataFlowSanitizer/prefix-rename.ll
+++ b/test/Instrumentation/DataFlowSanitizer/prefix-rename.ll
@@ -8,7 +8,15 @@ module asm ".symver f1,f@@version1"
 ; CHECK: @"dfs$f2" = alias {{.*}} @"dfs$f1"
 @f2 = alias void ()* @f1
 
+; CHECK: @"dfs$g2" = alias {{.*}} @"dfs$g1"
+@g2 = alias bitcast (void (i8*)* @g1 to void (i16*)*)
+
 ; CHECK: define void @"dfs$f1"
 define void @f1() {
   ret void
 }
+
+; CHECK: define void @"dfs$g1"
+define void @g1(i8*) {
+  ret void
+}
diff --git a/test/Instrumentation/MemorySanitizer/atomics.ll b/test/Instrumentation/MemorySanitizer/atomics.ll
index 98697d7..c8f3b88 100644
--- a/test/Instrumentation/MemorySanitizer/atomics.ll
+++ b/test/Instrumentation/MemorySanitizer/atomics.ll
@@ -37,12 +37,13 @@ entry:
 
 define i32 @Cmpxchg(i32* %p, i32 %a, i32 %b) sanitize_memory {
 entry:
-  %0 = cmpxchg i32* %p, i32 %a, i32 %b seq_cst seq_cst
+  %pair = cmpxchg i32* %p, i32 %a, i32 %b seq_cst seq_cst
+  %0 = extractvalue { i32, i1 } %pair, 0
   ret i32 %0
 }
 
 ; CHECK: @Cmpxchg
-; CHECK: store i32 0,
+; CHECK: store { i32, i1 } zeroinitializer,
 ; CHECK: icmp
 ; CHECK: br
 ; CHECK: @__msan_warning
@@ -55,12 +56,13 @@ entry:
 
 define i32 @CmpxchgMonotonic(i32* %p, i32 %a, i32 %b) sanitize_memory {
 entry:
-  %0 = cmpxchg i32* %p, i32 %a, i32 %b monotonic monotonic
+  %pair = cmpxchg i32* %p, i32 %a, i32 %b monotonic monotonic
+  %0 = extractvalue { i32, i1 } %pair, 0
   ret i32 %0
 }
 
 ; CHECK: @CmpxchgMonotonic
-; CHECK: store i32 0,
+; CHECK: store { i32, i1 } zeroinitializer,
 ; CHECK: icmp
 ; CHECK: br
 ; CHECK: @__msan_warning
diff --git a/test/Instrumentation/MemorySanitizer/instrumentation-with-call-threshold.ll b/test/Instrumentation/MemorySanitizer/instrumentation-with-call-threshold.ll
index 34988ef..beb3c5f 100644
--- a/test/Instrumentation/MemorySanitizer/instrumentation-with-call-threshold.ll
+++ b/test/Instrumentation/MemorySanitizer/instrumentation-with-call-threshold.ll
@@ -1,7 +1,10 @@
 ; Test -msan-instrumentation-with-call-threshold
+; Test that in with-calls mode there are no calls to __msan_chain_origin - they
+; are done from __msan_maybe_store_origin_*.
 
 ; RUN: opt < %s -msan -msan-check-access-address=0 -msan-instrumentation-with-call-threshold=0 -S | FileCheck %s
 ; RUN: opt < %s -msan -msan-check-access-address=0 -msan-instrumentation-with-call-threshold=0 -msan-track-origins=1 -S | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS %s
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-instrumentation-with-call-threshold=0 -msan-track-origins=2 -S | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -41,7 +44,10 @@ entry:
 ; CHECK: load {{.*}} @__msan_param_tls
 ; CHECK-ORIGINS: load {{.*}} @__msan_param_origin_tls
 ; CHECK: store
+; CHECK-ORIGINS-NOT: __msan_chain_origin
 ; CHECK-ORIGINS: bitcast i64* {{.*}} to i8*
+; CHECK-ORIGINS-NOT: __msan_chain_origin
 ; CHECK-ORIGINS: call void @__msan_maybe_store_origin_8(
+; CHECK-ORIGINS-NOT: __msan_chain_origin
 ; CHECK: store i64
 ; CHECK: ret void
diff --git a/test/Instrumentation/MemorySanitizer/missing_origin.ll b/test/Instrumentation/MemorySanitizer/missing_origin.ll
new file mode 100644
index 0000000..673e853
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/missing_origin.ll
@@ -0,0 +1,19 @@
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-track-origins=1 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Test that result origin is directy propagated from the argument,
+; and is not affected by all the literal undef operands.
+; https://code.google.com/p/memory-sanitizer/issues/detail?id=56
+
+define <4 x i32> @Shuffle(<4 x i32> %x) nounwind uwtable sanitize_memory {
+entry:
+  %y = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  ret <4 x i32> %y
+}
+
+; CHECK-LABEL: @Shuffle(
+; CHECK: [[A:%.*]] = load i32* {{.*}}@__msan_param_origin_tls,
+; CHECK: store i32 [[A]], i32* @__msan_retval_origin_tls
+; CHECK: ret <4 x i32>
diff --git a/test/Instrumentation/MemorySanitizer/msan_basic.ll b/test/Instrumentation/MemorySanitizer/msan_basic.ll
index 6b71310..51693cd 100644
--- a/test/Instrumentation/MemorySanitizer/msan_basic.ll
+++ b/test/Instrumentation/MemorySanitizer/msan_basic.ll
@@ -651,7 +651,7 @@ define void @VACopy(i8* %p1, i8* %p2) nounwind uwtable sanitize_memory {
 declare void @llvm.va_start(i8*) nounwind
 
 ; Function Attrs: nounwind uwtable
-define void @VAStart(i32 %x, ...) {
+define void @VAStart(i32 %x, ...) sanitize_memory {
 entry:
   %x.addr = alloca i32, align 4
   %va = alloca [1 x %struct.__va_list_tag], align 16
@@ -683,7 +683,7 @@ entry:
 ; CHECK: ret void
 
 
-; Test that checks are omitted but shadow propagation is kept if
+; Test that checks are omitted and returned value is always initialized if
 ; sanitize_memory attribute is missing.
 
 define i32 @NoSanitizeMemory(i32 %x) uwtable {
@@ -703,9 +703,7 @@ declare void @bar()
 
 ; CHECK: @NoSanitizeMemory
 ; CHECK-NOT: @__msan_warning
-; CHECK: load i32* {{.*}} @__msan_param_tls
-; CHECK-NOT: @__msan_warning
-; CHECK: store {{.*}} @__msan_retval_tls
+; CHECK: store i32 0, {{.*}} @__msan_retval_tls
 ; CHECK-NOT: @__msan_warning
 ; CHECK: ret i32
 
@@ -745,6 +743,29 @@ declare i32 @NoSanitizeMemoryUndefHelper(i32 %x)
 ; CHECK: ret i32
 
 
+; Test PHINode instrumentation in blacklisted functions
+
+define i32 @NoSanitizeMemoryPHI(i32 %x) {
+entry:
+  %tobool = icmp ne i32 %x, 0
+  br i1 %tobool, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ undef, %cond.true ], [ undef, %cond.false ]
+  ret i32 %cond
+}
+
+; CHECK: [[A:%.*]] = phi i32 [ undef, %cond.true ], [ undef, %cond.false ]
+; CHECK: store i32 0, i32* bitcast {{.*}} @__msan_retval_tls
+; CHECK: ret i32 [[A]]
+
+
 ; Test argument shadow alignment
 
 define <2 x i64> @ArgumentShadowAlignment(i64 %a, <2 x i64> %b) sanitize_memory {
@@ -825,3 +846,17 @@ entry:
 ; CHECK: store i64 16, i64* @__msan_va_arg_overflow_size_tls
 ; CHECK: call void (i32, ...)* @VAArgStructFn
 ; CHECK: ret void
+
+declare i32 @InnerTailCall(i32 %a)
+
+define void @MismatchedReturnTypeTailCall(i32 %a) sanitize_memory {
+  %b = tail call i32 @InnerTailCall(i32 %a)
+  ret void
+}
+
+; We used to strip off the 'tail' modifier, but now that we unpoison return slot
+; shadow before the call, we don't need to anymore.
+
+; CHECK-LABEL: define void @MismatchedReturnTypeTailCall
+; CHECK: tail call i32 @InnerTailCall
+; CHECK: ret void
diff --git a/test/Instrumentation/MemorySanitizer/mul_by_constant.ll b/test/Instrumentation/MemorySanitizer/mul_by_constant.ll
new file mode 100644
index 0000000..e068f69
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/mul_by_constant.ll
@@ -0,0 +1,94 @@
+; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Check instrumentation mul when one of the operands is a constant.
+
+define i64 @MulConst(i64 %x) sanitize_memory {
+entry:
+  %y = mul i64 %x, 42949672960000
+  ret i64 %y
+}
+
+; 42949672960000 = 2**32 * 10000
+; 36 trailing zero bits
+; 68719476736 = 2**36
+
+; CHECK-LABEL: @MulConst(
+; CHECK: [[A:%.*]] = load {{.*}} @__msan_param_tls
+; CHECK: [[B:%.*]] = mul i64 [[A]], 68719476736
+; CHECK: store i64 [[B]], i64* {{.*}} @__msan_retval_tls
+
+
+define i64 @MulZero(i64 %x) sanitize_memory {
+entry:
+  %y = mul i64 %x, 0
+  ret i64 %y
+}
+
+; CHECK-LABEL: @MulZero(
+; CHECK: [[A:%.*]] = load {{.*}} @__msan_param_tls
+; CHECK: [[B:%.*]] = mul i64 [[A]], 0{{$}}
+; CHECK: store i64 [[B]], i64* {{.*}} @__msan_retval_tls
+
+
+define i64 @MulNeg(i64 %x) sanitize_memory {
+entry:
+  %y = mul i64 %x, -16
+  ret i64 %y
+}
+
+; CHECK-LABEL: @MulNeg(
+; CHECK: [[A:%.*]] = load {{.*}} @__msan_param_tls
+; CHECK: [[B:%.*]] = mul i64 [[A]], 16
+; CHECK: store i64 [[B]], i64* {{.*}} @__msan_retval_tls
+
+
+define i64 @MulNeg2(i64 %x) sanitize_memory {
+entry:
+  %y = mul i64 %x, -48
+  ret i64 %y
+}
+
+; CHECK-LABEL: @MulNeg2(
+; CHECK: [[A:%.*]] = load {{.*}} @__msan_param_tls
+; CHECK: [[B:%.*]] = mul i64 [[A]], 16
+; CHECK: store i64 [[B]], i64* {{.*}} @__msan_retval_tls
+
+
+define i64 @MulOdd(i64 %x) sanitize_memory {
+entry:
+  %y = mul i64 %x, 12345
+  ret i64 %y
+}
+
+; CHECK-LABEL: @MulOdd(
+; CHECK: [[A:%.*]] = load {{.*}} @__msan_param_tls
+; CHECK: [[B:%.*]] = mul i64 [[A]], 1
+; CHECK: store i64 [[B]], i64* {{.*}} @__msan_retval_tls
+
+
+define i64 @MulLarge(i64 %x) sanitize_memory {
+entry:
+  %y = mul i64 %x, -9223372036854775808
+  ret i64 %y
+}
+
+; -9223372036854775808 = 0x7000000000000000
+
+; CHECK-LABEL: @MulLarge(
+; CHECK: [[A:%.*]] = load {{.*}} @__msan_param_tls
+; CHECK: [[B:%.*]] = mul i64 [[A]], -9223372036854775808
+; CHECK: store i64 [[B]], i64* {{.*}} @__msan_retval_tls
+
+define <4 x i32> @MulVectorConst(<4 x i32> %x) sanitize_memory {
+entry:
+  %y = mul <4 x i32> %x, <i32 3072, i32 0, i32 -16, i32 -48>
+  ret <4 x i32> %y
+}
+
+; CHECK-LABEL: @MulVectorConst(
+; CHECK: [[A:%.*]] = load {{.*}} @__msan_param_tls
+; CHECK: [[B:%.*]] = mul <4 x i32> [[A]], <i32 1024, i32 0, i32 16, i32 16>
+; CHECK: store <4 x i32> [[B]], <4 x i32>* {{.*}} @__msan_retval_tls
diff --git a/test/Instrumentation/MemorySanitizer/store-origin.ll b/test/Instrumentation/MemorySanitizer/store-origin.ll
index 024a10a..0bd9777 100644
--- a/test/Instrumentation/MemorySanitizer/store-origin.ll
+++ b/test/Instrumentation/MemorySanitizer/store-origin.ll
@@ -20,7 +20,7 @@ entry:
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind sanitize_memory "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
diff --git a/test/Instrumentation/MemorySanitizer/vector_arith.ll b/test/Instrumentation/MemorySanitizer/vector_arith.ll
new file mode 100644
index 0000000..6541a1c
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -0,0 +1,65 @@
+; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
+declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
+
+define <4 x i32> @Test_sse2_pmadd_wd(<8 x i16> %a, <8 x i16> %b) sanitize_memory {
+entry:
+  %c = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b) nounwind
+  ret <4 x i32> %c
+}
+
+; CHECK-LABEL: @Test_sse2_pmadd_wd(
+; CHECK: or <8 x i16>
+; CHECK: bitcast <8 x i16> {{.*}} to <4 x i32>
+; CHECK: icmp ne <4 x i32> {{.*}}, zeroinitializer
+; CHECK: sext <4 x i1> {{.*}} to <4 x i32>
+; CHECK: ret <4 x i32>
+
+
+define x86_mmx @Test_ssse3_pmadd_ub_sw(x86_mmx %a, x86_mmx %b) sanitize_memory {
+entry:
+  %c = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %a, x86_mmx %b) nounwind
+  ret x86_mmx %c
+}
+
+; CHECK-LABEL: @Test_ssse3_pmadd_ub_sw(
+; CHECK: or i64
+; CHECK: bitcast i64 {{.*}} to <4 x i16>
+; CHECK: icmp ne <4 x i16> {{.*}}, zeroinitializer
+; CHECK: sext <4 x i1> {{.*}} to <4 x i16>
+; CHECK: bitcast <4 x i16> {{.*}} to i64
+; CHECK: ret x86_mmx
+
+
+define <2 x i64> @Test_x86_sse2_psad_bw(<16 x i8> %a, <16 x i8> %b) sanitize_memory {
+  %c = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a, <16 x i8> %b)
+  ret <2 x i64> %c
+}
+
+; CHECK-LABEL: @Test_x86_sse2_psad_bw(
+; CHECK: or <16 x i8> {{.*}}, {{.*}}
+; CHECK: bitcast <16 x i8> {{.*}} to <2 x i64>
+; CHECK: icmp ne <2 x i64> {{.*}}, zeroinitializer
+; CHECK: sext <2 x i1> {{.*}} to <2 x i64>
+; CHECK: lshr <2 x i64> {{.*}}, <i64 48, i64 48>
+; CHECK: ret <2 x i64>
+
+
+define x86_mmx @Test_x86_mmx_psad_bw(x86_mmx %a, x86_mmx %b) sanitize_memory {
+entry:
+  %c = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %a, x86_mmx %b) nounwind
+  ret x86_mmx %c
+}
+
+; CHECK-LABEL: @Test_x86_mmx_psad_bw(
+; CHECK: or i64
+; CHECK: icmp ne i64
+; CHECK: sext i1 {{.*}} to i64
+; CHECK: lshr i64 {{.*}}, 48
+; CHECK: ret x86_mmx
diff --git a/test/Instrumentation/MemorySanitizer/vector_pack.ll b/test/Instrumentation/MemorySanitizer/vector_pack.ll
new file mode 100644
index 0000000..31c0c62
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/vector_pack.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
+declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) nounwind readnone
+declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
+
+define <8 x i16> @Test_packssdw_128(<4 x i32> %a, <4 x i32> %b) sanitize_memory {
+entry:
+  %c = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b) nounwind
+  ret <8 x i16> %c
+}
+
+; CHECK-LABEL: @Test_packssdw_128(
+; CHECK-DAG: icmp ne <4 x i32> {{.*}}, zeroinitializer
+; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i32>
+; CHECK-DAG: icmp ne <4 x i32> {{.*}}, zeroinitializer
+; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i32>
+; CHECK-DAG: call <8 x i16> @llvm.x86.sse2.packssdw.128(
+; CHECK-DAG: call <8 x i16> @llvm.x86.sse2.packssdw.128(
+; CHECK: ret <8 x i16>
+
+
+define <32 x i8> @Test_avx_packuswb(<16 x i16> %a, <16 x i16> %b) sanitize_memory {
+entry:
+  %c = tail call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) nounwind
+  ret <32 x i8> %c
+}
+
+; CHECK-LABEL: @Test_avx_packuswb(
+; CHECK-DAG: icmp ne <16 x i16> {{.*}}, zeroinitializer
+; CHECK-DAG: sext <16 x i1> {{.*}} to <16 x i16>
+; CHECK-DAG: icmp ne <16 x i16> {{.*}}, zeroinitializer
+; CHECK-DAG: sext <16 x i1> {{.*}} to <16 x i16>
+; CHECK-DAG: call <32 x i8> @llvm.x86.avx2.packsswb(
+; CHECK-DAG: call <32 x i8> @llvm.x86.avx2.packuswb(
+; CHECK: ret <32 x i8>
+
+
+define x86_mmx @Test_mmx_packuswb(x86_mmx %a, x86_mmx %b) sanitize_memory {
+entry:
+  %c = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %a, x86_mmx %b) nounwind
+  ret x86_mmx %c
+}
+
+; CHECK-LABEL: @Test_mmx_packuswb(
+; CHECK-DAG: bitcast i64 {{.*}} to <4 x i16>
+; CHECK-DAG: bitcast i64 {{.*}} to <4 x i16>
+; CHECK-DAG: icmp ne <4 x i16> {{.*}}, zeroinitializer
+; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i16>
+; CHECK-DAG: icmp ne <4 x i16> {{.*}}, zeroinitializer
+; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i16>
+; CHECK-DAG: bitcast <4 x i16> {{.*}} to x86_mmx
+; CHECK-DAG: bitcast <4 x i16> {{.*}} to x86_mmx
+; CHECK-DAG: call x86_mmx @llvm.x86.mmx.packsswb({{.*}}
+; CHECK-DAG: bitcast x86_mmx {{.*}} to i64
+; CHECK-DAG: call x86_mmx @llvm.x86.mmx.packuswb({{.*}}
+; CHECK: ret x86_mmx
diff --git a/test/Instrumentation/MemorySanitizer/vector_shift.ll b/test/Instrumentation/MemorySanitizer/vector_shift.ll
index d32f51b..91e4bd5 100644
--- a/test/Instrumentation/MemorySanitizer/vector_shift.ll
+++ b/test/Instrumentation/MemorySanitizer/vector_shift.ll
@@ -13,7 +13,7 @@ declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32)
 declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32)
 declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32)
 
-define i64 @test_mmx(i64 %x.coerce, i64 %y.coerce) {
+define i64 @test_mmx(i64 %x.coerce, i64 %y.coerce) sanitize_memory {
 entry:
   %0 = bitcast i64 %x.coerce to <2 x i32>
   %1 = bitcast <2 x i32> %0 to x86_mmx
@@ -35,7 +35,7 @@ entry:
 ; CHECK: ret i64
 
 
-define <8 x i16> @test_sse2_scalar(<8 x i16> %x, i32 %y) {
+define <8 x i16> @test_sse2_scalar(<8 x i16> %x, i32 %y) sanitize_memory {
 entry:
   %0 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %x, i32 %y)
   ret <8 x i16> %0
@@ -51,7 +51,7 @@ entry:
 ; CHECK: ret <8 x i16>
 
 
-define <8 x i16> @test_sse2(<8 x i16> %x, <8 x i16> %y) {
+define <8 x i16> @test_sse2(<8 x i16> %x, <8 x i16> %y) sanitize_memory {
 entry:
   %0 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %x, <8 x i16> %y)
   ret <8 x i16> %0
@@ -71,7 +71,7 @@ entry:
 
 ; Test variable shift (i.e. vector by vector).
 
-define <4 x i32> @test_avx2(<4 x i32> %x, <4 x i32> %y) {
+define <4 x i32> @test_avx2(<4 x i32> %x, <4 x i32> %y) sanitize_memory {
 entry:
   %0 = tail call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %0
@@ -85,7 +85,7 @@ entry:
 ; CHECK: = tail call <4 x i32> @llvm.x86.avx2.psllv.d(
 ; CHECK: ret <4 x i32>
 
-define <8 x i32> @test_avx2_256(<8 x i32> %x, <8 x i32> %y) {
+define <8 x i32> @test_avx2_256(<8 x i32> %x, <8 x i32> %y) sanitize_memory {
 entry:
   %0 = tail call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %0
diff --git a/test/Instrumentation/ThreadSanitizer/tsan_basic.ll b/test/Instrumentation/ThreadSanitizer/tsan_basic.ll
index d449a97..dc6e43e 100644
--- a/test/Instrumentation/ThreadSanitizer/tsan_basic.ll
+++ b/test/Instrumentation/ThreadSanitizer/tsan_basic.ll
@@ -27,7 +27,7 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
 
 ; Check that tsan converts mem intrinsics back to function calls.
 
-define void @MemCpyTest(i8* nocapture %x, i8* nocapture %y) {
+define void @MemCpyTest(i8* nocapture %x, i8* nocapture %y) sanitize_thread {
 entry:
     tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %x, i8* %y, i64 16, i32 4, i1 false)
     ret void
@@ -36,7 +36,7 @@ entry:
 ; CHECK: ret void
 }
 
-define void @MemMoveTest(i8* nocapture %x, i8* nocapture %y) {
+define void @MemMoveTest(i8* nocapture %x, i8* nocapture %y) sanitize_thread {
 entry:
     tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %x, i8* %y, i64 16, i32 4, i1 false)
     ret void
@@ -45,7 +45,7 @@ entry:
 ; CHECK: ret void
 }
 
-define void @MemSetTest(i8* nocapture %x)  {
+define void @MemSetTest(i8* nocapture %x) sanitize_thread {
 entry:
     tail call void @llvm.memset.p0i8.i64(i8* %x, i8 77, i64 16, i32 4, i1 false)
     ret void
diff --git a/test/LTO/jump-table-type.ll b/test/LTO/jump-table-type.ll
new file mode 100644
index 0000000..a39d3e9
--- /dev/null
+++ b/test/LTO/jump-table-type.ll
@@ -0,0 +1,23 @@
+; RUN: llvm-as <%s >%t1
+; RUN: llvm-lto -o %t2 %t1 -jump-table-type=arity
+; RUN: llvm-nm %t2 | FileCheck %s
+
+; CHECK: T __llvm_jump_instr_table_0_1
+; CHECK: T __llvm_jump_instr_table_1_1
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @g(i32 %a) unnamed_addr jumptable {
+  ret i32 %a
+}
+
+define i32 @f() unnamed_addr jumptable {
+  ret i32 0
+}
+
+define i32 @main() {
+  ret i32 0
+}
+
+@llvm.used = appending global [2 x i8*]  [i8* bitcast (i32(i32)* @g to i8*),
+                                          i8* bitcast (i32()* @f to i8*)]
diff --git a/test/LTO/lit.local.cfg b/test/LTO/lit.local.cfg
index 6df0e03..afde89b 100644
--- a/test/LTO/lit.local.cfg
+++ b/test/LTO/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
   config.unsupported = True
diff --git a/test/LTO/no-undefined-puts-when-implemented.ll b/test/LTO/no-undefined-puts-when-implemented.ll
index 18f5d21..29db8a6 100644
--- a/test/LTO/no-undefined-puts-when-implemented.ll
+++ b/test/LTO/no-undefined-puts-when-implemented.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-as <%s >%t1
 ; RUN: llvm-lto -exported-symbol=_uses_puts -exported-symbol=_uses_printf -o - %t1 | \
-; RUN: llvm-nm | \
+; RUN: llvm-nm - | \
 ; RUN: FileCheck %s
 ; rdar://problem/16165191
 ; runtime library implementations should not be renamed
diff --git a/test/Linker/Inputs/PR8300.b.ll b/test/Linker/Inputs/PR8300.b.ll
index 362d309..9e538f5 100644
--- a/test/Linker/Inputs/PR8300.b.ll
+++ b/test/Linker/Inputs/PR8300.b.ll
@@ -1,7 +1,7 @@
 %foo = type { [8 x i8] }
 %bar = type { [9 x i8] }
 
-@zed = alias void (%foo*), void (%bar*)* @xyz
+@zed = alias bitcast (void (%bar*)* @xyz to void (%foo*)*)
 
 define void @xyz(%bar* %this) {
 entry:
diff --git a/test/Linker/Inputs/alias.ll b/test/Linker/Inputs/alias.ll
index b869cae..f379476 100644
--- a/test/Linker/Inputs/alias.ll
+++ b/test/Linker/Inputs/alias.ll
@@ -1,3 +1,3 @@
 @zed = global i32 42
 @foo = alias i32* @zed
-@foo2 = alias i16, i32* @zed
+@foo2 = alias bitcast (i32* @zed to i16*)
diff --git a/test/Linker/Inputs/comdat.ll b/test/Linker/Inputs/comdat.ll
new file mode 100644
index 0000000..fdcca49
--- /dev/null
+++ b/test/Linker/Inputs/comdat.ll
@@ -0,0 +1,20 @@
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+$foo = comdat largest
+@foo = global i64 43, comdat $foo
+
+define i32 @bar() comdat $foo {
+  ret i32 43
+}
+
+$qux = comdat largest
+@qux = global i32 13, comdat $qux
+@in_unselected_group = global i32 13, comdat $qux
+
+define i32 @baz() comdat $qux {
+  ret i32 13
+}
+
+$any = comdat any
+@any = global i64 7, comdat $any
diff --git a/test/Linker/Inputs/comdat2.ll b/test/Linker/Inputs/comdat2.ll
new file mode 100644
index 0000000..9e18304
--- /dev/null
+++ b/test/Linker/Inputs/comdat2.ll
@@ -0,0 +1,2 @@
+$foo = comdat largest
+@foo = global i64 43, comdat $foo
diff --git a/test/Linker/Inputs/comdat3.ll b/test/Linker/Inputs/comdat3.ll
new file mode 100644
index 0000000..06f08b9
--- /dev/null
+++ b/test/Linker/Inputs/comdat3.ll
@@ -0,0 +1,2 @@
+$foo = comdat noduplicates
+@foo = global i64 43, comdat $foo
diff --git a/test/Linker/Inputs/comdat4.ll b/test/Linker/Inputs/comdat4.ll
new file mode 100644
index 0000000..bbfe3f7
--- /dev/null
+++ b/test/Linker/Inputs/comdat4.ll
@@ -0,0 +1,5 @@
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+$foo = comdat samesize
+@foo = global i64 42, comdat $foo
diff --git a/test/Linker/Inputs/comdat5.ll b/test/Linker/Inputs/comdat5.ll
new file mode 100644
index 0000000..800af18
--- /dev/null
+++ b/test/Linker/Inputs/comdat5.ll
@@ -0,0 +1,15 @@
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+%MSRTTICompleteObjectLocator = type { i32, i32, i32, i8*, %MSRTTIClassHierarchyDescriptor* }
+%MSRTTIClassHierarchyDescriptor = type { i32, i32, i32, %MSRTTIBaseClassDescriptor** }
+%MSRTTIBaseClassDescriptor = type { i8*, i32, i32, i32, i32, i32, %MSRTTIClassHierarchyDescriptor* }
+%struct.S = type { i32 (...)** }
+
+$"\01??_7S@@6B@" = comdat largest
+
+@"\01??_R4S@@6B@" = external constant %MSRTTICompleteObjectLocator
+@some_name = private unnamed_addr constant [2 x i8*] [i8* bitcast (%MSRTTICompleteObjectLocator* @"\01??_R4S@@6B@" to i8*), i8* bitcast (void (%struct.S*, i32)* @"\01??_GS@@UAEPAXI@Z" to i8*)], comdat $"\01??_7S@@6B@"
+@"\01??_7S@@6B@" = alias getelementptr([2 x i8*]* @some_name, i32 0, i32 1)
+
+declare x86_thiscallcc void @"\01??_GS@@UAEPAXI@Z"(%struct.S*, i32) unnamed_addr
diff --git a/test/Linker/Inputs/cycle.ll b/test/Linker/Inputs/cycle.ll
deleted file mode 100644
index d0eddb6..0000000
--- a/test/Linker/Inputs/cycle.ll
+++ /dev/null
@@ -1,2 +0,0 @@
-@foo = alias i32* @bar
-@bar = weak global i32 0
diff --git a/test/Linker/alias.ll b/test/Linker/alias.ll
index 5809a15..bce51ad 100644
--- a/test/Linker/alias.ll
+++ b/test/Linker/alias.ll
@@ -5,12 +5,12 @@
 ; CHECK-DAG: @foo = alias i32* @zed
 
 @bar = alias i32* @foo
-; CHECK-DAG: @bar = alias i32* @zed
+; CHECK-DAG: @bar = alias i32* @foo
 
 @foo2 = weak global i32 0
-; CHECK-DAG: @foo2 = alias i16, i32* @zed
+; CHECK-DAG: @foo2 = alias bitcast (i32* @zed to i16*)
 
 @bar2 = alias i32* @foo2
-; CHECK-DAG: @bar2 = alias i32* @zed
+; CHECK-DAG: @bar2 = alias bitcast (i16* @foo2 to i32*)
 
 ; CHECK-DAG: @zed = global i32 42
diff --git a/test/Linker/comdat.ll b/test/Linker/comdat.ll
new file mode 100644
index 0000000..4d2aef7
--- /dev/null
+++ b/test/Linker/comdat.ll
@@ -0,0 +1,32 @@
+; RUN: llvm-link %s %p/Inputs/comdat.ll -S -o - | FileCheck %s
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+$foo = comdat largest
+@foo = global i32 42, comdat $foo
+
+define i32 @bar() comdat $foo {
+  ret i32 42
+}
+
+$qux = comdat largest
+@qux = global i64 12, comdat $qux
+
+define i32 @baz() comdat $qux {
+  ret i32 12
+}
+
+$any = comdat any
+@any = global i64 6, comdat $any
+
+; CHECK: $qux = comdat largest
+; CHECK: $foo = comdat largest
+; CHECK: $any = comdat any
+
+; CHECK: @qux = global i64 12, comdat $qux
+; CHECK: @any = global i64 6, comdat $any
+; CHECK: @foo = global i64 43, comdat $foo
+; CHECK-NOT: @in_unselected_group = global i32 13, comdat $qux
+
+; CHECK: define i32 @baz() comdat $qux
+; CHECK: define i32 @bar() comdat $foo
diff --git a/test/Linker/comdat2.ll b/test/Linker/comdat2.ll
new file mode 100644
index 0000000..60c3d7c
--- /dev/null
+++ b/test/Linker/comdat2.ll
@@ -0,0 +1,7 @@
+; RUN: not llvm-link %s %p/Inputs/comdat.ll -S -o - 2>&1 | FileCheck %s
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+$foo = comdat samesize
+@foo = global i32 42, comdat $foo
+; CHECK: Linking COMDATs named 'foo': invalid selection kinds!
diff --git a/test/Linker/comdat3.ll b/test/Linker/comdat3.ll
new file mode 100644
index 0000000..f0d9a48
--- /dev/null
+++ b/test/Linker/comdat3.ll
@@ -0,0 +1,5 @@
+; RUN: not llvm-link %s %p/Inputs/comdat2.ll -S -o - 2>&1 | FileCheck %s
+
+$foo = comdat largest
+@foo = global i32 43, comdat $foo
+; CHECK: Linking COMDATs named 'foo': can't do size dependent selection without DataLayout!
diff --git a/test/Linker/comdat4.ll b/test/Linker/comdat4.ll
new file mode 100644
index 0000000..50c1778
--- /dev/null
+++ b/test/Linker/comdat4.ll
@@ -0,0 +1,5 @@
+; RUN: not llvm-link %s %p/Inputs/comdat3.ll -S -o - 2>&1 | FileCheck %s
+
+$foo = comdat noduplicates
+@foo = global i64 43, comdat $foo
+; CHECK: Linking COMDATs named 'foo': noduplicates has been violated!
diff --git a/test/Linker/comdat5.ll b/test/Linker/comdat5.ll
new file mode 100644
index 0000000..011fb8c
--- /dev/null
+++ b/test/Linker/comdat5.ll
@@ -0,0 +1,7 @@
+; RUN: not llvm-link %s %p/Inputs/comdat4.ll -S -o - 2>&1 | FileCheck %s
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+$foo = comdat samesize
+@foo = global i32 42, comdat $foo
+; CHECK: Linking COMDATs named 'foo': SameSize violated!
diff --git a/test/Linker/comdat6.ll b/test/Linker/comdat6.ll
new file mode 100644
index 0000000..efa5dfb
--- /dev/null
+++ b/test/Linker/comdat6.ll
@@ -0,0 +1,13 @@
+; RUN: llvm-link %s %p/Inputs/comdat5.ll -S -o - 2>&1 | FileCheck %s
+; RUN: llvm-link %p/Inputs/comdat5.ll %s -S -o - 2>&1 | FileCheck %s
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+%struct.S = type { i32 (...)** }
+
+$"\01??_7S@@6B@" = comdat largest
+@"\01??_7S@@6B@" = linkonce_odr unnamed_addr constant [1 x i8*] [i8* bitcast (void (%struct.S*, i32)* @"\01??_GS@@UAEPAXI@Z" to i8*)], comdat $"\01??_7S@@6B@"
+
+; CHECK: @"\01??_7S@@6B@" = alias getelementptr inbounds ([2 x i8*]* @some_name, i32 0, i32 1)
+
+declare x86_thiscallcc void @"\01??_GS@@UAEPAXI@Z"(%struct.S*, i32) unnamed_addr
diff --git a/test/Linker/comdat7.ll b/test/Linker/comdat7.ll
new file mode 100644
index 0000000..c3ff3f6
--- /dev/null
+++ b/test/Linker/comdat7.ll
@@ -0,0 +1,9 @@
+; RUN: not llvm-link %s %p/Inputs/comdat5.ll -S -o - 2>&1 | FileCheck %s
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+$"\01??_7S@@6B@" = comdat largest
+define void @"\01??_7S@@6B@"() {
+  ret void
+}
+; CHECK: GlobalVariable required for data dependent selection!
diff --git a/test/Linker/comdat8.ll b/test/Linker/comdat8.ll
new file mode 100644
index 0000000..21669f6
--- /dev/null
+++ b/test/Linker/comdat8.ll
@@ -0,0 +1,10 @@
+; RUN: not llvm-link %s %p/Inputs/comdat5.ll -S -o - 2>&1 | FileCheck %s
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+$"\01??_7S@@6B@" = comdat largest
+define void @some_name() {
+  ret void
+}
+@"\01??_7S@@6B@" = alias i8* inttoptr (i32 ptrtoint (void ()* @some_name to i32) to i8*)
+; CHECK: COMDAT key involves incomputable alias size.
diff --git a/test/Linker/cycle.ll b/test/Linker/cycle.ll
deleted file mode 100644
index 7d9ad2d..0000000
--- a/test/Linker/cycle.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: not llvm-link %s %S/Inputs/cycle.ll 2>&1 | FileCheck %s
-; RUN: not llvm-link %S/Inputs/cycle.ll %s 2>&1 | FileCheck %s
-
-; CHECK: Linking these modules creates an alias cycle
-
-@foo = weak global i32 0
-@bar = alias i32* @foo
diff --git a/test/Linker/unnamed-addr1-a.ll b/test/Linker/unnamed-addr1-a.ll
index adaa400..794ae98 100644
--- a/test/Linker/unnamed-addr1-a.ll
+++ b/test/Linker/unnamed-addr1-a.ll
@@ -21,6 +21,11 @@ define weak void @func-b() unnamed_addr { ret void }
 @global-f = weak global i32 42
 ; CHECK-DAG: @global-f = global i32 42
 
+@alias-a = weak global i32 42
+; CHECK-DAG: @alias-a = alias i32* @global-f
+@alias-b = weak unnamed_addr global i32 42
+; CHECK-DAG: @alias-b = unnamed_addr alias i32* @global-f
+
 declare void @func-c()
 ; CHECK-DAG: define weak void @func-c() {
 define weak void @func-d() { ret void }
@@ -38,6 +43,12 @@ define weak void @func-e() unnamed_addr { ret void }
 @global-j = weak global i32 42
 ; CHECK-DAG: @global-j = global i32 42
 
+@alias-c = weak global i32 42
+; CHECK-DAG: @alias-c = alias i32* @global-f
+@alias-d = weak unnamed_addr global i32 42
+; CHECK-DAG: @alias-d = alias i32* @global-f
+
+
 declare void @func-g()
 ; CHECK-DAG: define weak void @func-g() {
 define weak void @func-h() { ret void }
diff --git a/test/Linker/unnamed-addr1-b.ll b/test/Linker/unnamed-addr1-b.ll
index aa1507b..39a0c8b 100644
--- a/test/Linker/unnamed-addr1-b.ll
+++ b/test/Linker/unnamed-addr1-b.ll
@@ -6,6 +6,9 @@
 @global-e = unnamed_addr global i32 42
 @global-f = unnamed_addr global i32 42
 
+@alias-a =  unnamed_addr alias i32* @global-f
+@alias-b =  unnamed_addr alias i32* @global-f
+
 define weak void @func-c() unnamed_addr { ret void }
 define weak void @func-d() unnamed_addr { ret void }
 define weak void @func-e() unnamed_addr { ret void }
@@ -15,6 +18,9 @@ define weak void @func-e() unnamed_addr { ret void }
 @global-i = global i32 42
 @global-j = global i32 42
 
+@alias-c =  alias i32* @global-f
+@alias-d =  alias i32* @global-f
+
 define weak void @func-g() { ret void }
 define weak void @func-h() { ret void }
 define weak void @func-i() { ret void }
diff --git a/test/MC/AArch64/alias-logicalimm.s b/test/MC/AArch64/alias-logicalimm.s
new file mode 100644
index 0000000..28ec40b
--- /dev/null
+++ b/test/MC/AArch64/alias-logicalimm.s
@@ -0,0 +1,41 @@
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+// CHECK: and x0, x1, #0xfffffffffffffffd
+// CHECK: and x0, x1, #0xfffffffffffffffd
+        and x0, x1, #~2
+        bic x0, x1, #2
+
+// CHECK: and w0, w1, #0xfffffffd
+// CHECK: and w0, w1, #0xfffffffd
+        and w0, w1, #~2
+        bic w0, w1, #2
+
+// CHECK: ands x0, x1, #0xfffffffffffffffd
+// CHECK: ands x0, x1, #0xfffffffffffffffd
+        ands x0, x1, #~2
+        bics x0, x1, #2
+
+// CHECK: ands w0, w1, #0xfffffffd
+// CHECK: ands w0, w1, #0xfffffffd
+        ands w0, w1, #~2
+        bics w0, w1, #2
+
+// CHECK: orr x0, x1, #0xfffffffffffffffd
+// CHECK: orr x0, x1, #0xfffffffffffffffd
+        orr x0, x1, #~2
+        orn x0, x1, #2
+
+// CHECK: orr w2, w1, #0xfffffffc
+// CHECK: orr w2, w1, #0xfffffffc
+        orr w2, w1, #~3
+        orn w2, w1, #3
+
+// CHECK: eor x0, x1, #0xfffffffffffffffd
+// CHECK: eor x0, x1, #0xfffffffffffffffd
+        eor x0, x1, #~2
+        eon x0, x1, #2
+
+// CHECK: eor w2, w1, #0xfffffffc
+// CHECK: eor w2, w1, #0xfffffffc
+        eor w2, w1, #~3
+        eon w2, w1, #3
diff --git a/test/MC/AArch64/arm64-leaf-compact-unwind.s b/test/MC/AArch64/arm64-leaf-compact-unwind.s
index d699813..27d3d51 100644
--- a/test/MC/AArch64/arm64-leaf-compact-unwind.s
+++ b/test/MC/AArch64/arm64-leaf-compact-unwind.s
@@ -23,10 +23,10 @@
 // CHECK-NEXT:   Reserved1:
 // CHECK-NEXT:   Reserved2:
 // CHECK-NEXT:   Relocations [
-// CHECK-NEXT:     0x60 0 3 0 ARM64_RELOC_UNSIGNED 0 -
-// CHECK-NEXT:     0x40 0 3 0 ARM64_RELOC_UNSIGNED 0 -
-// CHECK-NEXT:     0x20 0 3 0 ARM64_RELOC_UNSIGNED 0 -
-// CHECK-NEXT:     0x0 0 3 0 ARM64_RELOC_UNSIGNED 0 -
+// CHECK-NEXT:     0x60 0 3 0 ARM64_RELOC_UNSIGNED 0 0x1
+// CHECK-NEXT:     0x40 0 3 0 ARM64_RELOC_UNSIGNED 0 0x1
+// CHECK-NEXT:     0x20 0 3 0 ARM64_RELOC_UNSIGNED 0 0x1
+// CHECK-NEXT:     0x0 0 3 0 ARM64_RELOC_UNSIGNED 0 0x1
 // CHECK-NEXT:   ]
 // CHECK-NEXT:   SectionData (
 // CHECK-NEXT:     0000: 00000000 00000000 08000000 00000002
diff --git a/test/MC/AArch64/arm64-system-encoding.s b/test/MC/AArch64/arm64-system-encoding.s
index 9246608..87f8f8a 100644
--- a/test/MC/AArch64/arm64-system-encoding.s
+++ b/test/MC/AArch64/arm64-system-encoding.s
@@ -4,7 +4,7 @@
 foo:
 
 ;-----------------------------------------------------------------------------
-; Simple encodings (instuctions w/ no operands)
+; Simple encodings (instructions w/ no operands)
 ;-----------------------------------------------------------------------------
 
   nop
diff --git a/test/MC/AArch64/basic-a64-diagnostics.s b/test/MC/AArch64/basic-a64-diagnostics.s
index a4a3b13..5293131 100644
--- a/test/MC/AArch64/basic-a64-diagnostics.s
+++ b/test/MC/AArch64/basic-a64-diagnostics.s
@@ -729,6 +729,27 @@
 // CHECK-ERROR-NEXT:                  ^
 
 //------------------------------------------------------------------------------
+// Logical (immediates)
+//------------------------------------------------------------------------------
+
+        and w2, w3, #4294967296
+        eor w2, w3, #4294967296
+        orr w2, w3, #4294967296
+        ands w2, w3, #4294967296
+// CHECK-ERROR: error: expected compatible register or logical immediate
+// CHECK-ERROR-NEXT:         and w2, w3, #4294967296
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected compatible register or logical immediate
+// CHECK-ERROR-NEXT:         eor w2, w3, #4294967296
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected compatible register or logical immediate
+// CHECK-ERROR-NEXT:         orr w2, w3, #4294967296
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected compatible register or logical immediate
+// CHECK-ERROR-NEXT:         ands w2, w3, #4294967296
+// CHECK-ERROR-NEXT:                      ^
+
+//------------------------------------------------------------------------------
 // Bitfield
 //------------------------------------------------------------------------------
 
@@ -1345,39 +1366,59 @@
 
         cset wsp, lt
         csetm sp, ge
+        cset w1, al
+        csetm x6, nv
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        cset wsp, lt
 // CHECK-ERROR-NEXT:             ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        csetm sp, ge
 // CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: condition codes AL and NV are invalid for this instruction
+// CHECK-ERROR-NEXT:        cset w1, al
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: condition codes AL and NV are invalid for this instruction
+// CHECK-ERROR-NEXT:        csetm x6, nv
+// CHECK-ERROR-NEXT:                    ^
 
         cinc w3, wsp, ne
         cinc sp, x9, eq
+        cinc x2, x0, nv
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        cinc w3, wsp, ne
 // CHECK-ERROR-NEXT:                 ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        cinc sp, x9, eq
 // CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: condition codes AL and NV are invalid for this instruction
+// CHECK-ERROR-NEXT:        cinc x2, x0, nv
+// CHECK-ERROR-NEXT:                       ^
 
         cinv w3, wsp, ne
         cinv sp, x9, eq
+        cinv w8, x7, nv
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        cinv w3, wsp, ne
 // CHECK-ERROR-NEXT:                 ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        cinv sp, x9, eq
 // CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: condition codes AL and NV are invalid for this instruction
+// CHECK-ERROR-NEXT:        cinv w8, x7, nv
+// CHECK-ERROR-NEXT:                       ^
 
         cneg w3, wsp, ne
         cneg sp, x9, eq
+        cneg x4, x5, al
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        cneg w3, wsp, ne
 // CHECK-ERROR-NEXT:                 ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        cneg sp, x9, eq
 // CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: condition codes AL and NV are invalid for this instruction
+// CHECK-ERROR-NEXT:        cneg x4, x5, al
+// CHECK-ERROR-NEXT:                       ^
 
 //------------------------------------------------------------------------------
 // Data Processing (1 source)
@@ -2944,13 +2985,17 @@
         orn wsp, w3, w5
         bics x20, sp, x9, lsr #0
         orn x2, x6, sp, lsl #3
-// CHECK-ERROR: error: invalid operand for instruction
+// FIXME: the diagnostic we get for 'orn wsp, w3, w5' is from the orn alias,
+// which is a better match than the genuine ORNWri, whereas it would be better
+// to get the ORNWri diagnostic when the alias did not match, i.e. the
+// alias' diagnostics should have a lower priority.
+// CHECK-ERROR: error: expected compatible register or logical immediate
 // CHECK-ERROR-NEXT:         orn wsp, w3, w5
-// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT:                      ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         bics x20, sp, x9, lsr #0
 // CHECK-ERROR-NEXT:                   ^
-// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT: error: expected compatible register or logical immediate
 // CHECK-ERROR-NEXT:         orn x2, x6, sp, lsl #3
 // CHECK-ERROR-NEXT:                     ^
 
diff --git a/test/MC/AArch64/basic-a64-instructions.s b/test/MC/AArch64/basic-a64-instructions.s
index a12968b..140ea33 100644
--- a/test/MC/AArch64/basic-a64-instructions.s
+++ b/test/MC/AArch64/basic-a64-instructions.s
@@ -601,9 +601,11 @@ _func:
         cmn w0, w3
         cmn wzr, w4
         cmn w5, wzr
+        cmn wsp, w6
 // CHECK: cmn      w0, w3                     // encoding: [0x1f,0x00,0x03,0x2b]
 // CHECK: cmn      wzr, w4                    // encoding: [0xff,0x03,0x04,0x2b]
 // CHECK: cmn      w5, wzr                    // encoding: [0xbf,0x00,0x1f,0x2b]
+// CHECK: cmn      wsp, w6                    // encoding: [0xff,0x43,0x26,0x2b]
 
         cmn w6, w7, lsl #0
         cmn w8, w9, lsl #15
@@ -629,9 +631,11 @@ _func:
         cmn x0, x3
         cmn xzr, x4
         cmn x5, xzr
+        cmn sp, x6
 // CHECK: cmn      x0, x3                     // encoding: [0x1f,0x00,0x03,0xab]
 // CHECK: cmn      xzr, x4                    // encoding: [0xff,0x03,0x04,0xab]
 // CHECK: cmn      x5, xzr                    // encoding: [0xbf,0x00,0x1f,0xab]
+// CHECK: cmn      sp, x6                     // encoding: [0xff,0x63,0x26,0xab]
 
         cmn x6, x7, lsl #0
         cmn x8, x9, lsl #15
@@ -657,9 +661,11 @@ _func:
         cmp w0, w3
         cmp wzr, w4
         cmp w5, wzr
+        cmp wsp, w6
 // CHECK: cmp      w0, w3                     // encoding: [0x1f,0x00,0x03,0x6b]
 // CHECK: cmp      wzr, w4                    // encoding: [0xff,0x03,0x04,0x6b]
 // CHECK: cmp      w5, wzr                    // encoding: [0xbf,0x00,0x1f,0x6b]
+// CHECK: cmp      wsp, w6                    // encoding: [0xff,0x43,0x26,0x6b]
 
         cmp w6, w7, lsl #0
         cmp w8, w9, lsl #15
@@ -685,9 +691,11 @@ _func:
         cmp x0, x3
         cmp xzr, x4
         cmp x5, xzr
+        cmp sp, x6
 // CHECK: cmp      x0, x3                     // encoding: [0x1f,0x00,0x03,0xeb]
 // CHECK: cmp      xzr, x4                    // encoding: [0xff,0x03,0x04,0xeb]
 // CHECK: cmp      x5, xzr                    // encoding: [0xbf,0x00,0x1f,0xeb]
+// CHECK: cmp      sp, x6                     // encoding: [0xff,0x63,0x26,0xeb]
 
         cmp x6, x7, lsl #0
         cmp x8, x9, lsl #15
@@ -3237,6 +3245,17 @@ _func:
 // CHECK: orr      w3, wzr, #0xf000f          // encoding: [0xe3,0x8f,0x00,0x32]
 // CHECK: orr x10, xzr, #0xaaaaaaaaaaaaaaaa // encoding: [0xea,0xf3,0x01,0xb2]
 
+        // The Imm field of logicalImm operations has to be truncated to the
+        // register width, i.e. 32 bits
+        and w2, w3, #-3
+        orr w0, w1, #~2
+        eor w16, w17, #-7
+        ands w19, w20, #~15
+// CHECK: and	w2, w3, #0xfffffffd     // encoding: [0x62,0x78,0x1e,0x12]
+// CHECK: orr	w0, w1, #0xfffffffd     // encoding: [0x20,0x78,0x1e,0x32]
+// CHECK: eor	w16, w17, #0xfffffff9   // encoding: [0x30,0x76,0x1d,0x52]
+// CHECK: ands	w19, w20, #0xfffffff0   // encoding: [0x93,0x6e,0x1c,0x72]
+
 //------------------------------------------------------------------------------
 // Logical (shifted register)
 //------------------------------------------------------------------------------
diff --git a/test/MC/AArch64/dot-req-case-insensitive.s b/test/MC/AArch64/dot-req-case-insensitive.s
new file mode 100644
index 0000000..e68b101
--- /dev/null
+++ b/test/MC/AArch64/dot-req-case-insensitive.s
@@ -0,0 +1,18 @@
+// RUN: llvm-mc -triple=arm64-eabi < %s | FileCheck %s
+_foo:
+        OBJECT .req x2
+        mov x4, OBJECT
+        mov x4, oBjEcT
+        .unreq oBJECT
+
+_foo2:
+        OBJECT .req w5
+        mov w4, OBJECT
+        .unreq OBJECT
+
+// CHECK-LABEL: _foo:
+// CHECK: mov x4, x2
+// CHECK: mov x4, x2
+
+// CHECK-LABEL: _foo2:
+// CHECK: mov w4, w5
diff --git a/test/MC/AArch64/dot-req-diagnostics.s b/test/MC/AArch64/dot-req-diagnostics.s
new file mode 100644
index 0000000..44065f8
--- /dev/null
+++ b/test/MC/AArch64/dot-req-diagnostics.s
@@ -0,0 +1,37 @@
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ERROR %s
+
+bar:
+        fred .req x5
+        fred .req x6
+// CHECK-ERROR: warning: ignoring redefinition of register alias 'fred'
+// CHECK-ERROR: fred .req x6
+// CHECK-ERROR: ^
+
+        ada  .req v2.8b
+// CHECK-ERROR: error: vector register without type specifier expected
+// CHECK-ERROR: ada  .req v2.8b
+// CHECK-ERROR:           ^
+
+        bob  .req lisa
+// CHECK-ERROR: error: register name or alias expected
+// CHECK-ERROR: bob  .req lisa
+// CHECK-ERROR:           ^
+
+        lisa .req x1, 23
+// CHECK-ERROR: error: unexpected input in .req directive
+// CHECK-ERROR: lisa .req x1, 23
+// CHECK-ERROR:             ^
+
+        mov  bob, fred
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: mov  bob, fred
+// CHECK-ERROR:      ^
+
+        .unreq 1
+// CHECK-ERROR: error: unexpected input in .unreq directive.
+// CHECK-ERROR: .unreq 1
+// CHECK-ERROR:        ^
+
+        mov  x1, fred
+// CHECK: mov x1, x5
+// CHECK-NOT: mov x1, x6
diff --git a/test/MC/AArch64/dot-req.s b/test/MC/AArch64/dot-req.s
new file mode 100644
index 0000000..947f945
--- /dev/null
+++ b/test/MC/AArch64/dot-req.s
@@ -0,0 +1,37 @@
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding < %s | FileCheck %s
+
+bar:
+        fred .req x5
+        mov fred, x11
+        .unreq fred
+        fred .req w6
+        mov w1, fred
+
+        bob .req fred
+        ada .req w1
+        mov ada, bob
+        .unreq bob
+        .unreq fred
+        .unreq ada
+// CHECK: mov      x5, x11                // encoding: [0xe5,0x03,0x0b,0xaa]
+// CHECK: mov      w1, w6                 // encoding: [0xe1,0x03,0x06,0x2a]
+// CHECK: mov      w1, w6                 // encoding: [0xe1,0x03,0x06,0x2a]
+
+        bob     .req b6
+        hanah   .req h5
+        sam     .req s4
+        dora    .req d3
+        quentin .req q2
+        vesna   .req v1
+        addv bob,     v0.8b
+        mov  hanah,   v4.h[3]
+        fadd s0,      sam,     sam
+        fmov d2,      dora
+        ldr  quentin, [sp]
+        mov  v0.8b,   vesna.8b
+// CHECK: addv    b6, v0.8b               // encoding: [0x06,0xb8,0x31,0x0e]
+// CHECK: mov     h5, v4.h[3]             // encoding: [0x85,0x04,0x0e,0x5e]
+// CHECK: fadd    s0, s4, s4              // encoding: [0x80,0x28,0x24,0x1e]
+// CHECK: fmov    d2, d3                  // encoding: [0x62,0x40,0x60,0x1e]
+// CHECK: ldr      q2, [sp]               // encoding: [0xe2,0x03,0xc0,0x3d]
+// CHECK: mov             v0.8b, v1.8b    // encoding: [0x20,0x1c,0xa1,0x0e]
diff --git a/test/MC/AArch64/ldr-pseudo-obj-errors.s b/test/MC/AArch64/ldr-pseudo-obj-errors.s
new file mode 100644
index 0000000..7f1b642
--- /dev/null
+++ b/test/MC/AArch64/ldr-pseudo-obj-errors.s
@@ -0,0 +1,13 @@
+//RUN: not llvm-mc -triple=aarch64-linux -filetype=obj %s -o %t1 2> %t2
+//RUN: cat %t2 | FileCheck %s
+
+//These tests look for errors that should be reported for invalid object layout
+//with the ldr pseudo. They are tested separately from parse errors because they
+//only trigger when the file has successfully parsed and the object file is about
+//to be written out.
+
+.text
+foo:
+  ldr x0, =0x10111
+  .space 0xdeadb0
+// CHECK: LVM ERROR: fixup value out of range
diff --git a/test/MC/AArch64/ldr-pseudo.s b/test/MC/AArch64/ldr-pseudo.s
new file mode 100644
index 0000000..6c82fb9
--- /dev/null
+++ b/test/MC/AArch64/ldr-pseudo.s
@@ -0,0 +1,231 @@
+//RUN: llvm-mc  -triple=aarch64-linux-gnu %s | FileCheck %s
+
+//
+// Check that large constants are converted to ldr from constant pool
+//
+// simple test
+.section a, "ax", @progbits
+// CHECK-LABEL: f1:
+f1:
+  ldr x0, =0x1234
+// CHECK: movz    x0, #0x1234
+  ldr w1, =0x4567
+// CHECK:  movz    w1, #0x4567
+  ldr x0, =0x12340000
+// CHECK:  movz    x0, #0x1234, lsl #16
+  ldr w1, =0x45670000
+// CHECK: movz    w1, #0x4567, lsl #16
+  ldr x0, =0xabc00000000
+// CHECK: movz    x0, #0xabc, lsl #32
+  ldr x0, =0xbeef000000000000
+// CHECK: movz    x0, #0xbeef, lsl #48
+
+.section b,"ax",@progbits
+// CHECK-LABEL: f3:
+f3:
+  ldr x0, =0x10001
+// CHECK: ldr x0, .Ltmp[[TMP0:[0-9]+]]
+
+// loading multiple constants
+.section c,"ax",@progbits
+// CHECK-LABEL: f4:
+f4:
+  ldr x0, =0x10002
+// CHECK: ldr x0, .Ltmp[[TMP1:[0-9]+]]
+  adds x0, x0, #1
+  adds x0, x0, #1
+  adds x0, x0, #1
+  adds x0, x0, #1
+  ldr x0, =0x10003
+// CHECK: ldr x0, .Ltmp[[TMP2:[0-9]+]]
+  adds x0, x0, #1
+  adds x0, x0, #1
+
+// TODO: the same constants should have the same constant pool location
+.section d,"ax",@progbits
+// CHECK-LABEL: f5:
+f5:
+  ldr x0, =0x10004
+// CHECK: ldr x0, .Ltmp[[TMP3:[0-9]+]]
+  adds x0, x0, #1
+  adds x0, x0, #1
+  adds x0, x0, #1
+  adds x0, x0, #1
+  adds x0, x0, #1
+  adds x0, x0, #1
+  adds x0, x0, #1
+  ldr x0, =0x10004
+// CHECK: ldr x0, .Ltmp[[TMP4:[0-9]+]]
+  adds x0, x0, #1
+  adds x0, x0, #1
+  adds x0, x0, #1
+  adds x0, x0, #1
+  adds x0, x0, #1
+  adds x0, x0, #1
+
+// a section defined in multiple pieces should be merged and use a single constant pool
+.section e,"ax",@progbits
+// CHECK-LABEL: f6:
+f6:
+  ldr x0, =0x10006
+// CHECK: ldr x0, .Ltmp[[TMP5:[0-9]+]]
+  adds x0, x0, #1
+  adds x0, x0, #1
+  adds x0, x0, #1
+
+.section f, "ax", @progbits
+// CHECK-LABEL: f7:
+f7:
+  adds x0, x0, #1
+  adds x0, x0, #1
+  adds x0, x0, #1
+
+.section e, "ax", @progbits
+// CHECK-LABEL: f8:
+f8:
+  adds x0, x0, #1
+  ldr x0, =0x10007
+// CHECK: ldr x0, .Ltmp[[TMP6:[0-9]+]]
+  adds x0, x0, #1
+  adds x0, x0, #1
+
+//
+// Check that symbols can be loaded using ldr pseudo
+//
+
+// load an undefined symbol
+.section g,"ax",@progbits
+// CHECK-LABEL: f9:
+f9:
+  ldr x0, =foo
+// CHECK: ldr x0, .Ltmp[[TMP7:[0-9]+]]
+
+// load a symbol from another section
+.section h,"ax",@progbits
+// CHECK-LABEL: f10:
+f10:
+  ldr x0, =f5
+// CHECK: ldr x0, .Ltmp[[TMP8:[0-9]+]]
+
+// load a symbol from the same section
+.section i,"ax",@progbits
+// CHECK-LABEL: f11:
+f11:
+  ldr x0, =f12
+// CHECK: ldr x0, .Ltmp[[TMP9:[0-9]+]]
+  ldr w0,=0x3C000
+// CHECK: ldr     w0, .Ltmp[[TMP10:[0-9]+]]
+
+// CHECK-LABEL: f12:
+f12:
+  adds x0, x0, #1
+  adds x0, x0, #1
+
+.section j,"ax",@progbits
+// mix of symbols and constants
+// CHECK-LABEL: f13:
+f13:
+  adds x0, x0, #1
+  adds x0, x0, #1
+  ldr x0, =0x101
+// CHECK: movz x0, #0x101
+  adds x0, x0, #1
+  adds x0, x0, #1
+  ldr x0, =bar
+// CHECK: ldr x0, .Ltmp[[TMP11:[0-9]+]]
+  adds x0, x0, #1
+  adds x0, x0, #1
+//
+// Check for correct usage in other contexts
+//
+
+// usage in macro
+.macro useit_in_a_macro
+  ldr x0, =0x10008
+  ldr x0, =baz
+.endm
+.section k,"ax",@progbits
+// CHECK-LABEL: f14:
+f14:
+  useit_in_a_macro
+// CHECK: ldr x0, .Ltmp[[TMP12:[0-9]+]]
+// CHECK: ldr x0, .Ltmp[[TMP13:[0-9]+]]
+
+// usage with expressions
+.section l, "ax", @progbits
+// CHECK-LABEL: f15:
+f15:
+  ldr x0, =0x10001+8
+// CHECK: ldr x0, .Ltmp[[TMP14:[0-9]+]]
+  adds x0, x0, #1
+  ldr x0, =bar+4
+// CHECK: ldr x0, .Ltmp[[TMP15:[0-9]+]]
+  adds x0, x0, #1
+
+//
+// Constant Pools
+//
+// CHECK: .section b,"ax",@progbits
+// CHECK: .align 2
+// CHECK: .Ltmp[[TMP0]]
+// CHECK: .word 65537
+
+// CHECK: .section c,"ax",@progbits
+// CHECK: .align 2
+// CHECK: .Ltmp[[TMP1]]
+// CHECK: .word 65538
+// CHECK: .Ltmp[[TMP2]]
+// CHECK: .word 65539
+
+// CHECK: .section d,"ax",@progbits
+// CHECK: .align 2
+// CHECK: .Ltmp[[TMP3]]
+// CHECK: .word 65540
+// CHECK: .Ltmp[[TMP4]]
+// CHECK: .word 65540
+
+// CHECK: .section e,"ax",@progbits
+// CHECK: .align 2
+// CHECK: .Ltmp[[TMP5]]
+// CHECK: .word 65542
+// CHECK: .Ltmp[[TMP6]]
+// CHECK: .word 65543
+
+// Should not switch to section because it has no constant pool
+// CHECK-NOT: .section f,"ax",@progbits
+
+// CHECK: .section g,"ax",@progbits
+// CHECK: .align 2
+// CHECK: .Ltmp[[TMP7]]
+// CHECK: .word foo
+
+// CHECK: .section h,"ax",@progbits
+// CHECK: .align 2
+// CHECK: .Ltmp[[TMP8]]
+// CHECK: .word f5
+
+// CHECK: .section i,"ax",@progbits
+// CHECK: .align 2
+// CHECK: .Ltmp[[TMP9]]
+// CHECK: .word f12
+// CHECK: .Ltmp[[TMP10]]
+// CHECK: .word 245760
+
+// CHECK: .section j,"ax",@progbits
+// CHECK: .align 2
+// CHECK: .Ltmp[[TMP11]]
+// CHECK: .word bar
+
+// CHECK: .section k,"ax",@progbits
+// CHECK: .align 2
+// CHECK: .Ltmp[[TMP12]]
+// CHECK: .word 65544
+// CHECK: .Ltmp[[TMP13]]
+// CHECK: .word baz
+
+// CHECK: .section l,"ax",@progbits
+// CHECK: .align 2
+// CHECK: .Ltmp[[TMP14]]
+// CHECK: .word 65545
+// CHECK: .Ltmp[[TMP15]]
+// CHECK: .word bar+4
diff --git a/test/MC/AArch64/lit.local.cfg b/test/MC/AArch64/lit.local.cfg
index 1be70c0..5822b72 100644
--- a/test/MC/AArch64/lit.local.cfg
+++ b/test/MC/AArch64/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if 'AArch64' not in targets:
+if 'AArch64' not in config.root.targets:
     config.unsupported = True
diff --git a/test/MC/ARM/AlignedBundling/lit.local.cfg b/test/MC/ARM/AlignedBundling/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/MC/ARM/AlignedBundling/lit.local.cfg
+++ b/test/MC/ARM/AlignedBundling/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/ARM/Windows/multiple-text-sections.s b/test/MC/ARM/Windows/multiple-text-sections.s
new file mode 100644
index 0000000..241eee4
--- /dev/null
+++ b/test/MC/ARM/Windows/multiple-text-sections.s
@@ -0,0 +1,58 @@
+@ RUN: llvm-mc -triple thumbv7-windows-itanium -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -s - | FileCheck %s
+
+	.syntax unified
+	.text
+	.thumb
+
+	.section	.text,"xr",one_only,a
+
+	.def	 a;
+		.scl	2;
+		.type	32;
+	.endef
+a:
+	movs	r0, #65
+	bx	lr
+
+	.section	.text,"xr",one_only,b
+
+	.def	 b;
+		.scl	2;
+		.type	32;
+	.endef
+	.thumb_func
+b:
+	movs	r0, #66
+	bx	lr
+
+@ CHECK: Sections [
+@ CHECK:   Section {
+@ CHECK:     Name: .text
+@ CHECK:     Characteristics [
+@ CHECK:       IMAGE_SCN_CNT_CODE
+@ CHECK:       IMAGE_SCN_MEM_16BIT
+@ CHECK:       IMAGE_SCN_MEM_EXECUTE
+@ CHECK:       IMAGE_SCN_MEM_READ
+@ CHECK:     ]
+@ CHECK:   }
+@ CHECK:   Section {
+@ CHECK:     Name: .text
+@ CHECK:     Characteristics [
+@ CHECK:       IMAGE_SCN_CNT_CODE
+@ CHECK:       IMAGE_SCN_MEM_16BIT
+@ CHECK:       IMAGE_SCN_MEM_EXECUTE
+@ CHECK:       IMAGE_SCN_MEM_READ
+@ CHECK:     ]
+@ CHECK:   }
+@ CHECK:   Section {
+@ CHECK:     Name: .text
+@ CHECK:     Characteristics [
+@ CHECK:       IMAGE_SCN_CNT_CODE
+@ CHECK:       IMAGE_SCN_MEM_16BIT
+@ CHECK:       IMAGE_SCN_MEM_EXECUTE
+@ CHECK:       IMAGE_SCN_MEM_READ
+@ CHECK:     ]
+@ CHECK:   }
+@ CHECK: ]
+
diff --git a/test/MC/ARM/Windows/text-attributes.s b/test/MC/ARM/Windows/text-attributes.s
new file mode 100644
index 0000000..62aa028
--- /dev/null
+++ b/test/MC/ARM/Windows/text-attributes.s
@@ -0,0 +1,30 @@
+@ RUN: llvm-mc -triple thumbv7-windows-itanium -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -s - | FileCheck %s
+
+	.syntax unified
+	.thumb
+
+	.text
+
+	.def function
+		.type 32
+		.scl 2
+	.endef
+	.global function
+	.thumb_func
+function:
+	bx lr
+
+@ CHECK: Sections [
+@ CHECK:   Section {
+@ CHECK:     Name: .text
+@ CHECK:     Characteristics [
+@ CHECK:       IMAGE_SCN_ALIGN_4BYTES
+@ CHECK:       IMAGE_SCN_CNT_CODE
+@ CHECK:       IMAGE_SCN_MEM_16BIT
+@ CHECK:       IMAGE_SCN_MEM_EXECUTE
+@ CHECK:       IMAGE_SCN_MEM_PURGEABLE
+@ CHECK:       IMAGE_SCN_MEM_READ
+@ CHECK:     ]
+@ CHECK:   }
+@ CHECK: ]
diff --git a/test/MC/ARM/diagnostics.s b/test/MC/ARM/diagnostics.s
index 62d7dae..88c5fb5 100644
--- a/test/MC/ARM/diagnostics.s
+++ b/test/MC/ARM/diagnostics.s
@@ -351,6 +351,24 @@
 @ CHECK-ERRORS:         ubfxgt r4, r5, #16, #17
 @ CHECK-ERRORS:                             ^
 
+        @ Using pc for SBFX/UBFX
+        sbfx pc, r2, #1, #3
+        sbfx sp, pc, #4, #5
+        ubfx pc, r0, #0, #31
+        ubfx r14, pc, #1, #2
+@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS:         sbfx pc, r2, #1, #3
+@ CHECK-ERRORS:              ^
+@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS:         sbfx sp, pc, #4, #5
+@ CHECK-ERRORS:                  ^
+@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS:         ubfx pc, r0, #0, #31
+@ CHECK-ERRORS:              ^
+@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS:         ubfx r14, pc, #1, #2
+@ CHECK-ERRORS:                   ^
+
         @ Out of order Rt/Rt2 operands for ldrd
         ldrd  r4, r3, [r8]
         ldrd  r4, r3, [r8, #8]!
diff --git a/test/MC/ARM/dwarf-asm-multiple-sections.s b/test/MC/ARM/dwarf-asm-multiple-sections.s
new file mode 100644
index 0000000..ed1b89e
--- /dev/null
+++ b/test/MC/ARM/dwarf-asm-multiple-sections.s
@@ -0,0 +1,79 @@
+// RUN: llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -fdebug-compilation-dir=/tmp
+// RUN: llvm-dwarfdump %t | FileCheck -check-prefix DWARF %s
+// RUN: llvm-objdump -r %t | FileCheck -check-prefix RELOC %s
+// RUN: not llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -dwarf-version 2 2>&1 | FileCheck -check-prefix VERSION %s
+// RUN: not llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -dwarf-version 1 2>&1 | FileCheck -check-prefix DWARF1 %s
+// RUN: not llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -dwarf-version 5 2>&1 | FileCheck -check-prefix DWARF5 %s
+  .section .text, "ax"
+a:
+  mov r0, r0
+
+  .section foo, "ax"
+b:
+  mov r1, r1
+
+// DWARF: .debug_abbrev contents:
+// DWARF: Abbrev table for offset: 0x00000000
+// DWARF: [1] DW_TAG_compile_unit DW_CHILDREN_yes
+// DWARF:         DW_AT_stmt_list DW_FORM_data4
+// DWARF:         DW_AT_ranges    DW_FORM_data4
+// DWARF:         DW_AT_name      DW_FORM_string
+// DWARF:         DW_AT_comp_dir  DW_FORM_string
+// DWARF:         DW_AT_producer  DW_FORM_string
+// DWARF:         DW_AT_language  DW_FORM_data2
+
+// DWARF: .debug_info contents:
+// DWARF: 0x{{[0-9a-f]+}}: DW_TAG_compile_unit [1]
+// CHECK-NOT-DWARF: DW_TAG_
+// DWARF: DW_AT_ranges [DW_FORM_data4]      (0x00000000)
+
+// DWARF: 0x{{[0-9a-f]+}}:   DW_TAG_label [2] *
+// DWARF-NEXT: DW_AT_name [DW_FORM_string]     ("a")
+
+// DWARF: 0x{{[0-9a-f]+}}:   DW_TAG_label [2] *
+// DWARF-NEXT: DW_AT_name [DW_FORM_string]     ("b")
+
+
+// DWARF: .debug_aranges contents:
+// DWARF-NEXT: Address Range Header: length = 0x00000024, version = 0x0002, cu_offset = 0x00000000, addr_size = 0x04, seg_size = 0x00
+// DWARF-NEXT: [0x00000000 - 0x00000004)
+// DWARF-NEXT: [0x00000000 - 0x00000004)
+
+
+// DWARF: .debug_line contents:
+// DWARF:      0x0000000000000000      9      0      1   0   0  is_stmt
+// DWARF-NEXT: 0x0000000000000004      9      0      1   0   0  is_stmt end_sequence
+// DWARF-NEXT: 0x0000000000000000     13      0      1   0   0  is_stmt
+// DWARF-NEXT: 0x0000000000000004     13      0      1   0   0  is_stmt end_sequence
+
+
+// DWARF: .debug_ranges contents:
+// DWARF: 00000000 ffffffff 00000000
+// DWARF: 00000000 00000000 00000004
+// DWARF: 00000000 ffffffff 00000000
+// DWARF: 00000000 00000000 00000004
+// DWARF: 00000000 <End of list>
+
+
+
+// RELOC: RELOCATION RECORDS FOR [.rel.debug_info]:
+// RELOC-NEXT: 00000006 R_ARM_ABS32 .debug_abbrev
+// RELOC-NEXT: 0000000c R_ARM_ABS32 .debug_line
+// RELOC-NEXT: 00000010 R_ARM_ABS32 .debug_ranges
+// RELOC-NEXT: R_ARM_ABS32 .text
+// RELOC-NEXT: R_ARM_ABS32 foo
+
+// RELOC: RELOCATION RECORDS FOR [.rel.debug_ranges]:
+// RELOC-NEXT: 00000004 R_ARM_ABS32 .text
+// RELOC-NEXT: 00000014 R_ARM_ABS32 foo
+
+// RELOC: RELOCATION RECORDS FOR [.rel.debug_aranges]:
+// RELOC-NEXT: 00000006 R_ARM_ABS32 .debug_info
+// RELOC-NEXT: 00000010 R_ARM_ABS32 .text
+// RELOC-NEXT: 00000018 R_ARM_ABS32 foo
+
+
+// VERSION: {{.*}} error: DWARF2 only supports one section per compilation unit
+
+// DWARF1: Dwarf version 1 is not supported.
+// DWARF5: Dwarf version 5 is not supported.
diff --git a/test/MC/ARM/dwarf-asm-no-code.s b/test/MC/ARM/dwarf-asm-no-code.s
new file mode 100644
index 0000000..7d06a41
--- /dev/null
+++ b/test/MC/ARM/dwarf-asm-no-code.s
@@ -0,0 +1,27 @@
+// RUN: llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -fdebug-compilation-dir=/tmp
+// RUN: llvm-dwarfdump %t | FileCheck -check-prefix DWARF %s
+// RUN: llvm-objdump -r %t | FileCheck -check-prefix RELOC %s
+
+// If there is no code in an assembly file, no debug info is produced
+
+.section .data, "aw"
+a:
+.long 42
+
+// DWARF: .debug_abbrev contents:
+// DWARF-NEXT: < EMPTY >
+
+// DWARF: .debug_info contents:
+
+// DWARF: .debug_aranges contents:
+
+// DWARF: .debug_line contents:
+
+// DWARF: .debug_ranges contents:
+
+
+// RELOC-NOT: RELOCATION RECORDS FOR [.rel.debug_info]:
+
+// RELOC-NOT: RELOCATION RECORDS FOR [.rel.debug_ranges]:
+
+// RELOC-NOT: RELOCATION RECORDS FOR [.rel.debug_aranges]:
diff --git a/test/MC/ARM/dwarf-asm-nonstandard-section.s b/test/MC/ARM/dwarf-asm-nonstandard-section.s
new file mode 100644
index 0000000..497a39a
--- /dev/null
+++ b/test/MC/ARM/dwarf-asm-nonstandard-section.s
@@ -0,0 +1,57 @@
+// RUN: llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -fdebug-compilation-dir=/tmp
+// RUN: llvm-dwarfdump %t | FileCheck -check-prefix DWARF %s
+// RUN: llvm-objdump -r %t | FileCheck -check-prefix RELOC %s
+
+  .section foo, "ax"
+b:
+  mov r1, r1
+
+// DWARF: .debug_abbrev contents:
+// DWARF: Abbrev table for offset: 0x00000000
+// DWARF: [1] DW_TAG_compile_unit DW_CHILDREN_yes
+// DWARF:         DW_AT_stmt_list DW_FORM_data4
+// DWARF:         DW_AT_low_pc    DW_FORM_addr
+// DWARF:         DW_AT_high_pc   DW_FORM_addr
+// DWARF:         DW_AT_name      DW_FORM_string
+// DWARF:         DW_AT_comp_dir  DW_FORM_string
+// DWARF:         DW_AT_producer  DW_FORM_string
+// DWARF:         DW_AT_language  DW_FORM_data2
+
+// DWARF: .debug_info contents:
+// DWARF: 0x{{[0-9a-f]+}}: DW_TAG_compile_unit [1]
+// DWARF-NOT:         DW_TAG_
+// DWARF:               DW_AT_low_pc [DW_FORM_addr]       (0x0000000000000000)
+// DWARF:               DW_AT_high_pc [DW_FORM_addr]      (0x0000000000000004)
+
+// DWARF: 0x{{[0-9a-f]+}}:   DW_TAG_label [2] *
+// DWARF-NEXT: DW_AT_name [DW_FORM_string]     ("b")
+
+
+// DWARF: .debug_aranges contents:
+// DWARF-NEXT: Address Range Header: length = 0x0000001c, version = 0x0002, cu_offset = 0x00000000, addr_size = 0x04, seg_size = 0x00
+// DWARF-NEXT: [0x00000000 - 0x00000004)
+
+
+// DWARF: .debug_line contents:
+// DWARF:      0x0000000000000000      7      0      1   0   0  is_stmt
+// DWARF-NEXT: 0x0000000000000004      7      0      1   0   0  is_stmt end_sequence
+
+
+// DWARF: .debug_ranges contents:
+// DWARF-NOT: {{0-9a-f}}
+// DWARF: .debug_pubnames contents:
+
+
+
+// RELOC: RELOCATION RECORDS FOR [.rel.debug_info]:
+// RELOC-NEXT: 00000006 R_ARM_ABS32 .debug_abbrev
+// RELOC-NEXT: 0000000c R_ARM_ABS32 .debug_line
+// RELOC-NEXT: R_ARM_ABS32 foo
+// RELOC-NEXT: R_ARM_ABS32 foo
+// RELOC-NEXT: R_ARM_ABS32 foo
+
+// RELOC-NOT: RELOCATION RECORDS FOR [.rel.debug_ranges]:
+
+// RELOC: RELOCATION RECORDS FOR [.rel.debug_aranges]:
+// RELOC-NEXT: 00000006 R_ARM_ABS32 .debug_info
+// RELOC-NEXT: 00000010 R_ARM_ABS32 foo
diff --git a/test/MC/ARM/dwarf-asm-single-section.s b/test/MC/ARM/dwarf-asm-single-section.s
new file mode 100644
index 0000000..c57e649
--- /dev/null
+++ b/test/MC/ARM/dwarf-asm-single-section.s
@@ -0,0 +1,56 @@
+// RUN: llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -fdebug-compilation-dir=/tmp
+// RUN: llvm-dwarfdump %t | FileCheck -check-prefix DWARF %s
+// RUN: llvm-objdump -r %t | FileCheck -check-prefix RELOC %s
+
+  .section .text, "ax"
+a:
+  mov r0, r0
+
+
+// DWARF: .debug_abbrev contents:
+// DWARF: Abbrev table for offset: 0x00000000
+// DWARF: [1] DW_TAG_compile_unit DW_CHILDREN_yes
+// DWARF:         DW_AT_stmt_list DW_FORM_data4
+// DWARF:         DW_AT_low_pc    DW_FORM_addr
+// DWARF:         DW_AT_high_pc   DW_FORM_addr
+// DWARF:         DW_AT_name      DW_FORM_string
+// DWARF:         DW_AT_comp_dir  DW_FORM_string
+// DWARF:         DW_AT_producer  DW_FORM_string
+// DWARF:         DW_AT_language  DW_FORM_data2
+
+// DWARF: .debug_info contents:
+// DWARF: 0x{{[0-9a-f]+}}: DW_TAG_compile_unit [1]
+// CHECK-NOT-DWARF: DW_TAG_
+// DWARF:               DW_AT_low_pc [DW_FORM_addr]       (0x0000000000000000)
+// DWARF:               DW_AT_high_pc [DW_FORM_addr]      (0x0000000000000004)
+
+// DWARF: 0x{{[0-9a-f]+}}:   DW_TAG_label [2] *
+// DWARF-NEXT: DW_AT_name [DW_FORM_string]     ("a")
+
+
+// DWARF: .debug_aranges contents:
+// DWARF-NEXT: Address Range Header: length = 0x0000001c, version = 0x0002, cu_offset = 0x00000000, addr_size = 0x04, seg_size = 0x00
+// DWARF-NEXT: [0x00000000 - 0x00000004)
+
+// DWARF: .debug_line contents:
+// DWARF:      0x0000000000000000      7      0      1   0   0 is_stmt
+// DWARF-NEXT: 0x0000000000000004      7      0      1   0   0 is_stmt end_sequence
+
+
+// DWARF: .debug_ranges contents:
+// DWARF-NOT: {{0-9a-f}}
+// DWARF: .debug_pubnames contents:
+
+
+// RELOC: RELOCATION RECORDS FOR [.rel.debug_info]:
+// RELOC-NEXT: 00000006 R_ARM_ABS32 .debug_abbrev
+// RELOC-NEXT: 0000000c R_ARM_ABS32 .debug_line
+// RELOC-NEXT: R_ARM_ABS32 .text
+// RELOC-NEXT: R_ARM_ABS32 .text
+// RELOC-NEXT: R_ARM_ABS32 .text
+
+// RELOC-NOT: RELOCATION RECORDS FOR [.rel.debug_ranges]:
+
+// RELOC: RELOCATION RECORDS FOR [.rel.debug_aranges]:
+// RELOC-NEXT: 00000006 R_ARM_ABS32 .debug_info
+// RELOC-NEXT: 00000010 R_ARM_ABS32 .text
diff --git a/test/MC/ARM/gas-compl-copr-reg.s b/test/MC/ARM/gas-compl-copr-reg.s
new file mode 100644
index 0000000..ab0b023
--- /dev/null
+++ b/test/MC/ARM/gas-compl-copr-reg.s
@@ -0,0 +1,14 @@
+@ RUN: llvm-mc -triple=armv7-linux-gnueabi -show-encoding < %s | FileCheck %s
+
+@ CHECK: ldc	p12, c4, [r0, #4]       @ encoding: [0x01,0x4c,0x90,0xed]
+@ CHECK: stc	p14, c6, [r2, #-224]    @ encoding: [0x38,0x6e,0x02,0xed]
+
+        ldc p12, cr4, [r0, #4]
+        stc p14, cr6, [r2, #-224]
+@ RUN: llvm-mc -triple=armv7-linux-gnueabi -show-encoding < %s | FileCheck %s
+
+@ CHECK: ldc	p12, c4, [r0, #4]       @ encoding: [0x01,0x4c,0x90,0xed]
+@ CHECK: stc	p14, c6, [r2, #-224]    @ encoding: [0x38,0x6e,0x02,0xed]
+
+        ldc p12, cr4, [r0, #4]
+        stc p14, cr6, [r2, #-224]
diff --git a/test/MC/ARM/lit.local.cfg b/test/MC/ARM/lit.local.cfg
index 8a3ba96..98c6700 100644
--- a/test/MC/ARM/lit.local.cfg
+++ b/test/MC/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/ARM/macho-relocs-with-addend.s b/test/MC/ARM/macho-relocs-with-addend.s
new file mode 100644
index 0000000..fee930e
--- /dev/null
+++ b/test/MC/ARM/macho-relocs-with-addend.s
@@ -0,0 +1,34 @@
+@ RUN: llvm-mc -triple thumbv7-apple-ios7.0 -filetype=obj -o - %s | \
+@ RUN: llvm-readobj -r - | FileCheck %s
+
+        @ MachO relocations that end up expressed as internal
+        @ (scattered) still need to have the type set correctly.
+
+        .text
+        .thumb_func
+        .thumb
+        .globl _with_thumb
+_with_thumb:
+        bl _dest+10
+        blx _dest+20
+
+        .globl _with_arm
+        .arm
+_with_arm:
+        bl _dest+10
+        blx _dest+20
+        bne _dest+30
+        b _dest+40
+
+        .data
+_dest:
+        .word 42
+
+@ CHECK: Relocations [
+@ CHECK-NEXT: Section __text {
+@ CHECK-NEXT: 0x14 1 2 n/a ARM_RELOC_BR24 1 0x18
+@ CHECK-NEXT: 0x10 1 2 n/a ARM_RELOC_BR24 1 0x18
+@ CHECK-NEXT: 0xC 1 2 n/a ARM_RELOC_BR24 1 0x18
+@ CHECK-NEXT: 0x8 1 2 n/a ARM_RELOC_BR24 1 0x18
+@ CHECK-NEXT: 0x4 1 2 n/a ARM_THUMB_RELOC_BR22 1 0x18
+@ CHECK-NEXT: 0x0 1 2 n/a ARM_THUMB_RELOC_BR22 1 0x18
diff --git a/test/MC/ARM/thumb-types.s b/test/MC/ARM/thumb-types.s
index 2fd7152..b3aaf7d 100644
--- a/test/MC/ARM/thumb-types.s
+++ b/test/MC/ARM/thumb-types.s
@@ -29,6 +29,12 @@ untyped_text_label:
 explicit_function:
 	nop
 
+	.long	tls(TPOFF)
+
+	.type indirect_function,%gnu_indirect_function
+indirect_function:
+	nop
+
 	.data
 
 untyped_data_label:
@@ -38,6 +44,14 @@ untyped_data_label:
 explicit_data:
 	.long 0
 
+	.section	.tdata,"awT",%progbits
+	.type	tls,%object
+	.align	2
+tls:
+	.long	42
+	.size	tls, 4
+
+
 @ CHECK: Symbol {
 @ CHECK:   Name: arm_function
 @ CHECK:   Value: 0x6
@@ -69,6 +83,18 @@ explicit_data:
 @ CHECK: }
 
 @ CHECK: Symbol {
+@ CHECK:   Name: indirect_function
+@ CHECK:   Value: 0x13
+@ CHECK:   Type: GNU_IFunc
+@ CHECK: }
+
+@ CHECK: Symbol {
+@ CHECK:   Name: tls
+@ CHECK:   Value: 0x0
+@ CHECK:   Type: TLS
+@ CHECK: }
+
+@ CHECK: Symbol {
 @ CHECK:   Name: untyped_data_label
 @ CHECK:   Value: 0x0
 @ CHECK:   Type: None
diff --git a/test/MC/AsmParser/cfi-invalid-startproc.s b/test/MC/AsmParser/cfi-invalid-startproc.s
deleted file mode 100644
index 57ded13..0000000
--- a/test/MC/AsmParser/cfi-invalid-startproc.s
+++ /dev/null
@@ -1,16 +0,0 @@
-# RUN: not llvm-mc -triple=x86_64-apple-macosx10.8 -filetype=obj -o %t %s 2>&1 | FileCheck %s
-# Check that the cfi_startproc is declared after the beginning of
-# a procedure, otherwise it will reference an invalid symbol for
-# emitting the relocation.
-# <rdar://problem/15939159>
-
-# CHECK: No symbol to start a frame
-.text
-.cfi_startproc
-.globl _someFunction
-_someFunction:
-.cfi_def_cfa_offset 16
-.cfi_offset %rbp, -16
-.cfi_def_cfa_register rbp
-  ret
-.cfi_endproc
diff --git a/test/MC/AsmParser/conditional_asm.s b/test/MC/AsmParser/conditional_asm.s
index b9bee33..ecbceb1 100644
--- a/test/MC/AsmParser/conditional_asm.s
+++ b/test/MC/AsmParser/conditional_asm.s
@@ -11,6 +11,66 @@
     .endif
 .endif
 
+# CHECK: .byte 0
+# CHECK-NOT: .byte 1
+.ifeq 32 - 32
+        .byte 0
+.else
+        .byte 1
+.endif
+
+# CHECK: .byte 0
+# CHECK: .byte 1
+# CHECK-NOT: .byte 2
+.ifge 32 - 31
+        .byte 0
+.endif
+.ifge 32 - 32
+        .byte 1
+.endif
+.ifge 32 - 33
+        .byte 2
+.endif
+
+# CHECK: .byte 0
+# CHECK-NOT: .byte 1
+# CHECK-NOT: .byte 2
+.ifgt 32 - 31
+        .byte 0
+.endif
+.ifgt 32 - 32
+        .byte 1
+.endif
+.ifgt 32 - 33
+        .byte 2
+.endif
+
+# CHECK-NOT: .byte 0
+# CHECK: .byte 1
+# CHECK: .byte 2
+.ifle 32 - 31
+        .byte 0
+.endif
+.ifle 32 - 32
+        .byte 1
+.endif
+.ifle 32 - 33
+        .byte 2
+.endif
+
+# CHECK-NOT: .byte 0
+# CHECK-NOT: .byte 1
+# CHECK: .byte 2
+.iflt 32 - 31
+        .byte 0
+.endif
+.iflt 32 - 32
+        .byte 1
+.endif
+.iflt 32 - 33
+        .byte 2
+.endif
+
 # CHECK: .byte 1
 # CHECK-NOT: .byte 0
 .ifne 32 - 32
diff --git a/test/MC/AsmParser/directive_file.s b/test/MC/AsmParser/directive_file.s
index 9b99e0f..d7290eb 100644
--- a/test/MC/AsmParser/directive_file.s
+++ b/test/MC/AsmParser/directive_file.s
@@ -1,4 +1,5 @@
 # RUN: llvm-mc -triple i386-unknown-unknown %s | FileCheck %s
+# RUN: llvm-mc -triple i386-unknown-unknown %s -filetype=null
 
         .file "hello"
         .file 1 "worl\144"   # "\144" is "d"
diff --git a/test/MC/AsmParser/directive_line.s b/test/MC/AsmParser/directive_line.s
index 94ce446..110b68a 100644
--- a/test/MC/AsmParser/directive_line.s
+++ b/test/MC/AsmParser/directive_line.s
@@ -1,4 +1,5 @@
 # RUN: llvm-mc -triple i386-unknown-unknown %s
+# RUN: llvm-mc -triple i386-unknown-unknown %s -filetype=null
 # FIXME: Actually test the output.
 
         .line
diff --git a/test/MC/AsmParser/directive_loc.s b/test/MC/AsmParser/directive_loc.s
index cda9579..404ebce 100644
--- a/test/MC/AsmParser/directive_loc.s
+++ b/test/MC/AsmParser/directive_loc.s
@@ -1,4 +1,5 @@
 # RUN: llvm-mc -triple i386-unknown-unknown %s | FileCheck %s
+# RUN: llvm-mc -triple i386-unknown-unknown %s -filetype=null
 
         .file 1 "hello"
 # CHECK: .file 1 "hello"
diff --git a/test/MC/AsmParser/directive_seh.s b/test/MC/AsmParser/directive_seh.s
index f6eb970..1821747 100644
--- a/test/MC/AsmParser/directive_seh.s
+++ b/test/MC/AsmParser/directive_seh.s
@@ -1,36 +1,25 @@
 # RUN: llvm-mc -triple x86_64-pc-win32 %s | FileCheck %s
 
-# CHECK: .seh_proc func
-# CHECK: .seh_pushframe @code
-# CHECK: .seh_stackalloc 24
-# CHECK: .seh_savereg %rbp, 16
-# CHECK: .seh_savexmm %r8, 0
-# CHECK: .seh_pushreg %rbx
-# CHECK: .seh_setframe %rbx, 0
-# CHECK: .seh_endprologue
-# CHECK: .seh_handler __C_specific_handler, @except
-# CHECK-NOT: .section{{.*}}.xdata
-# CHECK: .seh_handlerdata
-# CHECK: .text
-# CHECK: .seh_startchained
-# CHECK: .seh_endprologue
-# CHECK: .seh_endchained
-# CHECK: .seh_endproc
-
     .text
     .globl func
     .def func; .scl 2; .type 32; .endef
     .seh_proc func
+# CHECK: .seh_proc func
 func:
     .seh_pushframe @code
+# CHECK: .seh_pushframe @code
     subq $24, %rsp
     .seh_stackalloc 24
+# CHECK: .seh_stackalloc 24
     movq %rsi, 16(%rsp)
     .seh_savereg %rsi, 16
+# CHECK: .seh_savereg 6, 16
     movups %xmm8, (%rsp)
     .seh_savexmm %xmm8, 0
+# CHECK: .seh_savexmm 8, 0
     pushq %rbx
     .seh_pushreg 3
+# CHECK: .seh_pushreg 3
     mov %rsp, %rbx
     .seh_setframe 3, 0
     .seh_endprologue
@@ -41,8 +30,18 @@ func:
     .seh_startchained
     .seh_endprologue
     .seh_endchained
+# CHECK: .seh_setframe 3, 0
+# CHECK: .seh_endprologue
+# CHECK: .seh_handler __C_specific_handler, @except
+# CHECK-NOT: .section{{.*}}.xdata
+# CHECK: .seh_handlerdata
+# CHECK: .text
+# CHECK: .seh_startchained
+# CHECK: .seh_endprologue
+# CHECK: .seh_endchained
     lea (%rbx), %rsp
     pop %rbx
     addq $24, %rsp
     ret
     .seh_endproc
+# CHECK: .seh_endproc
diff --git a/test/MC/AsmParser/if-diagnostics.s b/test/MC/AsmParser/if-diagnostics.s
new file mode 100644
index 0000000..d102a56
--- /dev/null
+++ b/test/MC/AsmParser/if-diagnostics.s
@@ -0,0 +1,29 @@
+// RUN: not llvm-mc -triple i386 %s -o /dev/null 2>&1 | FileCheck %s
+
+.if
+.endif
+
+// CHECK: error: unknown token in expression
+// CHECK: .if
+// CHECK:   ^
+
+.ifeq 0, 3
+.endif
+
+// CHECK:error: unexpected token in '.if' directive
+// CHECK: .ifeq 0, 3
+// CHECK:        ^
+
+.iflt "string1"
+.endif
+
+// CHECK: error: expected absolute expression
+// CHECK: .iflt "string1"
+// CHECK:       ^
+
+.ifge test
+.endif
+
+// CHECK: error: expected absolute expression
+// CHECK: .ifge test
+// CHECK:       ^
diff --git a/test/MC/AsmParser/lit.local.cfg b/test/MC/AsmParser/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/MC/AsmParser/lit.local.cfg
+++ b/test/MC/AsmParser/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/AsmParser/vararg.s b/test/MC/AsmParser/vararg.s
index b27668e..e3236b0 100644
--- a/test/MC/AsmParser/vararg.s
+++ b/test/MC/AsmParser/vararg.s
@@ -17,6 +17,12 @@
 .endif
 .endm
 
+.macro ifcc4 arg0, arg1:vararg
+.if cc
+            movl \arg1, \arg0
+.endif
+.endm
+
 .text
 
 // CHECK: movl %esp, %ebp
@@ -25,6 +31,8 @@
 // CHECK: movl %ecx, %ebx
 // CHECK: movl %ecx, %eax
 // CHECK: movl %eax, %ecx
+// CHECK: movl %ecx, %eax
+// CHECK: movl %eax, %ecx
 .set cc,1
   ifcc  movl    %esp, %ebp
         subl $0, %esp
@@ -33,6 +41,8 @@
   ifcc2 %ecx, %ebx
   ifcc3 %ecx %eax
   ifcc3 %eax, %ecx
+  ifcc4 %eax %ecx  ## test
+  ifcc4 %ecx, %eax ## test
 
 // CHECK-NOT movl
 // CHECK: subl $1, %esp
diff --git a/test/MC/COFF/alias.s b/test/MC/COFF/alias.s
index dc4f65a..eb5398a 100644
--- a/test/MC/COFF/alias.s
+++ b/test/MC/COFF/alias.s
@@ -1,4 +1,5 @@
-// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s -o - | llvm-readobj -t -r | FileCheck %s
+// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s -o - \
+// RUN:   | llvm-readobj -t -r | FileCheck %s
 
 local1:
 external_aliased_to_local = local1
@@ -36,7 +37,7 @@ weak_aliased_to_external = external2
 // CHECK-NEXT:     AuxSymbolCount: 1
 // CHECK:        }
 // CHECK:        Symbol {
-// CHECK-NEXT:     Name: local1
+// CHECK:          Name: local1
 // CHECK-NEXT:     Value: 0
 // CHECK-NEXT:     Section: .text (1)
 // CHECK-NEXT:     BaseType: Null (0x0)
@@ -89,7 +90,7 @@ weak_aliased_to_external = external2
 // CHECK-NEXT:     StorageClass: WeakExternal (0x69)
 // CHECK-NEXT:     AuxSymbolCount: 1
 // CHECK-NEXT:     AuxWeakExternal {
-// CHECK-NEXT:       Linked: external2 (9)
+// CHECK-NEXT:       Linked: external2 (13)
 // CHECK-NEXT:       Search: Library (0x2)
 // CHECK-NEXT:       Unused: (00 00 00 00 00 00 00 00 00 00)
 // CHECK-NEXT:     }
diff --git a/test/MC/COFF/basic-coff-64.s b/test/MC/COFF/basic-coff-64.s
index 89d1745..38a9e57 100644
--- a/test/MC/COFF/basic-coff-64.s
+++ b/test/MC/COFF/basic-coff-64.s
@@ -25,10 +25,10 @@ _main:                                  # @main
 
 // CHECK: ImageFileHeader {
 // CHECK:   Machine: IMAGE_FILE_MACHINE_AMD64
-// CHECK:   SectionCount: 2
+// CHECK:   SectionCount: 3
 // CHECK:   TimeDateStamp: {{[0-9]+}}
 // CHECK:   PointerToSymbolTable: 0x{{[0-9A-F]+}}
-// CHECK:   SymbolCount: 6
+// CHECK:   SymbolCount: 8
 // CHECK:   OptionalHeaderSize: 0
 // CHECK:   Characteristics [ (0x0)
 // CHECK:   ]
diff --git a/test/MC/COFF/basic-coff.s b/test/MC/COFF/basic-coff.s
index 9b29970..38bfa6d 100644
--- a/test/MC/COFF/basic-coff.s
+++ b/test/MC/COFF/basic-coff.s
@@ -25,10 +25,10 @@ L_.str:                                 # @.str
 
 // CHECK: ImageFileHeader {
 // CHECK:   Machine: IMAGE_FILE_MACHINE_I386
-// CHECK:   SectionCount: 2
+// CHECK:   SectionCount: 3
 // CHECK:   TimeDateStamp: {{[0-9]+}}
 // CHECK:   PointerToSymbolTable: 0x{{[0-9A-F]+}}
-// CHECK:   SymbolCount: 6
+// CHECK:   SymbolCount: 8
 // CHECK:   OptionalHeaderSize: 0
 // CHECK:   Characteristics [ (0x0)
 // CHECK:   ]
diff --git a/test/MC/COFF/early-dce.s b/test/MC/COFF/early-dce.s
new file mode 100644
index 0000000..ec1a9bd
--- /dev/null
+++ b/test/MC/COFF/early-dce.s
@@ -0,0 +1,16 @@
+# RUN: llvm-mc -triple i686-windows -g -filetype obj -o - %s \
+# RUN:   | llvm-readobj -s -t | FileCheck %s
+
+	.section .rdata
+
+	.align 8
+	.global data
+data:
+	.quad 0
+
+# CHECK: Sections [
+# CHECK:  Section {
+# CHECK:    Name: .text
+# CHECK:  }
+# CHECK: ]
+
diff --git a/test/MC/COFF/global_ctors_dtors.ll b/test/MC/COFF/global_ctors_dtors.ll
index 046e93a..ca17f24 100644
--- a/test/MC/COFF/global_ctors_dtors.ll
+++ b/test/MC/COFF/global_ctors_dtors.ll
@@ -11,9 +11,10 @@
 
 %ini = type { i32, void()*, i8* }
 
-@llvm.global_ctors = appending global [2 x %ini ] [
+@llvm.global_ctors = appending global [3 x %ini ] [
   %ini { i32 65535, void ()* @a_global_ctor, i8* null },
-  %ini { i32 65535, void ()* @b_global_ctor, i8* bitcast (i32* @b to i8*) }
+  %ini { i32 65535, void ()* @b_global_ctor, i8* bitcast (i32* @b to i8*) },
+  %ini { i32 65535, void ()* @c_global_ctor, i8* bitcast (i32* @c to i8*) }
 ]
 @llvm.global_dtors = appending global [1 x %ini ] [%ini { i32 65535, void ()* @a_global_dtor, i8* null }]
 
@@ -26,11 +27,18 @@ define void @a_global_ctor() nounwind {
 
 @b = global i32 zeroinitializer
 
+@c = available_externally dllimport global i32 zeroinitializer
+
 define void @b_global_ctor() nounwind {
   store i32 42, i32* @b
   ret void
 }
 
+define void @c_global_ctor() nounwind {
+  store i32 42, i32* @c
+  ret void
+}
+
 define void @a_global_dtor() nounwind {
   %1 = call i32 @puts(i8* getelementptr inbounds ([12 x i8]* @.str2, i32 0, i32 0))
   ret void
@@ -43,13 +51,15 @@ define i32 @main() nounwind {
 
 ; WIN32: .section .CRT$XCU,"rd"
 ; WIN32: a_global_ctor
-; WIN32: .section .CRT$XCU,"rd",associative .bss,{{_?}}b
+; WIN32: .section .CRT$XCU,"rd",associative,{{_?}}b
 ; WIN32: b_global_ctor
+; WIN32-NOT: c_global_ctor
 ; WIN32: .section .CRT$XTX,"rd"
 ; WIN32: a_global_dtor
 ; MINGW32: .section .ctors,"wd"
 ; MINGW32: a_global_ctor
-; MINGW32: .section .ctors,"wd",associative .bss,{{_?}}b
+; MINGW32: .section .ctors,"wd",associative,{{_?}}b
 ; MINGW32: b_global_ctor
+; MINGW32-NOT: c_global_ctor
 ; MINGW32: .section .dtors,"wd"
 ; MINGW32: a_global_dtor
diff --git a/test/MC/COFF/linker-options.ll b/test/MC/COFF/linker-options.ll
index de11941..0be74e5 100755
--- a/test/MC/COFF/linker-options.ll
+++ b/test/MC/COFF/linker-options.ll
@@ -5,6 +5,7 @@
       metadata !{ metadata !"/DEFAULTLIB:msvcrt.lib" },
       metadata !{ metadata !"/DEFAULTLIB:msvcrt.lib",
                   metadata !"/DEFAULTLIB:secur32.lib" },
+      metadata !{ metadata !"/DEFAULTLIB:C:\5Cpath to\5Casan_rt.lib" },
       metadata !{ metadata !"/with spaces" } } }
 
 !llvm.module.flags = !{ !0 }
@@ -17,5 +18,6 @@ define dllexport void @foo() {
 ; CHECK: .ascii   " /DEFAULTLIB:msvcrt.lib"
 ; CHECK: .ascii   " /DEFAULTLIB:msvcrt.lib"
 ; CHECK: .ascii   " /DEFAULTLIB:secur32.lib"
+; CHECK: .ascii   " \"/DEFAULTLIB:C:\\path to\\asan_rt.lib\""
 ; CHECK: .ascii   " \"/with spaces\""
 ; CHECK: .ascii   " /EXPORT:_foo"
diff --git a/test/MC/COFF/linkonce-invalid.s b/test/MC/COFF/linkonce-invalid.s
index 90ce4a7..cc3a27c 100644
--- a/test/MC/COFF/linkonce-invalid.s
+++ b/test/MC/COFF/linkonce-invalid.s
@@ -19,21 +19,9 @@
 // CHECK: error: unexpected token in directive
 .linkonce discard foo
 
-// CHECK: error: expected associated section name
+// CHECK: error: cannot make section associative with .linkonce
 .linkonce associative
 
-// CHECK: error: cannot associate unknown section 'unknown'
-.linkonce associative unknown
-
-// CHECK: error: cannot associate a section with itself
-.linkonce associative invalid
-
-// CHECK: error: associated section must be a COMDAT section
-.linkonce associative non_comdat
-
-// CHECK: error: associated section cannot be itself associative
-.linkonce associative assoc
-
 // CHECK: error: section 'multi' is already linkonce
 .section multi
 .linkonce discard
diff --git a/test/MC/COFF/linkonce.s b/test/MC/COFF/linkonce.s
index e7b7f47..f2e3506 100644
--- a/test/MC/COFF/linkonce.s
+++ b/test/MC/COFF/linkonce.s
@@ -24,7 +24,6 @@
 .long 1
 
 .section s6
-.linkonce associative s1
 .long 1
 
 .section s7
@@ -39,11 +38,6 @@
 .linkonce discard
 .long 1
 
-// Check that valid '.section' names can be associated.
-.section multi
-.linkonce associative .foo$bar
-.long 1
-
 
 // CHECK: Sections [
 // CHECK:   Section {
@@ -79,7 +73,6 @@
 // CHECK:   Section {
 // CHECK:     Name: s6
 // CHECK:     Characteristics [
-// CHECK:       IMAGE_SCN_LNK_COMDAT
 // CHECK:     ]
 // CHECK:   }
 // CHECK:   Section {
@@ -94,86 +87,64 @@
 // CHECK:       IMAGE_SCN_LNK_COMDAT
 // CHECK:     ]
 // CHECK:   }
-// CHECK:   Section {
-// CHECK:     Name: multi
-// CHECK:     Characteristics [
-// CHECK:       IMAGE_SCN_LNK_COMDAT
-// CHECK:     ]
-// CHECK:   }
 // CHECK: ]
 // CHECK: Symbols [
 // CHECK:   Symbol {
 // CHECK:     Name: s1
-// CHECK:     Section: s1 (1)
+// CHECK:     Section: s1 (4)
 // CHECK:     AuxSectionDef {
-// CHECK:       Number: 1
+// CHECK:       Number: 4
 // CHECK:       Selection: Any (0x2)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: s2
-// CHECK:     Section: s2 (2)
+// CHECK:     Section: s2 (5)
 // CHECK:     AuxSectionDef {
-// CHECK:       Number: 2
+// CHECK:       Number: 5
 // CHECK:       Selection: NoDuplicates (0x1)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: s3
-// CHECK:     Section: s3 (3)
+// CHECK:     Section: s3 (6)
 // CHECK:     AuxSectionDef {
-// CHECK:       Number: 3
+// CHECK:       Number: 6
 // CHECK:       Selection: Any (0x2)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: s4
-// CHECK:     Section: s4 (4)
+// CHECK:     Section: s4 (7)
 // CHECK:     AuxSectionDef {
-// CHECK:       Number: 4
+// CHECK:       Number: 7
 // CHECK:       Selection: SameSize (0x3)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: s5
-// CHECK:     Section: s5 (5)
+// CHECK:     Section: s5 (8)
 // CHECK:     AuxSectionDef {
-// CHECK:       Number: 5
+// CHECK:       Number: 8
 // CHECK:       Selection: ExactMatch (0x4)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: s6
-// CHECK:     Section: s6 (6)
-// CHECK:     AuxSectionDef {
-// CHECK:       Number: 1
-// CHECK:       Selection: Associative (0x5)
-// CHECK:       AssocSection: s1
-// CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: s7
-// CHECK:     Section: s7 (7)
+// CHECK:     Section: s7 (10)
 // CHECK:     AuxSectionDef {
-// CHECK:       Number: 7
+// CHECK:       Number: 10
 // CHECK:       Selection: Largest (0x6)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: s8
-// CHECK:     Section: s8 (8)
+// CHECK:     Section: s8 (11)
 // CHECK:     AuxSectionDef {
-// CHECK:       Number: 8
+// CHECK:       Number: 11
 // CHECK:       Selection: Newest (0x7)
 // CHECK:     }
 // CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: multi
-// CHECK:     Value: 0
-// CHECK:     Section: multi (10)
-// CHECK:     AuxSectionDef {
-// CHECK:       Number: 9
-// CHECK:       Selection: Associative (0x5)
-// CHECK:       AssocSection: .foo$bar
-// CHECK:     }
-// CHECK:   }
diff --git a/test/MC/COFF/lit.local.cfg b/test/MC/COFF/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/MC/COFF/lit.local.cfg
+++ b/test/MC/COFF/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/COFF/lset0.s b/test/MC/COFF/lset0.s
index f5020c8..7321b01 100755
--- a/test/MC/COFF/lset0.s
+++ b/test/MC/COFF/lset0.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s -o - | llvm-nm | FileCheck %s
+// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s -o - | llvm-nm - | FileCheck %s
 
 not_global = 123
 global = 456
diff --git a/test/MC/COFF/section-comdat-conflict.s b/test/MC/COFF/section-comdat-conflict.s
new file mode 100644
index 0000000..7ed452a
--- /dev/null
+++ b/test/MC/COFF/section-comdat-conflict.s
@@ -0,0 +1,13 @@
+// RUN: not llvm-mc -triple i386-pc-win32 -filetype=obj < %s 2>&1 |  FileCheck %s
+
+// CHECK: conflicting sections for symbol
+
+        .section .xyz
+        .global bar
+bar:
+        .long 42
+
+        .section        .abcd,"xr",discard,bar
+        .global foo
+foo:
+        .long 42
diff --git a/test/MC/COFF/section-comdat-conflict2.s b/test/MC/COFF/section-comdat-conflict2.s
new file mode 100644
index 0000000..e2dfc2d
--- /dev/null
+++ b/test/MC/COFF/section-comdat-conflict2.s
@@ -0,0 +1,6 @@
+// RUN: not llvm-mc -triple i386-pc-win32 -filetype=obj < %s 2>&1 |  FileCheck %s
+
+// CHECK: two sections have the same comdat
+
+        .section        .xyz,"xr",discard,bar
+        .section        .abcd,"xr",discard,bar
diff --git a/test/MC/COFF/section-comdat.s b/test/MC/COFF/section-comdat.s
index dd5be87..e7052d8 100644
--- a/test/MC/COFF/section-comdat.s
+++ b/test/MC/COFF/section-comdat.s
@@ -1,8 +1,7 @@
 // RUN: llvm-mc -triple i386-pc-win32 -filetype=obj %s | llvm-readobj -s -t | FileCheck %s
 // RUN: llvm-mc -triple x86_64-pc-win32 -filetype=obj %s | llvm-readobj -s -t | FileCheck %s
 
-.section assocSec
-.linkonce
+.section assocSec, "dr", discard, "assocSym"
 .long 1
 
 .section secName, "dr", discard, "Symbol1"
@@ -25,7 +24,7 @@ Symbol3:
 Symbol4:
 .long 1
 
-.section SecName, "dr", associative assocSec, "Symbol5"
+.section SecName, "dr", associative, "assocSym"
 .globl Symbol5
 Symbol5:
 .long 1
@@ -40,58 +39,63 @@ Symbol6:
 Symbol7:
 .long 1
 
+.section assocSec, "dr", associative, "assocSym"
+.globl Symbol8
+Symbol8:
+.long 1
+
 // CHECK: Sections [
 // CHECK:   Section {
-// CHECK:     Number: 1
+// CHECK:     Number: 4
 // CHECK:     Name: assocSec
 // CHECK:     Characteristics [
 // CHECK:       IMAGE_SCN_LNK_COMDAT
 // CHECK:     ]
 // CHECK:   }
 // CHECK:   Section {
-// CHECK:     Number: 2
+// CHECK:     Number: 5
 // CHECK:     Name: secName
 // CHECK:     Characteristics [
 // CHECK:       IMAGE_SCN_LNK_COMDAT
 // CHECK:     ]
 // CHECK:   }
 // CHECK:   Section {
-// CHECK:     Number: 3
+// CHECK:     Number: 6
 // CHECK:     Name: secName
 // CHECK:     Characteristics [
 // CHECK:       IMAGE_SCN_LNK_COMDAT
 // CHECK:     ]
 // CHECK:   }
 // CHECK:   Section {
-// CHECK:     Number: 4
+// CHECK:     Number: 7
 // CHECK:     Name: SecName
 // CHECK:     Characteristics [
 // CHECK:       IMAGE_SCN_LNK_COMDAT
 // CHECK:     ]
 // CHECK:   }
 // CHECK:   Section {
-// CHECK:     Number: 5
+// CHECK:     Number: 8
 // CHECK:     Name: SecName
 // CHECK:     Characteristics [
 // CHECK:       IMAGE_SCN_LNK_COMDAT
 // CHECK:     ]
 // CHECK:   }
 // CHECK:   Section {
-// CHECK:     Number: 6
+// CHECK:     Number: 9
 // CHECK:     Name: SecName
 // CHECK:     Characteristics [
 // CHECK:       IMAGE_SCN_LNK_COMDAT
 // CHECK:     ]
 // CHECK:   }
 // CHECK:   Section {
-// CHECK:     Number: 7
+// CHECK:     Number: 10
 // CHECK:     Name: SecName
 // CHECK:     Characteristics [
 // CHECK:       IMAGE_SCN_LNK_COMDAT
 // CHECK:     ]
 // CHECK:   }
 // CHECK:   Section {
-// CHECK:     Number: 8
+// CHECK:     Number: 11
 // CHECK:     Name: SecName
 // CHECK:     Characteristics [
 // CHECK:       IMAGE_SCN_LNK_COMDAT
@@ -101,88 +105,104 @@ Symbol7:
 // CHECK: Symbols [
 // CHECK:   Symbol {
 // CHECK:     Name: assocSec
-// CHECK:     Section: assocSec (1)
+// CHECK:     Section: assocSec (4)
 // CHECK:     AuxSectionDef {
 // CHECK:       Selection: Any
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
+// CHECK:     Name: assocSym
+// CHECK:     Section: assocSec
+// CHECK:   }
+// CHECK:   Symbol {
 // CHECK:     Name: secName
-// CHECK:     Section: secName (2)
+// CHECK:     Section: secName (5)
 // CHECK:     AuxSectionDef {
 // CHECK:       Selection: Any
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
+// CHECK:     Name: Symbol1
+// CHECK:     Section: secName (5)
+// CHECK:   }
+// CHECK:   Symbol {
 // CHECK:     Name: secName
-// CHECK:     Section: secName (3)
+// CHECK:     Section: secName (6)
 // CHECK:     AuxSectionDef {
 // CHECK:       Selection: NoDuplicates
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
+// CHECK:     Name: Symbol2
+// CHECK:     Section: secName (6)
+// CHECK:   }
+// CHECK:   Symbol {
 // CHECK:     Name: SecName
-// CHECK:     Section: SecName (4)
+// CHECK:     Section: SecName (7)
 // CHECK:     AuxSectionDef {
 // CHECK:       Selection: SameSize
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
+// CHECK:     Name: Symbol3
+// CHECK:     Section: SecName (7)
+// CHECK:   }
+// CHECK:   Symbol {
 // CHECK:     Name: SecName
-// CHECK:     Section: SecName (5)
+// CHECK:     Section: SecName (8)
 // CHECK:     AuxSymbolCount: 1
 // CHECK:     AuxSectionDef {
 // CHECK:       Selection: ExactMatch
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
+// CHECK:     Name: Symbol4
+// CHECK:     Section: SecName (8)
+// CHECK:   }
+// CHECK:   Symbol {
 // CHECK:     Name: SecName
-// CHECK:     Section: SecName (6)
+// CHECK:     Section: SecName (9)
 // CHECK:     AuxSectionDef {
 // CHECK:       Selection: Associative
-// CHECK:       AssocSection: assocSec (1)
+// CHECK:       AssocSection: assocSec (4)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: SecName
-// CHECK:     Section: SecName (7)
+// CHECK:     Section: SecName (10)
 // CHECK:     AuxSectionDef {
 // CHECK:       Selection: Largest
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
+// CHECK:     Name: Symbol6
+// CHECK:     Section: SecName (10)
+// CHECK:   }
+// CHECK:   Symbol {
 // CHECK:     Name: SecName
-// CHECK:     Section: SecName (8)
+// CHECK:     Section: SecName (11)
 // CHECK:     AuxSectionDef {
 // CHECK:       Selection: Newest (0x7)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
-// CHECK:     Name: Symbol1
-// CHECK:     Section: secName (2)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: Symbol2
-// CHECK:     Section: secName (3)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: Symbol3
-// CHECK:     Section: SecName (4)
+// CHECK:     Name: Symbol7
+// CHECK:     Section: SecName (11)
 // CHECK:   }
 // CHECK:   Symbol {
-// CHECK:     Name: Symbol4
-// CHECK:     Section: SecName (5)
+// CHECK:     Name: assocSec
+// CHECK:     Section: assocSec (12)
+// CHECK:     AuxSectionDef {
+// CHECK:       Selection: Associative (0x5)
+// CHECK:       AssocSection: assocSec (4)
+// CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: Symbol5
-// CHECK:     Section: SecName (6)
+// CHECK:     Section: SecName (9)
 // CHECK:   }
 // CHECK:   Symbol {
-// CHECK:     Name: Symbol6
-// CHECK:     Section: SecName (7)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: Symbol7
-// CHECK:     Section: SecName (8)
+// CHECK:     Name: Symbol8
+// CHECK:     Section: assocSec (12)
 // CHECK:   }
 // CHECK: ]
diff --git a/test/MC/COFF/section-name-encoding.s b/test/MC/COFF/section-name-encoding.s
index 74cd490..7edd6d7 100644
--- a/test/MC/COFF/section-name-encoding.s
+++ b/test/MC/COFF/section-name-encoding.s
@@ -10,11 +10,11 @@
 // Raw encoding
 
 // CHECK:   Section {
-// CHECK:     Number: 1
+// CHECK:     Number: 4
 // CHECK:     Name: s (73 00 00 00 00 00 00 00)
 // CHECK:   }
 // CHECK:   Section {
-// CHECK:     Number: 2
+// CHECK:     Number: 5
 // CHECK:     Name: s1234567 (73 31 32 33 34 35 36 37)
 // CHECK:   }
 .section s;        .long 1
@@ -25,7 +25,7 @@
 
 // /4
 // CHECK:   Section {
-// CHECK:     Number: 3
+// CHECK:     Number: 6
 // CHECK:     Name: s12345678 (2F 34 00 00 00 00 00 00)
 // CHECK:   }
 .section s12345678; .long 1
@@ -57,7 +57,7 @@ pad_sections aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
 //     "s12345678\0"     # of pad sections
 //
 // CHECK:   Section {
-// CHECK:     Number: 9
+// CHECK:     Number: 12
 // CHECK:     Name: seven_digit (2F 31 30 30 30 30 32 39)
 // CHECK:   }
 .section seven_digit; .long 1
@@ -82,7 +82,7 @@ pad_sections_ex aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
 // "2F 2F 41 41 6D 4A 61 34" is "//AAmJa4", which decodes to "0 0 38 9 26 56".
 //
 // CHECK:   Section {
-// CHECK:     Number: 15
+// CHECK:     Number: 18
 // CHECK:     Name: double_slash (2F 2F 41 41 6D 4A 61 34)
 // CHECK:   }
 .section double_slash; .long 1
diff --git a/test/MC/COFF/seh-stackalloc-zero.s b/test/MC/COFF/seh-stackalloc-zero.s
new file mode 100644
index 0000000..898ac84
--- /dev/null
+++ b/test/MC/COFF/seh-stackalloc-zero.s
@@ -0,0 +1,11 @@
+// RUN: not llvm-mc -triple x86_64-pc-win32 -filetype=obj %s -o %t.o 2>&1 | FileCheck %s
+
+// CHECK: Allocation size must be non-zero!
+
+    .globl smallFunc
+    .def smallFunc; .scl 2; .type 32; .endef
+    .seh_proc smallFunc
+    .seh_stackalloc 0
+smallFunc:
+    ret
+    .seh_endproc
diff --git a/test/MC/COFF/seh.s b/test/MC/COFF/seh.s
index 72d42f4..cd884b4 100644
--- a/test/MC/COFF/seh.s
+++ b/test/MC/COFF/seh.s
@@ -35,13 +35,13 @@
 // CHECK-NEXT: ]
 
 // CHECK-NEXT: Relocations [
-// CHECK-NEXT:   Section (2) .xdata {
+// CHECK-NEXT:   Section (4) .xdata {
 // CHECK-NEXT:     0x14 IMAGE_REL_AMD64_ADDR32NB __C_specific_handler
 // CHECK-NEXT:     0x20 IMAGE_REL_AMD64_ADDR32NB func
 // CHECK-NEXT:     0x24 IMAGE_REL_AMD64_ADDR32NB func
 // CHECK-NEXT:     0x28 IMAGE_REL_AMD64_ADDR32NB .xdata
 // CHECK-NEXT:   }
-// CHECK-NEXT:   Section (3) .pdata {
+// CHECK-NEXT:   Section (5) .pdata {
 // CHECK-NEXT:     0x0 IMAGE_REL_AMD64_ADDR32NB func
 // CHECK-NEXT:     0x4 IMAGE_REL_AMD64_ADDR32NB func
 // CHECK-NEXT:     0x8 IMAGE_REL_AMD64_ADDR32NB .xdata
diff --git a/test/MC/COFF/symbol-fragment-offset-64.s b/test/MC/COFF/symbol-fragment-offset-64.s
index b824470..deac888 100644
--- a/test/MC/COFF/symbol-fragment-offset-64.s
+++ b/test/MC/COFF/symbol-fragment-offset-64.s
@@ -36,10 +36,10 @@ _main:                                  # @main
 
 // CHECK: {
 // CHECK:   Machine:                   IMAGE_FILE_MACHINE_AMD64
-// CHECK:   SectionCount:              2
+// CHECK:   SectionCount:              3
 // CHECK:   TimeDateStamp:             {{[0-9]+}}
 // CHECK:   PointerToSymbolTable:      0x{{[0-9A-F]+}}
-// CHECK:   SymbolCount:               7
+// CHECK:   SymbolCount:               9
 // CHECK:   OptionalHeaderSize:        0
 // CHECK:   Characteristics [ (0x0)
 // CHECK:   ]
diff --git a/test/MC/COFF/symbol-fragment-offset.s b/test/MC/COFF/symbol-fragment-offset.s
index 71b1703..b09c5af 100644
--- a/test/MC/COFF/symbol-fragment-offset.s
+++ b/test/MC/COFF/symbol-fragment-offset.s
@@ -36,10 +36,10 @@ L_.str2:
 
 // CHECK: {
 // CHECK:   Machine:                   IMAGE_FILE_MACHINE_I386 (0x14C)
-// CHECK:   SectionCount:              2
+// CHECK:   SectionCount:              3
 // CHECK:   TimeDateStamp:             {{[0-9]+}}
 // CHECK:   PointerToSymbolTable:      0x{{[0-9A-F]+}}
-// CHECK:   SymbolCount:               7
+// CHECK:   SymbolCount:               9
 // CHECK:   OptionalHeaderSize:        0
 // CHECK:   Characteristics [ (0x0)
 // CHECK:   ]
diff --git a/test/MC/Disassembler/AArch64/basic-a64-instructions.txt b/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
index 397a39e..23da001 100644
--- a/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
+++ b/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
@@ -945,10 +945,15 @@
 # CHECK: cset    x9, pl
 # CHECK: csetm    w20, ne
 # CHECK: csetm    x30, ge
+# "cset w2, nv" and "csetm x3, al" are invalid aliases for these two
+# CHECK: csinc    w2, wzr, wzr, al
+# CHECK: csinv    x3, xzr, xzr, nv
 0xe3 0x17 0x9f 0x1a
 0xe9 0x47 0x9f 0x9a
 0xf4 0x3 0x9f 0x5a
 0xfe 0xb3 0x9f 0xda
+0xe2,0xe7,0x9f,0x1a
+0xe3,0xf3,0x9f,0xda
 
 # CHECK: cinc    w3, w5, gt
 # CHECK: cinc    wzr, w4, le
@@ -956,25 +961,35 @@
 # CHECK: cinc    x3, x5, gt
 # CHECK: cinc    xzr, x4, le
 # CHECK: cset    x9, lt
+# "cinc w5, w6, al" and "cinc x1, x2, nv" are invalid aliases for these two
+# CHECK: csinc   w5, w6, w6, nv
+# CHECK: csinc   x1, x2, x2, al
 0xa3 0xd4 0x85 0x1a
 0x9f 0xc4 0x84 0x1a
 0xe9 0xa7 0x9f 0x1a
 0xa3 0xd4 0x85 0x9a
 0x9f 0xc4 0x84 0x9a
 0xe9 0xa7 0x9f 0x9a
+0xc5,0xf4,0x86,0x1a
+0x41,0xe4,0x82,0x9a
 
 # CHECK: cinv    w3, w5, gt
 # CHECK: cinv    wzr, w4, le
-# CHECK: csetm    w9, lt
+# CHECK: csetm   w9, lt
 # CHECK: cinv    x3, x5, gt
 # CHECK: cinv    xzr, x4, le
-# CHECK: csetm    x9, lt
+# CHECK: csetm   x9, lt
+# "cinv x1, x0, nv" and "cinv w9, w8, al" are invalid aliases for these two
+# CHECK: csinv   x1, x0, x0, al
+# CHECK: csinv   w9, w8, w8, nv
 0xa3 0xd0 0x85 0x5a
 0x9f 0xc0 0x84 0x5a
 0xe9 0xa3 0x9f 0x5a
 0xa3 0xd0 0x85 0xda
 0x9f 0xc0 0x84 0xda
 0xe9 0xa3 0x9f 0xda
+0x01 0xe0 0x80 0xda
+0x09,0xf1,0x88,0x5a
 
 # CHECK: cneg     w3, w5, gt
 # CHECK: cneg     wzr, w4, le
@@ -982,12 +997,17 @@
 # CHECK: cneg     x3, x5, gt
 # CHECK: cneg     xzr, x4, le
 # CHECK: cneg     x9, xzr, lt
+# "cneg x4, x8, nv" and "cneg w5, w6, al" are invalid aliases for these two
+# CHECK: csneg    x4, x8, x8, al
+# CHECK: csinv    w9, w8, w8, nv
 0xa3 0xd4 0x85 0x5a
 0x9f 0xc4 0x84 0x5a
 0xe9 0xa7 0x9f 0x5a
 0xa3 0xd4 0x85 0xda
 0x9f 0xc4 0x84 0xda
 0xe9 0xa7 0x9f 0xda
+0x04,0xe5,0x88,0xda
+0x09,0xf1,0x88,0x5a
 
 #------------------------------------------------------------------------------
 # Data-processing (1 source)
diff --git a/test/MC/Disassembler/AArch64/lit.local.cfg b/test/MC/Disassembler/AArch64/lit.local.cfg
index 2c423d1..180bb8a 100644
--- a/test/MC/Disassembler/AArch64/lit.local.cfg
+++ b/test/MC/Disassembler/AArch64/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if 'AArch64' not in targets:
+if 'AArch64' not in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/Disassembler/ARM/hex-immediates.txt b/test/MC/Disassembler/ARM/hex-immediates.txt
index 2634d7e..875d667 100644
--- a/test/MC/Disassembler/ARM/hex-immediates.txt
+++ b/test/MC/Disassembler/ARM/hex-immediates.txt
@@ -1,5 +1,11 @@
-# RUN: llvm-mc -triple=thumbv7-apple-darwin -mcpu=cortex-a8 -hdis < %s | FileCheck %s
+# RUN: llvm-mc -triple=thumbv7-apple-darwin -mcpu=cortex-a8 --disassemble --print-imm-hex < %s | FileCheck %s
 # CHECK: ldr	r4, [pc, #0x20]
 0x08 0x4c
 # CHECK: sub	sp, #0x84
 0xa1 0xb0
+# CHECK: ldr  r0, [sp, #0xb4]
+0x2d 0x98
+# CHECK: str.w  r8, [sp, #0xb4]
+0xcd 0xf8 0xb4 0x80
+# CHECK: ldr.w  r8, [sp, #0xb4]
+0xdd 0xf8 0xb4 0x80
diff --git a/test/MC/Disassembler/ARM/lit.local.cfg b/test/MC/Disassembler/ARM/lit.local.cfg
index 8a3ba96..98c6700 100644
--- a/test/MC/Disassembler/ARM/lit.local.cfg
+++ b/test/MC/Disassembler/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/Disassembler/Mips/lit.local.cfg b/test/MC/Disassembler/Mips/lit.local.cfg
index 1fa54b4..a3183a2 100644
--- a/test/MC/Disassembler/Mips/lit.local.cfg
+++ b/test/MC/Disassembler/Mips/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Mips' in targets:
+if not 'Mips' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/Disassembler/Mips/mips32r6.txt b/test/MC/Disassembler/Mips/mips32r6.txt
index adbcd99..afef8ad 100644
--- a/test/MC/Disassembler/Mips/mips32r6.txt
+++ b/test/MC/Disassembler/Mips/mips32r6.txt
@@ -30,6 +30,8 @@
 0x60 0xa6 0x00 0x40 # CHECK: bnec $5, $6, 256
 0x60 0x02 0x01 0x4d # CHECK: bnezalc $2,
 0xd8 0xa0 0x46 0x90 # CHECK: beqzc $5, 72256
+0x58 0x43 0x00 0x40 # CHECK: bgec $2, $3, 256
+0x18 0x43 0x00 0x40 # CHECK: bgeuc $2, $3, 256
 0x18 0x42 0x01 0x4d # CHECK: bgezalc $2,
 0xf8 0xa0 0x46 0x90 # CHECK: bnezc $5, 72256
 0x5c 0xa5 0x00 0x40 # CHECK: bltzc $5, 256
@@ -40,44 +42,46 @@
 0x5c 0x05 0x00 0x40 # CHECK: bgtzc $5, 256
 0x7c 0x02 0x20 0x20 # CHECK: bitswap $4, $2
 0x18 0x02 0x01 0x4d # CHECK: blezalc $2,
+0x5c 0xa6 0x00 0x40 # CHECK: bltc $5, $6, 256
+0x1c 0xa6 0x00 0x40 # CHECK: bltuc $5, $6, 256
 0x60 0x00 0x00 0x01 # CHECK: bnvc $zero, $zero, 4
 0x60 0x40 0x00 0x01 # CHECK: bnvc $2, $zero, 4
 0x60 0x82 0x00 0x01 # CHECK: bnvc $4, $2, 4
 0x20 0x00 0x00 0x01 # CHECK: bovc $zero, $zero, 4
 0x20 0x40 0x00 0x01 # CHECK: bovc $2, $zero, 4
 0x20 0x82 0x00 0x01 # CHECK: bovc $4, $2, 4
-0x46 0x84 0x18 0x80 # CHECK: cmp.f.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x80 # CHECK: cmp.f.d $f2, $f3, $f4
+0x46 0x84 0x18 0x80 # CHECK: cmp.af.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x80 # CHECK: cmp.af.d $f2, $f3, $f4
 0x46 0x84 0x18 0x81 # CHECK: cmp.un.s $f2, $f3, $f4
 0x46 0xa4 0x18 0x81 # CHECK: cmp.un.d $f2, $f3, $f4
 0x46 0x84 0x18 0x82 # CHECK: cmp.eq.s $f2, $f3, $f4
 0x46 0xa4 0x18 0x82 # CHECK: cmp.eq.d $f2, $f3, $f4
 0x46 0x84 0x18 0x83 # CHECK: cmp.ueq.s $f2, $f3, $f4
 0x46 0xa4 0x18 0x83 # CHECK: cmp.ueq.d $f2, $f3, $f4
-0x46 0x84 0x18 0x84 # CHECK: cmp.olt.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x84 # CHECK: cmp.olt.d $f2, $f3, $f4
+0x46 0x84 0x18 0x84 # CHECK: cmp.lt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x84 # CHECK: cmp.lt.d $f2, $f3, $f4
 0x46 0x84 0x18 0x85 # CHECK: cmp.ult.s $f2, $f3, $f4
 0x46 0xa4 0x18 0x85 # CHECK: cmp.ult.d $f2, $f3, $f4
-0x46 0x84 0x18 0x86 # CHECK: cmp.ole.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x86 # CHECK: cmp.ole.d $f2, $f3, $f4
+0x46 0x84 0x18 0x86 # CHECK: cmp.le.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x86 # CHECK: cmp.le.d $f2, $f3, $f4
 0x46 0x84 0x18 0x87 # CHECK: cmp.ule.s $f2, $f3, $f4
 0x46 0xa4 0x18 0x87 # CHECK: cmp.ule.d $f2, $f3, $f4
-0x46 0x84 0x18 0x88 # CHECK: cmp.sf.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x88 # CHECK: cmp.sf.d $f2, $f3, $f4
-0x46 0x84 0x18 0x89 # CHECK: cmp.ngle.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x89 # CHECK: cmp.ngle.d $f2, $f3, $f4
+0x46 0x84 0x18 0x88 # CHECK: cmp.saf.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x88 # CHECK: cmp.saf.d $f2, $f3, $f4
+0x46 0x84 0x18 0x89 # CHECK: cmp.sun.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x89 # CHECK: cmp.sun.d $f2, $f3, $f4
 0x46 0x84 0x18 0x8a # CHECK: cmp.seq.s $f2, $f3, $f4
 0x46 0xa4 0x18 0x8a # CHECK: cmp.seq.d $f2, $f3, $f4
-0x46 0x84 0x18 0x8b # CHECK: cmp.ngl.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x8b # CHECK: cmp.ngl.d $f2, $f3, $f4
-0x46 0x84 0x18 0x8c # CHECK: cmp.lt.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x8c # CHECK: cmp.lt.d $f2, $f3, $f4
-0x46 0x84 0x18 0x8d # CHECK: cmp.nge.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x8d # CHECK: cmp.nge.d $f2, $f3, $f4
-0x46 0x84 0x18 0x8e # CHECK: cmp.le.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x8e # CHECK: cmp.le.d $f2, $f3, $f4
-0x46 0x84 0x18 0x8f # CHECK: cmp.ngt.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x8f # CHECK: cmp.ngt.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8b # CHECK: cmp.sueq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8b # CHECK: cmp.sueq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8c # CHECK: cmp.slt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8c # CHECK: cmp.slt.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8d # CHECK: cmp.sult.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8d # CHECK: cmp.sult.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8e # CHECK: cmp.sle.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8e # CHECK: cmp.sle.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8f # CHECK: cmp.sule.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8f # CHECK: cmp.sule.d $f2, $f3, $f4
 0x00 0x64 0x10 0x9a # CHECK: div $2, $3, $4
 0x00 0x64 0x10 0x9b # CHECK: divu $2, $3, $4
 # 0xf8 0x05 0x01 0x00 # CHECK-TODO: jialc $5, 256
@@ -114,3 +118,10 @@
 0x46 0x20 0x20 0x9a # CHECK: rint.d $f2, $f4
 0x46 0x00 0x20 0x9b # CHECK: class.s $f2, $f4
 0x46 0x20 0x20 0x9b # CHECK: class.d $f2, $f4
+0x00 0x80 0x04 0x09 # CHECK: jr.hb $4
+0x00 0x80 0xfc 0x09 # CHECK: jalr.hb $4
+0x00 0xa0 0x24 0x09 # CHECK: jalr.hb $4, $5
+0x7e 0x42 0xb3 0xb6 # CHECK: ll $2, -153($18)
+0x7e 0x6f 0xec 0x26 # CHECK: sc $15, -40($19)
+0x00 0xa0 0x58 0x51 # CHECK: clo $11, $5
+0x03 0x80 0xe8 0x50 # CHECK: clz $sp, $gp
diff --git a/test/MC/Disassembler/Mips/mips64r6.txt b/test/MC/Disassembler/Mips/mips64r6.txt
index f5bb14e..3ddef9ab 100644
--- a/test/MC/Disassembler/Mips/mips64r6.txt
+++ b/test/MC/Disassembler/Mips/mips64r6.txt
@@ -30,6 +30,8 @@
 0x60 0xa6 0x00 0x40 # CHECK: bnec $5, $6, 256
 0x60 0x02 0x01 0x4d # CHECK: bnezalc $2,
 0xd8 0xa0 0x46 0x90 # CHECK: beqzc $5, 72256
+0x58 0x43 0x00 0x40 # CHECK: bgec $2, $3, 256
+0x18 0x43 0x00 0x40 # CHECK: bgeuc $2, $3, 256
 0x18 0x42 0x01 0x4d # CHECK: bgezalc $2,
 0xf8 0xa0 0x46 0x90 # CHECK: bnezc $5, 72256
 0x5c 0xa5 0x00 0x40 # CHECK: bltzc $5, 256
@@ -40,44 +42,46 @@
 0x5c 0x05 0x00 0x40 # CHECK: bgtzc $5, 256
 0x7c 0x02 0x20 0x20 # CHECK: bitswap $4, $2
 0x18 0x02 0x01 0x4d # CHECK: blezalc $2,
+0x5c 0xa6 0x00 0x40 # CHECK: bltc $5, $6, 256
+0x1c 0xa6 0x00 0x40 # CHECK: bltuc $5, $6, 256
 0x60 0x00 0x00 0x01 # CHECK: bnvc $zero, $zero, 4
 0x60 0x40 0x00 0x01 # CHECK: bnvc $2, $zero, 4
 0x60 0x82 0x00 0x01 # CHECK: bnvc $4, $2, 4
 0x20 0x00 0x00 0x01 # CHECK: bovc $zero, $zero, 4
 0x20 0x40 0x00 0x01 # CHECK: bovc $2, $zero, 4
 0x20 0x82 0x00 0x01 # CHECK: bovc $4, $2, 4
-0x46 0x84 0x18 0x80 # CHECK: cmp.f.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x80 # CHECK: cmp.f.d $f2, $f3, $f4
+0x46 0x84 0x18 0x80 # CHECK: cmp.af.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x80 # CHECK: cmp.af.d $f2, $f3, $f4
 0x46 0x84 0x18 0x81 # CHECK: cmp.un.s $f2, $f3, $f4
 0x46 0xa4 0x18 0x81 # CHECK: cmp.un.d $f2, $f3, $f4
 0x46 0x84 0x18 0x82 # CHECK: cmp.eq.s $f2, $f3, $f4
 0x46 0xa4 0x18 0x82 # CHECK: cmp.eq.d $f2, $f3, $f4
 0x46 0x84 0x18 0x83 # CHECK: cmp.ueq.s $f2, $f3, $f4
 0x46 0xa4 0x18 0x83 # CHECK: cmp.ueq.d $f2, $f3, $f4
-0x46 0x84 0x18 0x84 # CHECK: cmp.olt.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x84 # CHECK: cmp.olt.d $f2, $f3, $f4
+0x46 0x84 0x18 0x84 # CHECK: cmp.lt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x84 # CHECK: cmp.lt.d $f2, $f3, $f4
 0x46 0x84 0x18 0x85 # CHECK: cmp.ult.s $f2, $f3, $f4
 0x46 0xa4 0x18 0x85 # CHECK: cmp.ult.d $f2, $f3, $f4
-0x46 0x84 0x18 0x86 # CHECK: cmp.ole.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x86 # CHECK: cmp.ole.d $f2, $f3, $f4
+0x46 0x84 0x18 0x86 # CHECK: cmp.le.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x86 # CHECK: cmp.le.d $f2, $f3, $f4
 0x46 0x84 0x18 0x87 # CHECK: cmp.ule.s $f2, $f3, $f4
 0x46 0xa4 0x18 0x87 # CHECK: cmp.ule.d $f2, $f3, $f4
-0x46 0x84 0x18 0x88 # CHECK: cmp.sf.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x88 # CHECK: cmp.sf.d $f2, $f3, $f4
-0x46 0x84 0x18 0x89 # CHECK: cmp.ngle.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x89 # CHECK: cmp.ngle.d $f2, $f3, $f4
+0x46 0x84 0x18 0x88 # CHECK: cmp.saf.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x88 # CHECK: cmp.saf.d $f2, $f3, $f4
+0x46 0x84 0x18 0x89 # CHECK: cmp.sun.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x89 # CHECK: cmp.sun.d $f2, $f3, $f4
 0x46 0x84 0x18 0x8a # CHECK: cmp.seq.s $f2, $f3, $f4
 0x46 0xa4 0x18 0x8a # CHECK: cmp.seq.d $f2, $f3, $f4
-0x46 0x84 0x18 0x8b # CHECK: cmp.ngl.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x8b # CHECK: cmp.ngl.d $f2, $f3, $f4
-0x46 0x84 0x18 0x8c # CHECK: cmp.lt.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x8c # CHECK: cmp.lt.d $f2, $f3, $f4
-0x46 0x84 0x18 0x8d # CHECK: cmp.nge.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x8d # CHECK: cmp.nge.d $f2, $f3, $f4
-0x46 0x84 0x18 0x8e # CHECK: cmp.le.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x8e # CHECK: cmp.le.d $f2, $f3, $f4
-0x46 0x84 0x18 0x8f # CHECK: cmp.ngt.s $f2, $f3, $f4
-0x46 0xa4 0x18 0x8f # CHECK: cmp.ngt.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8b # CHECK: cmp.sueq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8b # CHECK: cmp.sueq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8c # CHECK: cmp.slt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8c # CHECK: cmp.slt.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8d # CHECK: cmp.sult.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8d # CHECK: cmp.sult.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8e # CHECK: cmp.sle.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8e # CHECK: cmp.sle.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8f # CHECK: cmp.sule.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8f # CHECK: cmp.sule.d $f2, $f3, $f4
 0x7c 0x43 0x23 0x64 # CHECK: dalign $4, $2, $3, 5
 0x74 0x62 0x12 0x34 # CHECK: daui $3, $2, 4660
 0x04 0x66 0x56 0x78 # CHECK: dahi $3, 22136
@@ -99,10 +103,10 @@
 0x00 0x64 0x10 0xd8 # CHECK: muh $2, $3, $4
 0x00 0x64 0x10 0x99 # CHECK: mulu $2, $3, $4
 0x00 0x64 0x10 0xd9 # CHECK: muhu $2, $3, $4
-0x00 0x64 0x10 0xb8 # CHECK: dmul $2, $3, $4
-0x00 0x64 0x10 0xf8 # CHECK: dmuh $2, $3, $4
-0x00 0x64 0x10 0xb9 # CHECK: dmulu $2, $3, $4
-0x00 0x64 0x10 0xf9 # CHECK: dmuhu $2, $3, $4
+0x00 0x64 0x10 0x9c # CHECK: dmul $2, $3, $4
+0x00 0x64 0x10 0xdc # CHECK: dmuh $2, $3, $4
+0x00 0x64 0x10 0x9d # CHECK: dmulu $2, $3, $4
+0x00 0x64 0x10 0xdd # CHECK: dmuhu $2, $3, $4
 0x46 0x04 0x18 0x98 # CHECK: maddf.s $f2, $f3, $f4
 0x46 0x24 0x18 0x98 # CHECK: maddf.d $f2, $f3, $f4
 0x46 0x04 0x18 0x99 # CHECK: msubf.s $f2, $f3, $f4
@@ -127,3 +131,15 @@
 0x46 0x20 0x20 0x9a # CHECK: rint.d $f2, $f4
 0x46 0x00 0x20 0x9b # CHECK: class.s $f2, $f4
 0x46 0x20 0x20 0x9b # CHECK: class.d $f2, $f4
+0xec 0x58 0x3c 0x48 # CHECK: ldpc $2, 123456
+0x00 0x80 0x04 0x09 # CHECK: jr.hb $4
+0x00 0x80 0xfc 0x09 # CHECK: jalr.hb $4
+0x00 0xa0 0x24 0x09 # CHECK: jalr.hb $4, $5
+0x7e 0x42 0xb3 0xb6 # CHECK: ll $2, -153($18)
+0x7f 0xe0 0x38 0x37 # CHECK: lld $zero, 112($ra)
+0x7e 0x6f 0xec 0x26 # CHECK: sc $15, -40($19)
+0x7f 0xaf 0xe6 0xa7 # CHECK: scd $15, -51($sp)
+0x00 0xa0 0x58 0x51 # CHECK: clo $11, $5
+0x03 0x80 0xe8 0x50 # CHECK: clz $sp, $gp
+0x00 0xc0 0x90 0x53 # CHECK: dclo $18, $6
+0x03 0x20 0x80 0x52 # CHECK: dclz $16, $25
diff --git a/test/MC/Disassembler/PowerPC/lit.local.cfg b/test/MC/Disassembler/PowerPC/lit.local.cfg
index 2e46300..5d33887 100644
--- a/test/MC/Disassembler/PowerPC/lit.local.cfg
+++ b/test/MC/Disassembler/PowerPC/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'PowerPC' in targets:
+if not 'PowerPC' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/Disassembler/Sparc/lit.local.cfg b/test/MC/Disassembler/Sparc/lit.local.cfg
index 4d344fa..fa6a54e 100644
--- a/test/MC/Disassembler/Sparc/lit.local.cfg
+++ b/test/MC/Disassembler/Sparc/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Sparc' in targets:
+if not 'Sparc' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/Disassembler/SystemZ/insns.txt b/test/MC/Disassembler/SystemZ/insns.txt
index 1a5634d..54a3c5b 100644
--- a/test/MC/Disassembler/SystemZ/insns.txt
+++ b/test/MC/Disassembler/SystemZ/insns.txt
@@ -3355,6 +3355,24 @@
 # CHECK: ldxbr %f13, %f13
 0xb3 0x45 0x00 0xdd
 
+# CHECK: ldxbra	%f0, 0, %f0, 1
+0xb3 0x45 0x01 0x00
+
+# CHECK: ldxbra	%f0, 0, %f0, 15
+0xb3 0x45 0x0f 0x00
+
+# CHECK: ldxbra	%f0, 0, %f13, 1
+0xb3 0x45 0x01 0x0d
+
+# CHECK: ldxbra	%f0, 15, %f0, 1
+0xb3 0x45 0xf1 0x00
+
+# CHECK: ldxbra	%f4, 5, %f8, 9
+0xb3 0x45 0x59 0x48
+
+# CHECK: ldxbra	%f13, 0, %f0, 1
+0xb3 0x45 0x01 0xd0
+
 # CHECK: ldy %f0, -524288
 0xed 0x00 0x00 0x00 0x80 0x65
 
@@ -3400,6 +3418,24 @@
 # CHECK: ledbr %f15, %f15
 0xb3 0x44 0x00 0xff
 
+# CHECK: ledbra	%f0, 0, %f0, 1
+0xb3 0x44 0x01 0x00
+
+# CHECK: ledbra	%f0, 0, %f0, 15
+0xb3 0x44 0x0f 0x00
+
+# CHECK: ledbra	%f0, 0, %f15, 1
+0xb3 0x44 0x01 0x0f
+
+# CHECK: ledbra	%f0, 15, %f0, 1
+0xb3 0x44 0xf1 0x00
+
+# CHECK: ledbra	%f4, 5, %f6, 7
+0xb3 0x44 0x57 0x46
+
+# CHECK: ledbra	%f15, 0, %f0, 1
+0xb3 0x44 0x01 0xf0
+
 # CHECK: ler %f0, %f9
 0x38 0x09
 
@@ -3448,6 +3484,24 @@
 # CHECK: lexbr %f13, %f13
 0xb3 0x46 0x00 0xdd
 
+# CHECK: lexbra	%f0, 0, %f0, 1
+0xb3 0x46 0x01 0x00
+
+# CHECK: lexbra	%f0, 0, %f0, 15
+0xb3 0x46 0x0f 0x00
+
+# CHECK: lexbra	%f0, 0, %f13, 1
+0xb3 0x46 0x01 0x0d
+
+# CHECK: lexbra	%f0, 15, %f0, 1
+0xb3 0x46 0xf1 0x00
+
+# CHECK: lexbra	%f4, 5, %f8, 9
+0xb3 0x46 0x59 0x48
+
+# CHECK: lexbra	%f13, 0, %f0, 1
+0xb3 0x46 0x01 0xd0
+
 # CHECK: ley %f0, -524288
 0xed 0x00 0x00 0x00 0x80 0x64
 
diff --git a/test/MC/Disassembler/SystemZ/lit.local.cfg b/test/MC/Disassembler/SystemZ/lit.local.cfg
index b12af09..5c02dd3 100644
--- a/test/MC/Disassembler/SystemZ/lit.local.cfg
+++ b/test/MC/Disassembler/SystemZ/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'SystemZ' in targets:
+if not 'SystemZ' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/Disassembler/X86/avx-512.txt b/test/MC/Disassembler/X86/avx-512.txt
index e5ad2a9..b1a8aaf 100644
--- a/test/MC/Disassembler/X86/avx-512.txt
+++ b/test/MC/Disassembler/X86/avx-512.txt
@@ -39,6 +39,12 @@
 # CHECK: vgatherdpd      (%rsi,%ymm0,4), %zmm1 {%k2}
 0x62 0xf2 0xfd 0x4a 0x92 0x0c 0x86
 
+# CHECK: vpslld $16, %zmm21, %zmm22
+0x62 0xb1 0x4d 0x40 0x72 0xf5 0x10
+
+# CHECK: vpord %zmm22, %zmm21, %zmm23
+0x62 0xa1 0x55 0x40 0xeb 0xfe
+
 #####################################################
 #                MASK INSTRUCTIONS                  #
 #####################################################
diff --git a/test/MC/Disassembler/X86/hex-immediates.txt b/test/MC/Disassembler/X86/hex-immediates.txt
index 80d2448..fb76c26 100644
--- a/test/MC/Disassembler/X86/hex-immediates.txt
+++ b/test/MC/Disassembler/X86/hex-immediates.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc --hdis %s -triple=x86_64-apple-darwin9 2>&1 | FileCheck %s
+# RUN: llvm-mc --print-imm-hex --disassemble %s -triple=x86_64-apple-darwin9 2>&1 | FileCheck %s
 
 # CHECK: movabsq	$0x7fffffffffffffff, %rcx
 0x48 0xb9 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0x7f
diff --git a/test/MC/Disassembler/X86/lit.local.cfg b/test/MC/Disassembler/X86/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/MC/Disassembler/X86/lit.local.cfg
+++ b/test/MC/Disassembler/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/Disassembler/X86/moffs.txt b/test/MC/Disassembler/X86/moffs.txt
index 67d64e8..dd2664c 100644
--- a/test/MC/Disassembler/X86/moffs.txt
+++ b/test/MC/Disassembler/X86/moffs.txt
@@ -1,6 +1,6 @@
-# RUN: llvm-mc --hdis %s -triple=i686-linux-gnu-code16 | FileCheck --check-prefix=16 %s
-# RUN: llvm-mc --hdis %s -triple=i686-linux-gnu | FileCheck --check-prefix=32 %s
-# RUN: llvm-mc --hdis %s -triple=x86_64-linux-gnu | FileCheck --check-prefix=64 %s
+# RUN: llvm-mc --disassemble --print-imm-hex %s -triple=i686-linux-gnu-code16 | FileCheck --check-prefix=16 %s
+# RUN: llvm-mc --disassemble --print-imm-hex %s -triple=i686-linux-gnu | FileCheck --check-prefix=32 %s
+# RUN: llvm-mc --disassemble --print-imm-hex %s -triple=x86_64-linux-gnu | FileCheck --check-prefix=64 %s
 
 # 16: movb 0x5a5a, %al
 # 32: movb 0x5a5a5a5a, %al
diff --git a/test/MC/Disassembler/XCore/lit.local.cfg b/test/MC/Disassembler/XCore/lit.local.cfg
index 4d17d46..bb48713 100644
--- a/test/MC/Disassembler/XCore/lit.local.cfg
+++ b/test/MC/Disassembler/XCore/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'XCore' in targets:
+if not 'XCore' in config.root.targets:
     config.unsupported = True
diff --git a/test/MC/ELF/ARM/bss-non-zero-value.s b/test/MC/ELF/ARM/bss-non-zero-value.s
new file mode 100644
index 0000000..999b8b0
--- /dev/null
+++ b/test/MC/ELF/ARM/bss-non-zero-value.s
@@ -0,0 +1,9 @@
+// RUN: not llvm-mc -filetype=obj -triple arm-linux-gnu %s -o %t 2>%t.out
+// RUN: FileCheck --input-file=%t.out %s
+// CHECK: non-zero initializer found in section '.bss'
+	.bss
+	.globl	a
+	.align	2
+a:
+	.long	1
+	.size	a, 4
diff --git a/test/MC/ELF/ARM/gnu-type-hash-diagnostics.s b/test/MC/ELF/ARM/gnu-type-hash-diagnostics.s
new file mode 100644
index 0000000..eb36475
--- /dev/null
+++ b/test/MC/ELF/ARM/gnu-type-hash-diagnostics.s
@@ -0,0 +1,9 @@
+@ RUN: not llvm-mc -triple arm-elf -filetype asm -o /dev/null %s 2>&1 | FileCheck %s
+
+	.syntax unified
+
+	.type TYPE #32
+// CHECK: error: expected symbol type in directive
+// CHECK: .type TYPE #32
+// CHECK:             ^
+
diff --git a/test/MC/ELF/ARM/gnu-type-hash.s b/test/MC/ELF/ARM/gnu-type-hash.s
new file mode 100644
index 0000000..ae5c47c
--- /dev/null
+++ b/test/MC/ELF/ARM/gnu-type-hash.s
@@ -0,0 +1,16 @@
+@ RUN: llvm-mc -triple arm-elf -filetype asm -o - %s | FileCheck %s
+
+	.syntax unified
+
+	.type TYPE #STT_FUNC
+// CHECK: .type TYPE,%function
+
+	.type type #function
+// CHECK: .type type,%function
+
+	.type comma_TYPE, #STT_FUNC
+// CHECK: .type comma_TYPE,%function
+
+	.type comma_type, #function
+// CHECK: .type comma_type,%function
+
diff --git a/test/MC/ELF/ARM/lit.local.cfg b/test/MC/ELF/ARM/lit.local.cfg
new file mode 100644
index 0000000..d825cc0
--- /dev/null
+++ b/test/MC/ELF/ARM/lit.local.cfg
@@ -0,0 +1,3 @@
+# We have to reset config.unsupported here because the parent directory is
+# predicated on 'X86'.
+config.unsupported = not 'ARM' in config.root.targets
diff --git a/test/MC/ELF/cfi-adjust-cfa-offset.s b/test/MC/ELF/cfi-adjust-cfa-offset.s
index b3768cb..9d639f7 100644
--- a/test/MC/ELF/cfi-adjust-cfa-offset.s
+++ b/test/MC/ELF/cfi-adjust-cfa-offset.s
@@ -28,7 +28,7 @@ f:
 // CHECK-NEXT:     Relocations [
 // CHECK-NEXT:     ]
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 14000000 00000000 017A5200 01781001
+// CHECK-NEXT:       0000: 14000000 00000000 037A5200 01781001
 // CHECK-NEXT:       0010: 1B0C0708 90010000 1C000000 1C000000
 // CHECK-NEXT:       0020: 00000000 0A000000 00440E10 410E1444
 // CHECK-NEXT:       0030: 0E080000 00000000
diff --git a/test/MC/ELF/cfi-advance-loc2.s b/test/MC/ELF/cfi-advance-loc2.s
index d7a53c4..98caa01 100644
--- a/test/MC/ELF/cfi-advance-loc2.s
+++ b/test/MC/ELF/cfi-advance-loc2.s
@@ -26,7 +26,7 @@ f:
 // CHECK-NEXT:     Relocations [
 // CHECK-NEXT:     ]
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 14000000 00000000 017A5200 01781001
+// CHECK-NEXT:       0000: 14000000 00000000 037A5200 01781001
 // CHECK-NEXT:       0010: 1B0C0708 90010000 14000000 1C000000
 // CHECK-NEXT:       0020: 00000000 01010000 00030001 0E080000
 // CHECK-NEXT:     )
diff --git a/test/MC/ELF/cfi-def-cfa-offset.s b/test/MC/ELF/cfi-def-cfa-offset.s
index eac2c73..59f7400 100644
--- a/test/MC/ELF/cfi-def-cfa-offset.s
+++ b/test/MC/ELF/cfi-def-cfa-offset.s
@@ -27,7 +27,7 @@ f:
 // CHECK-NEXT:     Relocations [
 // CHECK-NEXT:     ]
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 14000000 00000000 017A5200 01781001
+// CHECK-NEXT:       0000: 14000000 00000000 037A5200 01781001
 // CHECK-NEXT:       0010: 1B0C0708 90010000 14000000 1C000000
 // CHECK-NEXT:       0020: 00000000 0A000000 00440E10 450E0800
 // CHECK-NEXT:     )
diff --git a/test/MC/ELF/cfi-def-cfa-register.s b/test/MC/ELF/cfi-def-cfa-register.s
index 00d8b99..178ba32 100644
--- a/test/MC/ELF/cfi-def-cfa-register.s
+++ b/test/MC/ELF/cfi-def-cfa-register.s
@@ -23,7 +23,7 @@ f:
 // CHECK-NEXT:     Relocations [
 // CHECK-NEXT:     ]
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 14000000 00000000 017A5200 01781001
+// CHECK-NEXT:       0000: 14000000 00000000 037A5200 01781001
 // CHECK-NEXT:       0010: 1B0C0708 90010000 14000000 1C000000
 // CHECK-NEXT:       0020: 00000000 02000000 00410D06 00000000
 // CHECK-NEXT:     )
diff --git a/test/MC/ELF/cfi-def-cfa.s b/test/MC/ELF/cfi-def-cfa.s
index 36e147f..dfb0d4b 100644
--- a/test/MC/ELF/cfi-def-cfa.s
+++ b/test/MC/ELF/cfi-def-cfa.s
@@ -23,7 +23,7 @@ f:
 // CHECK-NEXT:     Relocations [
 // CHECK-NEXT:     ]
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 14000000 00000000 017A5200 01781001
+// CHECK-NEXT:       0000: 14000000 00000000 037A5200 01781001
 // CHECK-NEXT:       0010: 1B0C0708 90010000 14000000 1C000000
 // CHECK-NEXT:       0020: 00000000 02000000 00410C07 08000000
 // CHECK-NEXT:     )
diff --git a/test/MC/ELF/cfi-escape.s b/test/MC/ELF/cfi-escape.s
index 839d671..5394ee4 100644
--- a/test/MC/ELF/cfi-escape.s
+++ b/test/MC/ELF/cfi-escape.s
@@ -24,7 +24,7 @@ f:
 // CHECK-NEXT:     Relocations [
 // CHECK-NEXT:     ]
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 14000000 00000000 017A5200 01781001
+// CHECK-NEXT:       0000: 14000000 00000000 037A5200 01781001
 // CHECK-NEXT:       0010: 1B0C0708 90010000 14000000 1C000000
 // CHECK-NEXT:       0020: 00000000 02000000 00411507 7F000000
 // CHECK-NEXT:     )
diff --git a/test/MC/ELF/cfi-offset.s b/test/MC/ELF/cfi-offset.s
index 951a600..a65b4fc 100644
--- a/test/MC/ELF/cfi-offset.s
+++ b/test/MC/ELF/cfi-offset.s
@@ -23,7 +23,7 @@ f:
 // CHECK-NEXT:     Relocations [
 // CHECK-NEXT:     ]
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 14000000 00000000 017A5200 01781001
+// CHECK-NEXT:       0000: 14000000 00000000 037A5200 01781001
 // CHECK-NEXT:       0010: 1B0C0708 90010000 14000000 1C000000
 // CHECK-NEXT:       0020: 00000000 02000000 00418602 00000000
 // CHECK-NEXT:     )
diff --git a/test/MC/ELF/cfi-register.s b/test/MC/ELF/cfi-register.s
index 4abbb53..9441770 100644
--- a/test/MC/ELF/cfi-register.s
+++ b/test/MC/ELF/cfi-register.s
@@ -24,7 +24,7 @@ f:
 // CHECK-NEXT:     Relocations [
 // CHECK-NEXT:     ]
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 14000000 00000000 017A5200 01781001
+// CHECK-NEXT:       0000: 14000000 00000000 037A5200 01781001
 // CHECK-NEXT:       0010: 1B0C0708 90010000 14000000 1C000000
 // CHECK-NEXT:       0020: 00000000 02000000 00410906 00000000
 // CHECK-NEXT:     )
diff --git a/test/MC/ELF/cfi-rel-offset.s b/test/MC/ELF/cfi-rel-offset.s
index 34254c8..0dc69c8 100644
--- a/test/MC/ELF/cfi-rel-offset.s
+++ b/test/MC/ELF/cfi-rel-offset.s
@@ -31,7 +31,7 @@ f:
 // CHECK-NEXT:     Relocations [
 // CHECK-NEXT:     ]
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 14000000 00000000 017A5200 01781001
+// CHECK-NEXT:       0000: 14000000 00000000 037A5200 01781001
 // CHECK-NEXT:       0010: 1B0C0708 90010000 24000000 1C000000
 // CHECK-NEXT:       0020: 00000000 05000000 00410E08 410D0641
 // CHECK-NEXT:       0030: 11067F41 0E104186 02000000 00000000
diff --git a/test/MC/ELF/cfi-rel-offset2.s b/test/MC/ELF/cfi-rel-offset2.s
index 3de769f..360e7b0 100644
--- a/test/MC/ELF/cfi-rel-offset2.s
+++ b/test/MC/ELF/cfi-rel-offset2.s
@@ -23,7 +23,7 @@ f:
 // CHECK-NEXT:     Relocations [
 // CHECK-NEXT:     ]
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 14000000 00000000 017A5200 01781001
+// CHECK-NEXT:       0000: 14000000 00000000 037A5200 01781001
 // CHECK-NEXT:       0010: 1B0C0708 90010000 14000000 1C000000
 // CHECK-NEXT:       0020: 00000000 01000000 00411106 7F000000
 // CHECK-NEXT:     )
diff --git a/test/MC/ELF/cfi-remember.s b/test/MC/ELF/cfi-remember.s
index 98c759d..3a38948 100644
--- a/test/MC/ELF/cfi-remember.s
+++ b/test/MC/ELF/cfi-remember.s
@@ -26,7 +26,7 @@ f:
 // CHECK-NEXT:     Relocations [
 // CHECK-NEXT:     ]
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 14000000 00000000 017A5200 01781001
+// CHECK-NEXT:       0000: 14000000 00000000 037A5200 01781001
 // CHECK-NEXT:       0010: 1B0C0708 90010000 14000000 1C000000
 // CHECK-NEXT:       0020: 00000000 03000000 00410A41 0B000000
 // CHECK-NEXT:     )
diff --git a/test/MC/ELF/cfi-restore.s b/test/MC/ELF/cfi-restore.s
index d25b5ff..e225797 100644
--- a/test/MC/ELF/cfi-restore.s
+++ b/test/MC/ELF/cfi-restore.s
@@ -24,7 +24,7 @@ f:
 // CHECK-NEXT:     Relocations [
 // CHECK-NEXT:     ]
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 14000000 00000000 017A5200 01781001
+// CHECK-NEXT:       0000: 14000000 00000000 037A5200 01781001
 // CHECK-NEXT:       0010: 1B0C0708 90010000 14000000 1C000000
 // CHECK-NEXT:       0020: 00000000 02000000 0041C600 00000000
 // CHECK-NEXT:     )
diff --git a/test/MC/ELF/cfi-same-value.s b/test/MC/ELF/cfi-same-value.s
index 9f5ae4b..2d37f4d 100644
--- a/test/MC/ELF/cfi-same-value.s
+++ b/test/MC/ELF/cfi-same-value.s
@@ -24,7 +24,7 @@ f:
 // CHECK-NEXT:     Relocations [
 // CHECK-NEXT:     ]
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 14000000 00000000 017A5200 01781001
+// CHECK-NEXT:       0000: 14000000 00000000 037A5200 01781001
 // CHECK-NEXT:       0010: 1B0C0708 90010000 14000000 1C000000
 // CHECK-NEXT:       0020: 00000000 02000000 00410806 00000000
 // CHECK-NEXT:     )
diff --git a/test/MC/ELF/cfi-sections.s b/test/MC/ELF/cfi-sections.s
index 15a79e5..b0ba543 100644
--- a/test/MC/ELF/cfi-sections.s
+++ b/test/MC/ELF/cfi-sections.s
@@ -26,7 +26,7 @@ f2:
 // ELF_64-NEXT:     AddressAlignment: 8
 // ELF_64-NEXT:     EntrySize: 0
 // ELF_64-NEXT:     SectionData (
-// ELF_64-NEXT:       0000: 14000000 FFFFFFFF 01000178 100C0708
+// ELF_64-NEXT:       0000: 14000000 FFFFFFFF 03000178 100C0708
 // ELF_64-NEXT:       0010: 90010000 00000000 14000000 00000000
 // ELF_64-NEXT:       0020: 00000000 00000000 01000000 00000000
 // ELF_64-NEXT:       0030: 14000000 00000000 00000000 00000000
@@ -47,7 +47,7 @@ f2:
 // ELF_32-NEXT:     AddressAlignment: 4
 // ELF_32-NEXT:     EntrySize: 0
 // ELF_32-NEXT:     SectionData (
-// ELF_32-NEXT:       0000: 10000000 FFFFFFFF 0100017C 080C0404
+// ELF_32-NEXT:       0000: 10000000 FFFFFFFF 0300017C 080C0404
 // ELF_32-NEXT:       0010: 88010000 0C000000 00000000 00000000
 // ELF_32-NEXT:       0020: 01000000 0C000000 00000000 01000000
 // ELF_32-NEXT:       0030: 01000000
diff --git a/test/MC/ELF/cfi-signal-frame.s b/test/MC/ELF/cfi-signal-frame.s
index 0233119..98deb0a 100644
--- a/test/MC/ELF/cfi-signal-frame.s
+++ b/test/MC/ELF/cfi-signal-frame.s
@@ -23,10 +23,10 @@ g:
 // CHECK-NEXT:     AddressAlignment: 8
 // CHECK-NEXT:     EntrySize: 0
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 14000000 00000000 017A5253 00017810
+// CHECK-NEXT:       0000: 14000000 00000000 037A5253 00017810
 // CHECK-NEXT:       0010: 011B0C07 08900100 10000000 1C000000
 // CHECK-NEXT:       0020: 00000000 00000000 00000000 14000000
-// CHECK-NEXT:       0030: 00000000 017A5200 01781001 1B0C0708
+// CHECK-NEXT:       0030: 00000000 037A5200 01781001 1B0C0708
 // CHECK-NEXT:       0040: 90010000 10000000 1C000000 00000000
 // CHECK-NEXT:       0050: 00000000 00000000
 // CHECK-NEXT:     )
diff --git a/test/MC/ELF/cfi-undefined.s b/test/MC/ELF/cfi-undefined.s
index 9773a36..568b315 100644
--- a/test/MC/ELF/cfi-undefined.s
+++ b/test/MC/ELF/cfi-undefined.s
@@ -24,7 +24,7 @@ f:
 // CHECK-NEXT:     Relocations [
 // CHECK-NEXT:     ]
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 14000000 00000000 017A5200 01781001
+// CHECK-NEXT:       0000: 14000000 00000000 037A5200 01781001
 // CHECK-NEXT:       0010: 1B0C0708 90010000 14000000 1C000000
 // CHECK-NEXT:       0020: 00000000 02000000 00410706 00000000
 // CHECK-NEXT:     )
diff --git a/test/MC/ELF/cfi-version.ll b/test/MC/ELF/cfi-version.ll
new file mode 100644
index 0000000..10daa1d
--- /dev/null
+++ b/test/MC/ELF/cfi-version.ll
@@ -0,0 +1,45 @@
+; RUN: %llc_dwarf %s -o - -dwarf-version 2 -filetype=obj | llvm-dwarfdump - | FileCheck %s --check-prefix=DWARF2
+; RUN: %llc_dwarf %s -o - -dwarf-version 3 -filetype=obj | llvm-dwarfdump - | FileCheck %s --check-prefix=DWARF34
+; RUN: %llc_dwarf %s -o - -dwarf-version 4 -filetype=obj | llvm-dwarfdump - | FileCheck %s --check-prefix=DWARF34
+
+; .debug_frame is not emitted for targeting Windows x64.
+; REQUIRES: debug_frame
+
+; Function Attrs: nounwind
+define i32 @foo() #0 {
+entry:
+  %call = call i32 bitcast (i32 (...)* @bar to i32 ()*)(), !dbg !12
+  %add = add nsw i32 %call, 1, !dbg !12
+  ret i32 %add, !dbg !12
+}
+
+declare i32 @bar(...) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/test.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"test.c", metadata !"/tmp"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @foo, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [foo]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/test.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{metadata !"clang version 3.5.0 "}
+!12 = metadata !{i32 2, i32 0, metadata !4, null}
+
+; DWARF2:      .debug_frame contents:
+; DWARF2:        Version:               1
+; DWARF2-NEXT:   Augmentation:
+
+; DWARF34:      .debug_frame contents:
+; DWARF34:        Version:               3
+; DWARF34-NEXT:   Augmentation:
diff --git a/test/MC/ELF/cfi-window-save.s b/test/MC/ELF/cfi-window-save.s
index c7d438a..b083901 100644
--- a/test/MC/ELF/cfi-window-save.s
+++ b/test/MC/ELF/cfi-window-save.s
@@ -26,7 +26,7 @@ f:
 // CHECK-NEXT:     Relocations [
 // CHECK-NEXT:     ]
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 14000000 00000000 017A5200 01781001
+// CHECK-NEXT:       0000: 14000000 00000000 037A5200 01781001
 // CHECK-NEXT:       0010: 1B0C0708 90010000 14000000 1C000000
 // CHECK-NEXT:       0020: 00000000 02000000 00412D00 00000000
 // CHECK-NEXT:     )
diff --git a/test/MC/ELF/cfi-zero-addr-delta.s b/test/MC/ELF/cfi-zero-addr-delta.s
index 05cb0ae..8662839 100644
--- a/test/MC/ELF/cfi-zero-addr-delta.s
+++ b/test/MC/ELF/cfi-zero-addr-delta.s
@@ -30,7 +30,7 @@ f:
 // CHECK-NEXT:     Relocations [
 // CHECK-NEXT:     ]
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 14000000 00000000 017A5200 01781001
+// CHECK-NEXT:       0000: 14000000 00000000 037A5200 01781001
 // CHECK-NEXT:       0010: 1B0C0708 90010000 1C000000 1C000000
 // CHECK-NEXT:       0020: 00000000 04000000 00410E10 410A0E08
 // CHECK-NEXT:       0030: 410B0000 00000000
diff --git a/test/MC/ELF/cfi.s b/test/MC/ELF/cfi.s
index fd229b6..21be615 100644
--- a/test/MC/ELF/cfi.s
+++ b/test/MC/ELF/cfi.s
@@ -234,116 +234,116 @@ f37:
 // CHECK-NEXT:     Relocations [
 // CHECK-NEXT:     ]
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 14000000 00000000 017A4C52 00017810
+// CHECK-NEXT:       0000: 14000000 00000000 037A4C52 00017810
 // CHECK-NEXT:       0010: 02031B0C 07089001 14000000 1C000000
 // CHECK-NEXT:       0020: 00000000 01000000 04000000 00000000
-// CHECK-NEXT:       0030: 20000000 00000000 017A504C 52000178
+// CHECK-NEXT:       0030: 20000000 00000000 037A504C 52000178
 // CHECK-NEXT:       0040: 100B0000 00000000 00000003 1B0C0708
 // CHECK-NEXT:       0050: 90010000 14000000 28000000 00000000
 // CHECK-NEXT:       0060: 01000000 04000000 00000000 14000000
 // CHECK-NEXT:       0070: 70000000 00000000 01000000 04000000
-// CHECK-NEXT:       0080: 00000000 20000000 00000000 017A504C
+// CHECK-NEXT:       0080: 00000000 20000000 00000000 037A504C
 // CHECK-NEXT:       0090: 52000178 100B0000 00000000 00000002
 // CHECK-NEXT:       00A0: 1B0C0708 90010000 10000000 28000000
 // CHECK-NEXT:       00B0: 00000000 01000000 02000000 18000000
-// CHECK-NEXT:       00C0: 00000000 017A5052 00017810 04020000
+// CHECK-NEXT:       00C0: 00000000 037A5052 00017810 04020000
 // CHECK-NEXT:       00D0: 1B0C0708 90010000 10000000 20000000
 // CHECK-NEXT:       00E0: 00000000 01000000 00000000 18000000
-// CHECK-NEXT:       00F0: 00000000 017A5052 00017810 06030000
+// CHECK-NEXT:       00F0: 00000000 037A5052 00017810 06030000
 // CHECK-NEXT:       0100: 00001B0C 07089001 10000000 20000000
 // CHECK-NEXT:       0110: 00000000 01000000 00000000 1C000000
-// CHECK-NEXT:       0120: 00000000 017A5052 00017810 0A040000
+// CHECK-NEXT:       0120: 00000000 037A5052 00017810 0A040000
 // CHECK-NEXT:       0130: 00000000 00001B0C 07089001 10000000
 // CHECK-NEXT:       0140: 24000000 00000000 01000000 00000000
-// CHECK-NEXT:       0150: 18000000 00000000 017A5052 00017810
+// CHECK-NEXT:       0150: 18000000 00000000 037A5052 00017810
 // CHECK-NEXT:       0160: 040A0000 1B0C0708 90010000 10000000
 // CHECK-NEXT:       0170: 20000000 00000000 01000000 00000000
-// CHECK-NEXT:       0180: 18000000 00000000 017A5052 00017810
+// CHECK-NEXT:       0180: 18000000 00000000 037A5052 00017810
 // CHECK-NEXT:       0190: 060B0000 00001B0C 07089001 10000000
 // CHECK-NEXT:       01A0: 20000000 00000000 01000000 00000000
-// CHECK-NEXT:       01B0: 1C000000 00000000 017A5052 00017810
+// CHECK-NEXT:       01B0: 1C000000 00000000 037A5052 00017810
 // CHECK-NEXT:       01C0: 0A0C0000 00000000 00001B0C 07089001
 // CHECK-NEXT:       01D0: 10000000 24000000 00000000 01000000
-// CHECK-NEXT:       01E0: 00000000 1C000000 00000000 017A5052
+// CHECK-NEXT:       01E0: 00000000 1C000000 00000000 037A5052
 // CHECK-NEXT:       01F0: 00017810 0A080000 00000000 00001B0C
 // CHECK-NEXT:       0200: 07089001 10000000 24000000 00000000
 // CHECK-NEXT:       0210: 01000000 00000000 1C000000 00000000
-// CHECK-NEXT:       0220: 017A5052 00017810 0A100000 00000000
+// CHECK-NEXT:       0220: 037A5052 00017810 0A100000 00000000
 // CHECK-NEXT:       0230: 00001B0C 07089001 10000000 24000000
 // CHECK-NEXT:       0240: 00000000 01000000 00000000 18000000
-// CHECK-NEXT:       0250: 00000000 017A5052 00017810 04120000
+// CHECK-NEXT:       0250: 00000000 037A5052 00017810 04120000
 // CHECK-NEXT:       0260: 1B0C0708 90010000 10000000 20000000
 // CHECK-NEXT:       0270: 00000000 01000000 00000000 18000000
-// CHECK-NEXT:       0280: 00000000 017A5052 00017810 06130000
+// CHECK-NEXT:       0280: 00000000 037A5052 00017810 06130000
 // CHECK-NEXT:       0290: 00001B0C 07089001 10000000 20000000
 // CHECK-NEXT:       02A0: 00000000 01000000 00000000 1C000000
-// CHECK-NEXT:       02B0: 00000000 017A5052 00017810 0A140000
+// CHECK-NEXT:       02B0: 00000000 037A5052 00017810 0A140000
 // CHECK-NEXT:       02C0: 00000000 00001B0C 07089001 10000000
 // CHECK-NEXT:       02D0: 24000000 00000000 01000000 00000000
-// CHECK-NEXT:       02E0: 18000000 00000000 017A5052 00017810
+// CHECK-NEXT:       02E0: 18000000 00000000 037A5052 00017810
 // CHECK-NEXT:       02F0: 041A0000 1B0C0708 90010000 10000000
 // CHECK-NEXT:       0300: 20000000 00000000 01000000 00000000
-// CHECK-NEXT:       0310: 18000000 00000000 017A5052 00017810
+// CHECK-NEXT:       0310: 18000000 00000000 037A5052 00017810
 // CHECK-NEXT:       0320: 061B0000 00001B0C 07089001 10000000
 // CHECK-NEXT:       0330: 20000000 00000000 01000000 00000000
-// CHECK-NEXT:       0340: 1C000000 00000000 017A5052 00017810
+// CHECK-NEXT:       0340: 1C000000 00000000 037A5052 00017810
 // CHECK-NEXT:       0350: 0A1C0000 00000000 00001B0C 07089001
 // CHECK-NEXT:       0360: 10000000 24000000 00000000 01000000
-// CHECK-NEXT:       0370: 00000000 1C000000 00000000 017A5052
+// CHECK-NEXT:       0370: 00000000 1C000000 00000000 037A5052
 // CHECK-NEXT:       0380: 00017810 0A180000 00000000 00001B0C
 // CHECK-NEXT:       0390: 07089001 10000000 24000000 00000000
 // CHECK-NEXT:       03A0: 01000000 00000000 1C000000 00000000
-// CHECK-NEXT:       03B0: 017A5052 00017810 0A800000 00000000
+// CHECK-NEXT:       03B0: 037A5052 00017810 0A800000 00000000
 // CHECK-NEXT:       03C0: 00001B0C 07089001 10000000 24000000
 // CHECK-NEXT:       03D0: 00000000 01000000 00000000 18000000
-// CHECK-NEXT:       03E0: 00000000 017A5052 00017810 04820000
+// CHECK-NEXT:       03E0: 00000000 037A5052 00017810 04820000
 // CHECK-NEXT:       03F0: 1B0C0708 90010000 10000000 20000000
 // CHECK-NEXT:       0400: 00000000 01000000 00000000 18000000
-// CHECK-NEXT:       0410: 00000000 017A5052 00017810 06830000
+// CHECK-NEXT:       0410: 00000000 037A5052 00017810 06830000
 // CHECK-NEXT:       0420: 00001B0C 07089001 10000000 20000000
 // CHECK-NEXT:       0430: 00000000 01000000 00000000 1C000000
-// CHECK-NEXT:       0440: 00000000 017A5052 00017810 0A840000
+// CHECK-NEXT:       0440: 00000000 037A5052 00017810 0A840000
 // CHECK-NEXT:       0450: 00000000 00001B0C 07089001 10000000
 // CHECK-NEXT:       0460: 24000000 00000000 01000000 00000000
-// CHECK-NEXT:       0470: 18000000 00000000 017A5052 00017810
+// CHECK-NEXT:       0470: 18000000 00000000 037A5052 00017810
 // CHECK-NEXT:       0480: 048A0000 1B0C0708 90010000 10000000
 // CHECK-NEXT:       0490: 20000000 00000000 01000000 00000000
-// CHECK-NEXT:       04A0: 18000000 00000000 017A5052 00017810
+// CHECK-NEXT:       04A0: 18000000 00000000 037A5052 00017810
 // CHECK-NEXT:       04B0: 068B0000 00001B0C 07089001 10000000
 // CHECK-NEXT:       04C0: 20000000 00000000 01000000 00000000
-// CHECK-NEXT:       04D0: 1C000000 00000000 017A5052 00017810
+// CHECK-NEXT:       04D0: 1C000000 00000000 037A5052 00017810
 // CHECK-NEXT:       04E0: 0A8C0000 00000000 00001B0C 07089001
 // CHECK-NEXT:       04F0: 10000000 24000000 00000000 01000000
-// CHECK-NEXT:       0500: 00000000 1C000000 00000000 017A5052
+// CHECK-NEXT:       0500: 00000000 1C000000 00000000 037A5052
 // CHECK-NEXT:       0510: 00017810 0A880000 00000000 00001B0C
 // CHECK-NEXT:       0520: 07089001 10000000 24000000 00000000
 // CHECK-NEXT:       0530: 01000000 00000000 1C000000 00000000
-// CHECK-NEXT:       0540: 017A5052 00017810 0A900000 00000000
+// CHECK-NEXT:       0540: 037A5052 00017810 0A900000 00000000
 // CHECK-NEXT:       0550: 00001B0C 07089001 10000000 24000000
 // CHECK-NEXT:       0560: 00000000 01000000 00000000 18000000
-// CHECK-NEXT:       0570: 00000000 017A5052 00017810 04920000
+// CHECK-NEXT:       0570: 00000000 037A5052 00017810 04920000
 // CHECK-NEXT:       0580: 1B0C0708 90010000 10000000 20000000
 // CHECK-NEXT:       0590: 00000000 01000000 00000000 18000000
-// CHECK-NEXT:       05A0: 00000000 017A5052 00017810 06930000
+// CHECK-NEXT:       05A0: 00000000 037A5052 00017810 06930000
 // CHECK-NEXT:       05B0: 00001B0C 07089001 10000000 20000000
 // CHECK-NEXT:       05C0: 00000000 01000000 00000000 1C000000
-// CHECK-NEXT:       05D0: 00000000 017A5052 00017810 0A940000
+// CHECK-NEXT:       05D0: 00000000 037A5052 00017810 0A940000
 // CHECK-NEXT:       05E0: 00000000 00001B0C 07089001 10000000
 // CHECK-NEXT:       05F0: 24000000 00000000 01000000 00000000
-// CHECK-NEXT:       0600: 18000000 00000000 017A5052 00017810
+// CHECK-NEXT:       0600: 18000000 00000000 037A5052 00017810
 // CHECK-NEXT:       0610: 049A0000 1B0C0708 90010000 10000000
 // CHECK-NEXT:       0620: 20000000 00000000 01000000 00000000
-// CHECK-NEXT:       0630: 18000000 00000000 017A5052 00017810
+// CHECK-NEXT:       0630: 18000000 00000000 037A5052 00017810
 // CHECK-NEXT:       0640: 069B0000 00001B0C 07089001 10000000
 // CHECK-NEXT:       0650: 20000000 00000000 01000000 00000000
-// CHECK-NEXT:       0660: 1C000000 00000000 017A5052 00017810
+// CHECK-NEXT:       0660: 1C000000 00000000 037A5052 00017810
 // CHECK-NEXT:       0670: 0A9C0000 00000000 00001B0C 07089001
 // CHECK-NEXT:       0680: 10000000 24000000 00000000 01000000
-// CHECK-NEXT:       0690: 00000000 1C000000 00000000 017A5052
+// CHECK-NEXT:       0690: 00000000 1C000000 00000000 037A5052
 // CHECK-NEXT:       06A0: 00017810 0A980000 00000000 00001B0C
 // CHECK-NEXT:       06B0: 07089001 10000000 24000000 00000000
 // CHECK-NEXT:       06C0: 01000000 00000000 10000000 00000000
-// CHECK-NEXT:       06D0: 017A5200 01781001 1B000000 10000000
+// CHECK-NEXT:       06D0: 037A5200 01781001 1B000000 10000000
 // CHECK-NEXT:       06E0: 18000000 00000000 01000000 00000000
 // CHECK-NEXT:     )
 // CHECK-NEXT:   }
diff --git a/test/MC/ELF/gnu-type-diagnostics.s b/test/MC/ELF/gnu-type-diagnostics.s
new file mode 100644
index 0000000..df87d6d
--- /dev/null
+++ b/test/MC/ELF/gnu-type-diagnostics.s
@@ -0,0 +1,18 @@
+// RUN: not llvm-mc -triple i686-elf -filetype asm -o /dev/null %s 2>&1 | FileCheck %s
+
+	.type TYPE FUNC
+// CHECK: error: unsupported attribute in '.type' directive
+// CHECK: .type TYPE FUNC
+// CHECK:            ^
+
+	.type type stt_func
+// CHECK: error: unsupported attribute in '.type' directive
+// CHECK: .type type stt_func
+// CHECK:            ^
+
+	.type symbol 32
+// CHECK: error: expected STT_<TYPE_IN_UPPER_CASE>, '#<type>', '@<type>', '%<type>' or "<type>"
+// CHECK: .type symbol 32
+// CHECK:              ^
+
+
diff --git a/test/MC/ELF/gnu-type.s b/test/MC/ELF/gnu-type.s
new file mode 100644
index 0000000..19029e4
--- /dev/null
+++ b/test/MC/ELF/gnu-type.s
@@ -0,0 +1,38 @@
+// RUN: llvm-mc -triple i686-elf -filetype asm -o - %s | FileCheck %s
+
+	.type TYPE STT_FUNC
+// CHECK: .type TYPE,@function
+
+	.type comma_TYPE, STT_FUNC
+// CHECK: .type comma_TYPE,@function
+
+	.type at_TYPE, @STT_FUNC
+// CHECK: .type at_TYPE,@function
+
+	.type percent_TYPE, %STT_FUNC
+// CHECK: .type percent_TYPE,@function
+
+	.type string_TYPE, "STT_FUNC"
+// CHECK: .type string_TYPE,@function
+
+	.type type function
+// CHECK: .type type,@function
+
+	.type comma_type, function
+// CHECK: .type comma_type,@function
+
+	.type at_type, @function
+// CHECK: .type at_type,@function
+
+	.type percent_type, %function
+// CHECK: .type percent_type,@function
+
+	.type string_type, "function"
+// CHECK: .type string_type,@function
+
+	.type special gnu_unique_object
+// CHECK: .type special,@gnu_unique_object
+
+	.type comma_special, gnu_unique_object
+// CHECK: .type comma_special,@gnu_unique_object
+
diff --git a/test/MC/ELF/lit.local.cfg b/test/MC/ELF/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/MC/ELF/lit.local.cfg
+++ b/test/MC/ELF/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/ELF/no-reloc.s b/test/MC/ELF/no-reloc.s
new file mode 100644
index 0000000..78f1b88
--- /dev/null
+++ b/test/MC/ELF/no-reloc.s
@@ -0,0 +1,19 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -r | FileCheck %s
+
+// CHECK: Relocations [
+// CHECK-NEXT: ]
+
+	.section	.test1_foo
+.Ltest1_1:
+.Ltest1_2 = .Ltest1_1
+	.section	.test1_bar
+	.long .Ltest1_1-.Ltest1_2
+
+
+        .section test2
+
+.Ltest2_a:
+.Ltest2_b = .Ltest2_a
+.Ltest2_c:
+.Ltest2_d = .Ltest2_c-.Ltest2_b
+	.long	.Ltest2_d
diff --git a/test/MC/ELF/pr19430.s b/test/MC/ELF/pr19430.s
new file mode 100644
index 0000000..a1e5246
--- /dev/null
+++ b/test/MC/ELF/pr19430.s
@@ -0,0 +1,14 @@
+// RUN: llvm-mc -triple x86_64-pc-linux-gnu %s -filetype=obj -o - | llvm-readobj -r | FileCheck %s
+
+// Test that we can use .cfi_startproc without a global symbol.
+
+.text
+.space 1000
+.cfi_startproc
+ .cfi_endproc
+
+// CHECK:      Relocations [
+// CHECK-NEXT:   Section (5) .rela.eh_frame {
+// CHECK-NEXT:     0x20 R_X86_64_PC32 .text 0x3E8
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
diff --git a/test/MC/MachO/AArch64/lit.local.cfg b/test/MC/MachO/AArch64/lit.local.cfg
index 9a66a00..cec29af 100644
--- a/test/MC/MachO/AArch64/lit.local.cfg
+++ b/test/MC/MachO/AArch64/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'AArch64' in targets:
+if not 'AArch64' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/MachO/ARM/aliased-symbols.s b/test/MC/MachO/ARM/aliased-symbols.s
new file mode 100644
index 0000000..0b4463d
--- /dev/null
+++ b/test/MC/MachO/ARM/aliased-symbols.s
@@ -0,0 +1,115 @@
+// RUN: llvm-mc -triple thumbv7m-apple-darwin-eabi %s -filetype=obj -o %t
+// RUN:     llvm-readobj -symbols %t | FileCheck %s
+
+        .data
+        var1 = var2
+        .long var1
+        .long var2
+        .long var2 + 4
+defined_early:
+        .long 0
+
+        alias_to_early = defined_early
+        alias_to_late = defined_late
+
+defined_late:
+        .long 0
+
+        .global extern_test
+        extern_test = var2
+
+        alias_to_local = Ltmp0
+Ltmp0:
+
+// CHECK: Symbols [
+
+        // defined_early was defined. Actually has value 0xc.
+// CHECK: Symbol {
+// CHECK-NEXT:   Name: defined_early
+// CHECK-NEXT:   Type: Section (0xE)
+// CHECK-NEXT:   Section: __data (0x2)
+// CHECK-NEXT:   RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:   Flags [ (0x0)
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Value: 0x[[DEFINED_EARLY:[0-9A-F]+]]
+// CHECK-NEXT: }
+
+        // alias_to_early was an alias to defined_early. But we can resolve it.
+// CHECK: Symbol {
+// CHECK-NEXT:   Name: alias_to_early
+// CHECK-NEXT:   Type: Section (0xE)
+// CHECK-NEXT:   Section: __data (0x2)
+// CHECK-NEXT:   RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:   Flags [ (0x0)
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Value: 0x[[DEFINED_EARLY]]
+// CHECK-NEXT: }
+
+        // defined_late was defined. Just after defined_early.
+// CHECK: Symbol {
+// CHECK-NEXT:   Name: defined_late
+// CHECK-NEXT:   Type: Section (0xE)
+// CHECK-NEXT:   Section: __data (0x2)
+// CHECK-NEXT:   RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:   Flags [ (0x0)
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Value: 0x[[DEFINED_LATE:[0-9A-F]+]]
+// CHECK-NEXT: }
+
+        // alias_to_late was an alias to defined_late. But we can resolve it.
+// CHECK: Symbol {
+// CHECK-NEXT:   Name: alias_to_late
+// CHECK-NEXT:   Type: Section (0xE)
+// CHECK-NEXT:   Section: __data (0x2)
+// CHECK-NEXT:   RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:   Flags [ (0x0)
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Value: 0x[[DEFINED_LATE]]
+// CHECK-NEXT: }
+
+        // alias_to_local is an alias, but what it points to has no
+        // MachO representation. We must resolve it.
+// CHECK: Symbol {
+// CHECK-NEXT:   Name: alias_to_local (37)
+// CHECK-NEXT:   Type: Section (0xE)
+// CHECK-NEXT:   Section:  (0x0)
+// CHECK-NEXT:   RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:   Flags [ (0x0)
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Value: 0x14
+// CHECK-NEXT: }
+
+        // extern_test was a pure alias to the unknown "var2".
+        // N_INDR and Extern.
+// CHECK:   Name: extern_test
+// CHECK-NEXT:   Extern
+// CHECK-NEXT:   Type: Indirect (0xA)
+// CHECK-NEXT:   Section:  (0x0)
+// CHECK-NEXT:   RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:   Flags [ (0x0)
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Value: 0x[[VAR2_STRINGINDEX:[0-9a-f]+]]
+// CHECK-NEXT: }
+
+        // var1 was another alias to an unknown variable. Not extern this time.
+// CHECK: Symbol {
+// CHECK-NEXT:   Name: var1 (1)
+// CHECK-NEXT:   Type: Indirect (0xA)
+// CHECK-NEXT:   Section:  (0x0)
+// CHECK-NEXT:   RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:   Flags [ (0x0)
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Value: 0x[[VAR2_STRINGINDEX]]
+// CHECK-NEXT: }
+
+        // var2 was a normal undefined (extern) symbol.
+// CHECK: Symbol {
+// CHECK-NEXT:   Name: var2
+// CHECK-NEXT:   Extern
+// CHECK-NEXT:   Type: Undef (0x0)
+// CHECK-NEXT:   Section:  (0x0)
+// CHECK-NEXT:   RefType: UndefinedNonLazy (0x0)
+// CHECK-NEXT:   Flags [ (0x0)
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Value: 0x0
+// CHECK-NEXT: }
diff --git a/test/MC/MachO/ARM/lit.local.cfg b/test/MC/MachO/ARM/lit.local.cfg
index 8a3ba96..98c6700 100644
--- a/test/MC/MachO/ARM/lit.local.cfg
+++ b/test/MC/MachO/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/MachO/eh-frame-reloc.s b/test/MC/MachO/eh-frame-reloc.s
index 1b1c674..e14825b 100644
--- a/test/MC/MachO/eh-frame-reloc.s
+++ b/test/MC/MachO/eh-frame-reloc.s
@@ -1,5 +1,6 @@
 // RUN: llvm-mc < %s -triple=x86_64-apple-macosx10.7 -filetype=obj | llvm-readobj -r | FileCheck %s
 // RUN: llvm-mc < %s -triple=x86_64-apple-macosx10.6 -filetype=obj | llvm-readobj -r | FileCheck %s
+// RUN: llvm-mc < %s -triple=x86_64-apple-ios7.0.0 -filetype=obj | llvm-readobj -r | FileCheck %s
 // RUN: llvm-mc < %s -triple=x86_64-apple-macosx10.5 -filetype=obj | llvm-readobj -r | FileCheck --check-prefix=OLD64 %s
 // RUN: llvm-mc < %s -triple=i686-apple-macosx10.6 -filetype=obj | llvm-readobj -r | FileCheck %s
 // RUN: llvm-mc < %s -triple=i686-apple-macosx10.5 -filetype=obj | llvm-readobj -r | FileCheck --check-prefix=OLD32 %s
@@ -16,14 +17,14 @@ _bar:
 
 // OLD32:      Relocations [
 // OLD32-NEXT:   Section __eh_frame {
-// OLD32-NEXT:     0x20 0 2 n/a GENERIC_RELOC_LOCAL_SECTDIFF 1 -
-// OLD32-NEXT:     0x0 0 2 n/a GENERIC_RELOC_PAIR 1 -
+// OLD32-NEXT:     0x20 0 2 n/a GENERIC_RELOC_LOCAL_SECTDIFF 1 0x0
+// OLD32-NEXT:     0x0 0 2 n/a GENERIC_RELOC_PAIR 1 0x20
 // OLD32-NEXT:   }
 // OLD32-NEXT: ]
 
 // OLD64:      Relocations [
 // OLD64-NEXT:   Section __eh_frame {
-// OLD64-NEXT:     0x20 0 3 1 X86_64_RELOC_SUBTRACTOR 0 _bar.eh
+// OLD64-NEXT:     0x20 0 3 0 X86_64_RELOC_SUBTRACTOR 0
 // OLD64-NEXT:     0x20 0 3 1 X86_64_RELOC_UNSIGNED 0 _bar
 // OLD64-NEXT:   }
 // OLD64-NEXT: ]
diff --git a/test/MC/MachO/eh-symbols.s b/test/MC/MachO/eh-symbols.s
deleted file mode 100644
index 6adca56..0000000
--- a/test/MC/MachO/eh-symbols.s
+++ /dev/null
@@ -1,25 +0,0 @@
-// RUN: llvm-mc -filetype=obj -triple i686-apple-darwin %s  -o - | llvm-readobj -t | FileCheck %s
-
-// Make sure that the exception handling data has the same visibility as the
-// function it's generated for.
-
-	.private_extern	_main
-	.globl	_main
-_main:
-	.cfi_startproc
-	retl
-	.cfi_endproc
-
-"_-[NSString(local) isNullOrNil]":
-	.cfi_startproc
-	retl
-	.cfi_endproc
-
-// CHECK: Name: _-[NSString(local) isNullOrNil].eh
-
-// CHECK:       Name: _main
-// CHECK-NEXT:  PrivateExtern
-
-// CHECK:       Name: _main.eh
-// CHECK-NEXT:  PrivateExtern
-
diff --git a/test/MC/MachO/eh_symbol.s b/test/MC/MachO/eh_symbol.s
index 1135196..738e2b6 100644
--- a/test/MC/MachO/eh_symbol.s
+++ b/test/MC/MachO/eh_symbol.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple i386-apple-darwin9 %s -filetype=obj -o - | llvm-nm | FileCheck %s
+// RUN: llvm-mc -triple i386-apple-darwin9 %s -filetype=obj -o - | llvm-nm - | FileCheck %s
 
 // test that we don't produce foo.eh symbols in a debug_frame section.
 // CHECK-NOT: _f.eh
diff --git a/test/MC/MachO/lit.local.cfg b/test/MC/MachO/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/MC/MachO/lit.local.cfg
+++ b/test/MC/MachO/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/MachO/pr19185.s b/test/MC/MachO/pr19185.s
new file mode 100644
index 0000000..fb21e51
--- /dev/null
+++ b/test/MC/MachO/pr19185.s
@@ -0,0 +1,6 @@
+// RUN: llvm-mc -triple x86_64-apple-darwin %s -filetype=obj -o %t.o
+f:
+ .cfi_startproc
+ .cfi_endproc
+
+EH_frame0:
diff --git a/test/MC/MachO/variable-exprs.s b/test/MC/MachO/variable-exprs.s
index 8eeb82f..a7fa45d 100644
--- a/test/MC/MachO/variable-exprs.s
+++ b/test/MC/MachO/variable-exprs.s
@@ -202,10 +202,10 @@ Lt0_x = Lt0_a - Lt0_b
 // CHECK-I386:    ),
 // CHECK-I386:     # Symbol 8
 // CHECK-I386:    (('n_strx', 1)
-// CHECK-I386:     ('n_type', 0x1)
+// CHECK-I386:     ('n_type', 0xb)
 // CHECK-I386:     ('n_sect', 0)
 // CHECK-I386:     ('n_desc', 0)
-// CHECK-I386:     ('n_value', 0)
+// CHECK-I386:     ('n_value', 4)
 // CHECK-I386:     ('_string', 'd2')
 // CHECK-I386:    ),
 // CHECK-I386:     # Symbol 9
@@ -403,10 +403,10 @@ Lt0_x = Lt0_a - Lt0_b
 // CHECK-X86_64:    ),
 // CHECK-X86_64:     # Symbol 8
 // CHECK-X86_64:    (('n_strx', 1)
-// CHECK-X86_64:     ('n_type', 0x1)
+// CHECK-X86_64:     ('n_type', 0xb)
 // CHECK-X86_64:     ('n_sect', 0)
 // CHECK-X86_64:     ('n_desc', 0)
-// CHECK-X86_64:     ('n_value', 0)
+// CHECK-X86_64:     ('n_value', 4)
 // CHECK-X86_64:     ('_string', 'd2')
 // CHECK-X86_64:    ),
 // CHECK-X86_64:     # Symbol 9
diff --git a/test/MC/Mips/cpsetup-bad.s b/test/MC/Mips/cpsetup-bad.s
new file mode 100644
index 0000000..09252a1
--- /dev/null
+++ b/test/MC/Mips/cpsetup-bad.s
@@ -0,0 +1,14 @@
+# RUN: not llvm-mc %s -triple mips64-unknown-unknown 2>%t1
+# RUN:   FileCheck %s < %t1 -check-prefix=ASM
+
+        .text
+        .option pic2
+t1:
+        .cpsetup $bar, 8, __cerror
+# ASM: :[[@LINE-1]]:18: error: expected register containing function address
+        .cpsetup $33, 8, __cerror
+# ASM: :[[@LINE-1]]:18: error: invalid register
+        .cpsetup $31, foo, __cerror
+# ASM: :[[@LINE-1]]:23: error: expected save register or stack offset
+        .cpsetup $31, $32, __cerror
+# ASM: :[[@LINE-1]]:23: error: invalid register
diff --git a/test/MC/Mips/eh-frame.s b/test/MC/Mips/eh-frame.s
index 1671598..d6b9cf0 100644
--- a/test/MC/Mips/eh-frame.s
+++ b/test/MC/Mips/eh-frame.s
@@ -31,7 +31,7 @@ func:
 // MIPS32: 00000000
 
 // Version
-// MIPS32: 01
+// MIPS32: 03
 
 // Augmentation String
 // MIPS32: 7a5200
@@ -67,7 +67,7 @@ func:
 // MIPS32EL: 00000000
 
 // Version
-// MIPS32EL: 01
+// MIPS32EL: 03
 
 // Augmentation String
 // MIPS32EL: 7a5200
@@ -103,7 +103,7 @@ func:
 // MIPS64: 00000000
 
 // Version
-// MIPS64: 01
+// MIPS64: 03
 
 // Augmentation String
 // MIPS64: 7a5200
@@ -141,7 +141,7 @@ func:
 // MIPS64EL: 00000000
 
 // Version
-// MIPS64EL: 01
+// MIPS64EL: 03
 
 // Augmentation String
 // MIPS64EL: 7a5200
diff --git a/test/MC/Mips/elf_eflags.s b/test/MC/Mips/elf_eflags.s
index 8cf4960..36f4f9e 100644
--- a/test/MC/Mips/elf_eflags.s
+++ b/test/MC/Mips/elf_eflags.s
@@ -1,6 +1,12 @@
 # These *MUST* match the output of gas compiled with the same triple and
 # corresponding options (-mcpu=mips32 -> -mips32 for example).
 
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64r6 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64R6 %s
+# MIPSEL-MIPS64R6: Flags [ (0xA0001500)
+
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64r6 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64R6-NAN2008 %s
+# MIPSEL-MIPS64R6-NAN2008: Flags [ (0xA0001500)
+
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64r2 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64R2 %s
 # MIPSEL-MIPS64R2: Flags [ (0x80001100)
 
@@ -13,6 +19,12 @@
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64-NAN2008 %s
 # MIPSEL-MIPS64-NAN2008: Flags [ (0x60001500)
 
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32r6 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32R6 %s
+# MIPSEL-MIPS32R6: Flags [ (0x90001400)
+
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32r6 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32R6-NAN2008 %s
+# MIPSEL-MIPS32R6-NAN2008: Flags [ (0x90001400)
+
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32r2 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32R2 %s
 # MIPSEL-MIPS32R2: Flags [ (0x70001000)
 
@@ -55,12 +67,36 @@
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=-n64,o32,+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-O32-NAN2008 %s
 # MIPS64EL-MIPS64R2-O32-NAN2008: Flags [ (0x80001500)
 
+# RUN: llvm-mc -filetype=obj -triple mips64-unknown-linux -mcpu=mips5 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS5 %s
+# MIPS5: Flags [ (0x40000000)
+
+ # RUN: llvm-mc -filetype=obj -triple mips64-unknown-linux -mcpu=mips5 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS5-NAN2008 %s
+# MIPS5-NAN2008: Flags [ (0x40000400)
+
 # RUN: llvm-mc -filetype=obj -triple mips64-unknown-linux -mcpu=mips4 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS4 %s
 # MIPS4: Flags [ (0x30000000)
 
  # RUN: llvm-mc -filetype=obj -triple mips64-unknown-linux -mcpu=mips4 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS4-NAN2008 %s
 # MIPS4-NAN2008: Flags [ (0x30000400)
 
+# RUN: llvm-mc -filetype=obj -triple mips64-unknown-linux -mcpu=mips3 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS3 %s
+# MIPS3: Flags [ (0x20000000)
+
+ # RUN: llvm-mc -filetype=obj -triple mips64-unknown-linux -mcpu=mips3 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS3-NAN2008 %s
+# MIPS3-NAN2008: Flags [ (0x20000400)
+
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips2 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS2 %s
+# MIPSEL-MIPS2: Flags [ (0x10001000)
+
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips2 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS2-NAN2008 %s
+# MIPSEL-MIPS2-NAN2008: Flags [ (0x10001400)
+
+# RUN: llvm-mc -filetype=obj -triple mips-unknown-linux -mcpu=mips1 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS1 %s
+# MIPS1: Flags [ (0x1000)
+
+ # RUN: llvm-mc -filetype=obj -triple mips-unknown-linux -mcpu=mips1 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS1-NAN2008 %s
+# MIPS1-NAN2008: Flags [ (0x1400)
+
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 %s -mattr=-n64,o32 -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-O32 %s
 # MIPS64EL-MIPS64-O32: Flags [ (0x60001100)
 
diff --git a/test/MC/Mips/lit.local.cfg b/test/MC/Mips/lit.local.cfg
index 1fa54b4..a3183a2 100644
--- a/test/MC/Mips/lit.local.cfg
+++ b/test/MC/Mips/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Mips' in targets:
+if not 'Mips' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/Mips/mips-abi-bad.s b/test/MC/Mips/mips-abi-bad.s
new file mode 100644
index 0000000..c4653cf
--- /dev/null
+++ b/test/MC/Mips/mips-abi-bad.s
@@ -0,0 +1,20 @@
+# Error checking for malformed abi related directives
+# RUN: not llvm-mc -triple mips-unknown-unknown %s 2>&1 | FileCheck %s
+# CHECK: .text
+    .module fp=3
+# CHECK      : mips-abi-bad.s:4:16: error: unsupported option
+# CHECK-NEXT : .module fp=3
+# CHECK-NEXT :           ^
+
+    .set fp=xx,6
+# CHECK      :mips-abi-bad.s:5:15: error: unexpected token in statement
+# CHECK-NEXT :    .set fp=xx,6
+# CHECK-NEXT :              ^
+
+# CHECK       :.set mips16
+    .set mips16
+    .module fp=32
+
+# CHECK      :mips-abi-bad.s:14:13: error: .module directive must come before any code
+# CHECK-NEXT :    .module fp=32
+# CHECK-NEXT :            ^
diff --git a/test/MC/Mips/mips-data-directives.s b/test/MC/Mips/mips-data-directives.s
index 630a807..8b3e0b3 100644
--- a/test/MC/Mips/mips-data-directives.s
+++ b/test/MC/Mips/mips-data-directives.s
@@ -12,7 +12,7 @@
 
 # Checking if the data and reloations were correctly emitted
 # CHECK-OBJ:  Section {
-# CHECK-OBJ:    Name: .data (51)
+# CHECK-OBJ:    Name: .data (66)
 # CHECK-OBJ:    SectionData (
 # CHECK-OBJ:      0000: DEADC0DE DEADC0DE DEADBEEF 00000000
 # CHECK-OBJ:      0010: 00000000 00000000
@@ -20,7 +20,7 @@
 # CHECK-OBJ:  }
 
 # CHECK-OBJ:  Section {
-# CHECK-OBJ:    Name: .rel.data (47)
+# CHECK-OBJ:    Name: .rel.data (62)
 # CHECK-OBJ:    Relocations [
 # CHECK-OBJ:      0xC R_MIPS_32 .data 0x0
 # CHECK-OBJ:      0x10 R_MIPS_64 .data 0x0
diff --git a/test/MC/Mips/mips-expansions-bad.s b/test/MC/Mips/mips-expansions-bad.s
new file mode 100644
index 0000000..a137deb
--- /dev/null
+++ b/test/MC/Mips/mips-expansions-bad.s
@@ -0,0 +1,6 @@
+# RUN: not llvm-mc %s -arch=mips -mcpu=mips32r2 2>%t1
+# RUN: FileCheck %s < %t1
+
+        .text
+        li $5, 0x100000000 # CHECK: :[[@LINE]]:9: error: instruction requires a CPU feature not currently enabled
+        dli $5, 1 # CHECK: :[[@LINE]]:9: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips-expansions.s b/test/MC/Mips/mips-expansions.s
index 1622965..f0a04a5 100644
--- a/test/MC/Mips/mips-expansions.s
+++ b/test/MC/Mips/mips-expansions.s
@@ -8,6 +8,8 @@
 # CHECK: addiu   $6, $zero, -2345    # encoding: [0xd7,0xf6,0x06,0x24]
 # CHECK: lui     $7, 1               # encoding: [0x01,0x00,0x07,0x3c]
 # CHECK: ori     $7, $7, 2           # encoding: [0x02,0x00,0xe7,0x34]
+# CHECK: addiu   $8, $zero, -8       # encoding: [0xf8,0xff,0x08,0x24]
+
 # CHECK: addiu   $4, $zero, 20       # encoding: [0x14,0x00,0x04,0x24]
 # CHECK: lui     $7, 1               # encoding: [0x01,0x00,0x07,0x3c]
 # CHECK: ori     $7, $7, 2           # encoding: [0x02,0x00,0xe7,0x34]
@@ -32,17 +34,28 @@
 # CHECK: addu    $1, $1, $9              # encoding: [0x21,0x08,0x29,0x00]
 # CHECK: sw      $10, 57920($1)          # encoding: [0x40,0xe2,0x2a,0xac]
 
+# CHECK: lui     $1, %hi(symbol)
+# CHECK: ldc1    $f0, %lo(symbol)($1)
+# CHECK: lui     $1, %hi(symbol)
+# CHECK: sdc1    $f0, %lo(symbol)($1)
+
     li $5,123
     li $6,-2345
     li $7,65538
+    li $8, ~7
 
     la $a0, 20
     la $7,65538
     la $a0, 20($a1)
     la $7,65538($8)
 
+    .set noat
     lw  $t2, symbol($a0)
+    .set at
     sw  $t2, symbol($t1)
 
     lw  $t2, 655483($a0)
     sw  $t2, 123456($t1)
+
+    ldc1 $f0, symbol
+    sdc1 $f0, symbol
diff --git a/test/MC/Mips/mips-noat.s b/test/MC/Mips/mips-noat.s
index b83c517..07db251 100644
--- a/test/MC/Mips/mips-noat.s
+++ b/test/MC/Mips/mips-noat.s
@@ -10,11 +10,10 @@
 test1:
         lw      $2, 65536($2)
 
-# FIXME: It would be better if the error pointed at the mnemonic instead of the newline
-# ERROR: mips-noat.s:[[@LINE+4]]:1: error: Pseudo instruction requires $at, which is not available
 test2:
         .set noat
-        lw      $2, 65536($2)
+        lw      $2, 65536($2) # ERROR: mips-noat.s:[[@LINE]]:9: error: Pseudo instruction requires $at, which is not available
+
 
 # Can we switch it back on successfully?
 # CHECK-LABEL: test3:
@@ -25,10 +24,6 @@ test3:
         .set at
         lw      $2, 65536($2)
 
-# FIXME: It would be better if the error pointed at the mnemonic instead of the newline
-# ERROR: mips-noat.s:[[@LINE+4]]:1: error: Pseudo instruction requires $at, which is not available
 test4:
         .set at=$0
-        lw      $2, 65536($2)
-
-# ERROR-NOT: error
+        lw      $2, 65536($2) # ERROR: mips-noat.s:[[@LINE]]:9: error: Pseudo instruction requires $at, which is not available
diff --git a/test/MC/Mips/mips1/invalid-mips2.s b/test/MC/Mips/mips1/invalid-mips2.s
index 6c3e80a..7db261d 100644
--- a/test/MC/Mips/mips1/invalid-mips2.s
+++ b/test/MC/Mips/mips1/invalid-mips2.s
@@ -21,3 +21,4 @@
         tnei      $t4,-29647      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         trunc.w.d $f22,$f15       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         trunc.w.s $f28,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sync                      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips1/invalid-mips32.s b/test/MC/Mips/mips1/invalid-mips32.s
new file mode 100644
index 0000000..4ad8d63
--- /dev/null
+++ b/test/MC/Mips/mips1/invalid-mips32.s
@@ -0,0 +1,10 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips1 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+
+        sync 0                    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sync 1                    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips1/invalid-mips4.s b/test/MC/Mips/mips1/invalid-mips4.s
index 61aaf58..9f246bc 100644
--- a/test/MC/Mips/mips1/invalid-mips4.s
+++ b/test/MC/Mips/mips1/invalid-mips4.s
@@ -4,7 +4,9 @@
 # RUN:     2>%t1
 # RUN: FileCheck %s < %t1
 
-	.set noat
+        .set noat
+        bc1f      $fcc1, 4          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        bc1t      $fcc1, 4          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ceil.l.d  $f1,$f3           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ceil.l.s  $f18,$f13         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ceil.w.d  $f11,$f25         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
@@ -50,15 +52,20 @@
         floor.w.s $f8,$f9           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ldxc1     $f8,$s7($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         lwxc1     $f12,$s1($s8)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf      $gp,$8,$fcc7      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.d    $f6,$f10,$fcc5    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.s    $f23,$f5,$fcc6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.d    $f6,$f10,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f10,$fcc5    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.s    $f23,$f5,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movn      $v1,$s1,$s0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.d    $f26,$f20,$k0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.s    $f12,$f0,$s7      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt      $zero,$s4,$fcc5   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movt.d    $f0,$f2,$fcc0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt.s    $f30,$f2,$fcc1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movz      $a1,$s6,$9        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.d    $f12,$f29,$9      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.s    $f25,$f7,$v1      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips1/invalid-mips5.s b/test/MC/Mips/mips1/invalid-mips5.s
index 1eddf02..af5b278 100644
--- a/test/MC/Mips/mips1/invalid-mips5.s
+++ b/test/MC/Mips/mips1/invalid-mips5.s
@@ -4,7 +4,9 @@
 # RUN:     2>%t1
 # RUN: FileCheck %s < %t1
 
-	.set noat
+        .set noat
+        bc1f      $fcc1, 4          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        bc1t      $fcc1, 4          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ceil.l.d  $f1,$f3           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ceil.l.s  $f18,$f13         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ceil.w.d  $f11,$f25         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
@@ -49,15 +51,20 @@
         ldxc1     $f8,$s7($t3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         luxc1     $f19,$s6($s5)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         lwxc1     $f12,$s1($s8)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf      $gp,$a0,$fcc7     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.d    $f6,$f11,$fcc5    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.s    $f23,$f5,$fcc6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.d    $f6,$f10,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f10,$fcc5    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.s    $f23,$f5,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movn      $v1,$s1,$s0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.d    $f27,$f21,$k0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.s    $f12,$f0,$s7      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt      $zero,$s4,$fcc5   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movt.d    $f0,$f2,$fcc0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt.s    $f30,$f2,$fcc1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movz      $a1,$s6,$a3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.d    $f12,$f29,$a3     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.s    $f25,$f7,$v1      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips1/valid.s b/test/MC/Mips/mips1/valid.s
index 473e6b9..66e11ba 100644
--- a/test/MC/Mips/mips1/valid.s
+++ b/test/MC/Mips/mips1/valid.s
@@ -9,8 +9,18 @@
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
         and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
         c.sf.d    $f30,$f0
@@ -36,7 +46,7 @@
         li        $zero,-29889
         lw        $8,5674($a1)
         lwc1      $f16,10225($k0)
-        lwc2      $18,-841($a2)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
         lwc3      $10,-32265($k0)
         lwl       $s4,-4231($15)
         lwr       $zero,-19147($gp)
@@ -65,6 +75,7 @@
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
         sb        $s6,-19857($14)
         sh        $14,-6704($15)
         sll       $a3,18               # CHECK: sll $7, $7, 18         # encoding: [0x00,0x07,0x3c,0x80]
@@ -91,7 +102,7 @@
         subu      $sp,$s6,$s6
         sw        $ra,-10160($sp)
         swc1      $f6,-8465($24)
-        swc2      $25,24880($s0)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
         swc3      $10,-32265($k0)
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
diff --git a/test/MC/Mips/mips2/invalid-mips3-wrong-error.s b/test/MC/Mips/mips2/invalid-mips3-wrong-error.s
index a3f829b..3eb4ef3 100644
--- a/test/MC/Mips/mips2/invalid-mips3-wrong-error.s
+++ b/test/MC/Mips/mips2/invalid-mips3-wrong-error.s
@@ -7,7 +7,6 @@
 
 	.set noat
         dmult     $s7,$a5           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        dsub      $a3,$s6,$a4       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ld        $sp,-28645($s1)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ldl       $t8,-4167($t8)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ldr       $t2,-30358($s4)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/test/MC/Mips/mips2/invalid-mips3.s b/test/MC/Mips/mips2/invalid-mips3.s
index ef498d7..458c416 100644
--- a/test/MC/Mips/mips2/invalid-mips3.s
+++ b/test/MC/Mips/mips2/invalid-mips3.s
@@ -38,6 +38,7 @@
         dsrl32     $s3,23            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         dsrl32     $s3,$6,23         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         dsrlv      $s3,$t2,$s4       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsub       $a3,$s6,$a4       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         dsubu      $a1,$a1,$k0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         eret                         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         floor.l.d  $f26,$f7          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips2/invalid-mips32.s b/test/MC/Mips/mips2/invalid-mips32.s
index 2975c68..43ea345 100644
--- a/test/MC/Mips/mips2/invalid-mips32.s
+++ b/test/MC/Mips/mips2/invalid-mips32.s
@@ -1,28 +1,38 @@
 # Instructions that are invalid
 #
-# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips2 \
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips2 \
 # RUN:     2>%t1
 # RUN: FileCheck %s < %t1
 
-	.set noat
+        .set noat
+        bc1f      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        bc1t      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         clo       $11,$a1         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         clz       $sp,$gp         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         deret                     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         eret                      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        jr.hb     $4              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        jalr.hb   $4              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        jalr.hb   $4, $5          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         madd      $s6,$13         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         madd      $zero,$9        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         maddu     $s3,$gp         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         maddu     $24,$s2         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mfc0      $a2,$14,1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf      $gp,$8,$fcc7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.d    $f6,$f11,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.s    $f23,$f5,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movn      $v1,$s1,$s0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.d    $f27,$f21,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.s    $f12,$f0,$s7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc0 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movt.d    $f0,$f2,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movz      $a1,$s6,$9      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.d    $f12,$f29,$9    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.s    $f25,$f7,$v1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
@@ -30,3 +40,5 @@
         msubu     $15,$a1         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mtc0      $9,$29,3        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mul       $s0,$s4,$at     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sync      0               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sync      1               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips2/invalid-mips32r2.s b/test/MC/Mips/mips2/invalid-mips32r2.s
index 37f2eed..72a570a 100644
--- a/test/MC/Mips/mips2/invalid-mips32r2.s
+++ b/test/MC/Mips/mips2/invalid-mips32r2.s
@@ -1,10 +1,12 @@
 # Instructions that are invalid
 #
-# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips2 \
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips2 \
 # RUN:     2>%t1
 # RUN: FileCheck %s < %t1
 
-	.set noat
+        .set noat
+        bc1f      $fcc1, 4          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        bc1t      $fcc1, 4          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         clo     $t3,$a1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         clz     $sp,$gp             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         cvt.l.d $f24,$f15           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
@@ -24,15 +26,20 @@
         maddu   $t8,$s2             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mfc0    $a2,$14,1           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mfhc1   $s8,$f24            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf    $gp,$t0,$fcc7       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.d  $f6,$f11,$fcc5      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.s  $f23,$f5,$fcc6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf    $gp,$8,$fcc0        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf    $gp,$8,$fcc7        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.d  $f6,$f11,$fcc0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d  $f6,$f11,$fcc5      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.s  $f23,$f5,$fcc0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s  $f23,$f5,$fcc6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movn    $v1,$s1,$s0         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.d  $f27,$f21,$k0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.s  $f12,$f0,$s7        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt    $zero,$s4,$fcc5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt    $zero,$s4,$fcc0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt    $zero,$s4,$fcc5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movt.d  $f0,$f2,$fcc0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt.s  $f30,$f2,$fcc1      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s  $f30,$f2,$fcc0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s  $f30,$f2,$fcc1      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movz    $a1,$s6,$t1         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.d  $f12,$f29,$t1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.s  $f25,$f7,$v1        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips2/invalid-mips4.s b/test/MC/Mips/mips2/invalid-mips4.s
index e2eb672..13923f0 100644
--- a/test/MC/Mips/mips2/invalid-mips4.s
+++ b/test/MC/Mips/mips2/invalid-mips4.s
@@ -4,7 +4,9 @@
 # RUN:     2>%t1
 # RUN: FileCheck %s < %t1
 
-	.set noat
+        .set noat
+        bc1f      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        bc1t      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ceil.l.d  $f1,$f3         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ceil.l.s  $f18,$f13       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         cvt.d.l   $f4,$f16        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
@@ -46,15 +48,20 @@
         floor.l.s $f12,$f5        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ldxc1     $f8,$s7($15)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         lwxc1     $f12,$s1($s8)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf      $gp,$8,$fcc7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.d    $f6,$f11,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.s    $f23,$f5,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movn      $v1,$s1,$s0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.d    $f27,$f21,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.s    $f12,$f0,$s7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc0 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movt.d    $f0,$f2,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movz      $a1,$s6,$9      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.d    $f12,$f29,$9    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.s    $f25,$f7,$v1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips2/invalid-mips5.s b/test/MC/Mips/mips2/invalid-mips5.s
index f777ffe..8f460c7 100644
--- a/test/MC/Mips/mips2/invalid-mips5.s
+++ b/test/MC/Mips/mips2/invalid-mips5.s
@@ -4,7 +4,9 @@
 # RUN:     2>%t1
 # RUN: FileCheck %s < %t1
 
-	.set noat
+        .set noat
+        bc1f      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        bc1t      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ceil.l.d  $f1,$f3         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ceil.l.s  $f18,$f13       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         cvt.d.l   $f4,$f16        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
@@ -45,15 +47,20 @@
         ldxc1     $f8,$s7($t3)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         luxc1     $f19,$s6($s5)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         lwxc1     $f12,$s1($s8)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf      $gp,$a0,$fcc7   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$a0,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$a0,$fcc7   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.d    $f6,$f11,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.s    $f23,$f5,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movn      $v1,$s1,$s0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.d    $f27,$f21,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.s    $f12,$f0,$s7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc0 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movt.d    $f0,$f2,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movz      $a1,$s6,$a1     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.d    $f12,$f29,$a1   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.s    $f25,$f7,$v1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips2/valid.s b/test/MC/Mips/mips2/valid.s
index e3effde..9c3706e 100644
--- a/test/MC/Mips/mips2/valid.s
+++ b/test/MC/Mips/mips2/valid.s
@@ -9,8 +9,18 @@
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
         and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
         c.sf.d    $f30,$f0
@@ -35,16 +45,16 @@
         lb        $24,-14515($10)
         lbu       $8,30195($v1)
         ldc1      $f11,16391($s0)
-        ldc2      $8,-21181($at)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
         ldc3      $29,-28645($s1)
         lh        $11,-8556($s5)
         lhu       $s3,-22851($v0)
         li        $at,-29773
         li        $zero,-29889
-        ll        $v0,-7321($s2)
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
         lw        $8,5674($a1)
         lwc1      $f16,10225($k0)
-        lwc2      $18,-841($a2)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
         lwc3      $10,-32265($k0)
         lwl       $s4,-4231($15)
         lwr       $zero,-19147($gp)
@@ -73,12 +83,13 @@
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
         round.w.d $f6,$f4
         round.w.s $f27,$f28
         sb        $s6,-19857($14)
-        sc        $15,18904($s3)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
         sdc1      $f31,30574($13)
-        sdc2      $20,23157($s2)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
         sdc3      $12,5835($10)
         sh        $14,-6704($15)
         sll       $a3,18               # CHECK: sll $7, $7, 18         # encoding: [0x00,0x07,0x3c,0x80]
@@ -107,10 +118,11 @@
         subu      $sp,$s6,$s6
         sw        $ra,-10160($sp)
         swc1      $f6,-8465($24)
-        swc2      $25,24880($s0)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
         swc3      $10,-32265($k0)
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
         teqi      $s5,-17504
         tgei      $s1,5025
         tgeiu     $sp,-28621
diff --git a/test/MC/Mips/mips3/invalid-mips32.s b/test/MC/Mips/mips3/invalid-mips32.s
new file mode 100644
index 0000000..3acd765
--- /dev/null
+++ b/test/MC/Mips/mips3/invalid-mips32.s
@@ -0,0 +1,10 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips3 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+
+        sync 0                    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sync 1                    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips3/invalid-mips4.s b/test/MC/Mips/mips3/invalid-mips4.s
index 6e15d79..9cd92d3 100644
--- a/test/MC/Mips/mips3/invalid-mips4.s
+++ b/test/MC/Mips/mips3/invalid-mips4.s
@@ -4,20 +4,27 @@
 # RUN:     2>%t1
 # RUN: FileCheck %s < %t1
 
-	.set noat
+        .set noat
+        bc1f      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        bc1t      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ldxc1     $f8,$s7($15)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         lwxc1     $f12,$s1($s8)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf      $gp,$8,$fcc7   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.d    $f6,$f11,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.s    $f23,$f5,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movn      $v1,$s1,$s0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.d    $f27,$f21,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.s    $f12,$f0,$s7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc0 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movt.d    $f0,$f2,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movz      $a1,$s6,$9     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movz.d    $f12,$f29,$9   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movz      $a1,$s6,$9      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.d    $f12,$f29,$9    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.s    $f25,$f7,$v1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         sdxc1     $f11,$10($14)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         swxc1     $f19,$12($k0)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips3/invalid-mips5.s b/test/MC/Mips/mips3/invalid-mips5.s
index d25621b..307eee8 100644
--- a/test/MC/Mips/mips3/invalid-mips5.s
+++ b/test/MC/Mips/mips3/invalid-mips5.s
@@ -4,19 +4,26 @@
 # RUN:     2>%t1
 # RUN: FileCheck %s < %t1
 
-	.set noat
+        .set noat
+        bc1f      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        bc1t      $fcc1, 4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ldxc1     $f8,$s7($t3)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         luxc1     $f19,$s6($s5)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         lwxc1     $f12,$s1($s8)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf      $gp,$a4,$fcc7   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.d    $f6,$f11,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        movf.s    $f23,$f5,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movn      $v1,$s1,$s0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.d    $f27,$f21,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movn.s    $f12,$f0,$s7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc0 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movt.d    $f0,$f2,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         movz      $a1,$s6,$a5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.d    $f12,$f29,$a5   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         movz.s    $f25,$f7,$v1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips3/valid.s b/test/MC/Mips/mips3/valid.s
index 2067666..cb209fd 100644
--- a/test/MC/Mips/mips3/valid.s
+++ b/test/MC/Mips/mips3/valid.s
@@ -9,8 +9,19 @@
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
         and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
         c.sf.d    $f30,$f0
@@ -32,7 +43,11 @@
         cvt.w.d   $f20,$f14
         cvt.w.s   $f20,$f24
         dadd      $s3,$at,$ra
+        dadd      $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        dadd      $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddi     $sp,$s4,-27705
+        daddi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        daddi     $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddiu    $k0,$s6,-4586
         daddu     $s3,$at,$ra
         ddiv      $zero,$k0,$s3
@@ -64,6 +79,10 @@
         dsrl32    $s3,$6,23            # CHECK: dsrl32 $19, $6, 23          # encoding: [0x00,0x06,0x9d,0xfe]
         dsrlv     $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
         dsub      $a3,$s6,$8
+        dsub      $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsub      $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
+        dsubi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsubi     $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
         dsubu     $a1,$a1,$k0
         ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
         eret
@@ -75,18 +94,18 @@
         lbu       $8,30195($v1)
         ld        $sp,-28645($s1)
         ldc1      $f11,16391($s0)
-        ldc2      $8,-21181($at)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
         ldl       $24,-4167($24)
         ldr       $14,-30358($s4)
         lh        $11,-8556($s5)
         lhu       $s3,-22851($v0)
         li        $at,-29773
         li        $zero,-29889
-        ll        $v0,-7321($s2)
-        lld       $zero,-14736($ra)
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
+        lld       $zero,-14736($ra)    # CHECK: lld $zero, -14736($ra) # encoding: [0xd3,0xe0,0xc6,0x70]
         lw        $8,5674($a1)
         lwc1      $f16,10225($k0)
-        lwc2      $18,-841($a2)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
         lwl       $s4,-4231($15)
         lwr       $zero,-19147($gp)
         lwu       $s3,-24086($v1)
@@ -117,16 +136,17 @@
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
         round.l.d $f12,$f1
         round.l.s $f25,$f5
         round.w.d $f6,$f4
         round.w.s $f27,$f28
         sb        $s6,-19857($14)
-        sc        $15,18904($s3)
-        scd       $15,-8243($sp)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
+        scd       $15,-8243($sp)       # CHECK: scd $15, -8243($sp)    # encoding: [0xf3,0xaf,0xdf,0xcd]
         sd        $12,5835($10)
         sdc1      $f31,30574($13)
-        sdc2      $20,23157($s2)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
         sdl       $a3,-20961($s8)
         sdr       $11,-20423($12)
         sh        $14,-6704($15)
@@ -156,9 +176,10 @@
         subu      $sp,$s6,$s6
         sw        $ra,-10160($sp)
         swc1      $f6,-8465($24)
-        swc2      $25,24880($s0)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
         teqi      $s5,-17504
         tgei      $s1,5025
         tgeiu     $sp,-28621
diff --git a/test/MC/Mips/mips32/abiflags.s b/test/MC/Mips/mips32/abiflags.s
new file mode 100644
index 0000000..896dd84
--- /dev/null
+++ b/test/MC/Mips/mips32/abiflags.s
@@ -0,0 +1,37 @@
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32 | \
+# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+#
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ
+
+# CHECK-ASM: .module fp=32
+
+# Checking if the Mips.abiflags were correctly emitted.
+# CHECK-OBJ:  Section {
+# CHECK-OBJ:    Index: 5
+# CHECK-OBJ:    Name: .MIPS.abiflags (12)
+# CHECK-OBJ:    Type:  (0x7000002A)
+# CHECK-OBJ:     Flags [ (0x2)
+# CHECK-OBJ:      SHF_ALLOC (0x2)
+# CHECK-OBJ:    ]
+# CHECK-OBJ:    Address: 0x0
+# CHECK-OBJ:    Offset: 0x50
+# CHECK-OBJ:    Size: 24
+# CHECK-OBJ:    Link: 0
+# CHECK-OBJ:    Info: 0
+# CHECK-OBJ:    AddressAlignment: 8
+# CHECK-OBJ:    EntrySize: 0
+# CHECK-OBJ:    Relocations [
+# CHECK-OBJ:    ]
+# CHECK-OBJ:    SectionData (
+# CHECK-OBJ:      0000: 00002001 01010001 00000000 00000000  |.. .............|
+# CHECK-OBJ:      0010: 00000000 00000000                    |........|
+# CHECK-OBJ:    )
+# CHECK-OBJ:  }
+
+        .module fp=32
+
+# FIXME: Test should include gnu_attributes directive when implemented.
+#        An explicit .gnu_attribute must be checked against the effective
+#        command line options and any inconsistencies reported via a warning.
diff --git a/test/MC/Mips/mips32/valid.s b/test/MC/Mips/mips32/valid.s
index bc29bdc..d330905 100644
--- a/test/MC/Mips/mips32/valid.s
+++ b/test/MC/Mips/mips32/valid.s
@@ -9,8 +9,21 @@
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
         and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
         c.sf.d    $f30,$f0
@@ -18,8 +31,8 @@
         ceil.w.d  $f11,$f25
         ceil.w.s  $f6,$f20
         cfc1      $s1,$21
-        clo       $11,$a1
-        clz       $sp,$gp
+        clo       $11,$a1              # CHECK: clo $11, $5   # encoding: [0x70,0xab,0x58,0x21]
+        clz       $sp,$gp              # CHECK: clz $sp, $gp  # encoding: [0x73,0x9d,0xe8,0x20]
         ctc1      $a2,$26
         cvt.d.s   $f22,$f28
         cvt.d.w   $f26,$f11
@@ -39,15 +52,15 @@
         lb        $24,-14515($10)
         lbu       $8,30195($v1)
         ldc1      $f11,16391($s0)
-        ldc2      $8,-21181($at)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
         lh        $11,-8556($s5)
         lhu       $s3,-22851($v0)
         li        $at,-29773
         li        $zero,-29889
-        ll        $v0,-7321($s2)
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
         lw        $8,5674($a1)
         lwc1      $f16,10225($k0)
-        lwc2      $18,-841($a2)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
         lwl       $s4,-4231($15)
         lwr       $zero,-19147($gp)
         madd      $s6,$13
@@ -96,12 +109,16 @@
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
+        pref      1, 8($5)             # CHECK: pref 1, 8($5)          # encoding: [0xcc,0xa1,0x00,0x08]
         round.w.d $f6,$f4
         round.w.s $f27,$f28
         sb        $s6,-19857($14)
-        sc        $15,18904($s3)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
+        sdbbp                          # CHECK: sdbbp                  # encoding: [0x70,0x00,0x00,0x3f]
+        sdbbp     34                   # CHECK: sdbbp 34               # encoding: [0x70,0x00,0x08,0xbf]
         sdc1      $f31,30574($13)
-        sdc2      $20,23157($s2)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
         sh        $14,-6704($15)
         sll       $a3,18               # CHECK: sll $7, $7, 18         # encoding: [0x00,0x07,0x3c,0x80]
         sll       $a3,$zero,18         # CHECK: sll $7, $zero, 18      # encoding: [0x00,0x00,0x3c,0x80]
@@ -129,9 +146,11 @@
         subu      $sp,$s6,$s6
         sw        $ra,-10160($sp)
         swc1      $f6,-8465($24)
-        swc2      $25,24880($s0)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        sync      1                    # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
         teqi      $s5,-17504
         tgei      $s1,5025
         tgeiu     $sp,-28621
diff --git a/test/MC/Mips/mips32r2/abiflags.s b/test/MC/Mips/mips32r2/abiflags.s
new file mode 100644
index 0000000..41a809a
--- /dev/null
+++ b/test/MC/Mips/mips32r2/abiflags.s
@@ -0,0 +1,38 @@
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 | \
+# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+#
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ
+
+# CHECK-ASM: .module fp=32
+# CHECK-ASM: .set fp=64
+
+# Checking if the Mips.abiflags were correctly emitted.
+# CHECK-OBJ:  Section {
+# CHECK-OBJ:    Index: 5
+# CHECK-OBJ:    Name: .MIPS.abiflags (12)
+# CHECK-OBJ:    Type:  (0x7000002A)
+# CHECK-OBJ:     Flags [ (0x2)
+# CHECK-OBJ:      SHF_ALLOC (0x2)
+# CHECK-OBJ:    ]
+# CHECK-OBJ:    Address: 0x0
+# CHECK-OBJ:    Offset: 0x50
+# CHECK-OBJ:    Size: 24
+# CHECK-OBJ:    Link: 0
+# CHECK-OBJ:    Info: 0
+# CHECK-OBJ:    AddressAlignment: 8
+# CHECK-OBJ:    EntrySize: 0
+# CHECK-OBJ:    Relocations [
+# CHECK-OBJ:    ]
+# CHECK-OBJ:    SectionData (
+# CHECK-OBJ:      0000: 00002002 01010001 00000000 00000000  |.. .............|
+# CHECK-OBJ:      0010: 00000000 00000000                    |........|
+# CHECK-OBJ:    )
+# CHECK-OBJ:  }
+
+        .module fp=32
+        .set fp=64
+# FIXME: Test should include gnu_attributes directive when implemented.
+#        An explicit .gnu_attribute must be checked against the effective
+#        command line options and any inconsistencies reported via a warning.
diff --git a/test/MC/Mips/mips32r2/invalid.s b/test/MC/Mips/mips32r2/invalid.s
new file mode 100644
index 0000000..ebccc43
--- /dev/null
+++ b/test/MC/Mips/mips32r2/invalid.s
@@ -0,0 +1,10 @@
+# Instructions that are valid for the current ISA but should be rejected by the assembler (e.g.
+# invalid set of operands or operand's restrictions not met).
+
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -mcpu=mips32r2 2>%t1
+# RUN: FileCheck %s < %t1 -check-prefix=ASM
+
+        .text
+        .set noreorder
+        jalr.hb $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
+        jalr.hb $31, $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
diff --git a/test/MC/Mips/mips32r2/valid.s b/test/MC/Mips/mips32r2/valid.s
index 26f8b6b..631c691 100644
--- a/test/MC/Mips/mips32r2/valid.s
+++ b/test/MC/Mips/mips32r2/valid.s
@@ -9,8 +9,21 @@
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
         and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
         c.sf.d    $f30,$f0
@@ -18,8 +31,8 @@
         ceil.w.d  $f11,$f25
         ceil.w.s  $f6,$f20
         cfc1      $s1,$21
-        clo       $11,$a1
-        clz       $sp,$gp
+        clo       $11,$a1              # CHECK: clo $11, $5   # encoding: [0x70,0xab,0x58,0x21]
+        clz       $sp,$gp              # CHECK: clz $sp, $gp  # encoding: [0x73,0x9d,0xe8,0x20]
         ctc1      $a2,$26
         cvt.d.s   $f22,$f28
         cvt.d.w   $f26,$f11
@@ -40,20 +53,23 @@
         eret
         floor.w.d $f14,$f11
         floor.w.s $f8,$f9
+        jr.hb     $4                   # CHECK: jr.hb  $4 # encoding: [0x00,0x80,0x04,0x08]
+        jalr.hb   $4                   # CHECK: jalr.hb  $4 # encoding: [0x00,0x80,0xfc,0x09]
+        jalr.hb   $4, $5               # CHECK: jalr.hb  $4, $5 # encoding: [0x00,0xa0,0x24,0x09]
         lb        $24,-14515($10)
         lbu       $8,30195($v1)
         ldc1      $f11,16391($s0)
-        ldc2      $8,-21181($at)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
         ldxc1     $f8,$s7($15)
         lh        $11,-8556($s5)
         lhu       $s3,-22851($v0)
         li        $at,-29773
         li        $zero,-29889
-        ll        $v0,-7321($s2)
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
         luxc1     $f19,$s6($s5)
         lw        $8,5674($a1)
         lwc1      $f16,10225($k0)
-        lwc2      $18,-841($a2)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
         lwl       $s4,-4231($15)
         lwr       $zero,-19147($gp)
         lwxc1     $f12,$s1($s8)
@@ -113,7 +129,9 @@
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4           # encoding: [0x34,0x42,0x00,0x04]
         pause                          # CHECK: pause # encoding:  [0x00,0x00,0x01,0x40]
+        pref      1, 8($5)             # CHECK: pref 1, 8($5)           # encoding: [0xcc,0xa1,0x00,0x08]
         rdhwr     $sp,$11              
         rotr      $1,15                # CHECK: rotr $1, $1, 15         # encoding: [0x00,0x21,0x0b,0xc2]
         rotr      $1,$14,15            # CHECK: rotr $1, $14, 15        # encoding: [0x00,0x2e,0x0b,0xc2]
@@ -121,9 +139,11 @@
         round.w.d $f6,$f4
         round.w.s $f27,$f28
         sb        $s6,-19857($14)
-        sc        $15,18904($s3)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
+        sdbbp                          # CHECK: sdbbp                  # encoding: [0x70,0x00,0x00,0x3f]
+        sdbbp     34                   # CHECK: sdbbp 34               # encoding: [0x70,0x00,0x08,0xbf]
         sdc1      $f31,30574($13)
-        sdc2      $20,23157($s2)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
         sdxc1     $f11,$10($14)
         seb       $25,$15
         seh       $v1,$12
@@ -155,10 +175,12 @@
         suxc1     $f12,$k1($13)
         sw        $ra,-10160($sp)
         swc1      $f6,-8465($24)
-        swc2      $25,24880($s0)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
         swxc1     $f19,$12($k0)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        sync      1                    # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
         teqi      $s5,-17504
         tgei      $s1,5025
         tgeiu     $sp,-28621
diff --git a/test/MC/Mips/mips32r6/invalid-mips1-wrong-error.s b/test/MC/Mips/mips32r6/invalid-mips1-wrong-error.s
index aee068a..52fa5f5 100644
--- a/test/MC/Mips/mips32r6/invalid-mips1-wrong-error.s
+++ b/test/MC/Mips/mips32r6/invalid-mips1-wrong-error.s
@@ -5,6 +5,8 @@
 # RUN: FileCheck %s < %t1
 
 	.set noat
+        bc2f      4                   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2t      4                   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
         lwl       $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         lwr       $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         swl       $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/test/MC/Mips/mips32r6/invalid-mips1.s b/test/MC/Mips/mips32r6/invalid-mips1.s
index aa7d407..44d4fbb 100644
--- a/test/MC/Mips/mips32r6/invalid-mips1.s
+++ b/test/MC/Mips/mips32r6/invalid-mips1.s
@@ -6,3 +6,19 @@
 
 	.set noat
         addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        c.ngl.d   $f29,$f29           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        c.ngle.d  $f0,$f16            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        c.sf.d    $f30,$f0            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        c.sf.s    $f14,$f22           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $s3                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mflo      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthi      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $25                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$s4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$v0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $9,$s2              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $gp,$k0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+#       div has been re-encoded. See valid.s
+#       divu has been re-encoded. See valid.s
diff --git a/test/MC/Mips/mips32r6/invalid-mips2.s b/test/MC/Mips/mips32r6/invalid-mips2.s
index 0638e78..bfa2c4c 100644
--- a/test/MC/Mips/mips32r6/invalid-mips2.s
+++ b/test/MC/Mips/mips32r6/invalid-mips2.s
@@ -6,9 +6,21 @@
 
 	.set noat
         addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $s3                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mflo      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthi      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $25                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$s4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$v0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $9,$s2              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $gp,$k0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         teqi      $s5,-17504          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tgei      $s1,5025            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tgeiu     $sp,-28621          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tlti      $14,-21059          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tltiu     $ra,-5076           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tnei      $12,-29647          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+#       div has been re-encoded. See valid.s
+#       divu has been re-encoded. See valid.s
diff --git a/test/MC/Mips/mips32r6/invalid-mips32-wrong-error.s b/test/MC/Mips/mips32r6/invalid-mips32-wrong-error.s
index e416a20..e63bdd4 100644
--- a/test/MC/Mips/mips32r6/invalid-mips32-wrong-error.s
+++ b/test/MC/Mips/mips32r6/invalid-mips32-wrong-error.s
@@ -10,6 +10,10 @@
         bc1tl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
         bc1fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
         bc1fl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2f  4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2f  $fcc0,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2t  4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2t  $fcc0,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
         bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
         bc2tl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
         bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips32r6/invalid-mips32.s b/test/MC/Mips/mips32r6/invalid-mips32.s
new file mode 100644
index 0000000..e0889ea
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips32.s
@@ -0,0 +1,25 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+        madd      $s6,$13       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd      $zero,$9      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu     $s3,$gp       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu     $24,$s2       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn      $v1,$s1,$s0         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.d    $f27,$f21,$k0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.s    $f12,$f0,$s7        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.d    $f0,$f2,$fcc0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz      $a1,$s6,$9          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.d    $f12,$f29,$9        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.s    $f25,$f7,$v1        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msub      $s7,$k1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msubu     $15,$a1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32r6/invalid-mips32r2.s b/test/MC/Mips/mips32r6/invalid-mips32r2.s
new file mode 100644
index 0000000..25694e3
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips32r2.s
@@ -0,0 +1,15 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+        madd.d    $f18,$f19,$f26,$f20 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd.s    $f1,$f31,$f19,$f25  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msub.d    $f10,$f1,$f31,$f18  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msub.s    $f12,$f19,$f10,$f16 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nmadd.d   $f18,$f9,$f14,$f19  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nmadd.s   $f0,$f5,$f25,$f12   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nmsub.d   $f30,$f8,$f16,$f30  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nmsub.s   $f1,$f24,$f19,$f4   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32r6/invalid-mips4-wrong-error.s b/test/MC/Mips/mips32r6/invalid-mips4-wrong-error.s
new file mode 100644
index 0000000..f3131a9
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips4-wrong-error.s
@@ -0,0 +1,21 @@
+# Instructions that are invalid and are correctly rejected but use the wrong
+# error message at the moment.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+        beql $1,$2,4            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bgezall $3,8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bgezl $3,8              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bgtzl $4,16             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        blezl $3,8              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bltzall $3,8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bltzl $4,16             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bnel $1,$2,4            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        prefx 0,$2($31)         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips32r6/invalid-mips4.s b/test/MC/Mips/mips32r6/invalid-mips4.s
new file mode 100644
index 0000000..8ba2ed8
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips4.s
@@ -0,0 +1,11 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        ldxc1     $f8,$s7($15)        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lwxc1     $f12,$s1($s8)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sdxc1     $f11,$10($14)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        swxc1     $f19,$12($k0)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32r6/invalid-mips5-wrong-error.s b/test/MC/Mips/mips32r6/invalid-mips5-wrong-error.s
new file mode 100644
index 0000000..99d10c3
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips5-wrong-error.s
@@ -0,0 +1,11 @@
+# Instructions that are invalid but currently emit the wrong error message.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        bc1any2f  $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1any2t  $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1any4f  $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1any4t  $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips32r6/invalid-mips5.s b/test/MC/Mips/mips32r6/invalid-mips5.s
new file mode 100644
index 0000000..63f1cca
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips5.s
@@ -0,0 +1,9 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        luxc1     $f19,$s6($s5)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        suxc1     $f12,$k1($13)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32r6/invalid.s b/test/MC/Mips/mips32r6/invalid.s
new file mode 100644
index 0000000..82cb5ab
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid.s
@@ -0,0 +1,14 @@
+# Instructions that are available for the current ISA but should be rejected by
+# the assembler (e.g. invalid set of operands or operand's restrictions not met).
+
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -mcpu=mips32r6 2>%t1
+# RUN: FileCheck %s < %t1 -check-prefix=ASM
+
+        .text
+        .set noreorder
+        .set noat
+        jalr.hb $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
+        jalr.hb $31, $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
+        ldc2    $8,-21181($at)   # ASM: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sdc2    $20,23157($s2)   # ASM: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        swc2    $25,24880($s0)   # ASM: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32r6/relocations.s b/test/MC/Mips/mips32r6/relocations.s
index 4532e42..13b3387 100644
--- a/test/MC/Mips/mips32r6/relocations.s
+++ b/test/MC/Mips/mips32r6/relocations.s
@@ -5,6 +5,9 @@
 #------------------------------------------------------------------------------
 # Check that the assembler can handle the documented syntax for fixups.
 #------------------------------------------------------------------------------
+# CHECK-FIXUP: addiupc $2, bar  # encoding: [0xec,0b01000AAA,A,A]
+# CHECK-FIXUP:                  # fixup A - offset: 0,
+# CHECK-FIXUP:                    value: bar, kind: fixup_MIPS_PC19_S2
 # CHECK-FIXUP: beqc $5, $6, bar # encoding: [0x20,0xa6,A,A]
 # CHECK-FIXUP:                  #   fixup A - offset: 0,
 # CHECK-FIXUP:                      value: bar, kind: fixup_Mips_PC16
@@ -31,20 +34,30 @@
 # CHECK-FIXUP:                              #   fixup A - offset: 0,
 # CHECK-FIXUP:                                  value: bar@PCREL_LO16,
 # CHECK-FIXUP:                                  kind: fixup_MIPS_PCLO16
+# CHECK-FIXUP: lwpc    $2, bar  # encoding: [0xec,0b01001AAA,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC19_S2
+# CHECK-FIXUP: lwupc   $2, bar  # encoding: [0xec,0b01010AAA,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC19_S2
 #------------------------------------------------------------------------------
 # Check that the appropriate relocations were created.
 #------------------------------------------------------------------------------
 # CHECK-ELF: Relocations [
-# CHECK-ELF:     0x0 R_MIPS_PC16 bar 0x0
+# CHECK-ELF:     0x0 R_MIPS_PC19_S2 bar 0x0
 # CHECK-ELF:     0x4 R_MIPS_PC16 bar 0x0
-# CHECK-ELF:     0x8 R_MIPS_PC21_S2 bar 0x0
+# CHECK-ELF:     0x8 R_MIPS_PC16 bar 0x0
 # CHECK-ELF:     0xC R_MIPS_PC21_S2 bar 0x0
-# CHECK-ELF:     0x10 R_MIPS_PC26_S2 bar 0x0
+# CHECK-ELF:     0x10 R_MIPS_PC21_S2 bar 0x0
 # CHECK-ELF:     0x14 R_MIPS_PC26_S2 bar 0x0
-# CHECK-ELF:     0x18 R_MIPS_PCHI16 bar 0x0
-# CHECK-ELF:     0x1C R_MIPS_PCLO16 bar 0x0
+# CHECK-ELF:     0x18 R_MIPS_PC26_S2 bar 0x0
+# CHECK-ELF:     0x1C R_MIPS_PCHI16 bar 0x0
+# CHECK-ELF:     0x20 R_MIPS_PCLO16 bar 0x0
+# CHECK-ELF:     0x24 R_MIPS_PC19_S2 bar 0x0
+# CHECK-ELF:     0x28 R_MIPS_PC19_S2 bar 0x0
 # CHECK-ELF: ]
 
+  addiupc   $2,bar
   beqc  $5, $6, bar
   bnec  $5, $6, bar
   beqzc $9, bar
@@ -53,3 +66,5 @@
   bc    bar
   aluipc $2, %pcrel_hi(bar)
   addiu  $2, $2, %pcrel_lo(bar)
+  lwpc      $2,bar
+  lwupc     $2,bar
diff --git a/test/MC/Mips/mips32r6/valid.s b/test/MC/Mips/mips32r6/valid.s
index 5b4b928..f23dbd7 100644
--- a/test/MC/Mips/mips32r6/valid.s
+++ b/test/MC/Mips/mips32r6/valid.s
@@ -10,15 +10,18 @@
 #   rs > rt
 # appropriately for each branch instruction
 #
-# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 | FileCheck %s
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 2> %t0 | FileCheck %s
+# RUN: FileCheck %s -check-prefix=WARNING < %t0
 
         .set noat
         # FIXME: Add the instructions carried forward from older ISA's
+        and     $2,4             # CHECK: andi $2, $2, 4      # encoding: [0x30,0x42,0x00,0x04]
         addiupc $4, 100          # CHECK: addiupc $4, 100     # encoding: [0xec,0x80,0x00,0x19]
         align   $4, $2, $3, 2    # CHECK: align $4, $2, $3, 2 # encoding: [0x7c,0x43,0x22,0xa0]
         aluipc  $3, 56           # CHECK: aluipc $3, 56       # encoding: [0xec,0x7f,0x00,0x38]
         aui     $3,$2,-23        # CHECK: aui $3, $2, -23     # encoding: [0x3c,0x62,0xff,0xe9]
         auipc   $3, -1           # CHECK: auipc $3, -1        # encoding: [0xec,0x7e,0xff,0xff]
+        bal     21100            # CHECK: bal 21100           # encoding: [0x04,0x11,0x14,0x9b]
         balc 14572256            # CHECK: balc 14572256       # encoding: [0xe8,0x37,0x96,0xb8]
         bc 14572256              # CHECK: bc 14572256         # encoding: [0xc8,0x37,0x96,0xb8]
         bc1eqz  $f0,4            # CHECK: bc1eqz $f0, 4       # encoding: [0x45,0x20,0x00,0x01]
@@ -38,6 +41,8 @@
         bnec $5, $6, 256         # CHECK: bnec $5, $6, 256    # encoding: [0x60,0xa6,0x00,0x40]
         bnezalc $2, 1332         # CHECK: bnezalc $2, 1332    # encoding: [0x60,0x02,0x01,0x4d]
         beqzc $5, 72256          # CHECK: beqzc $5, 72256     # encoding: [0xd8,0xa0,0x46,0x90]
+        bgec $2, $3, 256         # CHECK: bgec $2, $3, 256    # encoding: [0x58,0x43,0x00,0x40]
+        bgeuc $2, $3, 256        # CHECK: bgeuc $2, $3, 256   # encoding: [0x18,0x43,0x00,0x40]
         bgezalc $2, 1332         # CHECK: bgezalc $2, 1332    # encoding: [0x18,0x42,0x01,0x4d]
         bnezc $5, 72256          # CHECK: bnezc $5, 72256     # encoding: [0xf8,0xa0,0x46,0x90]
         bltzc $5, 256            # CHECK: bltzc $5, 256       # encoding: [0x5c,0xa5,0x00,0x40]
@@ -48,6 +53,8 @@
         bgtzc $5, 256            # CHECK: bgtzc $5, 256       # encoding: [0x5c,0x05,0x00,0x40]
         bitswap $4, $2           # CHECK: bitswap $4, $2      # encoding: [0x7c,0x02,0x20,0x20]
         blezalc $2, 1332         # CHECK: blezalc $2, 1332    # encoding: [0x18,0x02,0x01,0x4d]
+        bltc $5, $6, 256         # CHECK: bltc $5, $6, 256    # encoding: [0x5c,0xa6,0x00,0x40]
+        bltuc $5, $6, 256        # CHECK: bltuc $5, $6, 256   # encoding: [0x1c,0xa6,0x00,0x40]
         # bnvc requires that rs >= rt but we accept both. See also bnec
         bnvc     $0, $0, 4       # CHECK: bnvc $zero, $zero, 4 # encoding: [0x60,0x00,0x00,0x01]
         bnvc     $2, $0, 4       # CHECK: bnvc $2, $zero, 4    # encoding: [0x60,0x40,0x00,0x01]
@@ -56,47 +63,49 @@
         bovc     $0, $0, 4       # CHECK: bovc $zero, $zero, 4 # encoding: [0x20,0x00,0x00,0x01]
         bovc     $2, $0, 4       # CHECK: bovc $2, $zero, 4    # encoding: [0x20,0x40,0x00,0x01]
         bovc     $4, $2, 4       # CHECK: bovc $4, $2, 4       # encoding: [0x20,0x82,0x00,0x01]
-        cmp.f.s    $f2,$f3,$f4      # CHECK: cmp.f.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x80]
-        cmp.f.d    $f2,$f3,$f4      # CHECK: cmp.f.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x80]
+        cache      1, 8($5)         # CHECK: cache 1, 8($5)         # encoding: [0x7c,0xa1,0x04,0x25]
+        cmp.af.s   $f2,$f3,$f4      # CHECK: cmp.af.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x80]
+        cmp.af.d   $f2,$f3,$f4      # CHECK: cmp.af.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x80]
         cmp.un.s   $f2,$f3,$f4      # CHECK: cmp.un.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x81]
         cmp.un.d   $f2,$f3,$f4      # CHECK: cmp.un.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x81]
         cmp.eq.s   $f2,$f3,$f4      # CHECK: cmp.eq.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x82]
         cmp.eq.d   $f2,$f3,$f4      # CHECK: cmp.eq.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x82]
         cmp.ueq.s  $f2,$f3,$f4      # CHECK: cmp.ueq.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x83]
         cmp.ueq.d  $f2,$f3,$f4      # CHECK: cmp.ueq.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x83]
-        cmp.olt.s  $f2,$f3,$f4      # CHECK: cmp.olt.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x84]
-        cmp.olt.d  $f2,$f3,$f4      # CHECK: cmp.olt.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x84]
+        cmp.lt.s   $f2,$f3,$f4      # CHECK: cmp.lt.s  $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x84]
+        cmp.lt.d   $f2,$f3,$f4      # CHECK: cmp.lt.d  $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x84]
         cmp.ult.s  $f2,$f3,$f4      # CHECK: cmp.ult.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x85]
         cmp.ult.d  $f2,$f3,$f4      # CHECK: cmp.ult.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x85]
-        cmp.ole.s  $f2,$f3,$f4      # CHECK: cmp.ole.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x86]
-        cmp.ole.d  $f2,$f3,$f4      # CHECK: cmp.ole.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x86]
+        cmp.le.s   $f2,$f3,$f4      # CHECK: cmp.le.s  $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x86]
+        cmp.le.d   $f2,$f3,$f4      # CHECK: cmp.le.d  $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x86]
         cmp.ule.s  $f2,$f3,$f4      # CHECK: cmp.ule.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x87]
         cmp.ule.d  $f2,$f3,$f4      # CHECK: cmp.ule.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x87]
-        cmp.sf.s   $f2,$f3,$f4      # CHECK: cmp.sf.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x88]
-        cmp.sf.d   $f2,$f3,$f4      # CHECK: cmp.sf.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x88]
-        cmp.ngle.s $f2,$f3,$f4      # CHECK: cmp.ngle.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x89]
-        cmp.ngle.d $f2,$f3,$f4      # CHECK: cmp.ngle.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x89]
+        cmp.saf.s  $f2,$f3,$f4      # CHECK: cmp.saf.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x88]
+        cmp.saf.d  $f2,$f3,$f4      # CHECK: cmp.saf.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x88]
+        cmp.sun.s  $f2,$f3,$f4      # CHECK: cmp.sun.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x89]
+        cmp.sun.d  $f2,$f3,$f4      # CHECK: cmp.sun.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x89]
         cmp.seq.s  $f2,$f3,$f4      # CHECK: cmp.seq.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8a]
         cmp.seq.d  $f2,$f3,$f4      # CHECK: cmp.seq.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8a]
-        cmp.ngl.s  $f2,$f3,$f4      # CHECK: cmp.ngl.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8b]
-        cmp.ngl.d  $f2,$f3,$f4      # CHECK: cmp.ngl.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8b]
-        cmp.lt.s   $f2,$f3,$f4      # CHECK: cmp.lt.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8c]
-        cmp.lt.d   $f2,$f3,$f4      # CHECK: cmp.lt.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8c]
-        cmp.nge.s  $f2,$f3,$f4      # CHECK: cmp.nge.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8d]
-        cmp.nge.d  $f2,$f3,$f4      # CHECK: cmp.nge.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8d]
-        cmp.le.s   $f2,$f3,$f4      # CHECK: cmp.le.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8e]
-        cmp.le.d   $f2,$f3,$f4      # CHECK: cmp.le.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8e]
-        cmp.ngt.s  $f2,$f3,$f4      # CHECK: cmp.ngt.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8f]
-        cmp.ngt.d  $f2,$f3,$f4      # CHECK: cmp.ngt.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8f]
+        cmp.sueq.s $f2,$f3,$f4      # CHECK: cmp.sueq.s $f2, $f3, $f4 # encoding: [0x46,0x84,0x18,0x8b]
+        cmp.sueq.d $f2,$f3,$f4      # CHECK: cmp.sueq.d $f2, $f3, $f4 # encoding: [0x46,0xa4,0x18,0x8b]
+        cmp.slt.s  $f2,$f3,$f4      # CHECK: cmp.slt.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8c]
+        cmp.slt.d  $f2,$f3,$f4      # CHECK: cmp.slt.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8c]
+        cmp.sult.s $f2,$f3,$f4      # CHECK: cmp.sult.s $f2, $f3, $f4 # encoding: [0x46,0x84,0x18,0x8d]
+        cmp.sult.d $f2,$f3,$f4      # CHECK: cmp.sult.d $f2, $f3, $f4 # encoding: [0x46,0xa4,0x18,0x8d]
+        cmp.sle.s  $f2,$f3,$f4      # CHECK: cmp.sle.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8e]
+        cmp.sle.d  $f2,$f3,$f4      # CHECK: cmp.sle.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8e]
+        cmp.sule.s $f2,$f3,$f4      # CHECK: cmp.sule.s $f2, $f3, $f4 # encoding: [0x46,0x84,0x18,0x8f]
+        cmp.sule.d $f2,$f3,$f4      # CHECK: cmp.sule.d $f2, $f3, $f4 # encoding: [0x46,0xa4,0x18,0x8f]
         div     $2,$3,$4         # CHECK: div $2, $3, $4   # encoding: [0x00,0x64,0x10,0x9a]
         divu    $2,$3,$4         # CHECK: divu $2, $3, $4  # encoding: [0x00,0x64,0x10,0x9b]
         jialc   $5, 256          # CHECK: jialc $5, 256    # encoding: [0xf8,0x05,0x01,0x00]
         jic     $5, 256          # CHECK: jic $5, 256      # encoding: [0xd8,0x05,0x01,0x00]
+        lsa     $2, $3, $4, 3    # CHECK: lsa  $2, $3, $4, 3 # encoding: [0x00,0x64,0x10,0xc5]
         lwpc    $2,268           # CHECK: lwpc $2, 268     # encoding: [0xec,0x48,0x00,0x43]
         lwupc   $2,268           # CHECK: lwupc $2, 268    # encoding: [0xec,0x50,0x00,0x43]
         mod     $2,$3,$4         # CHECK: mod $2, $3, $4   # encoding: [0x00,0x64,0x10,0xda]
         modu    $2,$3,$4         # CHECK: modu $2, $3, $4  # encoding: [0x00,0x64,0x10,0xdb]
-#        mul     $2,$3,$4         # CHECK-TODO: mul $2, $3, $4   # encoding: [0x00,0x64,0x10,0x98]
+        mul     $2,$3,$4         # CHECK: mul $2, $3, $4   # encoding: [0x00,0x64,0x10,0x98]
         muh     $2,$3,$4         # CHECK: muh $2, $3, $4   # encoding: [0x00,0x64,0x10,0xd8]
         mulu    $2,$3,$4         # CHECK: mulu $2, $3, $4  # encoding: [0x00,0x64,0x10,0x99]
         muhu    $2,$3,$4         # CHECK: muhu $2, $3, $4  # encoding: [0x00,0x64,0x10,0xd9]
@@ -104,6 +113,7 @@
         maddf.d $f2,$f3,$f4      # CHECK: maddf.d $f2, $f3, $f4  # encoding: [0x46,0x24,0x18,0x98]
         msubf.s $f2,$f3,$f4      # CHECK: msubf.s $f2, $f3, $f4  # encoding: [0x46,0x04,0x18,0x99]
         msubf.d $f2,$f3,$f4      # CHECK: msubf.d $f2, $f3, $f4  # encoding: [0x46,0x24,0x18,0x99]
+        pref    1, 8($5)         # CHECK: pref 1, 8($5)          # encoding: [0x7c,0xa1,0x04,0x35]
         sel.d   $f0,$f1,$f2      # CHECK: sel.d $f0, $f1, $f2 # encoding: [0x46,0x22,0x08,0x10]
         sel.s   $f0,$f1,$f2      # CHECK: sel.s $f0, $f1, $f2 # encoding: [0x46,0x02,0x08,0x10]
         seleqz  $2,$3,$4         # CHECK: seleqz $2, $3, $4 # encoding: [0x00,0x64,0x10,0x35]
@@ -116,6 +126,7 @@
         maxa.d  $f0, $f2, $f4    # CHECK: maxa.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1f]
         mina.s  $f0, $f2, $f4    # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e]
         mina.d  $f0, $f2, $f4    # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e]
+        or      $2, 4            # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
         seleqz.s $f0, $f2, $f4   # CHECK: seleqz.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x14]
         seleqz.d $f0, $f2, $f4   # CHECK: seleqz.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x14]
         selnez.s $f0, $f2, $f4   # CHECK: selnez.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x17]
@@ -124,3 +135,20 @@
         rint.d $f2, $f4          # CHECK: rint.d $f2, $f4        # encoding: [0x46,0x20,0x20,0x9a]
         class.s $f2, $f4         # CHECK: class.s $f2, $f4       # encoding: [0x46,0x00,0x20,0x9b]
         class.d $f2, $f4         # CHECK: class.d $f2, $f4       # encoding: [0x46,0x20,0x20,0x9b]
+        jr.hb   $4               # CHECK: jr.hb $4               # encoding: [0x00,0x80,0x04,0x09]
+        jalr.hb $4               # CHECK: jalr.hb $4             # encoding: [0x00,0x80,0xfc,0x09]
+        jalr.hb $4, $5           # CHECK: jalr.hb $4, $5         # encoding: [0x00,0xa0,0x24,0x09]
+        ldc2    $8, -701($at)    # CHECK: ldc2 $8, -701($1)      # encoding: [0x49,0xc8,0x0d,0x43]
+        lwc2    $18,-841($a2)    # CHECK: lwc2 $18, -841($6)     # encoding: [0x49,0x52,0x34,0xb7]
+        sdc2    $20,629($s2)     # CHECK: sdc2 $20, 629($18)     # encoding: [0x49,0xf4,0x92,0x75]
+        swc2    $25,304($s0)     # CHECK: swc2 $25, 304($16)     # encoding: [0x49,0x79,0x81,0x30]
+        ll      $v0,-153($s2)    # CHECK: ll $2, -153($18)       # encoding: [0x7e,0x42,0xb3,0xb6]
+        sc      $15,-40($s3)     # CHECK: sc $15, -40($19)       # encoding: [0x7e,0x6f,0xec,0x26]
+        clo     $11,$a1          # CHECK: clo $11, $5            # encoding: [0x00,0xa0,0x58,0x51]
+        clz     $sp,$gp          # CHECK: clz $sp, $gp           # encoding: [0x03,0x80,0xe8,0x50]
+        ssnop                    # WARNING: [[@LINE]]:9: warning: ssnop is deprecated for MIPS32r6 and is equivalent to a nop instruction
+        ssnop                    # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
+        sdbbp                    # CHECK: sdbbp                  # encoding: [0x00,0x00,0x00,0x0e]
+        sdbbp     34             # CHECK: sdbbp 34               # encoding: [0x00,0x00,0x08,0x8e]
+        sync                     # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        sync    1                # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
diff --git a/test/MC/Mips/mips4/invalid-mips32.s b/test/MC/Mips/mips4/invalid-mips32.s
new file mode 100644
index 0000000..52dea02
--- /dev/null
+++ b/test/MC/Mips/mips4/invalid-mips32.s
@@ -0,0 +1,10 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips4 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+
+        sync 0                    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sync 1                    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips4/valid.s b/test/MC/Mips/mips4/valid.s
index 811584e..949b91d 100644
--- a/test/MC/Mips/mips4/valid.s
+++ b/test/MC/Mips/mips4/valid.s
@@ -9,8 +9,21 @@
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
         and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
         c.sf.d    $f30,$f0
@@ -32,7 +45,11 @@
         cvt.w.d   $f20,$f14
         cvt.w.s   $f20,$f24
         dadd      $s3,$at,$ra
+        dadd      $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        dadd      $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddi     $sp,$s4,-27705
+        daddi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        daddi     $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddiu    $k0,$s6,-4586
         daddu     $s3,$at,$ra
         ddiv      $zero,$k0,$s3
@@ -64,8 +81,10 @@
         dsrl32    $s3,$6,23            # CHECK: dsrl32 $19, $6, 23          # encoding: [0x00,0x06,0x9d,0xfe]
         dsrlv     $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
         dsub      $a3,$s6,$8
-        dsubu     $a1,$a1,$k0
-        dsub      $a3,$s6,$8
+        dsub      $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsub      $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
+        dsubi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsubi     $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
         dsubu     $a1,$a1,$k0
         ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
         eret
@@ -77,7 +96,7 @@
         lbu       $8,30195($v1)
         ld        $sp,-28645($s1)
         ldc1      $f11,16391($s0)
-        ldc2      $8,-21181($at)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
         ldl       $24,-4167($24)
         ldr       $14,-30358($s4)
         ldxc1     $f8,$s7($15)
@@ -85,11 +104,11 @@
         lhu       $s3,-22851($v0)
         li        $at,-29773
         li        $zero,-29889
-        ll        $v0,-7321($s2)
-        lld       $zero,-14736($ra)
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
+        lld       $zero,-14736($ra)    # CHECK: lld $zero, -14736($ra) # encoding: [0xd3,0xe0,0xc6,0x70]
         lw        $8,5674($a1)
         lwc1      $f16,10225($k0)
-        lwc2      $18,-841($a2)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
         lwl       $s4,-4231($15)
         lwr       $zero,-19147($gp)
         lwu       $s3,-24086($v1)
@@ -133,16 +152,18 @@
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
+        pref      1, 8($5)             # CHECK: pref 1, 8($5)          # encoding: [0xcc,0xa1,0x00,0x08]
         round.l.d $f12,$f1
         round.l.s $f25,$f5
         round.w.d $f6,$f4
         round.w.s $f27,$f28
         sb        $s6,-19857($14)
-        sc        $15,18904($s3)
-        scd       $15,-8243($sp)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
+        scd       $15,-8243($sp)       # CHECK: scd $15, -8243($sp)    # encoding: [0xf3,0xaf,0xdf,0xcd]
         sd        $12,5835($10)
         sdc1      $f31,30574($13)
-        sdc2      $20,23157($s2)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
         sdl       $a3,-20961($s8)
         sdr       $11,-20423($12)
         sdxc1     $f11,$10($14)
@@ -173,10 +194,11 @@
         subu      $sp,$s6,$s6
         sw        $ra,-10160($sp)
         swc1      $f6,-8465($24)
-        swc2      $25,24880($s0)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
         swxc1     $f19,$12($k0)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
         teqi      $s5,-17504
         tgei      $s1,5025
         tgeiu     $sp,-28621
diff --git a/test/MC/Mips/mips5/invalid-mips32.s b/test/MC/Mips/mips5/invalid-mips32.s
new file mode 100644
index 0000000..2e2c8da
--- /dev/null
+++ b/test/MC/Mips/mips5/invalid-mips32.s
@@ -0,0 +1,10 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips5 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+
+        sync 0                    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sync 1                    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips5/invalid-mips64.s b/test/MC/Mips/mips5/invalid-mips64.s
index 19d64dc..0a15da8 100644
--- a/test/MC/Mips/mips5/invalid-mips64.s
+++ b/test/MC/Mips/mips5/invalid-mips64.s
@@ -10,6 +10,9 @@
         dclo      $s2,$a2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         dclz      $s0,$25     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         deret                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        jr.hb     $4          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        jalr.hb   $4          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        jalr.hb   $4, $5      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         madd      $s6,$13     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         madd      $zero,$9    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         maddu     $s3,$gp     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips5/valid.s b/test/MC/Mips/mips5/valid.s
index 19aad05..3afdee1 100644
--- a/test/MC/Mips/mips5/valid.s
+++ b/test/MC/Mips/mips5/valid.s
@@ -9,8 +9,21 @@
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
         and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
         c.sf.d    $f30,$f0
@@ -32,7 +45,11 @@
         cvt.w.d   $f20,$f14
         cvt.w.s   $f20,$f24
         dadd      $s3,$at,$ra
+        dadd      $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        dadd      $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddi     $sp,$s4,-27705
+        daddi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        daddi     $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddiu    $k0,$s6,-4586
         daddu     $s3,$at,$ra
         ddiv      $zero,$k0,$s3
@@ -64,8 +81,10 @@
         dsrl32    $s3,$6,23            # CHECK: dsrl32 $19, $6, 23          # encoding: [0x00,0x06,0x9d,0xfe]
         dsrlv     $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
         dsub      $a3,$s6,$8
-        dsubu     $a1,$a1,$k0
-        dsub      $a3,$s6,$8
+        dsub      $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsub      $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
+        dsubi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsubi     $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
         dsubu     $a1,$a1,$k0
         ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
         eret
@@ -77,7 +96,7 @@
         lbu       $8,30195($v1)
         ld        $sp,-28645($s1)
         ldc1      $f11,16391($s0)
-        ldc2      $8,-21181($at)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
         ldl       $24,-4167($24)
         ldr       $14,-30358($s4)
         ldxc1     $f8,$s7($15)
@@ -85,12 +104,12 @@
         lhu       $s3,-22851($v0)
         li        $at,-29773
         li        $zero,-29889
-        ll        $v0,-7321($s2)
-        lld       $zero,-14736($ra)
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
+        lld       $zero,-14736($ra)    # CHECK: lld $zero, -14736($ra) # encoding: [0xd3,0xe0,0xc6,0x70]
         luxc1     $f19,$s6($s5)
         lw        $8,5674($a1)
         lwc1      $f16,10225($k0)
-        lwc2      $18,-841($a2)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
         lwl       $s4,-4231($15)
         lwr       $zero,-19147($gp)
         lwu       $s3,-24086($v1)
@@ -134,16 +153,18 @@
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
+        pref      1, 8($5)             # CHECK: pref 1, 8($5)          # encoding: [0xcc,0xa1,0x00,0x08]
         round.l.d $f12,$f1
         round.l.s $f25,$f5
         round.w.d $f6,$f4
         round.w.s $f27,$f28
         sb        $s6,-19857($14)
-        sc        $15,18904($s3)
-        scd       $15,-8243($sp)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
+        scd       $15,-8243($sp)       # CHECK: scd $15, -8243($sp)    # encoding: [0xf3,0xaf,0xdf,0xcd]
         sd        $12,5835($10)
         sdc1      $f31,30574($13)
-        sdc2      $20,23157($s2)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
         sdl       $a3,-20961($s8)
         sdr       $11,-20423($12)
         sdxc1     $f11,$10($14)
@@ -175,10 +196,11 @@
         suxc1     $f12,$k1($13)
         sw        $ra,-10160($sp)
         swc1      $f6,-8465($24)
-        swc2      $25,24880($s0)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
         swxc1     $f19,$12($k0)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
         teqi      $s5,-17504
         tgei      $s1,5025
         tgeiu     $sp,-28621
diff --git a/test/MC/Mips/mips64-expansions.s b/test/MC/Mips/mips64-expansions.s
new file mode 100644
index 0000000..0efdd2f
--- /dev/null
+++ b/test/MC/Mips/mips64-expansions.s
@@ -0,0 +1,209 @@
+# RUN: llvm-mc %s -triple=mips64el-unknown-linux -show-encoding -mcpu=mips64r2 | FileCheck %s
+#
+# The GNU assembler implements 'dli' and 'dla' variants on 'li' and 'la'
+# supporting double-word lengths.  Test that not only are they present, bu
+# that they also seem to handle 64-bit values.
+#
+# XXXRW: Does using powers of ten make me a bad person?
+#
+# CHECK: ori	$12, $zero, 1           # encoding: [0x01,0x00,0x0c,0x34]
+# CHECK: ori	$12, $zero, 10          # encoding: [0x0a,0x00,0x0c,0x34]
+# CHECK: ori	$12, $zero, 100         # encoding: [0x64,0x00,0x0c,0x34]
+# CHECK: ori	$12, $zero, 1000        # encoding: [0xe8,0x03,0x0c,0x34]
+# CHECK: ori	$12, $zero, 10000       # encoding: [0x10,0x27,0x0c,0x34]
+# CHECK: lui	$12, 1                  # encoding: [0x01,0x00,0x0c,0x3c]
+# CHECK: ori	$12, $12, 34464         # encoding: [0xa0,0x86,0x8c,0x35]
+# CHECK: lui	$12, 15                 # encoding: [0x0f,0x00,0x0c,0x3c]
+# CHECK: ori	$12, $12, 16960         # encoding: [0x40,0x42,0x8c,0x35]
+# CHECK: lui	$12, 152                # encoding: [0x98,0x00,0x0c,0x3c]
+# CHECK: ori	$12, $12, 38528         # encoding: [0x80,0x96,0x8c,0x35]
+# CHECK: lui	$12, 1525               # encoding: [0xf5,0x05,0x0c,0x3c]
+# CHECK: ori	$12, $12, 57600         # encoding: [0x00,0xe1,0x8c,0x35]
+# CHECK: lui	$12, 15258              # encoding: [0x9a,0x3b,0x0c,0x3c]
+# CHECK: ori	$12, $12, 51712         # encoding: [0x00,0xca,0x8c,0x35]
+# CHECK: lui	$12, 2                  # encoding: [0x02,0x00,0x0c,0x3c]
+# CHECK: ori	$12, $12, 21515         # encoding: [0x0b,0x54,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 58368         # encoding: [0x00,0xe4,0x8c,0x35]
+# CHECK: lui	$12, 23                 # encoding: [0x17,0x00,0x0c,0x3c]
+# CHECK: ori	$12, $12, 18550         # encoding: [0x76,0x48,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 59392         # encoding: [0x00,0xe8,0x8c,0x35]
+# CHECK: lui	$12, 232                # encoding: [0xe8,0x00,0x0c,0x3c]
+# CHECK: ori	$12, $12, 54437         # encoding: [0xa5,0xd4,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 4096          # encoding: [0x00,0x10,0x8c,0x35]
+# CHECK: lui	$12, 2328               # encoding: [0x18,0x09,0x0c,0x3c]
+# CHECK: ori	$12, $12, 20082         # encoding: [0x72,0x4e,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 40960         # encoding: [0x00,0xa0,0x8c,0x35]
+# CHECK: lui	$12, 23283              # encoding: [0xf3,0x5a,0x0c,0x3c]
+# CHECK: ori	$12, $12, 4218          # encoding: [0x7a,0x10,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 16384         # encoding: [0x00,0x40,0x8c,0x35]
+# CHECK: lui	$12, 3                  # encoding: [0x03,0x00,0x0c,0x3c]
+# CHECK: ori	$12, $12, 36222         # encoding: [0x7e,0x8d,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 42182         # encoding: [0xc6,0xa4,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 32768         # encoding: [0x00,0x80,0x8c,0x35]
+# CHECK: lui	$12, 35                 # encoding: [0x23,0x00,0x0c,0x3c]
+# CHECK: ori	$12, $12, 34546         # encoding: [0xf2,0x86,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 28609         # encoding: [0xc1,0x6f,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 0             # encoding: [0x00,0x00,0x8c,0x35]
+# CHECK: lui	$12, 355                # encoding: [0x63,0x01,0x0c,0x3c]
+# CHECK: ori	$12, $12, 17784         # encoding: [0x78,0x45,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 23946         # encoding: [0x8a,0x5d,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 0             # encoding: [0x00,0x00,0x8c,0x35]
+# CHECK: lui	$12, 3552               # encoding: [0xe0,0x0d,0x0c,0x3c]
+# CHECK: ori	$12, $12, 46771         # encoding: [0xb3,0xb6,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 42852         # encoding: [0x64,0xa7,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 0             # encoding: [0x00,0x00,0x8c,0x35]
+# CHECK: lui	$12, 35527              # encoding: [0xc7,0x8a,0x0c,0x3c]
+# CHECK: ori	$12, $12, 8964          # encoding: [0x04,0x23,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 35304         # encoding: [0xe8,0x89,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 0             # encoding: [0x00,0x00,0x8c,0x35]
+# CHECK: addiu	$12, $zero, -1          # encoding: [0xff,0xff,0x0c,0x24]
+# CHECK: addiu	$12, $zero, -10         # encoding: [0xf6,0xff,0x0c,0x24]
+# CHECK: addiu	$12, $zero, -100        # encoding: [0x9c,0xff,0x0c,0x24]
+# CHECK: addiu	$12, $zero, -1000       # encoding: [0x18,0xfc,0x0c,0x24]
+# CHECK: addiu	$12, $zero, -10000      # encoding: [0xf0,0xd8,0x0c,0x24]
+# CHECK: lui	$12, 65535              # encoding: [0xff,0xff,0x0c,0x3c]
+# CHECK: ori	$12, $12, 65535         # encoding: [0xff,0xff,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 65534         # encoding: [0xfe,0xff,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 31072         # encoding: [0x60,0x79,0x8c,0x35]
+# CHECK: lui	$12, 65535              # encoding: [0xff,0xff,0x0c,0x3c]
+# CHECK: ori	$12, $12, 65535         # encoding: [0xff,0xff,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 65520         # encoding: [0xf0,0xff,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 48576         # encoding: [0xc0,0xbd,0x8c,0x35]
+# CHECK: lui	$12, 65535              # encoding: [0xff,0xff,0x0c,0x3c]
+# CHECK: ori	$12, $12, 65535         # encoding: [0xff,0xff,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 65383         # encoding: [0x67,0xff,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 27008         # encoding: [0x80,0x69,0x8c,0x35]
+# CHECK: lui	$12, 65535              # encoding: [0xff,0xff,0x0c,0x3c]
+# CHECK: ori	$12, $12, 65535         # encoding: [0xff,0xff,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 64010         # encoding: [0x0a,0xfa,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 7936          # encoding: [0x00,0x1f,0x8c,0x35]
+# CHECK: lui	$12, 65535              # encoding: [0xff,0xff,0x0c,0x3c]
+# CHECK: ori	$12, $12, 65535         # encoding: [0xff,0xff,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 50277         # encoding: [0x65,0xc4,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 13824         # encoding: [0x00,0x36,0x8c,0x35]
+# CHECK: lui	$12, 65535              # encoding: [0xff,0xff,0x0c,0x3c]
+# CHECK: ori	$12, $12, 65533         # encoding: [0xfd,0xff,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 44020         # encoding: [0xf4,0xab,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 7168          # encoding: [0x00,0x1c,0x8c,0x35]
+# CHECK: lui	$12, 65535              # encoding: [0xff,0xff,0x0c,0x3c]
+# CHECK: ori	$12, $12, 65512         # encoding: [0xe8,0xff,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 46985         # encoding: [0x89,0xb7,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 6144          # encoding: [0x00,0x18,0x8c,0x35]
+# CHECK: lui	$12, 65535              # encoding: [0xff,0xff,0x0c,0x3c]
+# CHECK: ori	$12, $12, 65303         # encoding: [0x17,0xff,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 11098         # encoding: [0x5a,0x2b,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 61440         # encoding: [0x00,0xf0,0x8c,0x35]
+# CHECK: lui	$12, 65535              # encoding: [0xff,0xff,0x0c,0x3c]
+# CHECK: ori	$12, $12, 63207         # encoding: [0xe7,0xf6,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 45453         # encoding: [0x8d,0xb1,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 24576         # encoding: [0x00,0x60,0x8c,0x35]
+# CHECK: lui	$12, 65535              # encoding: [0xff,0xff,0x0c,0x3c]
+# CHECK: ori	$12, $12, 42252         # encoding: [0x0c,0xa5,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 61317         # encoding: [0x85,0xef,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 49152         # encoding: [0x00,0xc0,0x8c,0x35]
+# CHECK: lui	$12, 65532              # encoding: [0xfc,0xff,0x0c,0x3c]
+# CHECK: ori	$12, $12, 29313         # encoding: [0x81,0x72,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 23353         # encoding: [0x39,0x5b,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 32768         # encoding: [0x00,0x80,0x8c,0x35]
+# CHECK: lui	$12, 65500              # encoding: [0xdc,0xff,0x0c,0x3c]
+# CHECK: ori	$12, $12, 30989         # encoding: [0x0d,0x79,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 36927         # encoding: [0x3f,0x90,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 0             # encoding: [0x00,0x00,0x8c,0x35]
+# CHECK: lui	$12, 65180              # encoding: [0x9c,0xfe,0x0c,0x3c]
+# CHECK: ori	$12, $12, 47751         # encoding: [0x87,0xba,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 41590         # encoding: [0x76,0xa2,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 0             # encoding: [0x00,0x00,0x8c,0x35]
+# CHECK: lui	$12, 61983              # encoding: [0x1f,0xf2,0x0c,0x3c]
+# CHECK: ori	$12, $12, 18764         # encoding: [0x4c,0x49,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 22684         # encoding: [0x9c,0x58,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 0             # encoding: [0x00,0x00,0x8c,0x35]
+# CHECK: lui	$12, 30008              # encoding: [0x38,0x75,0x0c,0x3c]
+# CHECK: ori	$12, $12, 56571         # encoding: [0xfb,0xdc,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 30232         # encoding: [0x18,0x76,0x8c,0x35]
+# CHECK: dsll	$12, $12, 16            # encoding: [0x38,0x64,0x0c,0x00]
+# CHECK: ori	$12, $12, 0             # encoding: [0x00,0x00,0x8c,0x35]
+
+	dli	$t0, 1
+	dli	$t0, 10
+	dli	$t0, 100
+	dli	$t0, 1000
+	dli	$t0, 10000
+	dli	$t0, 100000
+	dli	$t0, 1000000
+	dli	$t0, 10000000
+	dli	$t0, 100000000
+	dli	$t0, 1000000000
+	dli	$t0, 10000000000
+	dli	$t0, 100000000000
+	dli	$t0, 1000000000000
+	dli	$t0, 10000000000000
+	dli	$t0, 100000000000000
+	dli	$t0, 1000000000000000
+	dli	$t0, 10000000000000000
+	dli	$t0, 100000000000000000
+	dli	$t0, 1000000000000000000
+	dli	$t0, 10000000000000000000
+	dli	$t0, -1
+	dli	$t0, -10
+	dli	$t0, -100
+	dli	$t0, -1000
+	dli	$t0, -10000
+	dli	$t0, -100000
+	dli	$t0, -1000000
+	dli	$t0, -10000000
+	dli	$t0, -100000000
+	dli	$t0, -1000000000
+	dli	$t0, -10000000000
+	dli	$t0, -100000000000
+	dli	$t0, -1000000000000
+	dli	$t0, -10000000000000
+	dli	$t0, -100000000000000
+	dli	$t0, -1000000000000000
+	dli	$t0, -10000000000000000
+	dli	$t0, -100000000000000000
+	dli	$t0, -1000000000000000000
+	dli	$t0, -10000000000000000000
diff --git a/test/MC/Mips/mips64/abiflags.s b/test/MC/Mips/mips64/abiflags.s
new file mode 100644
index 0000000..557e32a
--- /dev/null
+++ b/test/MC/Mips/mips64/abiflags.s
@@ -0,0 +1,37 @@
+# RUN: llvm-mc %s -arch=mips -mcpu=mips64 | \
+# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+#
+# RUN: llvm-mc %s -arch=mips -mcpu=mips64 -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ
+
+# CHECK-ASM: .module fp=64
+
+# Checking if the Mips.abiflags were correctly emitted.
+# CHECK-OBJ:  Section {
+# CHECK-OBJ:    Index: 5
+# CHECK-OBJ:    Name: .MIPS.abiflags (12)
+# CHECK-OBJ:    Type:  (0x7000002A)
+# CHECK-OBJ:     Flags [ (0x2)
+# CHECK-OBJ:      SHF_ALLOC (0x2)
+# CHECK-OBJ:    ]
+# CHECK-OBJ:    Address: 0x0
+# CHECK-OBJ:    Offset: 0x50
+# CHECK-OBJ:    Size: 24
+# CHECK-OBJ:    Link: 0
+# CHECK-OBJ:    Info: 0
+# CHECK-OBJ:    AddressAlignment: 8
+# CHECK-OBJ:    EntrySize: 0
+# CHECK-OBJ:    Relocations [
+# CHECK-OBJ:    ]
+# CHECK-OBJ:    SectionData (
+# CHECK-OBJ:      0000: 00004001 02020001 00000000 00000000  |..@.............|
+# CHECK-OBJ:      0010: 00000000 00000000                    |........|
+# CHECK-OBJ:    )
+# CHECK-OBJ:  }
+
+        .module fp=64
+
+# FIXME: Test should include gnu_attributes directive when implemented.
+#        An explicit .gnu_attribute must be checked against the effective
+#        command line options and any inconsistencies reported via a warning.
diff --git a/test/MC/Mips/mips64/valid.s b/test/MC/Mips/mips64/valid.s
index b9e1002..1bd057d 100644
--- a/test/MC/Mips/mips64/valid.s
+++ b/test/MC/Mips/mips64/valid.s
@@ -9,8 +9,21 @@
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
         and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
         c.sf.d    $f30,$f0
@@ -20,8 +33,8 @@
         ceil.w.d  $f11,$f25
         ceil.w.s  $f6,$f20
         cfc1      $s1,$21
-        clo       $11,$a1
-        clz       $sp,$gp
+        clo       $11,$a1              # CHECK: clo $11, $5   # encoding: [0x70,0xab,0x58,0x21]
+        clz       $sp,$gp              # CHECK: clz $sp, $gp  # encoding: [0x73,0x9d,0xe8,0x20]
         ctc1      $a2,$26
         cvt.d.l   $f4,$f16
         cvt.d.s   $f22,$f28
@@ -34,11 +47,15 @@
         cvt.w.d   $f20,$f14
         cvt.w.s   $f20,$f24
         dadd      $s3,$at,$ra
+        dadd      $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        dadd      $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddi     $sp,$s4,-27705
+        daddi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        daddi     $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddiu    $k0,$s6,-4586
         daddu     $s3,$at,$ra
-        dclo      $s2,$a2
-        dclz      $s0,$25
+        dclo      $s2,$a2              # CHECK: dclo $18, $6   # encoding: [0x70,0xd2,0x90,0x25]
+        dclz      $s0,$25              # CHECK: dclz $16, $25  # encoding: [0x73,0x30,0x80,0x24]
         deret
         ddiv      $zero,$k0,$s3
         ddivu     $zero,$s0,$s1
@@ -69,8 +86,10 @@
         dsrl32    $s3,$6,23            # CHECK: dsrl32 $19, $6, 23          # encoding: [0x00,0x06,0x9d,0xfe]
         dsrlv     $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
         dsub      $a3,$s6,$8
-        dsubu     $a1,$a1,$k0
-        dsub      $a3,$s6,$8
+        dsub      $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsub      $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
+        dsubi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsubi     $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
         dsubu     $a1,$a1,$k0
         ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
         eret
@@ -82,7 +101,7 @@
         lbu       $8,30195($v1)
         ld        $sp,-28645($s1)
         ldc1      $f11,16391($s0)
-        ldc2      $8,-21181($at)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
         ldl       $24,-4167($24)
         ldr       $14,-30358($s4)
         ldxc1     $f8,$s7($15)
@@ -90,12 +109,12 @@
         lhu       $s3,-22851($v0)
         li        $at,-29773
         li        $zero,-29889
-        ll        $v0,-7321($s2)
-        lld       $zero,-14736($ra)
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
+        lld       $zero,-14736($ra)    # CHECK: lld $zero, -14736($ra) # encoding: [0xd3,0xe0,0xc6,0x70]
         luxc1     $f19,$s6($s5)
         lw        $8,5674($a1)
         lwc1      $f16,10225($k0)
-        lwc2      $18,-841($a2)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
         lwl       $s4,-4231($15)
         lwr       $zero,-19147($gp)
         lwu       $s3,-24086($v1)
@@ -148,16 +167,20 @@
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
+        pref      1, 8($5)             # CHECK: pref 1, 8($5)          # encoding: [0xcc,0xa1,0x00,0x08]
         round.l.d $f12,$f1
         round.l.s $f25,$f5
         round.w.d $f6,$f4
         round.w.s $f27,$f28
         sb        $s6,-19857($14)
-        sc        $15,18904($s3)
-        scd       $15,-8243($sp)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
+        scd       $15,-8243($sp)       # CHECK: scd $15, -8243($sp)    # encoding: [0xf3,0xaf,0xdf,0xcd]
+        sdbbp                          # CHECK: sdbbp                  # encoding: [0x70,0x00,0x00,0x3f]
+        sdbbp     34                   # CHECK: sdbbp 34               # encoding: [0x70,0x00,0x08,0xbf]
         sd        $12,5835($10)
         sdc1      $f31,30574($13)
-        sdc2      $20,23157($s2)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
         sdl       $a3,-20961($s8)
         sdr       $11,-20423($12)
         sdxc1     $f11,$10($14)
@@ -189,10 +212,12 @@
         suxc1     $f12,$k1($13)
         sw        $ra,-10160($sp)
         swc1      $f6,-8465($24)
-        swc2      $25,24880($s0)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
         swxc1     $f19,$12($k0)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        sync      1                    # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
         teqi      $s5,-17504
         tgei      $s1,5025
         tgeiu     $sp,-28621
diff --git a/test/MC/Mips/mips64r2/abi-bad.s b/test/MC/Mips/mips64r2/abi-bad.s
new file mode 100644
index 0000000..31d13ab
--- /dev/null
+++ b/test/MC/Mips/mips64r2/abi-bad.s
@@ -0,0 +1,9 @@
+# RUN: not llvm-mc %s -triple mips-unknown-unknown -mcpu=mips64r2 2>&1 | FileCheck %s
+# CHECK: .text
+
+
+
+        .set fp=xx
+# CHECK     : error: 'set fp=xx'option requires O32 ABI
+# CHECK     : .set fp=xx
+# CHECK     :          ^
diff --git a/test/MC/Mips/mips64r2/abiflags.s b/test/MC/Mips/mips64r2/abiflags.s
new file mode 100644
index 0000000..aa76dee
--- /dev/null
+++ b/test/MC/Mips/mips64r2/abiflags.s
@@ -0,0 +1,37 @@
+# RUN: llvm-mc %s -arch=mips -mcpu=mips64r2 | \
+# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+#
+# RUN: llvm-mc %s -arch=mips -mcpu=mips64r2 -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ
+
+# CHECK-ASM: .module fp=64
+
+# Checking if the Mips.abiflags were correctly emitted.
+# CHECK-OBJ:  Section {
+# CHECK-OBJ:    Index: 5
+# CHECK-OBJ:    Name: .MIPS.abiflags (12)
+# CHECK-OBJ:    Type:  (0x7000002A)
+# CHECK-OBJ:     Flags [ (0x2)
+# CHECK-OBJ:      SHF_ALLOC (0x2)
+# CHECK-OBJ:    ]
+# CHECK-OBJ:    Address: 0x0
+# CHECK-OBJ:    Offset: 0x50
+# CHECK-OBJ:    Size: 24
+# CHECK-OBJ:    Link: 0
+# CHECK-OBJ:    Info: 0
+# CHECK-OBJ:    AddressAlignment: 8
+# CHECK-OBJ:    EntrySize: 0
+# CHECK-OBJ:    Relocations [
+# CHECK-OBJ:    ]
+# CHECK-OBJ:    SectionData (
+# CHECK-OBJ:      0000: 00004002 02020001 00000000 00000000  |..@.............|
+# CHECK-OBJ:      0010: 00000000 00000000                    |........|
+# CHECK-OBJ:    )
+# CHECK-OBJ:  }
+
+        .module fp=64
+
+# FIXME: Test should include gnu_attributes directive when implemented.
+#        An explicit .gnu_attribute must be checked against the effective
+#        command line options and any inconsistencies reported via a warning.
diff --git a/test/MC/Mips/mips64r2/invalid.s b/test/MC/Mips/mips64r2/invalid.s
new file mode 100644
index 0000000..f53cfff
--- /dev/null
+++ b/test/MC/Mips/mips64r2/invalid.s
@@ -0,0 +1,10 @@
+# Instructions that are valid for the current ISA but should be rejected by the assembler (e.g.
+# invalid set of operands or operand's restrictions not met).
+
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -mcpu=mips64r2 2>%t1
+# RUN: FileCheck %s < %t1 -check-prefix=ASM
+
+        .text
+        .set noreorder
+        jalr.hb $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
+        jalr.hb $31, $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
diff --git a/test/MC/Mips/mips64r2/valid.s b/test/MC/Mips/mips64r2/valid.s
index 252589d..7a2244a 100644
--- a/test/MC/Mips/mips64r2/valid.s
+++ b/test/MC/Mips/mips64r2/valid.s
@@ -9,8 +9,21 @@
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
+        addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
         and       $s7,$v0,$12
+        and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
+        bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
+        bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
+        bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
+        bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
+        bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
         c.sf.d    $f30,$f0
@@ -20,8 +33,8 @@
         ceil.w.d  $f11,$f25
         ceil.w.s  $f6,$f20
         cfc1      $s1,$21
-        clo       $11,$a1
-        clz       $sp,$gp
+        clo       $11,$a1              # CHECK: clo $11, $5   # encoding: [0x70,0xab,0x58,0x21]
+        clz       $sp,$gp              # CHECK: clz $sp, $gp  # encoding: [0x73,0x9d,0xe8,0x20]
         ctc1      $a2,$26
         cvt.d.l   $f4,$f16
         cvt.d.s   $f22,$f28
@@ -34,11 +47,15 @@
         cvt.w.d   $f20,$f14
         cvt.w.s   $f20,$f24
         dadd      $s3,$at,$ra
+        dadd      $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        dadd      $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddi     $sp,$s4,-27705
+        daddi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, -27705 # encoding: [0x62,0x9d,0x93,0xc7]
+        daddi     $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddiu    $k0,$s6,-4586
         daddu     $s3,$at,$ra
-        dclo      $s2,$a2
-        dclz      $s0,$25
+        dclo      $s2,$a2              # CHECK: dclo $18, $6   # encoding: [0x70,0xd2,0x90,0x25]
+        dclz      $s0,$25              # CHECK: dclz $16, $25  # encoding: [0x73,0x30,0x80,0x24]
         deret
         di        $s8
         ddiv      $zero,$k0,$s3
@@ -77,8 +94,12 @@
         dsrl32    $s3,$6,23            # CHECK: dsrl32 $19, $6, 23          # encoding: [0x00,0x06,0x9d,0xfe]
         dsrlv     $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
         dsub      $a3,$s6,$8
-        dsubu     $a1,$a1,$k0
         dsub      $a3,$s6,$8
+        dsub      $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsub      $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
+        dsubi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
+        dsubi     $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
+        dsubu     $a1,$a1,$k0
         dsubu     $a1,$a1,$k0
         ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
         ei        $14
@@ -87,11 +108,14 @@
         floor.l.s $f12,$f5
         floor.w.d $f14,$f11
         floor.w.s $f8,$f9
+        jr.hb     $4                   # CHECK: jr.hb  $4 # encoding: [0x00,0x80,0x04,0x08]
+        jalr.hb   $4                   # CHECK: jalr.hb  $4 # encoding: [0x00,0x80,0xfc,0x09]
+        jalr.hb   $4, $5               # CHECK: jalr.hb  $4, $5 # encoding: [0x00,0xa0,0x24,0x09]
         lb        $24,-14515($10)
         lbu       $8,30195($v1)
         ld        $sp,-28645($s1)
         ldc1      $f11,16391($s0)
-        ldc2      $8,-21181($at)
+        ldc2      $8,-21181($at)        # CHECK: ldc2 $8, -21181($1)   # encoding: [0xd8,0x28,0xad,0x43]
         ldl       $24,-4167($24)
         ldr       $14,-30358($s4)
         ldxc1     $f8,$s7($15)
@@ -99,12 +123,12 @@
         lhu       $s3,-22851($v0)
         li        $at,-29773
         li        $zero,-29889
-        ll        $v0,-7321($s2)
-        lld       $zero,-14736($ra)
+        ll        $v0,-7321($s2)       # CHECK: ll $2, -7321($18)     # encoding: [0xc2,0x42,0xe3,0x67]
+        lld       $zero,-14736($ra)    # CHECK: lld $zero, -14736($ra) # encoding: [0xd3,0xe0,0xc6,0x70]
         luxc1     $f19,$s6($s5)
         lw        $8,5674($a1)
         lwc1      $f16,10225($k0)
-        lwc2      $18,-841($a2)
+        lwc2      $18,-841($a2)        # CHECK: lwc2 $18, -841($6)     # encoding: [0xc8,0xd2,0xfc,0xb7]
         lwl       $s4,-4231($15)
         lwr       $zero,-19147($gp)
         lwu       $s3,-24086($v1)
@@ -163,7 +187,9 @@
         nop
         nor       $a3,$zero,$a3
         or        $12,$s0,$sp
+        or        $2, 4                # CHECK: ori $2, $2, 4           # encoding: [0x34,0x42,0x00,0x04]
         pause                          # CHECK: pause # encoding:  [0x00,0x00,0x01,0x40]
+        pref      1, 8($5)             # CHECK: pref 1, 8($5)           # encoding: [0xcc,0xa1,0x00,0x08]
         rdhwr     $sp,$11
         rotr      $1,15                # CHECK: rotr $1, $1, 15         # encoding: [0x00,0x21,0x0b,0xc2]
         rotr      $1,$14,15            # CHECK: rotr $1, $14, 15        # encoding: [0x00,0x2e,0x0b,0xc2]
@@ -173,11 +199,13 @@
         round.w.d $f6,$f4
         round.w.s $f27,$f28
         sb        $s6,-19857($14)
-        sc        $15,18904($s3)
-        scd       $15,-8243($sp)
+        sc        $15,18904($s3)       # CHECK: sc $15, 18904($19)     # encoding: [0xe2,0x6f,0x49,0xd8]
+        scd       $15,-8243($sp)       # CHECK: scd $15, -8243($sp)    # encoding: [0xf3,0xaf,0xdf,0xcd]
+        sdbbp                          # CHECK: sdbbp                  # encoding: [0x70,0x00,0x00,0x3f]
+        sdbbp     34                   # CHECK: sdbbp 34               # encoding: [0x70,0x00,0x08,0xbf]
         sd        $12,5835($10)
         sdc1      $f31,30574($13)
-        sdc2      $20,23157($s2)
+        sdc2      $20,23157($s2)       # CHECK: sdc2 $20, 23157($18)   # encoding: [0xfa,0x54,0x5a,0x75]
         sdl       $a3,-20961($s8)
         sdr       $11,-20423($12)
         sdxc1     $f11,$10($14)
@@ -211,10 +239,12 @@
         suxc1     $f12,$k1($13)
         sw        $ra,-10160($sp)
         swc1      $f6,-8465($24)
-        swc2      $25,24880($s0)
+        swc2      $25,24880($s0)       # CHECK: swc2 $25, 24880($16)   # encoding: [0xea,0x19,0x61,0x30]
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
         swxc1     $f19,$12($k0)
+        sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        sync      1                    # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
         teqi      $s5,-17504
         tgei      $s1,5025
         tgeiu     $sp,-28621
diff --git a/test/MC/Mips/mips64r6/invalid-mips1-wrong-error.s b/test/MC/Mips/mips64r6/invalid-mips1-wrong-error.s
index f7949bb..e914c89 100644
--- a/test/MC/Mips/mips64r6/invalid-mips1-wrong-error.s
+++ b/test/MC/Mips/mips64r6/invalid-mips1-wrong-error.s
@@ -5,6 +5,8 @@
 # RUN: FileCheck %s < %t1
 
 	.set noat
+        bc2f      4                   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2t      4                   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
         lwl       $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         lwr       $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         swl       $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/test/MC/Mips/mips64r6/invalid-mips1.s b/test/MC/Mips/mips64r6/invalid-mips1.s
index 1225005..6efd8f4 100644
--- a/test/MC/Mips/mips64r6/invalid-mips1.s
+++ b/test/MC/Mips/mips64r6/invalid-mips1.s
@@ -6,3 +6,22 @@
 
 	.set noat
         addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $0, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bltzal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        c.ngl.d   $f29,$f29           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        c.ngle.d  $f0,$f16            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        c.sf.d    $f30,$f0            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        c.sf.s    $f14,$f22           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $s3                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mflo      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthi      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $25                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$s4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$v0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $9,$s2              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $gp,$k0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+#       div has been re-encoded. See valid.s
+#       divu has been re-encoded. See valid.s
diff --git a/test/MC/Mips/mips64r6/invalid-mips2.s b/test/MC/Mips/mips64r6/invalid-mips2.s
index 0638e78..8a5c50c 100644
--- a/test/MC/Mips/mips64r6/invalid-mips2.s
+++ b/test/MC/Mips/mips64r6/invalid-mips2.s
@@ -6,9 +6,24 @@
 
 	.set noat
         addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $0, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bltzal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $s3                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mflo      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthi      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $25                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$s4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$v0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $9,$s2              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $gp,$k0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         teqi      $s5,-17504          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tgei      $s1,5025            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tgeiu     $sp,-28621          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tlti      $14,-21059          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tltiu     $ra,-5076           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tnei      $12,-29647          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+#       div has been re-encoded. See valid.s
+#       divu has been re-encoded. See valid.s
diff --git a/test/MC/Mips/mips64r6/invalid-mips3.s b/test/MC/Mips/mips64r6/invalid-mips3.s
index 0638e78..322dabd 100644
--- a/test/MC/Mips/mips64r6/invalid-mips3.s
+++ b/test/MC/Mips/mips64r6/invalid-mips3.s
@@ -6,9 +6,28 @@
 
 	.set noat
         addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $0, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bltzal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmult     $s7,$9              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmultu    $a1,$a2             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $s3                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mflo      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthi      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $25                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$s4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$v0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $9,$s2              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $gp,$k0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         teqi      $s5,-17504          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tgei      $s1,5025            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tgeiu     $sp,-28621          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tlti      $14,-21059          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tltiu     $ra,-5076           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tnei      $12,-29647          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+#       ddiv has been re-encoded. See valid.s
+#       ddivu has been re-encoded. See valid.s
+#       div has been re-encoded. See valid.s
+#       divu has been re-encoded. See valid.s
diff --git a/test/MC/Mips/mips64r6/invalid-mips32-wrong-error.s b/test/MC/Mips/mips64r6/invalid-mips32-wrong-error.s
new file mode 100644
index 0000000..cc85f18
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips32-wrong-error.s
@@ -0,0 +1,20 @@
+# Instructions that are invalid and are correctly rejected but use the wrong
+# error message at the moment.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+        bc1fl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1tl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2f  $fcc0,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2f  4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2fl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2t  $fcc0,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2t  4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2tl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips64r6/invalid-mips4-wrong-error.s b/test/MC/Mips/mips64r6/invalid-mips4-wrong-error.s
new file mode 100644
index 0000000..f3131a9
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips4-wrong-error.s
@@ -0,0 +1,21 @@
+# Instructions that are invalid and are correctly rejected but use the wrong
+# error message at the moment.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+        beql $1,$2,4            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bgezall $3,8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bgezl $3,8              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bgtzl $4,16             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        blezl $3,8              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bltzall $3,8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bltzl $4,16             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bnel $1,$2,4            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        prefx 0,$2($31)         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips64r6/invalid-mips4.s b/test/MC/Mips/mips64r6/invalid-mips4.s
new file mode 100644
index 0000000..706db27
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips4.s
@@ -0,0 +1,14 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        bgezal    $0, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bltzal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ldxc1     $f8,$s7($15)        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lwxc1     $f12,$s1($s8)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sdxc1     $f11,$10($14)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        swxc1     $f19,$12($k0)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips64r6/invalid-mips5-wrong-error.s b/test/MC/Mips/mips64r6/invalid-mips5-wrong-error.s
index 6b980e6..4fc94e2 100644
--- a/test/MC/Mips/mips64r6/invalid-mips5-wrong-error.s
+++ b/test/MC/Mips/mips64r6/invalid-mips5-wrong-error.s
@@ -1,4 +1,4 @@
-# Instructions that are invalid
+# Instructions that are invalid but currently emit the wrong error message.
 #
 # RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
 # RUN:     2>%t1
@@ -8,6 +8,10 @@
         abs.ps          $f22,$f8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
         add.ps          $f25,$f27,$f13      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
         alnv.ps         $f12,$f18,$f30,$12  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1any2f        $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1any2t        $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1any4f        $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1any4t        $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
         c.eq.ps         $fcc5,$f0,$f9       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
         c.f.ps          $fcc6,$f11,$f11     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
         c.le.ps         $fcc1,$f7,$f20      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips64r6/invalid-mips5.s b/test/MC/Mips/mips64r6/invalid-mips5.s
new file mode 100644
index 0000000..e7fd99a
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips5.s
@@ -0,0 +1,12 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        bgezal    $0, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bltzal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        luxc1     $f19,$s6($s5)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        suxc1     $f12,$k1($13)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips64r6/invalid-mips64.s b/test/MC/Mips/mips64r6/invalid-mips64.s
new file mode 100644
index 0000000..51e5708
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips64.s
@@ -0,0 +1,54 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $0, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bltzal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddi     $sp,$s4,-27705      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddi     $sp,-27705          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dadd      $sp,$s4,-27705      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dadd      $sp,-27705          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmult     $s7,$s4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmultu    $a1,$a2             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsubi     $sp,$s4,-27705      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsubi     $sp,-27705          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsub      $sp,$s4,-27705      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsub      $sp,-27705          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        jalx      4                   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $s3                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mflo      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn      $v1,$s1,$s0         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.d    $f27,$f21,$k0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.s    $f12,$f0,$s7        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.d    $f0,$f2,$fcc0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz      $a1,$s6,$9          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.d    $f12,$f29,$9        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.s    $f25,$f7,$v1        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthi      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $25                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$s4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult      $sp,$v0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $9,$s2              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu     $gp,$k0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        teqi      $s5,-17504          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgei      $s1,5025            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgeiu     $sp,-28621          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tlti      $14,-21059          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tltiu     $ra,-5076           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tnei      $12,-29647          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+#       ddiv has been re-encoded. See valid.s
+#       ddivu has been re-encoded. See valid.s
+#       div has been re-encoded. See valid.s
+#       divu has been re-encoded. See valid.s
diff --git a/test/MC/Mips/mips64r6/invalid.s b/test/MC/Mips/mips64r6/invalid.s
new file mode 100644
index 0000000..1b01827
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid.s
@@ -0,0 +1,12 @@
+# Instructions that are available for the current ISA but should be rejected by
+# the assembler (e.g. invalid set of operands or operand's restrictions not met).
+
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -mcpu=mips64r6 2>%t1
+# RUN: FileCheck %s < %t1 -check-prefix=ASM
+
+        .text
+        .set noreorder
+	.set noat
+        jalr.hb $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
+        jalr.hb $31, $31 # ASM: :[[@LINE]]:9: error: source and destination must be different
+        ldc2    $8,-21181($at)   # ASM: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips64r6/relocations.s b/test/MC/Mips/mips64r6/relocations.s
index db84715..651ebfb 100644
--- a/test/MC/Mips/mips64r6/relocations.s
+++ b/test/MC/Mips/mips64r6/relocations.s
@@ -5,7 +5,10 @@
 #------------------------------------------------------------------------------
 # Check that the assembler can handle the documented syntax for fixups.
 #------------------------------------------------------------------------------
-# CHECK-FIXUP: beqc $5, $6, bar # encoding: [0x20,0xa6,A,A]
+# CHECK-FIXUP: addiupc $2, bar  # encoding: [0xec,0b01000AAA,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC19_S2
+# CHECK-FIXUP: beqc     $5, $6, bar # encoding: [0x20,0xa6,A,A]
 # CHECK-FIXUP:                  #   fixup A - offset: 0,
 # CHECK-FIXUP:                      value: bar, kind: fixup_Mips_PC16
 # CHECK-FIXUP: bnec $5, $6, bar # encoding: [0x60,0xa6,A,A]
@@ -31,20 +34,35 @@
 # CHECK-FIXUP:                              #   fixup A - offset: 0,
 # CHECK-FIXUP:                                  value: bar@PCREL_LO16,
 # CHECK-FIXUP:                                  kind: fixup_MIPS_PCLO16
+# CHECK-FIXUP: ldpc    $2, bar  # encoding: [0xec,0b010110AA,A,A]
+# CHECK-FIXUP:                  # fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar,
+# CHECK-FIXUP:                      kind: fixup_Mips_PC18_S3
+# CHECK-FIXUP: lwpc    $2, bar  # encoding: [0xec,0b01001AAA,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC19_S2
+# CHECK-FIXUP: lwupc   $2, bar  # encoding: [0xec,0b01010AAA,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC19_S2
 #------------------------------------------------------------------------------
 # Check that the appropriate relocations were created.
 #------------------------------------------------------------------------------
 # CHECK-ELF: Relocations [
-# CHECK-ELF:     0x0 R_MIPS_PC16 bar 0x0
+# CHECK-ELF:     0x0 R_MIPS_PC19_S2 bar 0x0
 # CHECK-ELF:     0x4 R_MIPS_PC16 bar 0x0
-# CHECK-ELF:     0x8 R_MIPS_PC21_S2 bar 0x0
+# CHECK-ELF:     0x8 R_MIPS_PC16 bar 0x0
 # CHECK-ELF:     0xC R_MIPS_PC21_S2 bar 0x0
-# CHECK-ELF:     0x10 R_MIPS_PC26_S2 bar 0x0
+# CHECK-ELF:     0x10 R_MIPS_PC21_S2 bar 0x0
 # CHECK-ELF:     0x14 R_MIPS_PC26_S2 bar 0x0
-# CHECK-ELF:     0x18 R_MIPS_PCHI16 bar 0x0
-# CHECK-ELF:     0x1C R_MIPS_PCLO16 bar 0x0
+# CHECK-ELF:     0x18 R_MIPS_PC26_S2 bar 0x0
+# CHECK-ELF:     0x1C R_MIPS_PCHI16 bar 0x0
+# CHECK-ELF:     0x20 R_MIPS_PCLO16 bar 0x0
+# CHECK-ELF:     0x24 R_MIPS_PC18_S3 bar 0x0
+# CHECK-ELF:     0x28 R_MIPS_PC19_S2 bar 0x0
+# CHECK-ELF:     0x2C R_MIPS_PC19_S2 bar 0x0
 # CHECK-ELF: ]
 
+  addiupc   $2,bar
   beqc  $5, $6, bar
   bnec  $5, $6, bar
   beqzc $9, bar
@@ -53,3 +71,6 @@
   bc    bar
   aluipc $2, %pcrel_hi(bar)
   addiu  $2, $2, %pcrel_lo(bar)
+  ldpc  $2,bar
+  lwpc  $2,bar
+  lwupc $2,bar
diff --git a/test/MC/Mips/mips64r6/valid.s b/test/MC/Mips/mips64r6/valid.s
index efdfc7f..34c1dac 100644
--- a/test/MC/Mips/mips64r6/valid.s
+++ b/test/MC/Mips/mips64r6/valid.s
@@ -10,15 +10,18 @@
 #   rs > rt
 # appropriately for each branch instruction
 #
-# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips64r6 | FileCheck %s
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips64r6 2> %t0 | FileCheck %s
+# RUN: FileCheck %s -check-prefix=WARNING < %t0
 
         .set noat
         # FIXME: Add the instructions carried forward from older ISA's
+        and     $2,4           # CHECK: andi $2, $2, 4        # encoding: [0x30,0x42,0x00,0x04]
         addiupc $4, 100          # CHECK: addiupc $4, 100     # encoding: [0xec,0x80,0x00,0x19]
         align   $4, $2, $3, 2    # CHECK: align $4, $2, $3, 2 # encoding: [0x7c,0x43,0x22,0xa0]
         aluipc  $3, 56           # CHECK: aluipc $3, 56       # encoding: [0xec,0x7f,0x00,0x38]
         aui     $3,$2,-23        # CHECK: aui $3, $2, -23     # encoding: [0x3c,0x62,0xff,0xe9]
         auipc   $3, -1           # CHECK: auipc $3, -1        # encoding: [0xec,0x7e,0xff,0xff]
+        bal     21100            # CHECK: bal 21100           # encoding: [0x04,0x11,0x14,0x9b]
         balc 14572256            # CHECK: balc 14572256       # encoding: [0xe8,0x37,0x96,0xb8]
         bc 14572256              # CHECK: bc 14572256         # encoding: [0xc8,0x37,0x96,0xb8]
         bc1eqz  $f0,4            # CHECK: bc1eqz $f0, 4       # encoding: [0x45,0x20,0x00,0x01]
@@ -38,6 +41,8 @@
         bnec $5, $6, 256         # CHECK: bnec $5, $6, 256    # encoding: [0x60,0xa6,0x00,0x40]
         bnezalc $2, 1332         # CHECK: bnezalc $2, 1332    # encoding: [0x60,0x02,0x01,0x4d]
         beqzc $5, 72256          # CHECK: beqzc $5, 72256     # encoding: [0xd8,0xa0,0x46,0x90]
+        bgec $2, $3, 256         # CHECK: bgec $2, $3, 256    # encoding: [0x58,0x43,0x00,0x40]
+        bgeuc $2, $3, 256        # CHECK: bgeuc $2, $3, 256   # encoding: [0x18,0x43,0x00,0x40]
         bgezalc $2, 1332         # CHECK: bgezalc $2, 1332    # encoding: [0x18,0x42,0x01,0x4d]
         bnezc $5, 72256          # CHECK: bnezc $5, 72256     # encoding: [0xf8,0xa0,0x46,0x90]
         bltzc $5, 256            # CHECK: bltzc $5, 256       # encoding: [0x5c,0xa5,0x00,0x40]
@@ -48,6 +53,8 @@
         bgtzc $5, 256            # CHECK: bgtzc $5, 256       # encoding: [0x5c,0x05,0x00,0x40]
         bitswap $4, $2           # CHECK: bitswap $4, $2      # encoding: [0x7c,0x02,0x20,0x20]
         blezalc $2, 1332         # CHECK: blezalc $2, 1332    # encoding: [0x18,0x02,0x01,0x4d]
+        bltc $5, $6, 256         # CHECK: bltc $5, $6, 256    # encoding: [0x5c,0xa6,0x00,0x40]
+        bltuc $5, $6, 256        # CHECK: bltuc $5, $6, 256   # encoding: [0x1c,0xa6,0x00,0x40]
         # bnvc requires that rs >= rt but we accept both. See also bnec
         bnvc     $0, $0, 4       # CHECK: bnvc $zero, $zero, 4 # encoding: [0x60,0x00,0x00,0x01]
         bnvc     $2, $0, 4       # CHECK: bnvc $2, $zero, 4    # encoding: [0x60,0x40,0x00,0x01]
@@ -56,38 +63,39 @@
         bovc     $0, $0, 4       # CHECK: bovc $zero, $zero, 4 # encoding: [0x20,0x00,0x00,0x01]
         bovc     $2, $0, 4       # CHECK: bovc $2, $zero, 4    # encoding: [0x20,0x40,0x00,0x01]
         bovc     $4, $2, 4       # CHECK: bovc $4, $2, 4      # encoding: [0x20,0x82,0x00,0x01]
-        cmp.f.s    $f2,$f3,$f4      # CHECK: cmp.f.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x80]
-        cmp.f.d    $f2,$f3,$f4      # CHECK: cmp.f.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x80]
+        cache      1, 8($5)         # CHECK: cache 1, 8($5)         # encoding: [0x7c,0xa1,0x04,0x25]
+        cmp.af.s   $f2,$f3,$f4      # CHECK: cmp.af.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x80]
+        cmp.af.d   $f2,$f3,$f4      # CHECK: cmp.af.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x80]
         cmp.un.s   $f2,$f3,$f4      # CHECK: cmp.un.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x81]
         cmp.un.d   $f2,$f3,$f4      # CHECK: cmp.un.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x81]
         cmp.eq.s   $f2,$f3,$f4      # CHECK: cmp.eq.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x82]
         cmp.eq.d   $f2,$f3,$f4      # CHECK: cmp.eq.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x82]
         cmp.ueq.s  $f2,$f3,$f4      # CHECK: cmp.ueq.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x83]
         cmp.ueq.d  $f2,$f3,$f4      # CHECK: cmp.ueq.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x83]
-        cmp.olt.s  $f2,$f3,$f4      # CHECK: cmp.olt.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x84]
-        cmp.olt.d  $f2,$f3,$f4      # CHECK: cmp.olt.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x84]
+        cmp.lt.s   $f2,$f3,$f4      # CHECK: cmp.lt.s  $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x84]
+        cmp.lt.d   $f2,$f3,$f4      # CHECK: cmp.lt.d  $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x84]
         cmp.ult.s  $f2,$f3,$f4      # CHECK: cmp.ult.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x85]
         cmp.ult.d  $f2,$f3,$f4      # CHECK: cmp.ult.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x85]
-        cmp.ole.s  $f2,$f3,$f4      # CHECK: cmp.ole.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x86]
-        cmp.ole.d  $f2,$f3,$f4      # CHECK: cmp.ole.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x86]
+        cmp.le.s   $f2,$f3,$f4      # CHECK: cmp.le.s  $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x86]
+        cmp.le.d   $f2,$f3,$f4      # CHECK: cmp.le.d  $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x86]
         cmp.ule.s  $f2,$f3,$f4      # CHECK: cmp.ule.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x87]
         cmp.ule.d  $f2,$f3,$f4      # CHECK: cmp.ule.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x87]
-        cmp.sf.s   $f2,$f3,$f4      # CHECK: cmp.sf.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x88]
-        cmp.sf.d   $f2,$f3,$f4      # CHECK: cmp.sf.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x88]
-        cmp.ngle.s $f2,$f3,$f4      # CHECK: cmp.ngle.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x89]
-        cmp.ngle.d $f2,$f3,$f4      # CHECK: cmp.ngle.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x89]
+        cmp.saf.s  $f2,$f3,$f4      # CHECK: cmp.saf.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x88]
+        cmp.saf.d  $f2,$f3,$f4      # CHECK: cmp.saf.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x88]
+        cmp.sun.s  $f2,$f3,$f4      # CHECK: cmp.sun.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x89]
+        cmp.sun.d  $f2,$f3,$f4      # CHECK: cmp.sun.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x89]
         cmp.seq.s  $f2,$f3,$f4      # CHECK: cmp.seq.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8a]
         cmp.seq.d  $f2,$f3,$f4      # CHECK: cmp.seq.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8a]
-        cmp.ngl.s  $f2,$f3,$f4      # CHECK: cmp.ngl.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8b]
-        cmp.ngl.d  $f2,$f3,$f4      # CHECK: cmp.ngl.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8b]
-        cmp.lt.s   $f2,$f3,$f4      # CHECK: cmp.lt.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8c]
-        cmp.lt.d   $f2,$f3,$f4      # CHECK: cmp.lt.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8c]
-        cmp.nge.s  $f2,$f3,$f4      # CHECK: cmp.nge.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8d]
-        cmp.nge.d  $f2,$f3,$f4      # CHECK: cmp.nge.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8d]
-        cmp.le.s   $f2,$f3,$f4      # CHECK: cmp.le.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8e]
-        cmp.le.d   $f2,$f3,$f4      # CHECK: cmp.le.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8e]
-        cmp.ngt.s  $f2,$f3,$f4      # CHECK: cmp.ngt.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8f]
-        cmp.ngt.d  $f2,$f3,$f4      # CHECK: cmp.ngt.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8f]
+        cmp.sueq.s $f2,$f3,$f4      # CHECK: cmp.sueq.s $f2, $f3, $f4 # encoding: [0x46,0x84,0x18,0x8b]
+        cmp.sueq.d $f2,$f3,$f4      # CHECK: cmp.sueq.d $f2, $f3, $f4 # encoding: [0x46,0xa4,0x18,0x8b]
+        cmp.slt.s  $f2,$f3,$f4      # CHECK: cmp.slt.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8c]
+        cmp.slt.d  $f2,$f3,$f4      # CHECK: cmp.slt.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8c]
+        cmp.sult.s $f2,$f3,$f4      # CHECK: cmp.sult.s $f2, $f3, $f4 # encoding: [0x46,0x84,0x18,0x8d]
+        cmp.sult.d $f2,$f3,$f4      # CHECK: cmp.sult.d $f2, $f3, $f4 # encoding: [0x46,0xa4,0x18,0x8d]
+        cmp.sle.s  $f2,$f3,$f4      # CHECK: cmp.sle.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8e]
+        cmp.sle.d  $f2,$f3,$f4      # CHECK: cmp.sle.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8e]
+        cmp.sule.s $f2,$f3,$f4      # CHECK: cmp.sule.s $f2, $f3, $f4 # encoding: [0x46,0x84,0x18,0x8f]
+        cmp.sule.d $f2,$f3,$f4      # CHECK: cmp.sule.d $f2, $f3, $f4 # encoding: [0x46,0xa4,0x18,0x8f]
         dalign  $4,$2,$3,5       # CHECK: dalign $4, $2, $3, 5 # encoding: [0x7c,0x43,0x23,0x64]
         daui    $3,$2,0x1234     # CHECK: daui $3, $2, 4660  # encoding: [0x74,0x62,0x12,0x34]
         dahi     $3,0x5678       # CHECK: dahi $3, 22136     # encoding: [0x04,0x66,0x56,0x78]
@@ -103,20 +111,24 @@
         ddivu   $2,$3,$4         # CHECK: ddivu $2, $3, $4 # encoding: [0x00,0x64,0x10,0x9f]
         dmod    $2,$3,$4         # CHECK: dmod $2, $3, $4  # encoding: [0x00,0x64,0x10,0xde]
         dmodu   $2,$3,$4         # CHECK: dmodu $2, $3, $4 # encoding: [0x00,0x64,0x10,0xdf]
+        lsa     $2, $3, $4, 3    # CHECK: lsa  $2, $3, $4, 3 # encoding: [0x00,0x64,0x10,0xc5]
+        dlsa    $2, $3, $4, 3    # CHECK: dlsa $2, $3, $4, 3 # encoding: [0x00,0x64,0x10,0xd5]
+        ldpc    $2,123456        # CHECK: ldpc $2, 123456  # encoding: [0xec,0x58,0x3c,0x48]
         lwpc    $2,268           # CHECK: lwpc $2, 268     # encoding: [0xec,0x48,0x00,0x43]
         lwupc   $2,268           # CHECK: lwupc $2, 268    # encoding: [0xec,0x50,0x00,0x43]
-#        mul     $2,$3,$4         # CHECK-TODO: mul $2, $3, $4   # encoding: [0x00,0x64,0x10,0x98]
+        mul     $2,$3,$4         # CHECK: mul $2, $3, $4   # encoding: [0x00,0x64,0x10,0x98]
         muh     $2,$3,$4         # CHECK: muh $2, $3, $4   # encoding: [0x00,0x64,0x10,0xd8]
         mulu    $2,$3,$4         # CHECK: mulu $2, $3, $4  # encoding: [0x00,0x64,0x10,0x99]
         muhu    $2,$3,$4         # CHECK: muhu $2, $3, $4  # encoding: [0x00,0x64,0x10,0xd9]
-        dmul    $2,$3,$4         # CHECK: dmul $2, $3, $4  # encoding: [0x00,0x64,0x10,0xb8]
-        dmuh    $2,$3,$4         # CHECK: dmuh $2, $3, $4  # encoding: [0x00,0x64,0x10,0xf8]
-        dmulu   $2,$3,$4         # CHECK: dmulu $2, $3, $4 # encoding: [0x00,0x64,0x10,0xb9]
-        dmuhu   $2,$3,$4         # CHECK: dmuhu $2, $3, $4 # encoding: [0x00,0x64,0x10,0xf9]
+        dmul    $2,$3,$4         # CHECK: dmul $2, $3, $4  # encoding: [0x00,0x64,0x10,0x9c]
+        dmuh    $2,$3,$4         # CHECK: dmuh $2, $3, $4  # encoding: [0x00,0x64,0x10,0xdc]
+        dmulu   $2,$3,$4         # CHECK: dmulu $2, $3, $4 # encoding: [0x00,0x64,0x10,0x9d]
+        dmuhu   $2,$3,$4         # CHECK: dmuhu $2, $3, $4 # encoding: [0x00,0x64,0x10,0xdd]
         maddf.s $f2,$f3,$f4      # CHECK: maddf.s $f2, $f3, $f4  # encoding: [0x46,0x04,0x18,0x98]
         maddf.d $f2,$f3,$f4      # CHECK: maddf.d $f2, $f3, $f4  # encoding: [0x46,0x24,0x18,0x98]
         msubf.s $f2,$f3,$f4      # CHECK: msubf.s $f2, $f3, $f4  # encoding: [0x46,0x04,0x18,0x99]
         msubf.d $f2,$f3,$f4      # CHECK: msubf.d $f2, $f3, $f4  # encoding: [0x46,0x24,0x18,0x99]
+        pref    1, 8($5)         # CHECK: pref 1, 8($5)          # encoding: [0x7c,0xa1,0x04,0x35]
         sel.d   $f0,$f1,$f2      # CHECK: sel.d $f0, $f1, $f2 # encoding: [0x46,0x22,0x08,0x10]
         sel.s   $f0,$f1,$f2      # CHECK: sel.s $f0, $f1, $f2 # encoding: [0x46,0x02,0x08,0x10]
         seleqz  $2,$3,$4         # CHECK: seleqz $2, $3, $4 # encoding: [0x00,0x64,0x10,0x35]
@@ -129,6 +141,7 @@
         maxa.d  $f0, $f2, $f4    # CHECK: maxa.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1f]
         mina.s  $f0, $f2, $f4    # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e]
         mina.d  $f0, $f2, $f4    # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e]
+        or      $2, 4            # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
         seleqz.s $f0, $f2, $f4   # CHECK: seleqz.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x14]
         seleqz.d $f0, $f2, $f4   # CHECK: seleqz.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x14]
         selnez.s $f0, $f2, $f4   # CHECK: selnez.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x17]
@@ -137,3 +150,24 @@
         rint.d $f2, $f4          # CHECK: rint.d $f2, $f4        # encoding: [0x46,0x20,0x20,0x9a]
         class.s $f2, $f4         # CHECK: class.s $f2, $f4       # encoding: [0x46,0x00,0x20,0x9b]
         class.d $f2, $f4         # CHECK: class.d $f2, $f4       # encoding: [0x46,0x20,0x20,0x9b]
+        jr.hb   $4               # CHECK: jr.hb $4               # encoding: [0x00,0x80,0x04,0x09]
+        jalr.hb $4               # CHECK: jalr.hb $4             # encoding: [0x00,0x80,0xfc,0x09]
+        jalr.hb $4, $5           # CHECK: jalr.hb $4, $5         # encoding: [0x00,0xa0,0x24,0x09]
+        ldc2    $8, -701($at)    # CHECK: ldc2 $8, -701($1)      # encoding: [0x49,0xc8,0x0d,0x43]
+        lwc2    $18,-841($a2)    # CHECK: lwc2 $18, -841($6)     # encoding: [0x49,0x52,0x34,0xb7]
+        sdc2    $20,629($s2)     # CHECK: sdc2 $20, 629($18)     # encoding: [0x49,0xf4,0x92,0x75]
+        swc2    $25,304($s0)     # CHECK: swc2 $25, 304($16)     # encoding: [0x49,0x79,0x81,0x30]
+        ll      $v0,-153($s2)    # CHECK: ll $2, -153($18)       # encoding: [0x7e,0x42,0xb3,0xb6]
+        lld     $zero,112($ra)   # CHECK: lld $zero, 112($ra)    # encoding: [0x7f,0xe0,0x38,0x37]
+        sc      $15,-40($s3)     # CHECK: sc $15, -40($19)       # encoding: [0x7e,0x6f,0xec,0x26]
+        scd     $15,-51($sp)     # CHECK: scd $15, -51($sp)      # encoding: [0x7f,0xaf,0xe6,0xa7]
+        clo     $11,$a1          # CHECK: clo $11, $5            # encoding: [0x00,0xa0,0x58,0x51]
+        clz     $sp,$gp          # CHECK: clz $sp, $gp           # encoding: [0x03,0x80,0xe8,0x50]
+        dclo    $s2,$a2          # CHECK: dclo $18, $6           # encoding: [0x00,0xc0,0x90,0x53]
+        dclz    $s0,$25          # CHECK: dclz $16, $25          # encoding: [0x03,0x20,0x80,0x52]
+        ssnop                    # WARNING: [[@LINE]]:9: warning: ssnop is deprecated for MIPS64r6 and is equivalent to a nop instruction
+        ssnop                    # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
+        sdbbp                    # CHECK: sdbbp                  # encoding: [0x00,0x00,0x00,0x0e]
+        sdbbp     34             # CHECK: sdbbp 34               # encoding: [0x00,0x00,0x08,0x8e]
+        sync                     # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        sync    1                # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
diff --git a/test/MC/Mips/mips_abi_flags_xx.s b/test/MC/Mips/mips_abi_flags_xx.s
new file mode 100644
index 0000000..1d65e99
--- /dev/null
+++ b/test/MC/Mips/mips_abi_flags_xx.s
@@ -0,0 +1,37 @@
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32 | \
+# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+#
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ
+
+# CHECK-ASM: .module fp=xx
+
+# Checking if the Mips.abiflags were correctly emitted.
+# CHECK-OBJ:  Section {
+# CHECK-OBJ:    Index: 5
+# CHECK-OBJ:    Name: .MIPS.abiflags (12)
+# CHECK-OBJ:    Type:  (0x7000002A)
+# CHECK-OBJ:     Flags [ (0x2)
+# CHECK-OBJ:      SHF_ALLOC (0x2)
+# CHECK-OBJ:    ]
+# CHECK-OBJ:    Address: 0x0
+# CHECK-OBJ:    Offset: 0x50
+# CHECK-OBJ:    Size: 24
+# CHECK-OBJ:    Link: 0
+# CHECK-OBJ:    Info: 0
+# CHECK-OBJ:    AddressAlignment: 8
+# CHECK-OBJ:    EntrySize: 0
+# CHECK-OBJ:    Relocations [
+# CHECK-OBJ:    ]
+# CHECK-OBJ:    SectionData (
+# CHECK-OBJ:      0000: 00002001 01010005 00000000 00000000  |.. .............|
+# CHECK-OBJ:      0010: 00000000 00000000                    |........|
+# CHECK-OBJ:    )
+# CHECK-OBJ:  }
+
+        .module fp=xx
+
+# FIXME: Test should include gnu_attributes directive when implemented.
+#        An explicit .gnu_attribute must be checked against the effective
+#        command line options and any inconsistencies reported via a warning.
diff --git a/test/MC/Mips/mips_abi_flags_xx_set.s b/test/MC/Mips/mips_abi_flags_xx_set.s
new file mode 100644
index 0000000..56f19d3
--- /dev/null
+++ b/test/MC/Mips/mips_abi_flags_xx_set.s
@@ -0,0 +1,38 @@
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32 | \
+# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+#
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ
+
+# CHECK-ASM: .module fp=xx
+# CHECK-ASM: .set    fp=64
+
+# Checking if the Mips.abiflags were correctly emitted.
+# CHECK-OBJ:  Section {
+# CHECK-OBJ:    Index: 5
+# CHECK-OBJ:    Name: .MIPS.abiflags (12)
+# CHECK-OBJ:    Type:  (0x7000002A)
+# CHECK-OBJ:     Flags [ (0x2)
+# CHECK-OBJ:      SHF_ALLOC (0x2)
+# CHECK-OBJ:    ]
+# CHECK-OBJ:    Address: 0x0
+# CHECK-OBJ:    Offset: 0x50
+# CHECK-OBJ:    Size: 24
+# CHECK-OBJ:    Link: 0
+# CHECK-OBJ:    Info: 0
+# CHECK-OBJ:    AddressAlignment: 8
+# CHECK-OBJ:    EntrySize: 0
+# CHECK-OBJ:    Relocations [
+# CHECK-OBJ:    ]
+# CHECK-OBJ:    SectionData (
+# CHECK-OBJ:      0000: 00002001 01010005 00000000 00000000  |.. .............|
+# CHECK-OBJ:      0010: 00000000 00000000                    |........|
+# CHECK-OBJ:    )
+# CHECK-OBJ:  }
+
+        .module fp=xx
+        .set    fp=64
+# FIXME: Test should include gnu_attributes directive when implemented.
+#        An explicit .gnu_attribute must be checked against the effective
+#        command line options and any inconsistencies reported via a warning.
diff --git a/test/MC/Mips/msa/abiflags.s b/test/MC/Mips/msa/abiflags.s
new file mode 100644
index 0000000..83b83cc
--- /dev/null
+++ b/test/MC/Mips/msa/abiflags.s
@@ -0,0 +1,38 @@
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa | \
+# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+#
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ
+
+# CHECK-ASM: .module fp=32
+# CHECK-ASM: .set fp=64
+
+# Checking if the Mips.abiflags were correctly emitted.
+# CHECK-OBJ:  Section {
+# CHECK-OBJ:    Index: 5
+# CHECK-OBJ:    Name: .MIPS.abiflags (12)
+# CHECK-OBJ:    Type:  (0x7000002A)
+# CHECK-OBJ:     Flags [ (0x2)
+# CHECK-OBJ:      SHF_ALLOC (0x2)
+# CHECK-OBJ:    ]
+# CHECK-OBJ:    Address: 0x0
+# CHECK-OBJ:    Offset: 0x50
+# CHECK-OBJ:    Size: 24
+# CHECK-OBJ:    Link: 0
+# CHECK-OBJ:    Info: 0
+# CHECK-OBJ:    AddressAlignment: 8
+# CHECK-OBJ:    EntrySize: 0
+# CHECK-OBJ:    Relocations [
+# CHECK-OBJ:    ]
+# CHECK-OBJ:    SectionData (
+# CHECK-OBJ:      0000: 00002002 01030001 00000000 00000200  |.. .............|
+# CHECK-OBJ:      0010: 00000000 00000000                    |........|
+# CHECK-OBJ:    )
+# CHECK-OBJ:  }
+
+        .module fp=32
+        .set fp=64
+# FIXME: Test should include gnu_attributes directive when implemented.
+#        An explicit .gnu_attribute must be checked against the effective
+#        command line options and any inconsistencies reported via a warning.
diff --git a/test/MC/Mips/nacl-mask.s b/test/MC/Mips/nacl-mask.s
index 8205835..22286ac 100644
--- a/test/MC/Mips/nacl-mask.s
+++ b/test/MC/Mips/nacl-mask.s
@@ -283,3 +283,37 @@ test5:
 # CHECK-NEXT:        and     $25, $25, $14
 # CHECK-NEXT:        jalr    $25
 # CHECK-NEXT:        addiu   $4, $zero, 5
+
+
+
+# Test that we can put non-dangerous loads and stores in branch delay slot.
+
+	.align	4
+test6:
+	.set	noreorder
+
+        jal func1
+        sw      $4, 0($sp)
+
+        bal func2
+        lw      $5, 0($t8)
+
+        jalr $t9
+        sw      $sp, 0($sp)
+
+# CHECK-LABEL:   test6:
+
+# CHECK-NEXT:        nop
+# CHECK-NEXT:        nop
+# CHECK-NEXT:        jal
+# CHECK-NEXT:        sw      $4, 0($sp)
+
+# CHECK-NEXT:        nop
+# CHECK-NEXT:        nop
+# CHECK-NEXT:        bal
+# CHECK-NEXT:        lw      $5, 0($24)
+
+# CHECK-NEXT:        nop
+# CHECK-NEXT:        and     $25, $25, $14
+# CHECK-NEXT:        jalr
+# CHECK-NEXT:        sw      $sp, 0($sp)
diff --git a/test/MC/Mips/nooddspreg-cmdarg.s b/test/MC/Mips/nooddspreg-cmdarg.s
new file mode 100644
index 0000000..826db12
--- /dev/null
+++ b/test/MC/Mips/nooddspreg-cmdarg.s
@@ -0,0 +1,43 @@
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -mattr=+fp64,+nooddspreg | \
+# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+#
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -mattr=+fp64,+nooddspreg -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ
+
+# RUN: not llvm-mc %s -arch=mips -mcpu=mips64 -mattr=-n64,+n32,+nooddspreg 2> %t0
+# RUN: FileCheck %s -check-prefix=INVALID < %t0
+#
+# RUN: not llvm-mc %s -arch=mips -mcpu=mips64 -mattr=+nooddspreg 2> %t0
+# RUN: FileCheck %s -check-prefix=INVALID < %t0
+#
+# CHECK-ASM-NOT: .module nooddspreg
+
+# Checking if the Mips.abiflags were correctly emitted.
+# CHECK-OBJ:  Section {
+# CHECK-OBJ:    Index: 5
+# CHECK-OBJ:    Name: .MIPS.abiflags (12)
+# CHECK-OBJ:    Type:  (0x7000002A)
+# CHECK-OBJ:     Flags [ (0x2)
+# CHECK-OBJ:      SHF_ALLOC (0x2)
+# CHECK-OBJ:    ]
+# CHECK-OBJ:    Address: 0x0
+# CHECK-OBJ:    Offset: 0x50
+# CHECK-OBJ:    Size: 24
+# CHECK-OBJ:    Link: 0
+# CHECK-OBJ:    Info: 0
+# CHECK-OBJ:    AddressAlignment: 8
+# CHECK-OBJ:    EntrySize: 0
+# CHECK-OBJ:    Relocations [
+# CHECK-OBJ:    ]
+# CHECK-OBJ:    SectionData (
+# CHECK-OBJ:      0000: 00002001 01020007 00000000 00000000  |.. .............|
+# CHECK-OBJ:      0010: 00000000 00000000                    |........|
+# CHECK-OBJ:    )
+# CHECK-OBJ:  }
+
+# INVALID: ERROR: -mno-odd-spreg requires the O32 ABI
+
+# FIXME: Test should include gnu_attributes directive when implemented.
+#        An explicit .gnu_attribute must be checked against the effective
+#        command line options and any inconsistencies reported via a warning.
diff --git a/test/MC/Mips/nooddspreg-error.s b/test/MC/Mips/nooddspreg-error.s
new file mode 100644
index 0000000..b4aabbe
--- /dev/null
+++ b/test/MC/Mips/nooddspreg-error.s
@@ -0,0 +1,14 @@
+# RUN: not llvm-mc %s -arch=mips -mcpu=mips32 -mattr=+fp64 2> %t0 | \
+# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+# RUN: FileCheck %s -check-prefix=CHECK-ERROR < %t0
+#
+        .module nooddspreg
+# CHECK-ASM: .module nooddspreg
+
+        add.s $f1, $f2, $f5
+# CHECK-ERROR: :[[@LINE-1]]:15: error: -mno-odd-spreg prohibits the use of odd FPU registers
+# CHECK-ERROR: :[[@LINE-2]]:25: error: -mno-odd-spreg prohibits the use of odd FPU registers
+
+# FIXME: Test should include gnu_attributes directive when implemented.
+#        An explicit .gnu_attribute must be checked against the effective
+#        command line options and any inconsistencies reported via a warning.
diff --git a/test/MC/Mips/nooddspreg.s b/test/MC/Mips/nooddspreg.s
new file mode 100644
index 0000000..5a283f5
--- /dev/null
+++ b/test/MC/Mips/nooddspreg.s
@@ -0,0 +1,45 @@
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -mattr=+fp64 | \
+# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+#
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -mattr=+fp64 -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ
+
+# RUN: not llvm-mc %s -arch=mips -mcpu=mips64 -mattr=-n64,n32 2> %t1
+# RUN: FileCheck %s -check-prefix=INVALID < %t1
+#
+# RUN: not llvm-mc %s -arch=mips -mcpu=mips64 2> %t2
+# RUN: FileCheck %s -check-prefix=INVALID < %t2
+#
+# CHECK-ASM: .module nooddspreg
+
+# Checking if the Mips.abiflags were correctly emitted.
+# CHECK-OBJ:  Section {
+# CHECK-OBJ:    Index: 5
+# CHECK-OBJ:    Name: .MIPS.abiflags (12)
+# CHECK-OBJ:    Type:  (0x7000002A)
+# CHECK-OBJ:     Flags [ (0x2)
+# CHECK-OBJ:      SHF_ALLOC (0x2)
+# CHECK-OBJ:    ]
+# CHECK-OBJ:    Address: 0x0
+# CHECK-OBJ:    Offset: 0x50
+# CHECK-OBJ:    Size: 24
+# CHECK-OBJ:    Link: 0
+# CHECK-OBJ:    Info: 0
+# CHECK-OBJ:    AddressAlignment: 8
+# CHECK-OBJ:    EntrySize: 0
+# CHECK-OBJ:    Relocations [
+# CHECK-OBJ:    ]
+# CHECK-OBJ:    SectionData (
+# CHECK-OBJ:      0000: 00002001 01020007 00000000 00000000  |.. .............|
+# CHECK-OBJ:      0010: 00000000 00000000                    |........|
+# CHECK-OBJ:    )
+# CHECK-OBJ:  }
+
+# INVALID: '.module nooddspreg' requires the O32 ABI
+
+        .module nooddspreg
+
+# FIXME: Test should include gnu_attributes directive when implemented.
+#        An explicit .gnu_attribute must be checked against the effective
+#        command line options and any inconsistencies reported via a warning.
diff --git a/test/MC/Mips/oddspreg.s b/test/MC/Mips/oddspreg.s
new file mode 100644
index 0000000..f5aa9c0
--- /dev/null
+++ b/test/MC/Mips/oddspreg.s
@@ -0,0 +1,56 @@
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -mattr=+fp64 | \
+# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+#
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32 -mattr=+fp64 -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ-ALL -check-prefix=CHECK-OBJ-O32
+#
+# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -mattr=-n64,+n32 | \
+# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+#
+# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -mattr=-n64,+n32 -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ-ALL -check-prefix=CHECK-OBJ-N32
+
+# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 | \
+# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+#
+# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ-ALL -check-prefix=CHECK-OBJ-N64
+
+# CHECK-ASM: .module oddspreg
+
+# Checking if the Mips.abiflags were correctly emitted.
+# CHECK-OBJ-ALL:  Section {
+# CHECK-OBJ-ALL:    Index: 5
+# CHECK-OBJ-ALL:    Name: .MIPS.abiflags ({{[0-9]+}})
+# CHECK-OBJ-ALL:    Type:  (0x7000002A)
+# CHECK-OBJ-ALL:     Flags [ (0x2)
+# CHECK-OBJ-ALL:      SHF_ALLOC (0x2)
+# CHECK-OBJ-ALL:    ]
+# CHECK-OBJ-ALL:    Address: 0x0
+# CHECK-OBJ-ALL:    Offset: 0x{{[0-9A-F]+}}
+# CHECK-OBJ-ALL:    Size: 24
+# CHECK-OBJ-ALL:    Link: 0
+# CHECK-OBJ-ALL:    Info: 0
+# CHECK-OBJ-ALL:    AddressAlignment: 8
+# CHECK-OBJ-ALL:    EntrySize: 0
+# CHECK-OBJ-ALL:    Relocations [
+# CHECK-OBJ-ALL:    ]
+# CHECK-OBJ-ALL:    SectionData (
+# CHECK-OBJ-O32:      0000: 00002001 01020006 00000000 00000000  |.. .............|
+# CHECK-OBJ-O32:      0010: 00000001 00000000                    |........|
+# CHECK-OBJ-N32:      0000: 00004001 02020001 00000000 00000000  |..@.............|
+# CHECK-OBJ-N32:      0010: 00000001 00000000                    |........|
+# CHECK-OBJ-N64:      0000: 00004001 02020001 00000000 00000000  |..@.............|
+# CHECK-OBJ-N64:      0010: 00000001 00000000                    |........|
+# CHECK-OBJ-ALL:    )
+# CHECK-OBJ-ALL:  }
+
+        .module oddspreg
+        add.s $f3, $f1, $f5
+
+# FIXME: Test should include gnu_attributes directive when implemented.
+#        An explicit .gnu_attribute must be checked against the effective
+#        command line options and any inconsistencies reported via a warning.
diff --git a/test/MC/PowerPC/lit.local.cfg b/test/MC/PowerPC/lit.local.cfg
index 193ebeb..0913324 100644
--- a/test/MC/PowerPC/lit.local.cfg
+++ b/test/MC/PowerPC/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'PowerPC' in targets:
+if not 'PowerPC' in config.root.targets:
     config.unsupported = True
diff --git a/test/MC/PowerPC/ppc64-initial-cfa.s b/test/MC/PowerPC/ppc64-initial-cfa.s
index ca97e1b..d0bc6b3 100644
--- a/test/MC/PowerPC/ppc64-initial-cfa.s
+++ b/test/MC/PowerPC/ppc64-initial-cfa.s
@@ -28,8 +28,8 @@ _proc:
 # STATIC-NEXT:   Relocations [
 # STATIC-NEXT:   ]
 # STATIC-NEXT:   SectionData (
-# STATIC-BE-NEXT:  0000: 00000010 00000000 017A5200 04784101
-# STATIC-LE-NEXT:  0000: 10000000 00000000 017A5200 04784101
+# STATIC-BE-NEXT:  0000: 00000010 00000000 037A5200 04784101
+# STATIC-LE-NEXT:  0000: 10000000 00000000 037A5200 04784101
 # STATIC-BE-NEXT:  0010: 1B0C0100 00000010 00000018 00000000
 # STATIC-LE-NEXT:  0010: 1B0C0100 10000000 18000000 00000000
 # STATIC-BE-NEXT:  0020: 00000004 00000000
@@ -69,8 +69,8 @@ _proc:
 # PIC-NEXT:   Relocations [
 # PIC-NEXT:   ]
 # PIC-NEXT:   SectionData (
-# PIC-BE-NEXT:  0000: 00000010 00000000 017A5200 04784101
-# PIC-LE-NEXT:  0000: 10000000 00000000 017A5200 04784101
+# PIC-BE-NEXT:  0000: 00000010 00000000 037A5200 04784101
+# PIC-LE-NEXT:  0000: 10000000 00000000 037A5200 04784101
 # PIC-BE-NEXT:  0010: 1B0C0100 00000010 00000018 00000000
 # PIC-LE-NEXT:  0010: 1B0C0100 10000000 18000000 00000000
 # PIC-BE-NEXT:  0020: 00000004 00000000
diff --git a/test/MC/Sparc/lit.local.cfg b/test/MC/Sparc/lit.local.cfg
index 4d344fa..fa6a54e 100644
--- a/test/MC/Sparc/lit.local.cfg
+++ b/test/MC/Sparc/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Sparc' in targets:
+if not 'Sparc' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/SystemZ/insn-bad-z196.s b/test/MC/SystemZ/insn-bad-z196.s
index da23a4b..47dbe08 100644
--- a/test/MC/SystemZ/insn-bad-z196.s
+++ b/test/MC/SystemZ/insn-bad-z196.s
@@ -411,6 +411,60 @@
 	lbh	%r0, 524288
 
 #CHECK: error: invalid operand
+#CHECK: ldxbra	%f0, 0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: ldxbra	%f0, 0, %f0, 16
+#CHECK: error: invalid operand
+#CHECK: ldxbra	%f0, -1, %f0, 0
+#CHECK: error: invalid operand
+#CHECK: ldxbra	%f0, 16, %f0, 0
+#CHECK: error: invalid register pair
+#CHECK: ldxbra	%f0, 0, %f2, 0
+#CHECK: error: invalid register pair
+#CHECK: ldxbra	%f2, 0, %f0, 0
+
+	ldxbra	%f0, 0, %f0, -1
+	ldxbra	%f0, 0, %f0, 16
+	ldxbra	%f0, -1, %f0, 0
+	ldxbra	%f0, 16, %f0, 0
+	ldxbra	%f0, 0, %f2, 0
+	ldxbra	%f2, 0, %f0, 0
+
+#CHECK: error: invalid operand
+#CHECK: ledbra	%f0, 0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: ledbra	%f0, 0, %f0, 16
+#CHECK: error: invalid operand
+#CHECK: ledbra	%f0, -1, %f0, 0
+#CHECK: error: invalid operand
+#CHECK: ledbra	%f0, 16, %f0, 0
+
+	ledbra	%f0, 0, %f0, -1
+	ledbra	%f0, 0, %f0, 16
+	ledbra	%f0, -1, %f0, 0
+	ledbra	%f0, 16, %f0, 0
+
+#CHECK: error: invalid operand
+#CHECK: lexbra	%f0, 0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: lexbra	%f0, 0, %f0, 16
+#CHECK: error: invalid operand
+#CHECK: lexbra	%f0, -1, %f0, 0
+#CHECK: error: invalid operand
+#CHECK: lexbra	%f0, 16, %f0, 0
+#CHECK: error: invalid register pair
+#CHECK: lexbra	%f0, 0, %f2, 0
+#CHECK: error: invalid register pair
+#CHECK: lexbra	%f2, 0, %f0, 0
+
+	lexbra	%f0, 0, %f0, -1
+	lexbra	%f0, 0, %f0, 16
+	lexbra	%f0, -1, %f0, 0
+	lexbra	%f0, 16, %f0, 0
+	lexbra	%f0, 0, %f2, 0
+	lexbra	%f2, 0, %f0, 0
+
+#CHECK: error: invalid operand
 #CHECK: lfh	%r0, -524289
 #CHECK: error: invalid operand
 #CHECK: lfh	%r0, 524288
diff --git a/test/MC/SystemZ/insn-bad.s b/test/MC/SystemZ/insn-bad.s
index 8004168..a08cb34 100644
--- a/test/MC/SystemZ/insn-bad.s
+++ b/test/MC/SystemZ/insn-bad.s
@@ -1560,6 +1560,11 @@
 	ldxbr	%f0, %f2
 	ldxbr	%f2, %f0
 
+#CHECK: error: {{(instruction requires: fp-extension)?}}
+#CHECK: ldxbra	%f0, 0, %f0, 0
+
+	ldxbra	%f0, 0, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: ldy	%f0, -524289
 #CHECK: error: invalid operand
@@ -1576,6 +1581,11 @@
 	le	%f0, -1
 	le	%f0, 4096
 
+#CHECK: error: {{(instruction requires: fp-extension)?}}
+#CHECK: ledbra	%f0, 0, %f0, 0
+
+	ledbra	%f0, 0, %f0, 0
+
 #CHECK: error: invalid register pair
 #CHECK: lexbr	%f0, %f2
 #CHECK: error: invalid register pair
@@ -1584,6 +1594,11 @@
 	lexbr	%f0, %f2
 	lexbr	%f2, %f0
 
+#CHECK: error: {{(instruction requires: fp-extension)?}}
+#CHECK: lexbra	%f0, 0, %f0, 0
+
+	lexbra	%f0, 0, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: ley	%f0, -524289
 #CHECK: error: invalid operand
diff --git a/test/MC/SystemZ/insn-good-z196.s b/test/MC/SystemZ/insn-good-z196.s
index 834bdad..db5ecdd 100644
--- a/test/MC/SystemZ/insn-good-z196.s
+++ b/test/MC/SystemZ/insn-good-z196.s
@@ -675,6 +675,48 @@
 	lbh	%r0, 524287(%r15,%r1)
 	lbh	%r15, 0
 
+#CHECK: ldxbra	%f0, 0, %f0, 0          # encoding: [0xb3,0x45,0x00,0x00]
+#CHECK: ldxbra	%f0, 0, %f0, 15         # encoding: [0xb3,0x45,0x0f,0x00]
+#CHECK: ldxbra	%f0, 0, %f13, 0         # encoding: [0xb3,0x45,0x00,0x0d]
+#CHECK: ldxbra	%f0, 15, %f0, 0         # encoding: [0xb3,0x45,0xf0,0x00]
+#CHECK: ldxbra	%f4, 5, %f8, 9          # encoding: [0xb3,0x45,0x59,0x48]
+#CHECK: ldxbra	%f13, 0, %f0, 0         # encoding: [0xb3,0x45,0x00,0xd0]
+
+	ldxbra	%f0, 0, %f0, 0
+	ldxbra	%f0, 0, %f0, 15
+	ldxbra	%f0, 0, %f13, 0
+	ldxbra	%f0, 15, %f0, 0
+	ldxbra	%f4, 5, %f8, 9
+	ldxbra	%f13, 0, %f0, 0
+
+#CHECK: ledbra	%f0, 0, %f0, 0          # encoding: [0xb3,0x44,0x00,0x00]
+#CHECK: ledbra	%f0, 0, %f0, 15         # encoding: [0xb3,0x44,0x0f,0x00]
+#CHECK: ledbra	%f0, 0, %f15, 0         # encoding: [0xb3,0x44,0x00,0x0f]
+#CHECK: ledbra	%f0, 15, %f0, 0         # encoding: [0xb3,0x44,0xf0,0x00]
+#CHECK: ledbra	%f4, 5, %f6, 7          # encoding: [0xb3,0x44,0x57,0x46]
+#CHECK: ledbra	%f15, 0, %f0, 0         # encoding: [0xb3,0x44,0x00,0xf0]
+
+	ledbra	%f0, 0, %f0, 0
+	ledbra	%f0, 0, %f0, 15
+	ledbra	%f0, 0, %f15, 0
+	ledbra	%f0, 15, %f0, 0
+	ledbra	%f4, 5, %f6, 7
+	ledbra	%f15, 0, %f0, 0
+
+#CHECK: lexbra	%f0, 0, %f0, 0          # encoding: [0xb3,0x46,0x00,0x00]
+#CHECK: lexbra	%f0, 0, %f0, 15         # encoding: [0xb3,0x46,0x0f,0x00]
+#CHECK: lexbra	%f0, 0, %f13, 0         # encoding: [0xb3,0x46,0x00,0x0d]
+#CHECK: lexbra	%f0, 15, %f0, 0         # encoding: [0xb3,0x46,0xf0,0x00]
+#CHECK: lexbra	%f4, 5, %f8, 9          # encoding: [0xb3,0x46,0x59,0x48]
+#CHECK: lexbra	%f13, 0, %f0, 0         # encoding: [0xb3,0x46,0x00,0xd0]
+
+	lexbra	%f0, 0, %f0, 0
+	lexbra	%f0, 0, %f0, 15
+	lexbra	%f0, 0, %f13, 0
+	lexbra	%f0, 15, %f0, 0
+	lexbra	%f4, 5, %f8, 9
+	lexbra	%f13, 0, %f0, 0
+
 #CHECK: lfh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0xca]
 #CHECK: lfh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0xca]
 #CHECK: lfh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0xca]
diff --git a/test/MC/SystemZ/lit.local.cfg b/test/MC/SystemZ/lit.local.cfg
index b12af09..5c02dd3 100644
--- a/test/MC/SystemZ/lit.local.cfg
+++ b/test/MC/SystemZ/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'SystemZ' in targets:
+if not 'SystemZ' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/X86/AlignedBundling/lit.local.cfg b/test/MC/X86/AlignedBundling/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/MC/X86/AlignedBundling/lit.local.cfg
+++ b/test/MC/X86/AlignedBundling/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s
index 2915b7a..187b512 100644
--- a/test/MC/X86/avx512-encodings.s
+++ b/test/MC/X86/avx512-encodings.s
@@ -1,4 +1,5 @@
-// RUN: llvm-mc -triple x86_64-unknown-unknown -mcpu=knl --show-encoding %s | FileCheck %s
+// RUN: not llvm-mc -triple x86_64-unknown-unknown -mcpu=knl --show-encoding %s 2> %t.err | FileCheck %s
+// RUN: FileCheck --check-prefix=ERR < %t.err %s
 
 // CHECK: vaddpd %zmm6, %zmm27, %zmm8
 // CHECK:  encoding: [0x62,0x71,0xa5,0x40,0x58,0xc6]
@@ -3128,6 +3129,14 @@ vpsrad 512(%rdi, %rsi, 4), %zmm12, %zmm25
 // CHECK: encoding: [0x62,0xf2,0x7d,0xc9,0x58,0xc8]
 vpbroadcastd  %xmm0, %zmm1 {%k1} {z}
 
+// CHECK: vbroadcasti32x4 {{.*}} {%k7} {z}
+// CHECK: encoding: [0x67,0x62,0xf2,0x7d,0xcf,0x5a,0x52,0x02]
+vbroadcasti32x4 0x20(%edx), %zmm2 {%k7} {z}
+
+// CHECK: vbroadcasti64x4 {{.*}} %zmm22
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x5b,0x72,0x02]
+vbroadcasti64x4 0x40(%rdx), %zmm22
+
 // CHECK: vmovdqu64 {{.*}} {%k3}
 // CHECK: encoding: [0x62,0xf1,0xfe,0x4b,0x7f,0x07]
 vmovdqu64 %zmm0, (%rdi) {%k3}
@@ -3151,3 +3160,62 @@ vaddpd 512(%rdi, %rsi, 8) {1to8}, %zmm20, %zmm30
 // CHECK: vaddps {{.*}}{1to16}
 // CHECK: encoding: [0x62,0x61,0x5c,0x50,0x58,0xb4,0xf7,0x00,0x02,0x00,0x00]
 vaddps 512(%rdi, %rsi, 8) {1to16}, %zmm20, %zmm30
+
+// CHECK: vmovntdqa
+// CHECK: encoding: [0x62,0x72,0x7d,0x48,0x2a,0xab,0x78,0x56,0x34,0x12]
+vmovntdqa 0x12345678(%rbx), %zmm13
+
+// CHECK: vmovntdqa
+// CHECK: encoding: [0x62,0xc2,0x7d,0x48,0x2a,0x14,0x56]
+vmovntdqa (%r14,%rdx,2), %zmm18
+
+// CHECK: vmovntdqa
+// CHECK: encoding: [0x62,0xc2,0x7d,0x48,0x2a,0x7c,0x14,0x02]
+vmovntdqa 128(%r12,%rdx), %zmm23
+
+// CHECK: vmovntdq
+// CHECK: encoding: [0x62,0x21,0x7d,0x48,0xe7,0x24,0xa9]
+vmovntdq %zmm28, (%rcx,%r13,4)
+
+// CHECK: vmovntpd
+// CHECK: encoding: [0x62,0xf1,0xfd,0x48,0x2b,0xb2,0x04,0x00,0x00,0x00]
+vmovntpd %zmm6, 4(%rdx)
+
+// CHECK: vmovntps
+// CHECK: encoding: [0x62,0x51,0x7c,0x48,0x2b,0x5c,0x8d,0x00]
+vmovntps %zmm11, (%r13,%rcx,4)
+
+// CHECK: vcmpps $14
+// CHECK: encoding: [0x62,0xb1,0x54,0x48,0xc2,0xd1,0x0e]
+vcmpgtps %zmm17, %zmm5, %k2
+
+// CHECK: vcmppd $13
+// CHECK: encoding: [0x62,0xd1,0xf5,0x40,0xc2,0x76,0x02,0x0d]
+vcmpgepd 0x80(%r14), %zmm17, %k6
+
+// CHECK: vpcmpd $1,
+// CHECK: encoding: [0x62,0x93,0x45,0x4c,0x1f,0xe8,0x01]
+vpcmpd $1, %zmm24, %zmm7, %k5{%k4}
+
+// CHECK: vpcmpuq $2,
+// CHECK: encoding: [0x62,0xf3,0xf5,0x47,0x1e,0x72,0x01,0x02]
+vpcmpuq $2, 0x40(%rdx), %zmm17, %k6{%k7}
+
+// ERR: invalid operand for instruction
+vpcmpd $1, %zmm24, %zmm7, %k5{%k0}
+
+// CHECK: vpermi2d
+// CHECK: encoding: [0x62,0x42,0x6d,0x4b,0x76,0xd6]
+vpermi2d %zmm14, %zmm2, %zmm26 {%k3}
+
+// CHECK: vpermt2pd
+// CHECK: encoding: [0x62,0xf2,0xcd,0xc6,0x7f,0xf3]
+vpermt2pd %zmm3, %zmm22, %zmm6 {%k6} {z}
+
+// CHECK: vpermi2q
+// CHECK: encoding: [0x62,0x62,0xed,0x4b,0x76,0x54,0x58,0x02]
+vpermi2q 0x80(%rax,%rbx,2), %zmm2, %zmm26 {%k3}
+
+// CHECK: vpermt2d
+// CHECK: encoding: [0x62,0x32,0x4d,0xc2,0x7e,0x24,0xad,0x05,0x00,0x00,0x00]	
+vpermt2d 5(,%r13,4), %zmm22, %zmm12 {%k2} {z}
diff --git a/test/MC/X86/intel-syntax.s b/test/MC/X86/intel-syntax.s
index 540282a..7968918 100644
--- a/test/MC/X86/intel-syntax.s
+++ b/test/MC/X86/intel-syntax.s
@@ -599,3 +599,11 @@ fxrstor64 opaque ptr [rax]
 // CHECK: movq _g0+8, %rcx
 mov rbx, qword ptr [_g0]
 mov rcx, qword ptr [_g0 + 8]
+
+"?half@?0??bar@@YAXXZ@4NA":
+	.quad   4602678819172646912
+
+fadd   "?half@?0??bar@@YAXXZ@4NA"
+fadd   "?half@?0??bar@@YAXXZ@4NA"@IMGREL
+// CHECK: fadds   "?half@?0??bar@@YAXXZ@4NA"
+// CHECK: fadds   "?half@?0??bar@@YAXXZ@4NA"@IMGREL32
diff --git a/test/MC/X86/lit.local.cfg b/test/MC/X86/lit.local.cfg
index 19840aa..c8625f4 100644
--- a/test/MC/X86/lit.local.cfg
+++ b/test/MC/X86/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
diff --git a/test/MC/X86/no-elf-compact-unwind.s b/test/MC/X86/no-elf-compact-unwind.s
index 017c52a..4e9236b 100644
--- a/test/MC/X86/no-elf-compact-unwind.s
+++ b/test/MC/X86/no-elf-compact-unwind.s
@@ -1,4 +1,5 @@
 // RUN: llvm-mc < %s -filetype=obj -triple x86_64-apple-macosx10.8.0 | llvm-readobj -s | FileCheck -check-prefix=MACHO %s
+// RUN: llvm-mc < %s -filetype=obj -triple x86_64-apple-ios7.0.0 | llvm-readobj -s | FileCheck -check-prefix=MACHO %s
 // RUN: llvm-mc < %s -filetype=obj -triple x86_64-unknown-linux | llvm-readobj -s | FileCheck -check-prefix=ELF %s
 
 	.globl	__Z3barv
diff --git a/test/MC/X86/x86_long_nop.s b/test/MC/X86/x86_long_nop.s
index ac1bc08..eee840c 100644
--- a/test/MC/X86/x86_long_nop.s
+++ b/test/MC/X86/x86_long_nop.s
@@ -2,6 +2,7 @@
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-apple-darwin10.0 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-apple-darwin8 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=slm %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=SLM %s
 
 # Ensure alignment directives also emit sequences of 15-byte NOPs on processors
 # capable of using long NOPs.
@@ -13,3 +14,12 @@ inc %eax
 # CHECK-NEXT: 10:  nop
 # CHECK-NEXT: 1f:  nop
 # CHECK-NEXT: 20:  inc
+
+# On Silvermont we emit only 7 byte NOPs since longer NOPs are not profitable
+# SLM: 0:  inc
+# SLM-NEXT: 1:  nop
+# SLM-NEXT: 8:  nop
+# SLM-NEXT: f:  nop
+# SLM-NEXT: 16:  nop
+# SLM-NEXT: 1d:  nop
+# SLM-NEXT: 20:  inc
diff --git a/test/MC/X86/x86_nop.s b/test/MC/X86/x86_nop.s
index 059f591..572487b 100644
--- a/test/MC/X86/x86_nop.s
+++ b/test/MC/X86/x86_nop.s
@@ -14,6 +14,7 @@
 # RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=c3 %s | llvm-objdump -d - | FileCheck %s
 # RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=c3-2 %s | llvm-objdump -d - | FileCheck %s
 # RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=core2 %s | llvm-objdump -d - | FileCheck --check-prefix=NOPL %s
+# RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=slm %s | llvm-objdump -d - | FileCheck --check-prefix=NOPL %s
 
 
 inc %eax
diff --git a/test/Makefile b/test/Makefile
index dc99fe1..c78c256 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -61,6 +61,15 @@ clang-tools-site-cfg: FORCE
 extra-site-cfgs:: clang-tools-site-cfg
 endif
 
+ifeq ($(shell test -f $(PROJ_OBJ_DIR)/../tools/lld/Makefile && echo OK), OK)
+LIT_ALL_TESTSUITES += $(PROJ_OBJ_DIR)/../tools/lld/test
+
+# Force creation of lld's lit.site.cfg.
+lld-site-cfg: FORCE
+	$(MAKE) -C $(PROJ_OBJ_DIR)/../tools/lld/test lit.site.cfg Unit/lit.site.cfg
+extra-site-cfgs:: lld-site-cfg
+endif
+
 ifeq ($(shell test -f $(PROJ_OBJ_DIR)/../tools/polly/Makefile && echo OK), OK)
 LIT_ALL_TESTSUITES += $(PROJ_OBJ_DIR)/../tools/polly/test
 
diff --git a/test/Object/ARM/lit.local.cfg b/test/Object/ARM/lit.local.cfg
index 5fc35d8..236e1d3 100644
--- a/test/Object/ARM/lit.local.cfg
+++ b/test/Object/ARM/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
diff --git a/test/Object/Inputs/corrupt-archive.a b/test/Object/Inputs/corrupt-archive.a
new file mode 100644
index 0000000..f8940ff
--- /dev/null
+++ b/test/Object/Inputs/corrupt-archive.a
diff --git a/test/Object/Inputs/darwin-m-test1.mach0-armv7 b/test/Object/Inputs/darwin-m-test1.mach0-armv7
new file mode 100644
index 0000000..2ce3a18
--- /dev/null
+++ b/test/Object/Inputs/darwin-m-test1.mach0-armv7
diff --git a/test/Object/Inputs/darwin-m-test2.macho-i386 b/test/Object/Inputs/darwin-m-test2.macho-i386
new file mode 100644
index 0000000..dc0e865
--- /dev/null
+++ b/test/Object/Inputs/darwin-m-test2.macho-i386
diff --git a/test/Object/Inputs/darwin-m-test3.macho-x86-64 b/test/Object/Inputs/darwin-m-test3.macho-x86-64
new file mode 100755
index 0000000..18960c4
--- /dev/null
+++ b/test/Object/Inputs/darwin-m-test3.macho-x86-64
diff --git a/test/Object/Inputs/hello-world.macho-x86_64 b/test/Object/Inputs/hello-world.macho-x86_64
new file mode 100755
index 0000000..d004bed
--- /dev/null
+++ b/test/Object/Inputs/hello-world.macho-x86_64
diff --git a/test/Object/Inputs/macho-archive-x86_64.a b/test/Object/Inputs/macho-archive-x86_64.a
new file mode 100644
index 0000000..9979ba9
--- /dev/null
+++ b/test/Object/Inputs/macho-archive-x86_64.a
diff --git a/test/Object/Inputs/no-sections.elf-x86-64 b/test/Object/Inputs/no-sections.elf-x86-64
new file mode 100755
index 0000000..9b8ca2e
--- /dev/null
+++ b/test/Object/Inputs/no-sections.elf-x86-64
diff --git a/test/Object/Inputs/program-headers.mips64 b/test/Object/Inputs/program-headers.mips64
new file mode 100644
index 0000000..ad21c7d
--- /dev/null
+++ b/test/Object/Inputs/program-headers.mips64
diff --git a/test/Object/Inputs/trivial.ll b/test/Object/Inputs/trivial.ll
index 25ece76..463442e 100644
--- a/test/Object/Inputs/trivial.ll
+++ b/test/Object/Inputs/trivial.ll
@@ -10,3 +10,7 @@ entry:
 declare i32 @puts(i8* nocapture) nounwind
 
 declare void @SomeOtherFunction(...)
+
+@var = global i32 0
+@llvm.used = appending global [1 x i8*] [i8* bitcast (i32* @var to i8*)], section "llvm.metadata"
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* null, i8* null }]
diff --git a/test/Object/Mips/lit.local.cfg b/test/Object/Mips/lit.local.cfg
index 88262fb..7d12f7a 100644
--- a/test/Object/Mips/lit.local.cfg
+++ b/test/Object/Mips/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Mips' in targets:
+if not 'Mips' in config.root.targets:
     config.unsupported = True
diff --git a/test/Object/X86/archive-ir-asm.ll b/test/Object/X86/archive-ir-asm.ll
new file mode 100644
index 0000000..560ac17
--- /dev/null
+++ b/test/Object/X86/archive-ir-asm.ll
@@ -0,0 +1,20 @@
+; RUN: llvm-as %s -o=%t1
+; RUN: rm -f %t2
+; RUN: llvm-ar rcs %t2 %t1
+; RUN: llvm-nm -M %t2 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+module asm ".global global_asm_sym"
+module asm "global_asm_sym:"
+module asm "local_asm_sym:"
+module asm ".long undef_asm_sym"
+
+; CHECK: Archive map
+; CHECK-NEXT: global_asm_sym in archive-ir-asm.ll
+
+; CHECK: archive-ir-asm.ll
+; CHECK-NEXT:         T global_asm_sym
+; CHECK-NEXT:         t local_asm_sym
+; CHECK-NEXT:         U undef_asm_sym
diff --git a/test/Object/X86/lit.local.cfg b/test/Object/X86/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/Object/X86/lit.local.cfg
+++ b/test/Object/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Object/nm-ir.ll b/test/Object/X86/nm-ir.ll
index ddf4125..6bb7e23 100644
--- a/test/Object/nm-ir.ll
+++ b/test/Object/X86/nm-ir.ll
@@ -10,6 +10,17 @@
 ; CHECK-NEXT: d g2
 ; CHECK-NEXT: C g3
 ; CHECK-NOT: g4
+; CHECK-NEXT: T global_asm_sym
+; CHECK-NEXT: t local_asm_sym
+; CHECK-NEXT: U undef_asm_sy
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+module asm ".global global_asm_sym"
+module asm "global_asm_sym:"
+module asm "local_asm_sym:"
+module asm ".long undef_asm_sym"
 
 @g1 = global i32 42
 @g2 = internal global i32 42
diff --git a/test/Object/archive-long-index.test b/test/Object/archive-long-index.test
index f2f4df6..6feb69e 100644
--- a/test/Object/archive-long-index.test
+++ b/test/Object/archive-long-index.test
@@ -1,7 +1,7 @@
 #
 # Check if the index is appearing properly in the output file
 #
-RUN: llvm-nm -s %p/Inputs/liblong_filenames.a | FileCheck -check-prefix=CHECKIDX %s
+RUN: llvm-nm -M %p/Inputs/liblong_filenames.a | FileCheck -check-prefix=CHECKIDX %s
 
 CHECKIDX: Archive map
 CHECKIDX: abcdefghijklmnopqrstuvwxyz12345678 in 1.o
diff --git a/test/Object/archive-symtab.test b/test/Object/archive-symtab.test
index 88c9c98..0899828 100644
--- a/test/Object/archive-symtab.test
+++ b/test/Object/archive-symtab.test
@@ -1,6 +1,6 @@
 RUN: rm -f %t.a
 RUN: llvm-ar rcs %t.a %p/Inputs/trivial-object-test.elf-x86-64 %p/Inputs/trivial-object-test2.elf-x86-64
-RUN: llvm-nm -s %t.a | FileCheck %s
+RUN: llvm-nm -M %t.a | FileCheck %s
 
 CHECK: Archive map
 CHECK-NEXT: main in trivial-object-test.elf-x86-64
@@ -12,24 +12,25 @@ CHECK: trivial-object-test.elf-x86-64:
 CHECK-NEXT:                  U SomeOtherFunction
 CHECK-NEXT: 0000000000000000 T main
 CHECK-NEXT:                  U puts
-CHECK-NEXT: trivial-object-test2.elf-x86-64:
+
+CHECK: trivial-object-test2.elf-x86-64:
 CHECK-NEXT: 0000000000000000 t bar
 CHECK-NEXT: 0000000000000006 T foo
 CHECK-NEXT: 0000000000000016 T main
 
 RUN: rm -f %t.a
 RUN: llvm-ar rcS %t.a %p/Inputs/trivial-object-test.elf-x86-64 %p/Inputs/trivial-object-test2.elf-x86-64
-RUN: llvm-nm -s %t.a | FileCheck %s --check-prefix=NOMAP
+RUN: llvm-nm -M %t.a | FileCheck %s --check-prefix=NOMAP
 
 NOMAP-NOT: Archive map
 
 RUN: llvm-ar s %t.a
-RUN: llvm-nm -s %t.a | FileCheck %s
+RUN: llvm-nm -M %t.a | FileCheck %s
 
 check that the archive does have a corrupt symbol table.
 RUN: rm -f %t.a
 RUN: cp %p/Inputs/archive-test.a-corrupt-symbol-table %t.a
-RUN: llvm-nm -s %t.a | FileCheck %s --check-prefix=CORRUPT
+RUN: llvm-nm -M %t.a | FileCheck %s --check-prefix=CORRUPT
 
 CORRUPT: Archive map
 CORRUPT-NEXT: mbin in trivial-object-test.elf-x86-64
@@ -40,20 +41,27 @@ CORRUPT: trivial-object-test.elf-x86-64:
 CORRUPT-NEXT:                  U SomeOtherFunction
 CORRUPT-NEXT: 0000000000000000 T main
 CORRUPT-NEXT:                  U puts
-CORRUPT-NEXT: trivial-object-test2.elf-x86-64:
+
+CORRUPT: trivial-object-test2.elf-x86-64:
 CORRUPT-NEXT: 0000000000000000 t bar
 CORRUPT-NEXT: 0000000000000006 T foo
 CORRUPT-NEXT: 0000000000000016 T main
 
 check that the we *don't* update the symbol table.
 RUN: llvm-ar s %t.a
-RUN: llvm-nm -s %t.a | FileCheck %s --check-prefix=CORRUPT
+RUN: llvm-nm -M %t.a | FileCheck %s --check-prefix=CORRUPT
 
 repeate the test with llvm-ranlib
 
 RUN: rm -f %t.a
 RUN: llvm-ar rcS %t.a %p/Inputs/trivial-object-test.elf-x86-64 %p/Inputs/trivial-object-test2.elf-x86-64
-RUN: llvm-nm -s %t.a | FileCheck %s --check-prefix=NOMAP
+RUN: llvm-nm -M %t.a | FileCheck %s --check-prefix=NOMAP
 
 RUN: llvm-ranlib %t.a
-RUN: llvm-nm -s %t.a | FileCheck %s
+RUN: llvm-nm -M %t.a | FileCheck %s
+
+RUN: llvm-nm -M %p/Inputs/macho-archive-x86_64.a | FileCheck %s --check-prefix=BSD-MachO
+
+BSD-MachO: Archive map
+BSD-MachO: _bar in bar.o
+BSD-MachO: _foo in foo.o
diff --git a/test/Object/coff-archive-short.test b/test/Object/coff-archive-short.test
index fa531b3..2aee956 100644
--- a/test/Object/coff-archive-short.test
+++ b/test/Object/coff-archive-short.test
@@ -5,7 +5,7 @@
 # than 15 characters, thus, unlike coff_archive.lib, it has no string
 # table as the third member.
 #
-RUN: llvm-nm --numeric-sort -s %p/Inputs/coff_archive_short.lib | FileCheck -check-prefix=CHECKIDX %s
+RUN: llvm-nm --numeric-sort -M %p/Inputs/coff_archive_short.lib | FileCheck -check-prefix=CHECKIDX %s
 
 CHECKIDX: Archive map
 CHECKIDX: _shortfn1 in short1.obj
diff --git a/test/Object/coff-archive.test b/test/Object/coff-archive.test
index 768fe1c..3b0aa0c 100644
--- a/test/Object/coff-archive.test
+++ b/test/Object/coff-archive.test
@@ -1,7 +1,7 @@
 #
 # Check if the index is appearing properly in the output file 
 #
-RUN: llvm-nm --numeric-sort -s %p/Inputs/coff_archive.lib | FileCheck -check-prefix=CHECKIDX %s
+RUN: llvm-nm --numeric-sort -M %p/Inputs/coff_archive.lib | FileCheck -check-prefix=CHECKIDX %s
 
 CHECKIDX: Archive map
 CHECKIDX: ??0invalid_argument@std@@QAE@PBD@Z in Debug\mymath.obj
diff --git a/test/Object/directory.ll b/test/Object/directory.ll
index 48eefcb..c4b0bbf 100644
--- a/test/Object/directory.ll
+++ b/test/Object/directory.ll
@@ -1,6 +1,6 @@
 ;RUN: rm -f %T/test.a
 ;RUN: not llvm-ar r %T/test.a . 2>&1 | FileCheck %s
-;CHECK: .: Is a directory
+;CHECK: .: {{I|i}}s a directory
 
 ;RUN: rm -f %T/test.a
 ;RUN: touch %T/a-very-long-file-name
diff --git a/test/Object/mangle-ir.ll b/test/Object/mangle-ir.ll
index 725d788..5b3cd09 100644
--- a/test/Object/mangle-ir.ll
+++ b/test/Object/mangle-ir.ll
@@ -2,7 +2,13 @@
 
 target datalayout = "m:o"
 
+; CHECK-NOT: memcpy
 ; CHECK: T _f
+; CHECK-NOT: memcpy
+
 define void @f() {
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* null, i8* null, i64 0, i32 1, i1 false)
   ret void
 }
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
diff --git a/test/Object/nm-archive.test b/test/Object/nm-archive.test
index fbbf051..7dbc22a 100644
--- a/test/Object/nm-archive.test
+++ b/test/Object/nm-archive.test
@@ -18,6 +18,7 @@ RUN: llvm-nm %t2 | FileCheck %s -check-prefix BITCODE
 BITCODE:          U SomeOtherFunction
 BITCODE-NEXT:          T main
 BITCODE-NEXT:          U puts
+BITCODE-NEXT:          D var
 
 
 Test we don't error with an archive with no symtab.
@@ -29,7 +30,13 @@ RUN: llvm-nm %p/Inputs/archive-test.a-gnu-minimal
 
 
 And don't crash when asked to print a non-existing symtab.
-RUN: llvm-nm -s %p/Inputs/archive-test.a-gnu-minimal
+RUN: llvm-nm -M %p/Inputs/archive-test.a-gnu-minimal
 
 Don't reject an empty archive.
 RUN: llvm-nm %p/Inputs/archive-test.a-empty
+
+This archive has an unaligned member and a unknown format member.
+GNU AR is able to parse the unaligned member and warns about the member with
+the unknown format. We should probably simply warn on both. For now just check
+that we don't produce an error.
+RUN: llvm-nm %p/Inputs/corrupt-archive.a
diff --git a/test/Object/nm-darwin-m.test b/test/Object/nm-darwin-m.test
new file mode 100644
index 0000000..5bb19dc
--- /dev/null
+++ b/test/Object/nm-darwin-m.test
@@ -0,0 +1,53 @@
+RUN: llvm-nm -format darwin %p/Inputs/darwin-m-test1.mach0-armv7 \
+RUN:         | FileCheck %s -check-prefix test1
+RUN: llvm-nm -format darwin %p/Inputs/darwin-m-test2.macho-i386 \
+RUN:         | FileCheck %s -check-prefix test2
+RUN: llvm-nm -m %p/Inputs/darwin-m-test3.macho-x86-64 \
+RUN:         | FileCheck %s -check-prefix test3
+
+# This is testing that the various bits in the n_desc feild are correct
+test1: 00000001 (absolute) non-external _a
+test1: 00000008 (common) (alignment 2^2) external _c
+test1: 0000000a (__DATA,__data) non-external [no dead strip] _d
+test1: 00000004 (__TEXT,__text) non-external [alt entry] _e
+test1: 00000000 (__TEXT,__text) non-external [symbol resolver] _r
+test1: 00000008 (__TEXT,__text) non-external [Thumb] _t
+
+# This is testing that an N_INDR symbol gets its alias name, the "(for ...)"
+test2:          (undefined) external __i
+test2:          (indirect) external _i (for __i)
+
+# This is testing is using darwin-m-test3.macho-x86-64 that is linked with
+# dylibs that have the follow set of -install_names:
+#	Foo.framework/Foo 
+#	/System/Library/Frameworks/FooPath.framework/FooPath 
+#	FooSuffix.framework/FooSuffix_debug
+#	/System/Library/Frameworks/FooPathSuffix.framework/FooPathSuffix_profile
+#	FooVers.framework/Versions/A/FooVers
+#	/System/Library/Frameworks/FooPathVers.framework/Versions/B/FooPathVers
+#	libx.dylib
+#	libxSuffix_profile.dylib
+#	/usr/local/lib/libxPathSuffix_debug.dylib
+#	libATS.A_profile.dylib
+#	/usr/lib/libPathATS.A_profile.dylib
+#	QT.A.qtx
+#	/lib/QTPath.qtx
+#	/usr/lib/libSystem.B.dylib
+# to test that MachOObjectFile::guessLibraryShortName() is correctly parsing 
+# them into their short names.
+test3: 0000000100000000 (__TEXT,__text) [referenced dynamically] external __mh_execute_header
+test3:                  (undefined) external _atsPathVersSuffix (from libPathATS)
+test3:                  (undefined) external _atsVersSuffix (from libATS)
+test3:                  (undefined) external _foo (from Foo)
+test3:                  (undefined) external _fooPath (from FooPath)
+test3:                  (undefined) external _fooPathSuffix (from FooPathSuffix)
+test3:                  (undefined) external _fooPathVers (from FooPathVers)
+test3:                  (undefined) external _fooSuffix (from FooSuffix)
+test3:                  (undefined) external _fooVers (from FooVers)
+test3: 0000000100000e60 (__TEXT,__text) external _main
+test3:                  (undefined) external _qt (from QT)
+test3:                  (undefined) external _qtPath (from QTPath)
+test3:                  (undefined) external _x (from libx)
+test3:                  (undefined) external _xPathSuffix (from libxPathSuffix)
+test3:                  (undefined) external _xSuffix (from libxSuffix)
+test3:                  (undefined) external dyld_stub_binder (from libSystem)
diff --git a/test/Object/nm-trivial-object.test b/test/Object/nm-trivial-object.test
index 20ac662..656d6b0 100644
--- a/test/Object/nm-trivial-object.test
+++ b/test/Object/nm-trivial-object.test
@@ -1,6 +1,6 @@
-RUN: yaml2obj %p/Inputs/COFF/i386.yaml | llvm-nm \
+RUN: yaml2obj %p/Inputs/COFF/i386.yaml | llvm-nm - \
 RUN:         | FileCheck %s -check-prefix COFF
-RUN: yaml2obj %p/Inputs/COFF/x86-64.yaml | llvm-nm \
+RUN: yaml2obj %p/Inputs/COFF/x86-64.yaml | llvm-nm - \
 RUN:         | FileCheck %s -check-prefix COFF
 RUN: llvm-nm %p/Inputs/trivial-object-test.elf-i386 \
 RUN:         | FileCheck %s -check-prefix ELF
@@ -12,14 +12,27 @@ RUN: llvm-nm %p/Inputs/absolute.elf-x86-64 \
 RUN:         | FileCheck %s -check-prefix ABSOLUTE-ELF64
 RUN: llvm-nm %p/Inputs/trivial-object-test.macho-i386 \
 RUN:         | FileCheck %s -check-prefix macho
+RUN: llvm-nm -U %p/Inputs/trivial-object-test.macho-i386 \
+RUN:         | FileCheck %s -check-prefix macho-U
 RUN: llvm-nm %p/Inputs/trivial-object-test.macho-x86-64 \
 RUN:         | FileCheck %s -check-prefix macho64
+RUN: llvm-nm %p/Inputs/macho-text-data-bss.macho-x86_64 \
+RUN:         | FileCheck %s -check-prefix macho-tdb
+RUN: llvm-nm -j %p/Inputs/macho-text-data-bss.macho-x86_64 \
+RUN:         | FileCheck %s -check-prefix macho-j
+RUN: llvm-nm -r %p/Inputs/macho-text-data-bss.macho-x86_64 \
+RUN:         | FileCheck %s -check-prefix macho-r
 RUN: llvm-nm %p/Inputs/common.coff-i386 \
 RUN:         | FileCheck %s -check-prefix COFF-COMMON
 RUN: llvm-nm %p/Inputs/relocatable-with-section-address.elf-x86-64 \
 RUN:         | FileCheck %s -check-prefix ELF-SEC-ADDR64
 RUN: llvm-nm %p/Inputs/thumb-symbols.elf.arm \
 RUN:         | FileCheck %s -check-prefix ELF-THUMB
+RUN: mkdir -p %t
+RUN: cd %t
+RUN: cp %p/Inputs/trivial-object-test.macho-i386 a.out
+RUN: llvm-nm | FileCheck %s -check-prefix A-OUT
+REQUIRES: shell
 
 
 COFF: 00000000 d .data
@@ -59,11 +72,32 @@ macho:          U _SomeOtherFunction
 macho: 00000000 T _main
 macho:          U _puts
 
+macho-U-NOT:          U _SomeOtherFunction
+macho-U: 00000000 T _main
+macho-U-NOT:          U _puts
+
 macho64: 0000000000000028 s L_.str
 macho64:                  U _SomeOtherFunction
 macho64: 0000000000000000 T _main
 macho64:                  U _puts
 
+macho-tdb: 0000000000000030 s EH_frame0
+macho-tdb: 0000000000000070 b _b
+macho-tdb: 000000000000000c D _d
+macho-tdb: 0000000000000000 T _t
+macho-tdb: 0000000000000048 S _t.eh
+
+macho-j: EH_frame0
+macho-j: _b
+macho-j: _d
+macho-j: _t
+macho-j: _t.eh
+
+macho-r: 0000000000000048 S _t.eh
+macho-r-NEXT: 0000000000000000 T _t
+macho-r-NEXT: 000000000000000c D _d
+macho-r-NEXT: 0000000000000070 b _b
+macho-r-NEXT: 0000000000000030 s EH_frame0
 
 Test that nm uses addresses even with ELF .o files.
 ELF-SEC-ADDR64:      0000000000000058 D a
@@ -76,3 +110,7 @@ ELF-SEC-ADDR64-NEXT: 0000000000000060 D p
 Test that we drop the thumb bit only from function addresses.
 ELF-THUMB: 00000000 t f
 ELF-THUMB: 00000003 t g
+
+A-OUT:          U _SomeOtherFunction
+A-OUT: 00000000 T _main
+A-OUT:          U _puts
diff --git a/test/Object/nm-universal-binary.test b/test/Object/nm-universal-binary.test
index c20c733..889377b 100644
--- a/test/Object/nm-universal-binary.test
+++ b/test/Object/nm-universal-binary.test
@@ -1,19 +1,31 @@
-RUN: llvm-nm %p/Inputs/macho-universal.x86_64.i386 \
+RUN: llvm-nm -arch all %p/Inputs/macho-universal.x86_64.i386 \
 RUN:         | FileCheck %s -check-prefix CHECK-OBJ
-RUN: llvm-nm %p/Inputs/macho-universal-archive.x86_64.i386 \
+RUN: llvm-nm -arch x86_64 %p/Inputs/macho-universal.x86_64.i386 \
+RUN:         | FileCheck %s -check-prefix CHECK-OBJ-x86_64
+RUN: llvm-nm -arch all %p/Inputs/macho-universal-archive.x86_64.i386 \
 RUN:         | FileCheck %s -check-prefix CHECK-AR
+RUN: llvm-nm -arch i386 %p/Inputs/macho-universal-archive.x86_64.i386 \
+RUN:         | FileCheck %s -check-prefix CHECK-AR-i386
 
-CHECK-OBJ: macho-universal.x86_64.i386:x86_64
+CHECK-OBJ: macho-universal.x86_64.i386 (for architecture x86_64):
 CHECK-OBJ: 0000000100000f60 T _main
-CHECK-OBJ: macho-universal.x86_64.i386:i386
+CHECK-OBJ: macho-universal.x86_64.i386 (for architecture i386):
 CHECK-OBJ: 00001fa0 T _main
 
-CHECK-AR: macho-universal-archive.x86_64.i386:x86_64:hello.o:
+CHECK-OBJ-x86_64: 0000000100000000 T __mh_execute_header
+CHECK-OBJ-x86_64: 0000000100000f60 T _main
+CHECK-OBJ-x86_64:                  U dyld_stub_binder
+
+CHECK-AR: macho-universal-archive.x86_64.i386(hello.o) (for architecture x86_64):
 CHECK-AR: 0000000000000068 s EH_frame0
 CHECK-AR: 000000000000003b s L_.str
 CHECK-AR: 0000000000000000 T _main
 CHECK-AR: 0000000000000080 S _main.eh
 CHECK-AR:                  U _printf
-CHECK-AR: macho-universal-archive.x86_64.i386:i386:foo.o:
-CHECK-AR: 00000008 S _bar
+CHECK-AR: macho-universal-archive.x86_64.i386(foo.o) (for architecture i386):
+CHECK-AR: 00000008 D _bar
 CHECK-AR: 00000000 T _foo
+
+CHECK-AR-i386: macho-universal-archive.x86_64.i386(foo.o):
+CHECK-AR-i386: 00000008 D _bar
+CHECK-AR-i386: 00000000 T _foo
diff --git a/test/Object/obj2yaml.test b/test/Object/obj2yaml.test
index 1c15263..98b40d5 100644
--- a/test/Object/obj2yaml.test
+++ b/test/Object/obj2yaml.test
@@ -201,8 +201,8 @@ ELF-MIPSEL-NEXT:     Content:         0000023C00004224E8FFBD271400BFAF1000B0AF21
 ELF-MIPSEL-NEXT:   - Name:            .rel.text
 ELF-MIPSEL-NEXT:     Type:            SHT_REL
 ELF-MIPSEL-NEXT:     Link:            .symtab
-ELF-MIPSEL-NEXT:     Info:            .text
 ELF-MIPSEL-NEXT:     AddressAlign:    0x0000000000000004
+ELF-MIPSEL-NEXT:     Info:            .text
 ELF-MIPSEL-NEXT:     Relocations:
 ELF-MIPSEL-NEXT:       - Offset:          0
 ELF-MIPSEL-NEXT:         Symbol:          _gp_disp
@@ -300,8 +300,8 @@ ELF-MIPS64EL-NEXT:     Content:         '00000000000000000000000000000000'
 ELF-MIPS64EL-NEXT:   - Name:            .rela.data
 ELF-MIPS64EL-NEXT:     Type:            SHT_RELA
 ELF-MIPS64EL-NEXT:     Link:            .symtab
-ELF-MIPS64EL-NEXT:     Info:            .data
 ELF-MIPS64EL-NEXT:     AddressAlign:    0x0000000000000008
+ELF-MIPS64EL-NEXT:     Info:            .data
 ELF-MIPS64EL-NEXT:     Relocations:
 ELF-MIPS64EL-NEXT:       - Offset:          0
 ELF-MIPS64EL-NEXT:         Symbol:          zed
@@ -370,8 +370,8 @@ ELF-X86-64-NEXT:   - Name:            .rela.text
 ELF-X86-64-NEXT:     Type:            SHT_RELA
 ELF-X86-64-NEXT:     Address:         0x0000000000000038
 ELF-X86-64-NEXT:     Link:            .symtab
-ELF-X86-64-NEXT:     Info:            .text
 ELF-X86-64-NEXT:     AddressAlign:    0x0000000000000008
+ELF-X86-64-NEXT:     Info:            .text
 ELF-X86-64-NEXT:     Relocations:
 ELF-X86-64-NEXT:       - Offset:          0x000000000000000D
 ELF-X86-64-NEXT:         Symbol:          .rodata.str1.1
diff --git a/test/Object/objdump-no-sectionheaders.test b/test/Object/objdump-no-sectionheaders.test
new file mode 100644
index 0000000..5130100
--- /dev/null
+++ b/test/Object/objdump-no-sectionheaders.test
@@ -0,0 +1,6 @@
+; RUN: llvm-objdump -h %p/Inputs/no-sections.elf-x86-64 \
+; RUN:              | FileCheck %s
+
+; CHECK: Sections:
+; CHECK: Idx Name          Size      Address          Type
+; CHECK-NOT: {{.}}
diff --git a/test/Object/simple-archive.test b/test/Object/simple-archive.test
index 3e6760e..085a91e 100644
--- a/test/Object/simple-archive.test
+++ b/test/Object/simple-archive.test
@@ -1,7 +1,7 @@
 #
 # Check if the index is appearing properly in the output file
 #
-RUN: llvm-nm -s %p/Inputs/libsimple_archive.a | FileCheck -check-prefix=CHECKIDX %s
+RUN: llvm-nm -M %p/Inputs/libsimple_archive.a | FileCheck -check-prefix=CHECKIDX %s
 
 CHECKIDX: Archive map
 CHECKIDX: abcdefghijklmnopqrstuvwxyz12345678 in 1.o
diff --git a/test/Object/size-trivial-macho.test b/test/Object/size-trivial-macho.test
index 6ecdf5c..a6d3d1c 100644
--- a/test/Object/size-trivial-macho.test
+++ b/test/Object/size-trivial-macho.test
@@ -2,6 +2,22 @@ RUN: llvm-size -A %p/Inputs/macho-text-data-bss.macho-x86_64 \
 RUN:         | FileCheck %s -check-prefix A
 RUN: llvm-size -B %p/Inputs/macho-text-data-bss.macho-x86_64 \
 RUN:         | FileCheck %s -check-prefix B
+RUN: llvm-size -format darwin %p/Inputs/macho-text-data-bss.macho-x86_64 \
+RUN:         | FileCheck %s -check-prefix m
+RUN: llvm-size %p/Inputs/macho-archive-x86_64.a \
+RUN:         | FileCheck %s -check-prefix AR
+RUN: llvm-size -format darwin %p/Inputs/macho-archive-x86_64.a \
+RUN:         | FileCheck %s -check-prefix mAR
+RUN: llvm-size -m -x -l %p/Inputs/hello-world.macho-x86_64 \
+RUN:         | FileCheck %s -check-prefix mxl
+RUN: llvm-size -arch all %p/Inputs/macho-universal.x86_64.i386 \
+RUN:         | FileCheck %s -check-prefix u
+RUN: llvm-size -arch i386 %p/Inputs/macho-universal.x86_64.i386 \
+RUN:         | FileCheck %s -check-prefix u-i386
+RUN: llvm-size -arch all %p/Inputs/macho-universal-archive.x86_64.i386 \
+RUN:         | FileCheck %s -check-prefix uAR
+RUN: llvm-size -arch x86_64 %p/Inputs/macho-universal-archive.x86_64.i386 \
+RUN:         | FileCheck %s -check-prefix uAR-x86_64
 
 A: section              size   addr
 A: __text                 12      0
@@ -11,5 +27,63 @@ A: __compact_unwind       32     16
 A: __eh_frame             64     48
 A: Total                 116
 
-B:   text    data     bss     dec     hex filename
-B:     12     100       4     116      74 
+B:	__TEXT	__DATA	__OBJC	others	dec	hex
+B:	76	8	0	32	116	74	
+
+m: Segment : 116
+m: 	Section (__TEXT, __text): 12
+m: 	Section (__DATA, __data): 4
+m: 	Section (__DATA, __bss): 4
+m: 	Section (__LD, __compact_unwind): 32
+m: 	Section (__TEXT, __eh_frame): 64
+m: 	total 116
+m: total 116
+
+AR: __TEXT	__DATA	__OBJC	others	dec	hex
+AR: 70	0	0	32	102	66	{{.*}}/macho-archive-x86_64.a(foo.o)
+AR: 0	4	0	0	4	4	{{.*}}/macho-archive-x86_64.a(bar.o)
+
+mAR: {{.*}}/macho-archive-x86_64.a(foo.o):
+mAR: Segment : 104
+mAR: 	Section (__TEXT, __text): 6
+mAR: 	Section (__LD, __compact_unwind): 32
+mAR: 	Section (__TEXT, __eh_frame): 64
+mAR: 	total 102
+mAR: total 104
+mAR: {{.*}}/macho-archive-x86_64.a(bar.o):
+mAR: Segment : 4
+mAR: 	Section (__TEXT, __text): 0
+mAR: 	Section (__DATA, __data): 4
+mAR: 	total 4
+mAR: total 4
+
+
+mxl: Segment __PAGEZERO: 0x100000000 (vmaddr 0x0 fileoff 0)
+mxl: Segment __TEXT: 0x1000 (vmaddr 0x100000000 fileoff 0)
+mxl: 	Section __text: 0x3b (addr 0x100000f30 offset 3888)
+mxl: 	Section __stubs: 0x6 (addr 0x100000f6c offset 3948)
+mxl: 	Section __stub_helper: 0x1a (addr 0x100000f74 offset 3956)
+mxl: 	Section __cstring: 0xd (addr 0x100000f8e offset 3982)
+mxl: 	Section __unwind_info: 0x48 (addr 0x100000f9b offset 3995)
+mxl: 	Section __eh_frame: 0x18 (addr 0x100000fe8 offset 4072)
+mxl: 	total 0xc8
+mxl: Segment __DATA: 0x1000 (vmaddr 0x100001000 fileoff 4096)
+mxl: 	Section __nl_symbol_ptr: 0x10 (addr 0x100001000 offset 4096)
+mxl: 	Section __la_symbol_ptr: 0x8 (addr 0x100001010 offset 4112)
+mxl: 	total 0x18
+mxl: Segment __LINKEDIT: 0x1000 (vmaddr 0x100002000 fileoff 8192)
+mxl: total 0x100003000
+
+u: __TEXT	__DATA	__OBJC	others	dec	hex
+u: 4096	0	0	4294971392	4294975488	100002000	{{.*}}/macho-universal.x86_64.i386 (for architecture x86_64)
+u: 4096	0	0	8192	12288	3000	{{.*}}/macho-universal.x86_64.i386 (for architecture i386)
+
+u-i386: __TEXT	__DATA	__OBJC	others	dec	hex
+u-i386: 4096	0	0	8192	12288	3000	
+
+uAR: __TEXT	__DATA	__OBJC	others	dec	hex
+uAR: 136	0	0	32	168	a8	{{.*}}/macho-universal-archive.x86_64.i386(hello.o) (for architecture x86_64)
+uAR: 5	4	0	0	9	9	{{.*}}/macho-universal-archive.x86_64.i386(foo.o) (for architecture i386)
+
+uAR-x86_64: __TEXT	__DATA	__OBJC	others	dec	hex
+uAR-x86_64: 136	0	0	32	168	a8	{{.*}}/macho-universal-archive.x86_64.i386(hello.o)
diff --git a/test/Object/yaml2obj-coff-multi-doc.test b/test/Object/yaml2obj-coff-multi-doc.test
new file mode 100644
index 0000000..1cf7203
--- /dev/null
+++ b/test/Object/yaml2obj-coff-multi-doc.test
@@ -0,0 +1,91 @@
+# RUN: yaml2obj -format=coff -docnum=1 %s \
+# RUN:   | llvm-readobj -symbols - | FileCheck -check-prefix=DOC1 %s
+# RUN: yaml2obj -format=coff -docnum=2 %s \
+# RUN:   | llvm-readobj -symbols - | FileCheck -check-prefix=DOC2 %s
+# RUN: not yaml2obj -format=coff -docnum=3 %s 2>&1 \
+# RUN:   | FileCheck -check-prefix=DOC3 %s
+
+# DOC1: Name: _sym1
+# DOC2: Name: _sym2
+# DOC3: yaml2obj: Cannot find the 3rd document
+
+---
+header:
+  Machine: IMAGE_FILE_MACHINE_I386
+  Characteristics: [ IMAGE_FILE_DEBUG_STRIPPED ]
+
+sections:
+  - Name: .text
+    Alignment: 16
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE,
+                       IMAGE_SCN_MEM_READ ]
+    SectionData: "00000000"
+
+symbols:
+  - Name: .text
+    Value: 0
+    SectionNumber: 1
+    SimpleType: IMAGE_SYM_TYPE_NULL
+    ComplexType: IMAGE_SYM_DTYPE_NULL
+    StorageClass: IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          36
+      NumberOfRelocations: 3
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          1
+
+  - Name: _main
+    Value: 0
+    SectionNumber: 1
+    SimpleType: IMAGE_SYM_TYPE_NULL
+    ComplexType: IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass: IMAGE_SYM_CLASS_EXTERNAL
+
+  - Name: _sym1
+    Value: 0
+    SectionNumber: 0
+    SimpleType: IMAGE_SYM_TYPE_NULL
+    ComplexType: IMAGE_SYM_DTYPE_NULL
+    StorageClass: IMAGE_SYM_CLASS_EXTERNAL
+
+---
+header:
+  Machine: IMAGE_FILE_MACHINE_I386
+  Characteristics: [ IMAGE_FILE_DEBUG_STRIPPED ]
+
+sections:
+  - Name: .text
+    Alignment: 16
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE,
+                       IMAGE_SCN_MEM_READ ]
+    SectionData: "00000000"
+
+symbols:
+  - Name: .text
+    Value: 0
+    SectionNumber: 1
+    SimpleType: IMAGE_SYM_TYPE_NULL
+    ComplexType: IMAGE_SYM_DTYPE_NULL
+    StorageClass: IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          36
+      NumberOfRelocations: 3
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          1
+
+  - Name: _main
+    Value: 0
+    SectionNumber: 1
+    SimpleType: IMAGE_SYM_TYPE_NULL
+    ComplexType: IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass: IMAGE_SYM_CLASS_EXTERNAL
+
+  - Name: _sym2
+    Value: 0
+    SectionNumber: 0
+    SimpleType: IMAGE_SYM_TYPE_NULL
+    ComplexType: IMAGE_SYM_DTYPE_NULL
+    StorageClass: IMAGE_SYM_CLASS_EXTERNAL
+...
diff --git a/test/Object/yaml2obj-elf-multi-doc.test b/test/Object/yaml2obj-elf-multi-doc.test
new file mode 100644
index 0000000..c51f803
--- /dev/null
+++ b/test/Object/yaml2obj-elf-multi-doc.test
@@ -0,0 +1,56 @@
+# RUN: yaml2obj -format=elf -docnum=1 %s \
+# RUN:   | llvm-readobj -symbols - | FileCheck -check-prefix=DOC1 %s
+# RUN: yaml2obj -format=elf -docnum=2 %s \
+# RUN:   | llvm-readobj -symbols - | FileCheck -check-prefix=DOC2 %s
+# RUN: not yaml2obj -format=elf -docnum=3 %s 2>&1 \
+# RUN:   | FileCheck -check-prefix=DOC3 %s
+
+# DOC1: Name: T1 (1)
+# DOC2: Name: T2 (1)
+# DOC3: yaml2obj: Cannot find the 3rd document
+
+--- !ELF
+FileHeader: !FileHeader
+  Class: ELFCLASS32
+  Data: ELFDATA2LSB
+  Type: ET_REL
+  Machine: EM_MIPS
+  Flags: [EF_MIPS_CPIC]
+
+Sections:
+- Name: .text
+  Type: SHT_PROGBITS
+  Content:  "0000000000000000"
+  AddressAlign: 16
+  Flags: [SHF_EXECINSTR, SHF_ALLOC]
+
+Symbols:
+  Global:
+    - Name: T1
+      Section: .text
+      Type: STT_FUNC
+      Value: 0x0
+      Size: 8
+
+--- !ELF
+FileHeader: !FileHeader
+  Class: ELFCLASS32
+  Data: ELFDATA2LSB
+  Type: ET_REL
+  Machine: EM_MIPS
+
+Sections:
+- Name: .text
+  Type: SHT_PROGBITS
+  Content:  "00000000"
+  AddressAlign: 16
+  Flags: [SHF_EXECINSTR, SHF_ALLOC]
+
+Symbols:
+  Global:
+    - Name: T2
+      Section: .text
+      Type: STT_FUNC
+      Value: 0x0
+      Size: 4
+...
diff --git a/test/Object/yaml2obj-elf-symbol-visibility.yaml b/test/Object/yaml2obj-elf-symbol-visibility.yaml
new file mode 100644
index 0000000..113354a
--- /dev/null
+++ b/test/Object/yaml2obj-elf-symbol-visibility.yaml
@@ -0,0 +1,126 @@
+# RUN: yaml2obj -format=elf %s | llvm-readobj -symbols - | \
+# RUN:   FileCheck --check-prefix OBJ %s
+# RUN: yaml2obj -format=elf %s | obj2yaml - | FileCheck --check-prefix YAML %s
+
+# OBJ:      Symbol {
+# OBJ:        Name: default1 (36)
+# OBJ-NEXT:   Value: 0x0
+# OBJ-NEXT:   Size: 4
+# OBJ-NEXT:   Binding: Global (0x1)
+# OBJ-NEXT:   Type: Object (0x1)
+# OBJ-NEXT:   Other: 0
+# OBJ-NEXT:   Section: .data (0x1)
+# OBJ-NEXT: }
+# OBJ-NEXT: Symbol {
+# OBJ-NEXT:   Name: default2 (27)
+# OBJ-NEXT:   Value: 0x4
+# OBJ-NEXT:   Size: 4
+# OBJ-NEXT:   Binding: Global (0x1)
+# OBJ-NEXT:   Type: Object (0x1)
+# OBJ-NEXT:   Other: 0
+# OBJ-NEXT:   Section: .data (0x1)
+# OBJ-NEXT: }
+# OBJ-NEXT: Symbol {
+# OBJ-NEXT:   Name: internal (8)
+# OBJ-NEXT:   Value: 0x8
+# OBJ-NEXT:   Size: 4
+# OBJ-NEXT:   Binding: Global (0x1)
+# OBJ-NEXT:   Type: Object (0x1)
+# OBJ-NEXT:   Other: 1
+# OBJ-NEXT:   Section: .data (0x1)
+# OBJ-NEXT: }
+# OBJ-NEXT: Symbol {
+# OBJ-NEXT:   Name: hidden (1)
+# OBJ-NEXT:   Value: 0xC
+# OBJ-NEXT:   Size: 4
+# OBJ-NEXT:   Binding: Global (0x1)
+# OBJ-NEXT:   Type: Object (0x1)
+# OBJ-NEXT:   Other: 2
+# OBJ-NEXT:   Section: .data (0x1)
+# OBJ-NEXT: }
+# OBJ-NEXT: Symbol {
+# OBJ-NEXT:   Name: protected (17)
+# OBJ-NEXT:   Value: 0x10
+# OBJ-NEXT:   Size: 4
+# OBJ-NEXT:   Binding: Global (0x1)
+# OBJ-NEXT:   Type: Object (0x1)
+# OBJ-NEXT:   Other: 3
+# OBJ-NEXT:   Section: .data (0x1)
+# OBJ-NEXT: }
+
+# YAML:      Symbols:
+# YAML-NEXT:   Global:
+# YAML-NEXT:     - Name:            default1
+# YAML-NEXT:       Type:            STT_OBJECT
+# YAML-NEXT:       Section:         .data
+# YAML-NEXT:       Size:            0x0000000000000004
+# YAML-NEXT:     - Name:            default2
+# YAML-NEXT:       Type:            STT_OBJECT
+# YAML-NEXT:       Section:         .data
+# YAML-NEXT:       Value:           0x0000000000000004
+# YAML-NEXT:       Size:            0x0000000000000004
+# YAML-NEXT:     - Name:            internal
+# YAML-NEXT:       Type:            STT_OBJECT
+# YAML-NEXT:       Section:         .data
+# YAML-NEXT:       Value:           0x0000000000000008
+# YAML-NEXT:       Size:            0x0000000000000004
+# YAML-NEXT:       Visibility:      STV_INTERNAL
+# YAML-NEXT:     - Name:            hidden
+# YAML-NEXT:       Type:            STT_OBJECT
+# YAML-NEXT:       Section:         .data
+# YAML-NEXT:       Value:           0x000000000000000C
+# YAML-NEXT:       Size:            0x0000000000000004
+# YAML-NEXT:       Visibility:      STV_HIDDEN
+# YAML-NEXT:     - Name:            protected
+# YAML-NEXT:       Type:            STT_OBJECT
+# YAML-NEXT:       Section:         .data
+# YAML-NEXT:       Value:           0x0000000000000010
+# YAML-NEXT:       Size:            0x0000000000000004
+# YAML-NEXT:       Visibility:      STV_PROTECTED
+
+---
+FileHeader:
+  Class:           ELFCLASS32
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_MIPS
+  Flags:           [ EF_MIPS_ABI_O32, EF_MIPS_ARCH_32 ]
+
+Sections:
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_WRITE ]
+    AddressAlign:    0x04
+    Size:            0x14
+
+Symbols:
+  Global:
+    - Name:            default1
+      Type:            STT_OBJECT
+      Visibility:      STV_DEFAULT
+      Section:         .data
+      Value:           0x00
+      Size:            0x04
+    - Name:            default2
+      Type:            STT_OBJECT
+      Section:         .data
+      Value:           0x04
+      Size:            0x04
+    - Name:            internal
+      Type:            STT_OBJECT
+      Visibility:      STV_INTERNAL
+      Section:         .data
+      Value:           0x08
+      Size:            0x04
+    - Name:            hidden
+      Type:            STT_OBJECT
+      Visibility:      STV_HIDDEN
+      Section:         .data
+      Value:           0x0C
+      Size:            0x04
+    - Name:            protected
+      Type:            STT_OBJECT
+      Visibility:      STV_PROTECTED
+      Section:         .data
+      Value:           0x10
+      Size:            0x04
diff --git a/test/Other/X86/lit.local.cfg b/test/Other/X86/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/Other/X86/lit.local.cfg
+++ b/test/Other/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Other/constant-fold-gep.ll b/test/Other/constant-fold-gep.ll
index aed4145..3874898 100644
--- a/test/Other/constant-fold-gep.ll
+++ b/test/Other/constant-fold-gep.ll
@@ -457,7 +457,7 @@ define i8* @different_addrspace() nounwind noinline {
   %p = getelementptr inbounds i8* addrspacecast ([4 x i8] addrspace(12)* @p12 to i8*),
                                   i32 2
   ret i8* %p
-; OPT: ret i8* getelementptr (i8* addrspacecast ([4 x i8] addrspace(12)* @p12 to i8*), i32 2)
+; OPT: ret i8* getelementptr (i8* addrspacecast (i8 addrspace(12)* getelementptr inbounds ([4 x i8] addrspace(12)* @p12, i32 0, i32 0) to i8*), i32 2)
 }
 
 define i8* @same_addrspace() nounwind noinline {
@@ -467,4 +467,21 @@ define i8* @same_addrspace() nounwind noinline {
 ; OPT: ret i8* getelementptr inbounds ([4 x i8]* @p0, i32 0, i32 2)
 }
 
+@gv1 = internal global i32 1
+@gv2 = internal global [1 x i32] [ i32 2 ]
+@gv3 = internal global [1 x i32] [ i32 2 ]
+
+; Handled by TI-independent constant folder
+define i1 @gv_gep_vs_gv() {
+  ret i1 icmp eq (i32* getelementptr inbounds ([1 x i32]* @gv2, i32 0, i32 0), i32* @gv1)
+}
+; PLAIN: gv_gep_vs_gv
+; PLAIN: ret i1 false
+
+define i1 @gv_gep_vs_gv_gep() {
+  ret i1 icmp eq (i32* getelementptr inbounds ([1 x i32]* @gv2, i32 0, i32 0), i32* getelementptr inbounds ([1 x i32]* @gv3, i32 0, i32 0))
+}
+; PLAIN: gv_gep_vs_gv_gep
+; PLAIN: ret i1 false
+
 ; CHECK: attributes #0 = { nounwind }
diff --git a/test/Other/llvm-nm-without-aliases.ll b/test/Other/llvm-nm-without-aliases.ll
index 9d9408c..6ef72c7 100644
--- a/test/Other/llvm-nm-without-aliases.ll
+++ b/test/Other/llvm-nm-without-aliases.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-as < %s > %t
-; RUN: llvm-nm -without-aliases < %t | FileCheck %s
-; RUN: llvm-nm < %t | FileCheck --check-prefix=WITH %s
+; RUN: llvm-nm -without-aliases - < %t | FileCheck %s
+; RUN: llvm-nm - < %t | FileCheck --check-prefix=WITH %s
 
 ; CHECK-NOT: T a0bar
 ; CHECK-NOT: T a0foo
diff --git a/test/TableGen/ForeachLoop.td b/test/TableGen/ForeachLoop.td
index 4aacc74..25208fa 100644
--- a/test/TableGen/ForeachLoop.td
+++ b/test/TableGen/ForeachLoop.td
@@ -51,8 +51,10 @@ foreach i = [0, 1, 2, 3, 4, 5, 6, 7] in
 // CHECK: string Name = "R7";
 // CHECK: int Index = 7;
 
-foreach i = {0-3,9-7} in
+foreach i = {0-3,9-7} in {
   def S#i : Register<"Q"#i, i>;
+  def : Register<"T"#i, i>;
+}
 
 // CHECK: def S0
 // CHECK: def S1
@@ -61,3 +63,25 @@ foreach i = {0-3,9-7} in
 // CHECK: def S7
 // CHECK: def S8
 // CHECK: def S9
+
+// CHECK: def
+// CHECK: string Name = "T0";
+
+// CHECK: def
+// CHECK: string Name = "T1";
+
+// CHECK: def
+// CHECK: string Name = "T2";
+
+// CHECK: def
+// CHECK: string Name = "T3";
+
+// CHECK: def
+// CHECK: string Name = "T9";
+
+// CHECK: def
+// CHECK: string Name = "T8";
+
+// CHECK: def
+// CHECK: string Name = "T7";
+
diff --git a/test/TableGen/if-empty-list-arg.td b/test/TableGen/if-empty-list-arg.td
new file mode 100644
index 0000000..39edf58
--- /dev/null
+++ b/test/TableGen/if-empty-list-arg.td
@@ -0,0 +1,7 @@
+// RUN: llvm-tblgen %s
+// XFAIL: vg_leak
+
+class C<bit cond> {
+  list<int> X = !if(cond, [1, 2, 3], []);
+  list<int> Y = !if(cond, [], [4, 5, 6]);
+}
diff --git a/test/Transforms/ArgumentPromotion/basictest.ll b/test/Transforms/ArgumentPromotion/basictest.ll
index d3d21fc..8f78b98 100644
--- a/test/Transforms/ArgumentPromotion/basictest.ll
+++ b/test/Transforms/ArgumentPromotion/basictest.ll
@@ -1,23 +1,29 @@
-; RUN: opt < %s -basicaa -argpromotion -mem2reg -S | not grep alloca
+; RUN: opt < %s -basicaa -argpromotion -mem2reg -S | FileCheck %s
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+
 define internal i32 @test(i32* %X, i32* %Y) {
-        %A = load i32* %X               ; <i32> [#uses=1]
-        %B = load i32* %Y               ; <i32> [#uses=1]
-        %C = add i32 %A, %B             ; <i32> [#uses=1]
-        ret i32 %C
+; CHECK-LABEL: define internal i32 @test(i32 %X.val, i32 %Y.val)
+  %A = load i32* %X
+  %B = load i32* %Y
+  %C = add i32 %A, %B
+  ret i32 %C
 }
 
 define internal i32 @caller(i32* %B) {
-        %A = alloca i32         ; <i32*> [#uses=2]
-        store i32 1, i32* %A
-        %C = call i32 @test( i32* %A, i32* %B )         ; <i32> [#uses=1]
-        ret i32 %C
+; CHECK-LABEL: define internal i32 @caller(i32 %B.val1)
+  %A = alloca i32
+  store i32 1, i32* %A
+  %C = call i32 @test(i32* %A, i32* %B)
+; CHECK: call i32 @test(i32 1, i32 %B.val1)
+  ret i32 %C
 }
 
 define i32 @callercaller() {
-        %B = alloca i32         ; <i32*> [#uses=2]
-        store i32 2, i32* %B
-        %X = call i32 @caller( i32* %B )                ; <i32> [#uses=1]
-        ret i32 %X
+; CHECK-LABEL: define i32 @callercaller()
+  %B = alloca i32
+  store i32 2, i32* %B
+  %X = call i32 @caller(i32* %B)
+; CHECK: call i32 @caller(i32 2)
+  ret i32 %X
 }
 
diff --git a/test/Transforms/ArgumentPromotion/byval-2.ll b/test/Transforms/ArgumentPromotion/byval-2.ll
index 368c689..b412f5e 100644
--- a/test/Transforms/ArgumentPromotion/byval-2.ll
+++ b/test/Transforms/ArgumentPromotion/byval-2.ll
@@ -1,26 +1,31 @@
-; RUN: opt < %s -argpromotion -S | grep -F "i32* byval" | count 2
-; Argpromote + scalarrepl should change this to passing the two integers by value.
+; RUN: opt < %s -argpromotion -S | FileCheck %s
 
-	%struct.ss = type { i32, i64 }
+; Arg promotion eliminates the struct argument.
+; FIXME: Should it eliminate the i32* argument?
+
+%struct.ss = type { i32, i64 }
 
 define internal void @f(%struct.ss* byval  %b, i32* byval %X) nounwind  {
+; CHECK-LABEL: define internal void @f(i32 %b.0, i64 %b.1, i32* byval %X)
 entry:
-	%tmp = getelementptr %struct.ss* %b, i32 0, i32 0
-	%tmp1 = load i32* %tmp, align 4
-	%tmp2 = add i32 %tmp1, 1	
-	store i32 %tmp2, i32* %tmp, align 4
+  %tmp = getelementptr %struct.ss* %b, i32 0, i32 0
+  %tmp1 = load i32* %tmp, align 4
+  %tmp2 = add i32 %tmp1, 1
+  store i32 %tmp2, i32* %tmp, align 4
 
-	store i32 0, i32* %X
-	ret void
+  store i32 0, i32* %X
+  ret void
 }
 
 define i32 @test(i32* %X) {
+; CHECK-LABEL: define i32 @test
 entry:
-	%S = alloca %struct.ss		; <%struct.ss*> [#uses=4]
-	%tmp1 = getelementptr %struct.ss* %S, i32 0, i32 0		; <i32*> [#uses=1]
-	store i32 1, i32* %tmp1, align 8
-	%tmp4 = getelementptr %struct.ss* %S, i32 0, i32 1		; <i64*> [#uses=1]
-	store i64 2, i64* %tmp4, align 4
-	call void @f( %struct.ss* byval %S, i32* byval %X) 
-	ret i32 0
+  %S = alloca %struct.ss
+  %tmp1 = getelementptr %struct.ss* %S, i32 0, i32 0
+  store i32 1, i32* %tmp1, align 8
+  %tmp4 = getelementptr %struct.ss* %S, i32 0, i32 1
+  store i64 2, i64* %tmp4, align 4
+  call void @f( %struct.ss* byval %S, i32* byval %X)
+; CHECK: call void @f(i32 %{{.*}}, i64 %{{.*}}, i32* byval %{{.*}})
+  ret i32 0
 }
diff --git a/test/Transforms/ArgumentPromotion/byval.ll b/test/Transforms/ArgumentPromotion/byval.ll
index 44b26fc..27305e9 100644
--- a/test/Transforms/ArgumentPromotion/byval.ll
+++ b/test/Transforms/ArgumentPromotion/byval.ll
@@ -1,25 +1,28 @@
-; RUN: opt < %s -argpromotion -scalarrepl -S | not grep load
+; RUN: opt < %s -argpromotion -S | FileCheck %s
+
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
-; Argpromote + scalarrepl should change this to passing the two integers by value.
 
-	%struct.ss = type { i32, i64 }
+%struct.ss = type { i32, i64 }
 
 define internal void @f(%struct.ss* byval  %b) nounwind  {
+; CHECK-LABEL: define internal void @f(i32 %b.0, i64 %b.1)
 entry:
-	%tmp = getelementptr %struct.ss* %b, i32 0, i32 0		; <i32*> [#uses=2]
-	%tmp1 = load i32* %tmp, align 4		; <i32> [#uses=1]
-	%tmp2 = add i32 %tmp1, 1		; <i32> [#uses=1]
-	store i32 %tmp2, i32* %tmp, align 4
-	ret void
+  %tmp = getelementptr %struct.ss* %b, i32 0, i32 0		; <i32*> [#uses=2]
+  %tmp1 = load i32* %tmp, align 4		; <i32> [#uses=1]
+  %tmp2 = add i32 %tmp1, 1		; <i32> [#uses=1]
+  store i32 %tmp2, i32* %tmp, align 4
+  ret void
 }
 
 define i32 @main() nounwind  {
+; CHECK-LABEL: define i32 @main
 entry:
-	%S = alloca %struct.ss		; <%struct.ss*> [#uses=4]
-	%tmp1 = getelementptr %struct.ss* %S, i32 0, i32 0		; <i32*> [#uses=1]
-	store i32 1, i32* %tmp1, align 8
-	%tmp4 = getelementptr %struct.ss* %S, i32 0, i32 1		; <i64*> [#uses=1]
-	store i64 2, i64* %tmp4, align 4
-	call void @f( %struct.ss* byval  %S ) nounwind 
-	ret i32 0
+  %S = alloca %struct.ss		; <%struct.ss*> [#uses=4]
+  %tmp1 = getelementptr %struct.ss* %S, i32 0, i32 0		; <i32*> [#uses=1]
+  store i32 1, i32* %tmp1, align 8
+  %tmp4 = getelementptr %struct.ss* %S, i32 0, i32 1		; <i64*> [#uses=1]
+  store i64 2, i64* %tmp4, align 4
+  call void @f( %struct.ss* byval  %S ) nounwind 
+; CHECK: call void @f(i32 %{{.*}}, i64 %{{.*}})
+  ret i32 0
 }
diff --git a/test/Transforms/ArgumentPromotion/dbg.ll b/test/Transforms/ArgumentPromotion/dbg.ll
new file mode 100644
index 0000000..70503af
--- /dev/null
+++ b/test/Transforms/ArgumentPromotion/dbg.ll
@@ -0,0 +1,22 @@
+; RUN: opt < %s -argpromotion -S | FileCheck %s
+; CHECK: call void @test(), !dbg [[DBG_LOC:![0-9]]]
+; CHECK: [[TEST_FN:.*]] = {{.*}} void ()* @test
+; CHECK: [[DBG_LOC]] = metadata !{i32 8, i32 0, metadata [[TEST_FN]], null}
+
+define internal void @test(i32* %X) {
+  ret void
+}
+
+define void @caller() {
+  call void @test(i32* null), !dbg !1
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!3}
+
+!0 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!1 = metadata !{i32 8, i32 0, metadata !2, null}
+!2 = metadata !{i32 786478, null, null, metadata !"test", metadata !"test", metadata !"", i32 3, null, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32*)* @test, null, null, null, i32 3}
+!3 = metadata !{i32 786449, null, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, null, null, metadata !4, null, null, metadata !"", i32 2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/pr20038/reduce/<stdin>] [DW_LANG_C_plus_plus]
+!4 = metadata !{metadata !2}
diff --git a/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v7.ll b/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v7.ll
index ac9fc1f..6a93016 100644
--- a/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v7.ll
+++ b/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v7.ll
@@ -80,8 +80,8 @@ define i16 @test_atomic_nand_i16(i16* %ptr, i16 %nandend) {
 ; CHECK: [[LOOP]]:
 ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* %ptr)
 ; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i16
-; CHECK: [[NEWVAL_TMP:%.*]] = xor i16 %nandend, -1
-; CHECK: [[NEWVAL:%.*]] = and i16 [[OLDVAL]], [[NEWVAL_TMP]]
+; CHECK: [[NEWVAL_TMP:%.*]] = and i16 [[OLDVAL]], %nandend
+; CHECK: [[NEWVAL:%.*]] = xor i16 [[NEWVAL_TMP]], -1
 ; CHECK: [[NEWVAL32:%.*]] = zext i16 [[NEWVAL]] to i32
 ; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
 ; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
@@ -229,22 +229,28 @@ define i8 @test_cmpxchg_i8_seqcst_seqcst(i8* %ptr, i8 %desired, i8 %newval) {
 ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
 ; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i8
 ; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i8 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[BARRIER:.*]]
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
 
 ; CHECK: [[TRY_STORE]]:
 ; CHECK: [[NEWVAL32:%.*]] = zext i8 %newval to i32
 ; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
-; CHECK: [[BARRIER]]:
+; CHECK: [[SUCCESS_BB]]:
 ; CHECK: fence seq_cst
 ; CHECK: br label %[[DONE:.*]]
 
+; CHECK: [[FAILURE_BB]]:
+; CHECK: fence seq_cst
+; CHECK: br label %[[DONE]]
+
 ; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
 ; CHECK: ret i8 [[OLDVAL]]
 
-  %old = cmpxchg i8* %ptr, i8 %desired, i8 %newval seq_cst seq_cst
+  %pairold = cmpxchg i8* %ptr, i8 %desired, i8 %newval seq_cst seq_cst
+  %old = extractvalue { i8, i1 } %pairold, 0
   ret i8 %old
 }
 
@@ -257,22 +263,28 @@ define i16 @test_cmpxchg_i16_seqcst_monotonic(i16* %ptr, i16 %desired, i16 %newv
 ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* %ptr)
 ; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i16
 ; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i16 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[DONE:.*]]
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
 
 ; CHECK: [[TRY_STORE]]:
 ; CHECK: [[NEWVAL32:%.*]] = zext i16 %newval to i32
 ; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
-; CHECK: [[BARRIER]]:
+; CHECK: [[SUCCESS_BB]]:
 ; CHECK: fence seq_cst
 ; CHECK: br label %[[DONE:.*]]
 
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence
+; CHECK: br label %[[DONE]]
+
 ; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
 ; CHECK: ret i16 [[OLDVAL]]
 
-  %old = cmpxchg i16* %ptr, i16 %desired, i16 %newval seq_cst monotonic
+  %pairold = cmpxchg i16* %ptr, i16 %desired, i16 %newval seq_cst monotonic
+  %old = extractvalue { i16, i1 } %pairold, 0
   ret i16 %old
 }
 
@@ -284,21 +296,27 @@ define i32 @test_cmpxchg_i32_acquire_acquire(i32* %ptr, i32 %desired, i32 %newva
 ; CHECK: [[LOOP]]:
 ; CHECK: [[OLDVAL:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %ptr)
 ; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i32 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[DONE:.*]]
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
 
 ; CHECK: [[TRY_STORE]]:
 ; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i32(i32 %newval, i32* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
-; CHECK: [[BARRIER]]:
+; CHECK: [[SUCCESS_BB]]:
 ; CHECK: fence acquire
 ; CHECK: br label %[[DONE:.*]]
 
+; CHECK: [[FAILURE_BB]]:
+; CHECK: fence acquire
+; CHECK: br label %[[DONE]]
+
 ; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
 ; CHECK: ret i32 [[OLDVAL]]
 
-  %old = cmpxchg i32* %ptr, i32 %desired, i32 %newval acquire acquire
+  %pairold = cmpxchg i32* %ptr, i32 %desired, i32 %newval acquire acquire
+  %old = extractvalue { i32, i1 } %pairold, 0
   ret i32 %old
 }
 
@@ -317,7 +335,7 @@ define i64 @test_cmpxchg_i64_monotonic_monotonic(i64* %ptr, i64 %desired, i64 %n
 ; CHECK: [[HI64:%.*]] = shl i64 [[HI64_TMP]], 32
 ; CHECK: [[OLDVAL:%.*]] = or i64 [[LO64]], [[HI64]]
 ; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i64 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[DONE:.*]]
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
 
 ; CHECK: [[TRY_STORE]]:
 ; CHECK: [[NEWLO:%.*]] = trunc i64 %newval to i32
@@ -325,16 +343,22 @@ define i64 @test_cmpxchg_i64_monotonic_monotonic(i64* %ptr, i64 %desired, i64 %n
 ; CHECK: [[NEWHI:%.*]] = trunc i64 [[NEWHI_TMP]] to i32
 ; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
 ; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strexd(i32 [[NEWLO]], i32 [[NEWHI]], i8* [[PTR8]])
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
-; CHECK: [[BARRIER]]:
+; CHECK: [[SUCCESS_BB]]:
 ; CHECK-NOT: fence
 ; CHECK: br label %[[DONE:.*]]
 
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence
+; CHECK: br label %[[DONE]]
+
 ; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
 ; CHECK: ret i64 [[OLDVAL]]
 
-  %old = cmpxchg i64* %ptr, i64 %desired, i64 %newval monotonic monotonic
+  %pairold = cmpxchg i64* %ptr, i64 %desired, i64 %newval monotonic monotonic
+  %old = extractvalue { i64, i1 } %pairold, 0
   ret i64 %old
 }
 \ No newline at end of file
diff --git a/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v8.ll b/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v8.ll
index bec5bef..8092c10 100644
--- a/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v8.ll
+++ b/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v8.ll
@@ -91,22 +91,28 @@ define i8 @test_cmpxchg_i8_seqcst_seqcst(i8* %ptr, i8 %desired, i8 %newval) {
 ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldaex.p0i8(i8* %ptr)
 ; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i8
 ; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i8 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[BARRIER:.*]]
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
 
 ; CHECK: [[TRY_STORE]]:
 ; CHECK: [[NEWVAL32:%.*]] = zext i8 %newval to i32
 ; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.stlex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
-; CHECK: [[BARRIER]]:
-; CHECK-NOT: fence
+; CHECK: [[SUCCESS_BB]]:
+; CHECK-NOT: fence_cst
 ; CHECK: br label %[[DONE:.*]]
 
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence_cst
+; CHECK: br label %[[DONE]]
+
 ; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
 ; CHECK: ret i8 [[OLDVAL]]
 
-  %old = cmpxchg i8* %ptr, i8 %desired, i8 %newval seq_cst seq_cst
+  %pairold = cmpxchg i8* %ptr, i8 %desired, i8 %newval seq_cst seq_cst
+  %old = extractvalue { i8, i1 } %pairold, 0
   ret i8 %old
 }
 
@@ -119,22 +125,28 @@ define i16 @test_cmpxchg_i16_seqcst_monotonic(i16* %ptr, i16 %desired, i16 %newv
 ; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldaex.p0i16(i16* %ptr)
 ; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i16
 ; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i16 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[DONE:.*]]
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
 
 ; CHECK: [[TRY_STORE]]:
 ; CHECK: [[NEWVAL32:%.*]] = zext i16 %newval to i32
 ; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.stlex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
-; CHECK: [[BARRIER]]:
+; CHECK: [[SUCCESS_BB]]:
 ; CHECK-NOT: fence
 ; CHECK: br label %[[DONE:.*]]
 
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence
+; CHECK: br label %[[DONE]]
+
 ; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
 ; CHECK: ret i16 [[OLDVAL]]
 
-  %old = cmpxchg i16* %ptr, i16 %desired, i16 %newval seq_cst monotonic
+  %pairold = cmpxchg i16* %ptr, i16 %desired, i16 %newval seq_cst monotonic
+  %old = extractvalue { i16, i1 } %pairold, 0
   ret i16 %old
 }
 
@@ -146,21 +158,27 @@ define i32 @test_cmpxchg_i32_acquire_acquire(i32* %ptr, i32 %desired, i32 %newva
 ; CHECK: [[LOOP]]:
 ; CHECK: [[OLDVAL:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* %ptr)
 ; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i32 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[DONE:.*]]
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
 
 ; CHECK: [[TRY_STORE]]:
 ; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i32(i32 %newval, i32* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
-; CHECK: [[BARRIER]]:
-; CHECK-NOT: fence
+; CHECK: [[SUCCESS_BB]]:
+; CHECK-NOT: fence_cst
 ; CHECK: br label %[[DONE:.*]]
 
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence_cst
+; CHECK: br label %[[DONE]]
+
 ; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
 ; CHECK: ret i32 [[OLDVAL]]
 
-  %old = cmpxchg i32* %ptr, i32 %desired, i32 %newval acquire acquire
+  %pairold = cmpxchg i32* %ptr, i32 %desired, i32 %newval acquire acquire
+  %old = extractvalue { i32, i1 } %pairold, 0
   ret i32 %old
 }
 
@@ -179,7 +197,7 @@ define i64 @test_cmpxchg_i64_monotonic_monotonic(i64* %ptr, i64 %desired, i64 %n
 ; CHECK: [[HI64:%.*]] = shl i64 [[HI64_TMP]], 32
 ; CHECK: [[OLDVAL:%.*]] = or i64 [[LO64]], [[HI64]]
 ; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i64 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[DONE:.*]]
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
 
 ; CHECK: [[TRY_STORE]]:
 ; CHECK: [[NEWLO:%.*]] = trunc i64 %newval to i32
@@ -187,16 +205,22 @@ define i64 @test_cmpxchg_i64_monotonic_monotonic(i64* %ptr, i64 %desired, i64 %n
 ; CHECK: [[NEWHI:%.*]] = trunc i64 [[NEWHI_TMP]] to i32
 ; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
 ; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strexd(i32 [[NEWLO]], i32 [[NEWHI]], i8* [[PTR8]])
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
 
-; CHECK: [[BARRIER]]:
-; CHECK-NOT: fence
+; CHECK: [[SUCCESS_BB]]:
+; CHECK-NOT: fence_cst
 ; CHECK: br label %[[DONE:.*]]
 
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence_cst
+; CHECK: br label %[[DONE]]
+
 ; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
 ; CHECK: ret i64 [[OLDVAL]]
 
-  %old = cmpxchg i64* %ptr, i64 %desired, i64 %newval monotonic monotonic
+  %pairold = cmpxchg i64* %ptr, i64 %desired, i64 %newval monotonic monotonic
+  %old = extractvalue { i64, i1 } %pairold, 0
   ret i64 %old
 }
 \ No newline at end of file
diff --git a/test/Transforms/AtomicExpandLoadLinked/ARM/cmpxchg-weak.ll b/test/Transforms/AtomicExpandLoadLinked/ARM/cmpxchg-weak.ll
new file mode 100644
index 0000000..07a4a7f
--- /dev/null
+++ b/test/Transforms/AtomicExpandLoadLinked/ARM/cmpxchg-weak.ll
@@ -0,0 +1,97 @@
+; RUN: opt -atomic-ll-sc -S -mtriple=thumbv7s-apple-ios7.0 %s | FileCheck %s
+
+define i32 @test_cmpxchg_seq_cst(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: @test_cmpxchg_seq_cst
+; CHECK:     fence release
+; CHECK:     br label %[[START:.*]]
+
+; CHECK: [[START]]:
+; CHECK:     [[LOADED:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
+; CHECK:     [[SHOULD_STORE:%.*]] = icmp eq i32 [[LOADED]], %desired
+; CHECK:     br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK:     [[STREX:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 %new, i32* %addr)
+; CHECK:     [[SUCCESS:%.*]] = icmp eq i32 [[STREX]], 0
+; CHECK:     br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB]]
+
+; CHECK: [[SUCCESS_BB]]:
+; CHECK:     fence seq_cst
+; CHECK:     br label %[[END:.*]]
+
+; CHECK: [[FAILURE_BB]]:
+; CHECK:     fence seq_cst
+; CHECK:     br label %[[END]]
+
+; CHECK: [[END]]:
+; CHECK:     [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
+; CHECK:     ret i32 [[LOADED]]
+
+  %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %oldval = extractvalue { i32, i1 } %pair, 0
+  ret i32 %oldval
+}
+
+define i1 @test_cmpxchg_weak_fail(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: @test_cmpxchg_weak_fail
+; CHECK:     fence release
+; CHECK:     br label %[[START:.*]]
+
+; CHECK: [[START]]:
+; CHECK:     [[LOADED:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
+; CHECK:     [[SHOULD_STORE:%.*]] = icmp eq i32 [[LOADED]], %desired
+; CHECK:     br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK:     [[STREX:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 %new, i32* %addr)
+; CHECK:     [[SUCCESS:%.*]] = icmp eq i32 [[STREX]], 0
+; CHECK:     br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[SUCCESS_BB]]:
+; CHECK:     fence seq_cst
+; CHECK:     br label %[[END:.*]]
+
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence
+; CHECK:     br label %[[END]]
+
+; CHECK: [[END]]:
+; CHECK:     [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
+; CHECK:     ret i1 [[SUCCESS]]
+
+  %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+  %oldval = extractvalue { i32, i1 } %pair, 1
+  ret i1 %oldval
+}
+
+define i32 @test_cmpxchg_monotonic(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: @test_cmpxchg_monotonic
+; CHECK-NOT: fence
+; CHECK:     br label %[[START:.*]]
+
+; CHECK: [[START]]:
+; CHECK:     [[LOADED:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
+; CHECK:     [[SHOULD_STORE:%.*]] = icmp eq i32 [[LOADED]], %desired
+; CHECK:     br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK:     [[STREX:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 %new, i32* %addr)
+; CHECK:     [[SUCCESS:%.*]] = icmp eq i32 [[STREX]], 0
+; CHECK:     br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[SUCCESS_BB]]:
+; CHECK-NOT: fence
+; CHECK:     br label %[[END:.*]]
+
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence
+; CHECK:     br label %[[END]]
+
+; CHECK: [[END]]:
+; CHECK:     [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
+; CHECK:     ret i32 [[LOADED]]
+
+  %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new monotonic monotonic
+  %oldval = extractvalue { i32, i1 } %pair, 0
+  ret i32 %oldval
+}
diff --git a/test/Transforms/AtomicExpandLoadLinked/ARM/lit.local.cfg b/test/Transforms/AtomicExpandLoadLinked/ARM/lit.local.cfg
index 8a3ba96..98c6700 100644
--- a/test/Transforms/AtomicExpandLoadLinked/ARM/lit.local.cfg
+++ b/test/Transforms/AtomicExpandLoadLinked/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/BBVectorize/lit.local.cfg b/test/Transforms/BBVectorize/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/Transforms/BBVectorize/lit.local.cfg
+++ b/test/Transforms/BBVectorize/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/CodeGenPrepare/X86/lit.local.cfg b/test/Transforms/CodeGenPrepare/X86/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/Transforms/CodeGenPrepare/X86/lit.local.cfg
+++ b/test/Transforms/CodeGenPrepare/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/ConstantHoisting/AArch64/lit.local.cfg b/test/Transforms/ConstantHoisting/AArch64/lit.local.cfg
index c420349..7184443 100644
--- a/test/Transforms/ConstantHoisting/AArch64/lit.local.cfg
+++ b/test/Transforms/ConstantHoisting/AArch64/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'AArch64' in targets:
+if not 'AArch64' in config.root.targets:
     config.unsupported = True
diff --git a/test/Transforms/ConstantHoisting/PowerPC/lit.local.cfg b/test/Transforms/ConstantHoisting/PowerPC/lit.local.cfg
index 2e46300..5d33887 100644
--- a/test/Transforms/ConstantHoisting/PowerPC/lit.local.cfg
+++ b/test/Transforms/ConstantHoisting/PowerPC/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'PowerPC' in targets:
+if not 'PowerPC' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/ConstantHoisting/X86/large-immediate.ll b/test/Transforms/ConstantHoisting/X86/large-immediate.ll
index e0af9c9..b8c04f3 100644
--- a/test/Transforms/ConstantHoisting/X86/large-immediate.ll
+++ b/test/Transforms/ConstantHoisting/X86/large-immediate.ll
@@ -25,3 +25,12 @@ define i196 @test3(i196 %a) nounwind {
   %2 = mul i196 %1, 2
   ret i196 %2
 }
+
+; Check that we don't hoist immediates with small values.
+define i96 @test4(i96 %a) nounwind {
+; CHECK-LABEL: test4
+; CHECK-NOT: %const = bitcast i96 2 to i96
+  %1 = mul i96 %a, 2
+  %2 = add i96 %1, 2
+  ret i96 %2
+}
diff --git a/test/Transforms/ConstantHoisting/X86/lit.local.cfg b/test/Transforms/ConstantHoisting/X86/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/Transforms/ConstantHoisting/X86/lit.local.cfg
+++ b/test/Transforms/ConstantHoisting/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/ConstantHoisting/X86/stackmap.ll b/test/Transforms/ConstantHoisting/X86/stackmap.ll
index cef022e..9df4417 100644
--- a/test/Transforms/ConstantHoisting/X86/stackmap.ll
+++ b/test/Transforms/ConstantHoisting/X86/stackmap.ll
@@ -6,11 +6,11 @@ target triple = "x86_64-apple-macosx10.9.0"
 ; Test if the 3rd argument of a stackmap is hoisted.
 define i128 @test1(i128 %a) {
 ; CHECK-LABEL:  @test1
-; CHECK:        %const = bitcast i128 13464618275673403322 to i128
+; CHECK:        %const = bitcast i128 134646182756734033220 to i128
 ; CHECK:        tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 1, i32 24, i128 %const)
 entry:
-  %0 = add i128 %a, 13464618275673403322
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 1, i32 24, i128 13464618275673403322)
+  %0 = add i128 %a, 134646182756734033220
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 1, i32 24, i128 134646182756734033220)
   ret i128 %0
 }
 
diff --git a/test/Transforms/FunctionAttrs/nocapture.ll b/test/Transforms/FunctionAttrs/nocapture.ll
index d2460c0..d3842c8 100644
--- a/test/Transforms/FunctionAttrs/nocapture.ll
+++ b/test/Transforms/FunctionAttrs/nocapture.ll
@@ -68,7 +68,7 @@ define i1* @lookup_bit(i32* %q, i32 %bitno) readnone nounwind {
 	ret i1* %lookup
 }
 
-; CHECK: define i1 @c7(i32* readnone %q, i32 %bitno)
+; CHECK: define i1 @c7(i32* readonly %q, i32 %bitno)
 define i1 @c7(i32* %q, i32 %bitno) {
 	%ptr = call i1* @lookup_bit(i32* %q, i32 %bitno)
 	%val = load i1* %ptr
diff --git a/test/Transforms/FunctionAttrs/readattrs.ll b/test/Transforms/FunctionAttrs/readattrs.ll
index 7ae38bb..b4e904c 100644
--- a/test/Transforms/FunctionAttrs/readattrs.ll
+++ b/test/Transforms/FunctionAttrs/readattrs.ll
@@ -51,3 +51,17 @@ define void @test6_2(i8** %p, i8* %q) {
 define void @test7_1(i32* inalloca %a) {
   ret void
 }
+
+; CHECK: define i32* @test8_1(i32* readnone %p)
+define i32* @test8_1(i32* %p) {
+entry:
+  ret i32* %p
+}
+
+; CHECK: define void @test8_2(i32* %p)
+define void @test8_2(i32* %p) {
+entry:
+  %call = call i32* @test8_1(i32* %p)
+  store i32 10, i32* %call, align 4
+  ret void
+}
diff --git a/test/Transforms/GCOVProfiling/global-ctor.ll b/test/Transforms/GCOVProfiling/global-ctor.ll
new file mode 100644
index 0000000..722a096
--- /dev/null
+++ b/test/Transforms/GCOVProfiling/global-ctor.ll
@@ -0,0 +1,58 @@
+; RUN: echo '!16 = metadata !{metadata !"%T/global-ctor.ll", metadata !0}' > %t1
+; RUN: cat %s %t1 > %t2
+; RUN: opt -insert-gcov-profiling -disable-output < %t2
+; RUN: not grep '_GLOBAL__sub_I_global-ctor' %T/global-ctor.gcno
+; RUN: rm %T/global-ctor.gcno
+
+; REQUIRES: shell
+
+@x = global i32 0, align 4
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_global-ctor.ll, i8* null }]
+
+; Function Attrs: nounwind
+define internal void @__cxx_global_var_init() #0 section ".text.startup" {
+entry:
+  br label %0
+
+; <label>:0                                       ; preds = %entry
+  %call = call i32 @_Z1fv(), !dbg !13
+  store i32 %call, i32* @x, align 4, !dbg !13
+  ret void, !dbg !13
+}
+
+declare i32 @_Z1fv() #1
+
+; Function Attrs: nounwind
+define internal void @_GLOBAL__sub_I_global-ctor.ll() #0 section ".text.startup" {
+entry:
+  br label %0
+
+; <label>:0                                       ; preds = %entry
+  call void @__cxx_global_var_init(), !dbg !14
+  ret void, !dbg !14
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!10, !11}
+!llvm.gcov = !{!16}
+!llvm.ident = !{!12}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (trunk 210217)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2} ; [ DW_TAG_compile_unit ] [/home/nlewycky/<stdin>] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"<stdin>", metadata !"/home/nlewycky"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !8}
+!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"__cxx_global_var_init", metadata !"__cxx_global_var_init", metadata !"", i32 2, metadata !7, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @__cxx_global_var_init, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [local] [def] [__cxx_global_var_init]
+!5 = metadata !{metadata !"global-ctor.ll", metadata !"/home/nlewycky"}
+!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [/home/nlewycky/global-ctor.ll]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{i32 786478, metadata !1, metadata !9, metadata !"", metadata !"", metadata !"_GLOBAL__sub_I_global-ctor.ll", i32 0, metadata !7, i1 true, i1 true, i32 0, i32 0, null, i32 64, i1 false, void ()* @_GLOBAL__sub_I_global-ctor.ll, null, null, metadata !2, i32 0} ; [ DW_TAG_subprogram ] [line 0] [local] [def]
+!9 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/home/nlewycky/<stdin>]
+!10 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!11 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{metadata !"clang version 3.5.0 (trunk 210217)"}
+!13 = metadata !{i32 2, i32 0, metadata !4, null}
+!14 = metadata !{i32 0, i32 0, metadata !15, null}
+!15 = metadata !{i32 786443, metadata !5, metadata !8} ; [ DW_TAG_lexical_block ] [/home/nlewycky/global-ctor.ll]
diff --git a/test/Transforms/GCOVProfiling/linezero.ll b/test/Transforms/GCOVProfiling/linezero.ll
new file mode 100644
index 0000000..e2f8324
--- /dev/null
+++ b/test/Transforms/GCOVProfiling/linezero.ll
@@ -0,0 +1,143 @@
+; RUN: sed -e 's@PATTERN@\%T@g' < %s > %t1
+; RUN: opt -insert-gcov-profiling -disable-output < %t1
+; RUN: rm %T/linezero.gcno %t1
+; REQUIRES: shell
+
+; This is a crash test.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.vector = type { i8 }
+
+; Function Attrs: nounwind
+define i32 @_Z4testv() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %__range = alloca %struct.vector*, align 8
+  %ref.tmp = alloca %struct.vector, align 1
+  %undef.agg.tmp = alloca %struct.vector, align 1
+  %__begin = alloca i8*, align 8
+  %__end = alloca i8*, align 8
+  %spec = alloca i8, align 1
+  call void @llvm.dbg.declare(metadata !{%struct.vector** %__range}, metadata !27), !dbg !30
+  br label %0
+
+; <label>:0                                       ; preds = %entry
+  call void @_Z13TagFieldSpecsv(), !dbg !31
+  store %struct.vector* %ref.tmp, %struct.vector** %__range, align 8, !dbg !31
+  call void @llvm.dbg.declare(metadata !{i8** %__begin}, metadata !32), !dbg !30
+  %1 = load %struct.vector** %__range, align 8, !dbg !31
+  %call = call i8* @_ZN6vector5beginEv(%struct.vector* %1), !dbg !31
+  store i8* %call, i8** %__begin, align 8, !dbg !31
+  call void @llvm.dbg.declare(metadata !{i8** %__end}, metadata !33), !dbg !30
+  %2 = load %struct.vector** %__range, align 8, !dbg !31
+  %call1 = call i8* @_ZN6vector3endEv(%struct.vector* %2), !dbg !31
+  store i8* %call1, i8** %__end, align 8, !dbg !31
+  br label %for.cond, !dbg !31
+
+for.cond:                                         ; preds = %for.inc, %0
+  %3 = load i8** %__begin, align 8, !dbg !34
+  %4 = load i8** %__end, align 8, !dbg !34
+  %cmp = icmp ne i8* %3, %4, !dbg !34
+  br i1 %cmp, label %for.body, label %for.end, !dbg !34
+
+for.body:                                         ; preds = %for.cond
+  call void @llvm.dbg.declare(metadata !{i8* %spec}, metadata !37), !dbg !31
+  %5 = load i8** %__begin, align 8, !dbg !38
+  %6 = load i8* %5, align 1, !dbg !38
+  store i8 %6, i8* %spec, align 1, !dbg !38
+  br label %for.inc, !dbg !38
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i8** %__begin, align 8, !dbg !40
+  %incdec.ptr = getelementptr inbounds i8* %7, i32 1, !dbg !40
+  store i8* %incdec.ptr, i8** %__begin, align 8, !dbg !40
+  br label %for.cond, !dbg !40
+
+for.end:                                          ; preds = %for.cond
+  call void @llvm.trap(), !dbg !42
+  unreachable, !dbg !42
+
+return:                                           ; No predecessors!
+  %8 = load i32* %retval, !dbg !44
+  ret i32 %8, !dbg !44
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+declare void @_Z13TagFieldSpecsv() #2
+
+declare i8* @_ZN6vector5beginEv(%struct.vector*) #2
+
+declare i8* @_ZN6vector3endEv(%struct.vector*) #2
+
+; Function Attrs: noreturn nounwind
+declare void @llvm.trap() #3
+
+; Function Attrs: nounwind
+define void @_Z2f1v() #0 {
+entry:
+  br label %0
+
+; <label>:0                                       ; preds = %entry
+  ret void, !dbg !45
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { noreturn nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!23, !24}
+!llvm.gcov = !{!25}
+!llvm.ident = !{!26}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (trunk 209871)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !14, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [<stdin>] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"<stdin>", metadata !"PATTERN"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786451, metadata !5, null, metadata !"vector", i32 21, i64 8, i64 8, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS6vector"} ; [ DW_TAG_structure_type ] [vector] [line 21, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{metadata !"linezero.cc", metadata !"PATTERN"}
+!6 = metadata !{metadata !7, metadata !13}
+!7 = metadata !{i32 786478, metadata !5, metadata !"_ZTS6vector", metadata !"begin", metadata !"begin", metadata !"_ZN6vector5beginEv", i32 25, metadata !8, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 25} ; [ DW_TAG_subprogram ] [line 25] [begin]
+!8 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !10, metadata !12}
+!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!11 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!12 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS6vector"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS6vector]
+!13 = metadata !{i32 786478, metadata !5, metadata !"_ZTS6vector", metadata !"end", metadata !"end", metadata !"_ZN6vector3endEv", i32 26, metadata !8, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 26} ; [ DW_TAG_subprogram ] [line 26] [end]
+!14 = metadata !{metadata !15, metadata !20}
+!15 = metadata !{i32 786478, metadata !5, metadata !16, metadata !"test", metadata !"test", metadata !"_Z4testv", i32 50, metadata !17, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z4testv, null, null, metadata !2, i32 50} ; [ DW_TAG_subprogram ] [line 50] [def] [test]
+!16 = metadata !{i32 786473, metadata !5}         ; [ DW_TAG_file_type ] [./linezero.cc]
+!17 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = metadata !{metadata !19}
+!19 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!20 = metadata !{i32 786478, metadata !5, metadata !16, metadata !"f1", metadata !"f1", metadata !"_Z2f1v", i32 54, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z2f1v, null, null, metadata !2, i32 54} ; [ DW_TAG_subprogram ] [line 54] [def] [f1]
+!21 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = metadata !{null}
+!23 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!24 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!25 = metadata !{metadata !"PATTERN/linezero.o", metadata !0}
+!26 = metadata !{metadata !"clang version 3.5.0 (trunk 209871)"}
+!27 = metadata !{i32 786688, metadata !28, metadata !"__range", null, i32 0, metadata !29, i32 64, i32 0} ; [ DW_TAG_auto_variable ] [__range] [line 0]
+!28 = metadata !{i32 786443, metadata !5, metadata !15, i32 51, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!29 = metadata !{i32 786498, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTS6vector"} ; [ DW_TAG_rvalue_reference_type ] [line 0, size 0, align 0, offset 0] [from _ZTS6vector]
+!30 = metadata !{i32 0, i32 0, metadata !28, null}
+!31 = metadata !{i32 51, i32 0, metadata !28, null}
+!32 = metadata !{i32 786688, metadata !28, metadata !"__begin", null, i32 0, metadata !10, i32 64, i32 0} ; [ DW_TAG_auto_variable ] [__begin] [line 0]
+!33 = metadata !{i32 786688, metadata !28, metadata !"__end", null, i32 0, metadata !10, i32 64, i32 0} ; [ DW_TAG_auto_variable ] [__end] [line 0]
+!34 = metadata !{i32 51, i32 0, metadata !35, null}
+!35 = metadata !{i32 786443, metadata !5, metadata !36, i32 51, i32 0, i32 5, i32 5} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!36 = metadata !{i32 786443, metadata !5, metadata !28, i32 51, i32 0, i32 1, i32 1} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!37 = metadata !{i32 786688, metadata !28, metadata !"spec", metadata !16, i32 51, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [spec] [line 51]
+!38 = metadata !{i32 51, i32 0, metadata !39, null}
+!39 = metadata !{i32 786443, metadata !5, metadata !28, i32 51, i32 0, i32 2, i32 2} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!40 = metadata !{i32 51, i32 0, metadata !41, null}
+!41 = metadata !{i32 786443, metadata !5, metadata !28, i32 51, i32 0, i32 4, i32 4} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!42 = metadata !{i32 51, i32 0, metadata !43, null}
+!43 = metadata !{i32 786443, metadata !5, metadata !28, i32 51, i32 0, i32 3, i32 3} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!44 = metadata !{i32 52, i32 0, metadata !15, null}
+!45 = metadata !{i32 54, i32 0, metadata !20, null}
diff --git a/test/Transforms/GVN/calloc-load-removal.ll b/test/Transforms/GVN/calloc-load-removal.ll
new file mode 100644
index 0000000..2dde5b7
--- /dev/null
+++ b/test/Transforms/GVN/calloc-load-removal.ll
@@ -0,0 +1,25 @@
+; RUN: opt -S -basicaa -gvn < %s | FileCheck %s
+; RUN: opt -S -basicaa -gvn -disable-simplify-libcalls < %s | FileCheck %s -check-prefix=CHECK_NO_LIBCALLS
+; Check that loads from calloc are recognized as being zero.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind uwtable
+define i32 @test1() {
+  %1 = tail call noalias i8* @calloc(i64 1, i64 4)
+  %2 = bitcast i8* %1 to i32*
+  ; This load is trivially constant zero
+  %3 = load i32* %2, align 4
+  ret i32 %3
+
+; CHECK-LABEL: @test1(
+; CHECK-NOT: %3 = load i32* %2, align 4
+; CHECK: ret i32 0
+
+; CHECK_NO_LIBCALLS-LABEL: @test1(
+; CHECK_NO_LIBCALLS: load
+; CHECK_NO_LIBCALLS: ret i32 %
+
+}
+
+declare noalias i8* @calloc(i64, i64)
diff --git a/test/Transforms/GVN/invariant-load.ll b/test/Transforms/GVN/invariant-load.ll
new file mode 100644
index 0000000..80e2226
--- /dev/null
+++ b/test/Transforms/GVN/invariant-load.ll
@@ -0,0 +1,31 @@
+; Test if the !invariant.load metadata is maintained by GVN.
+; RUN: opt -basicaa -gvn -S < %s | FileCheck %s
+
+define i32 @test1(i32* nocapture %p, i8* nocapture %q) {
+; CHECK-LABEL: test1
+; CHECK: %x = load i32* %p, align 4, !invariant.load !0
+; CHECK-NOT: %y = load
+entry:
+  %x = load i32* %p, align 4, !invariant.load !0
+  %conv = trunc i32 %x to i8
+  store i8 %conv, i8* %q, align 1
+  %y = load i32* %p, align 4, !invariant.load !0
+  %add = add i32 %y, 1
+  ret i32 %add
+}
+
+define i32 @test2(i32* nocapture %p, i8* nocapture %q) {
+; CHECK-LABEL: test2
+; CHECK-NOT: !invariant.load
+; CHECK-NOT: %y = load
+entry:
+  %x = load i32* %p, align 4
+  %conv = trunc i32 %x to i8
+  store i8 %conv, i8* %q, align 1
+  %y = load i32* %p, align 4, !invariant.load !0
+  %add = add i32 %y, 1
+  ret i32 %add
+}
+
+!0 = metadata !{ }
+
diff --git a/test/Transforms/GlobalDCE/2009-01-05-DeadAliases.ll b/test/Transforms/GlobalDCE/2009-01-05-DeadAliases.ll
index 4b96799..0bdced5 100644
--- a/test/Transforms/GlobalDCE/2009-01-05-DeadAliases.ll
+++ b/test/Transforms/GlobalDCE/2009-01-05-DeadAliases.ll
@@ -11,8 +11,8 @@
 @L1 = alias i32* @A
 ; CHECK: @L1 = alias i32* @A
 
-@L2 = alias internal i32* @A
-; DEAD-NOT: @L2
+@L2 = alias internal i32* @L1
+; CHECK: @L2 = alias internal i32* @L1
 
-@L3 = alias i32* @A
-; CHECK: @L3 = alias i32* @A
+@L3 = alias i32* @L2
+; CHECK: @L3 = alias i32* @L2
diff --git a/test/Transforms/GlobalMerge/AArch64/lit.local.cfg b/test/Transforms/GlobalMerge/AArch64/lit.local.cfg
deleted file mode 100644
index 9a66a00..0000000
--- a/test/Transforms/GlobalMerge/AArch64/lit.local.cfg
+++ /dev/null
@@ -1,4 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'AArch64' in targets:
-    config.unsupported = True
-
diff --git a/test/Transforms/GlobalMerge/ARM/lit.local.cfg b/test/Transforms/GlobalMerge/ARM/lit.local.cfg
deleted file mode 100644
index 8a3ba96..0000000
--- a/test/Transforms/GlobalMerge/ARM/lit.local.cfg
+++ /dev/null
@@ -1,4 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
-    config.unsupported = True
-
diff --git a/test/Transforms/GlobalOpt/2009-02-15-BitcastAlias.ll b/test/Transforms/GlobalOpt/2009-02-15-BitcastAlias.ll
index 03d6ee4..d6a565a 100644
--- a/test/Transforms/GlobalOpt/2009-02-15-BitcastAlias.ll
+++ b/test/Transforms/GlobalOpt/2009-02-15-BitcastAlias.ll
@@ -2,7 +2,7 @@
 
 @g = global i32 0
 
-@a = alias i8, i32* @g
+@a = alias bitcast (i32* @g to i8*)
 
 define void @f() {
 	%tmp = load i8* @a
diff --git a/test/Transforms/GlobalOpt/2009-03-06-Anonymous.ll b/test/Transforms/GlobalOpt/2009-03-06-Anonymous.ll
index 62f75e1..930a96e 100644
--- a/test/Transforms/GlobalOpt/2009-03-06-Anonymous.ll
+++ b/test/Transforms/GlobalOpt/2009-03-06-Anonymous.ll
@@ -1,11 +1,23 @@
-; RUN: opt < %s -globalopt -S | grep internal | count 2
+; RUN: opt < %s -globalopt -S | FileCheck %s
 
 global i32 0
-define i32* @1() {
+; CHECK-DAG: @0 = internal global i32 0
+
+private global i32 0
+; CHECK-DAG: @1 = private global i32 0
+
+define i32* @2() {
 	ret i32* @0
 }
+; CHECK-DAG: define internal fastcc i32* @2()
+
 define i32* @f() {
 entry:
-	call i32* @1()
+	call i32* @2()
 	ret i32* %0
 }
+
+define i32* @g() {
+entry:
+	ret i32* @1
+}
diff --git a/test/Transforms/GlobalOpt/alias-resolve.ll b/test/Transforms/GlobalOpt/alias-resolve.ll
index bd07b31..9d70c70 100644
--- a/test/Transforms/GlobalOpt/alias-resolve.ll
+++ b/test/Transforms/GlobalOpt/alias-resolve.ll
@@ -1,9 +1,9 @@
 ; RUN: opt < %s -globalopt -S | FileCheck %s
 
-@foo1 = alias void ()* @bar2
+@foo1 = alias void ()* @foo2
 ; CHECK: @foo1 = alias void ()* @bar2
 
-@foo2 = alias void()* @bar2
+@foo2 = alias void()* @bar1
 ; CHECK: @foo2 = alias void ()* @bar2
 
 @bar1  = alias void ()* @bar2
@@ -12,6 +12,10 @@
 @weak1 = alias weak void ()* @bar2
 ; CHECK: @weak1 = alias weak void ()* @bar2
 
+@bar4 = private unnamed_addr constant [2 x i8*] zeroinitializer
+@foo4 = unnamed_addr alias linkonce_odr getelementptr inbounds ([2 x i8*]* @bar4, i32 0, i32 1)
+; CHECK: @foo4 = unnamed_addr alias linkonce_odr getelementptr inbounds ([2 x i8*]* @bar4, i32 0, i32 1)
+
 define void @bar2() {
   ret void
 }
diff --git a/test/Transforms/GlobalOpt/constantfold-initializers.ll b/test/Transforms/GlobalOpt/constantfold-initializers.ll
index ce6e2c4..4a25d66 100644
--- a/test/Transforms/GlobalOpt/constantfold-initializers.ll
+++ b/test/Transforms/GlobalOpt/constantfold-initializers.ll
@@ -50,7 +50,41 @@ entry:
   ret void
 }
 
+; PR19955
+
+@dllimportptr = global i32* null, align 4
+; CHECK: @dllimportptr = global i32* null, align 4
+@dllimportvar = external dllimport global i32
+define internal void @test3() {
+entry:
+  store i32* @dllimportvar, i32** @dllimportptr, align 4
+  ret void
+}
+
+@dllexportptr = global i32* null, align 4
+; CHECK: @dllexportptr = global i32* @dllexportvar, align 4
+@dllexportvar = dllexport global i32 0, align 4
+; CHECK: @dllexportvar = dllexport global i32 20, align 4
+define internal void @test4() {
+entry:
+  store i32 20, i32* @dllexportvar, align 4
+  store i32* @dllexportvar, i32** @dllexportptr, align 4
+  ret void
+}
+
+@threadlocalptr = global i32* null, align 4
+; CHECK: @threadlocalptr = global i32* null, align 4
+@threadlocalvar = external thread_local global i32
+define internal void @test5() {
+entry:
+  store i32* @threadlocalvar, i32** @threadlocalptr, align 4
+  ret void
+}
+
 @llvm.global_ctors = appending constant
-  [2 x { i32, void ()* }]
+  [5 x { i32, void ()* }]
   [{ i32, void ()* } { i32 65535, void ()* @test1 },
-   { i32, void ()* } { i32 65535, void ()* @test2 }]
+   { i32, void ()* } { i32 65535, void ()* @test2 },
+   { i32, void ()* } { i32 65535, void ()* @test3 },
+   { i32, void ()* } { i32 65535, void ()* @test4 },
+   { i32, void ()* } { i32 65535, void ()* @test5 }]
diff --git a/test/Transforms/IndVarSimplify/2014-06-21-congruent-constant.ll b/test/Transforms/IndVarSimplify/2014-06-21-congruent-constant.ll
new file mode 100644
index 0000000..2c738de
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/2014-06-21-congruent-constant.ll
@@ -0,0 +1,57 @@
+; RUN: opt -S -loop-unswitch -instcombine -indvars < %s | FileCheck %s
+
+; This used to crash in SCEVExpander when there were congruent phis with and
+; undef incoming value from the loop header. The -loop-unswitch -instcombine is
+; necessary to create just this pattern, which is essentially a nop and gets
+; folded away aggressively if spelled out in IR directly.
+; PR 20093
+
+@c = external global i32**, align 8
+
+define void @test1() {
+entry:
+  br i1 undef, label %for.end12, label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %entry
+  %0 = load i32*** @c, align 8
+  %1 = load i32** %0, align 8
+  %2 = load i32* %1, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.cond.backedge, %for.body9.us, %for.cond.preheader
+  %3 = phi i32* [ %1, %for.cond.preheader ], [ %3, %for.cond.backedge ], [ %6, %for.body9.us ]
+  %4 = phi i32 [ %2, %for.cond.preheader ], [ undef, %for.cond.backedge ], [ %7, %for.body9.us ]
+  %i.024 = phi i32 [ 0, %for.cond.preheader ], [ %inc, %for.cond.backedge ], [ 0, %for.body9.us ]
+  %tobool1 = icmp eq i32 %4, 0
+  br i1 %tobool1, label %if.end, label %for.cond.backedge
+
+if.end:                                           ; preds = %for.body
+  %5 = load i32* %3, align 4
+  %tobool4 = icmp eq i32 %5, 0
+  br i1 %tobool4, label %for.cond3, label %for.body9.preheader
+
+for.body9.preheader:                              ; preds = %if.end
+  %tobool8 = icmp eq i32 %i.024, 1
+  br i1 %tobool8, label %for.body9.us, label %for.body9
+
+for.body9.us:                                     ; preds = %for.body9.preheader
+  %6 = load i32** undef, align 8
+  %7 = load i32* %6, align 4
+  br label %for.body
+
+for.cond3:                                        ; preds = %for.cond3, %if.end
+  br label %for.cond3
+
+for.body9:                                        ; preds = %for.body9, %for.body9.preheader
+  br label %for.body9
+
+for.cond.backedge:                                ; preds = %for.body
+  %inc = add nsw i32 %i.024, 1
+  br i1 false, label %for.body, label %for.end12
+
+for.end12:                                        ; preds = %for.cond.backedge, %entry
+  ret void
+
+; CHECK-LABEL: @test1
+; CHECK-NOT: phi
+}
diff --git a/test/Transforms/Inline/blockaddress.ll b/test/Transforms/Inline/blockaddress.ll
index 4206312..8eb3072 100644
--- a/test/Transforms/Inline/blockaddress.ll
+++ b/test/Transforms/Inline/blockaddress.ll
@@ -1,8 +1,9 @@
 ; RUN: opt -inline -S < %s | FileCheck %s
 ; PR10162
 
-; Make sure the blockaddress is mapped correctly when doit is inlined
-; CHECK: store i8* blockaddress(@f, %here.i), i8** @ptr1, align 8
+; Make sure doit is not inlined since the blockaddress is taken
+; which could be unsafe
+; CHECK: store i8* blockaddress(@doit, %here), i8** %pptr, align 8
 
 @i = global i32 1, align 4
 @ptr1 = common global i8* null, align 8
diff --git a/test/Transforms/Inline/debug-invoke.ll b/test/Transforms/Inline/debug-invoke.ll
new file mode 100644
index 0000000..41d6074
--- /dev/null
+++ b/test/Transforms/Inline/debug-invoke.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s -always-inline -S | FileCheck %s
+
+; Test that the debug location is preserved when rewriting an inlined call as an invoke
+
+; CHECK: invoke void @test()
+; CHECK-NEXT: to label {{.*}} unwind label {{.*}}, !dbg [[INL_LOC:!.*]]
+; CHECK: [[EMPTY:.*]] = metadata !{}
+; CHECK: [[INL_LOC]] = metadata !{i32 1, i32 0, metadata [[EMPTY]], metadata [[INL_AT:.*]]}
+; CHECK: [[INL_AT]] = metadata !{i32 2, i32 0, metadata [[EMPTY]], null}
+
+declare void @test()
+declare i32 @__gxx_personality_v0(...)
+
+attributes #0 = { alwaysinline }
+define void @inl() #0 {
+  call void @test(), !dbg !3
+  ret void
+}
+
+define void @caller() {
+  invoke void @inl()
+    to label %cont unwind label %lpad, !dbg !4
+
+cont:
+  ret void
+
+lpad:
+  landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+    cleanup
+  ret void
+}
+
+!llvm.module.flags = !{!1}
+!1 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!2 = metadata !{}
+!3 = metadata !{i32 1, i32 0, metadata !2, null}
+!4 = metadata !{i32 2, i32 0, metadata !2, null}
diff --git a/test/Transforms/Inline/null-function.ll b/test/Transforms/Inline/null-function.ll
new file mode 100644
index 0000000..2aecfa8
--- /dev/null
+++ b/test/Transforms/Inline/null-function.ll
@@ -0,0 +1,9 @@
+; RUN: opt -print-before=always-inline -always-inline < %s -o /dev/null 2>&1 | FileCheck %s
+
+define i32 @main() #0 {
+entry:
+  ret i32 0
+}
+
+; CHECK: *** IR Dump Before Inliner for always_inline functions ***
+; CHECK: Printing <null> Function
diff --git a/test/Transforms/InstCombine/2010-03-03-ExtElim.ll b/test/Transforms/InstCombine/2010-03-03-ExtElim.ll
index b1384ec..e0def99 100644
--- a/test/Transforms/InstCombine/2010-03-03-ExtElim.ll
+++ b/test/Transforms/InstCombine/2010-03-03-ExtElim.ll
@@ -22,11 +22,11 @@ define i1 @PR6486() nounwind {
 define i1 @PR16462_1() nounwind {
 ; CHECK-LABEL: @PR16462_1(
   ret i1 icmp sgt (i32 sext (i16 trunc (i32 select (i1 icmp eq (i32* getelementptr inbounds ([1 x i32]* @a, i32 0, i32 0), i32* @d), i32 0, i32 1) to i16) to i32), i32 65535)
-; CHECK: ret i1 icmp sgt (i32 sext (i16 trunc (i32 select (i1 icmp eq (i32* getelementptr inbounds ([1 x i32]* @a, i32 0, i32 0), i32* @d), i32 0, i32 1) to i16) to i32), i32 65535)
+; CHECK: ret i1 false
 }
 
 define i1 @PR16462_2() nounwind {
 ; CHECK-LABEL: @PR16462_2(
   ret i1 icmp sgt (i32 sext (i16 trunc (i32 select (i1 icmp eq (i32* getelementptr inbounds ([1 x i32]* @a, i32 0, i32 0), i32* @d), i32 0, i32 1) to i16) to i32), i32 42)
-; CHECK: ret i1 icmp sgt (i16 trunc (i32 select (i1 icmp eq (i32* getelementptr inbounds ([1 x i32]* @a, i32 0, i32 0), i32* @d), i32 0, i32 1) to i16), i16 42)
+; CHECK: ret i1 false
 }
diff --git a/test/Transforms/InstCombine/2012-07-30-addrsp-bitcast.ll b/test/Transforms/InstCombine/2012-07-30-addrsp-bitcast.ll
index 4d185bf..ac9c555 100644
--- a/test/Transforms/InstCombine/2012-07-30-addrsp-bitcast.ll
+++ b/test/Transforms/InstCombine/2012-07-30-addrsp-bitcast.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 ; CHECK: addrspacecast
 
-@base = internal addrspace(3) unnamed_addr global [16 x i32] zeroinitializer, align 16
+@base = internal unnamed_addr addrspace(3) global [16 x i32] zeroinitializer, align 16
 declare void @foo(i32*)
 
 define void @test() nounwind {
diff --git a/test/Transforms/InstCombine/AddOverFlow.ll b/test/Transforms/InstCombine/AddOverFlow.ll
new file mode 100644
index 0000000..8f3d429
--- /dev/null
+++ b/test/Transforms/InstCombine/AddOverFlow.ll
@@ -0,0 +1,118 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @oppositesign
+; CHECK: add nsw i16 %a, %b
+define i16 @oppositesign(i16 %x, i16 %y) {
+; %a is negative, %b is positive
+  %a = or i16 %x, 32768
+  %b = and i16 %y, 32767
+  %c = add i16 %a, %b
+  ret i16 %c
+}
+
+define i16 @zero_sign_bit(i16 %a) {
+; CHECK-LABEL: @zero_sign_bit(
+; CHECK-NEXT: and
+; CHECK-NEXT: add nuw
+; CHECK-NEXT: ret
+  %1 = and i16 %a, 32767
+  %2 = add i16 %1, 512
+  ret i16 %2
+}
+
+define i16 @zero_sign_bit2(i16 %a, i16 %b) {
+; CHECK-LABEL: @zero_sign_bit2(
+; CHECK-NEXT: and
+; CHECK-NEXT: and
+; CHECK-NEXT: add nuw
+; CHECK-NEXT: ret
+  %1 = and i16 %a, 32767
+  %2 = and i16 %b, 32767
+  %3 = add i16 %1, %2
+  ret i16 %3
+}
+
+declare i16 @bounded(i16 %input);
+declare i32 @__gxx_personality_v0(...);
+!0 = metadata !{i16 0, i16 32768} ; [0, 32767]
+!1 = metadata !{i16 0, i16 32769} ; [0, 32768]
+
+define i16 @add_bounded_values(i16 %a, i16 %b) {
+; CHECK-LABEL: @add_bounded_values(
+entry:
+  %c = call i16 @bounded(i16 %a), !range !0
+  %d = invoke i16 @bounded(i16 %b) to label %cont unwind label %lpad, !range !0
+cont:
+; %c and %d are in [0, 32767]. Therefore, %c + %d doesn't unsigned overflow.
+  %e = add i16 %c, %d
+; CHECK: add nuw i16 %c, %d
+  ret i16 %e
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          filter [0 x i8*] zeroinitializer
+  ret i16 42
+}
+
+define i16 @add_bounded_values_2(i16 %a, i16 %b) {
+; CHECK-LABEL: @add_bounded_values_2(
+entry:
+  %c = call i16 @bounded(i16 %a), !range !1
+  %d = invoke i16 @bounded(i16 %b) to label %cont unwind label %lpad, !range !1
+cont:
+; Similar to add_bounded_values, but %c and %d are in [0, 32768]. Therefore,
+; %c + %d may unsigned overflow and we cannot add NUW.
+  %e = add i16 %c, %d
+; CHECK: add i16 %c, %d
+  ret i16 %e
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          filter [0 x i8*] zeroinitializer
+  ret i16 42
+}
+
+; CHECK-LABEL: @ripple_nsw1
+; CHECK: add nsw i16 %a, %b
+define i16 @ripple_nsw1(i16 %x, i16 %y) {
+; %a has at most one bit set
+  %a = and i16 %y, 1
+
+; %b has a 0 bit other than the sign bit
+  %b = and i16 %x, 49151
+
+  %c = add i16 %a, %b
+  ret i16 %c
+}
+
+; Like the previous test, but flip %a and %b
+; CHECK-LABEL: @ripple_nsw2
+; CHECK: add nsw i16 %b, %a
+define i16 @ripple_nsw2(i16 %x, i16 %y) {
+  %a = and i16 %y, 1
+  %b = and i16 %x, 49151
+  %c = add i16 %b, %a
+  ret i16 %c
+}
+
+; CHECK-LABEL: @ripple_no_nsw1
+; CHECK: add i32 %a, %x
+define i32 @ripple_no_nsw1(i32 %x, i32 %y) {
+; We know nothing about %x
+  %a = and i32 %y, 1
+  %b = add i32 %a, %x
+  ret i32 %b
+}
+
+; CHECK-LABEL: @ripple_no_nsw2
+; CHECK: add nuw i16 %a, %b
+define i16 @ripple_no_nsw2(i16 %x, i16 %y) {
+; %a has at most one bit set
+  %a = and i16 %y, 1
+
+; %b has a 0 bit, but it is the sign bit
+  %b = and i16 %x, 32767
+
+  %c = add i16 %a, %b
+  ret i16 %c
+}
diff --git a/test/Transforms/InstCombine/abs_abs.ll b/test/Transforms/InstCombine/abs_abs.ll
new file mode 100644
index 0000000..de10fd1
--- /dev/null
+++ b/test/Transforms/InstCombine/abs_abs.ll
@@ -0,0 +1,961 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define i32 @abs_abs_x01(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x01(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x02(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x02(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x03(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x03(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x04(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x04(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x05(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x05(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x06(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x06(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x07(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x07(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x08(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x08(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x09(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x09(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x10(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x10(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x11(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x11(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x12(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x12(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x13(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x13(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x14(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x14(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x15(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x15(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_abs_x16(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_abs_x16(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x01(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x01(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x02(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x02(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x03(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x03(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x04(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x04(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x05(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x05(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x06(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x06(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x07(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x07(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x08(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x08(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x09(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x09(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x10(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x10(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x11(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x11(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x12(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x12(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x13(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x13(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x14(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x14(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x15(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x15(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_nabs_x16(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_nabs_x16(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x01(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x01(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x02(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x02(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x03(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x03(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x04(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x04(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x05(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x05(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x06(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x06(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x07(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x07(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x08(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x08(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x09(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x09(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x10(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x10(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x11(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x11(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x12(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x12(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x13(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x13(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x14(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x14(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x15(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x15(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @abs_nabs_x16(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @abs_nabs_x16(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x01(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x01(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x02(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x02(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x03(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x03(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x04(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, -1
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x04(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x05(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x05(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x06(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x06(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x07(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x07(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x08(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp sgt i32 %cond, 0
+  %sub9 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %sub9, i32 %cond
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x08(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x09(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x09(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x10(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x10(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x11(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x11(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x12(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 0
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x12(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x13(i32 %x) {
+  %cmp = icmp sgt i32 %x, -1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x13(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, -1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x14(i32 %x) {
+  %cmp = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x14(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[NEG]], i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x15(i32 %x) {
+  %cmp = icmp slt i32 %x, 0
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x15(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 0
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+define i32 @nabs_abs_x16(i32 %x) {
+  %cmp = icmp slt i32 %x, 1
+  %sub = sub nsw i32 0, %x
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  %cmp1 = icmp slt i32 %cond, 1
+  %sub16 = sub nsw i32 0, %cond
+  %cond18 = select i1 %cmp1, i32 %cond, i32 %sub16
+  ret i32 %cond18
+; CHECK-LABEL: @nabs_abs_x16(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 1
+; CHECK-NEXT: [[NEG:%[a-z0-9]+]] = sub nsw i32 0, %x
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 %x, i32 [[NEG]]
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+\ No newline at end of file
diff --git a/test/Transforms/InstCombine/add-shrink.ll b/test/Transforms/InstCombine/add-shrink.ll
index 3edb392..67a990f 100644
--- a/test/Transforms/InstCombine/add-shrink.ll
+++ b/test/Transforms/InstCombine/add-shrink.ll
@@ -1,9 +1,11 @@
-; RUN: opt < %s -instcombine -S | grep "add nsw i32"
-; RUN: opt < %s -instcombine -S | grep sext | count 1
-
-; Should only have one sext and the add should be i32 instead of i64.
+; RUN: opt < %s -instcombine -S | FileCheck %s
 
+; CHECK-LABEL: define i64 @test
 define i64 @test1(i32 %A) {
+; CHECK: %[[ADD:.*]] = add nsw i32 %B, %C
+; CHECK: %F = sext i32 %[[ADD]] to i64
+; CHECK: ret i64 %F
+
 	%B = ashr i32 %A, 7		; <i32> [#uses=1]
 	%C = ashr i32 %A, 9		; <i32> [#uses=1]
 	%D = sext i32 %B to i64		; <i64> [#uses=1]
diff --git a/test/Transforms/InstCombine/add-sitofp.ll b/test/Transforms/InstCombine/add-sitofp.ll
index 40edf71..3b5485e 100644
--- a/test/Transforms/InstCombine/add-sitofp.ll
+++ b/test/Transforms/InstCombine/add-sitofp.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep "add nsw i32"
+; RUN: opt < %s -instcombine -S | grep "add nuw nsw i32"
 
 define double @x(i32 %a, i32 %b) nounwind {
   %m = lshr i32 %a, 24
diff --git a/test/Transforms/InstCombine/add2.ll b/test/Transforms/InstCombine/add2.ll
index 67d560e..d7eac4b 100644
--- a/test/Transforms/InstCombine/add2.ll
+++ b/test/Transforms/InstCombine/add2.ll
@@ -76,3 +76,240 @@ define <2 x i64> @test8(<2 x i64> %A) {
 ; CHECK-NEXT: %add = sub <2 x i64> <i64 1, i64 2>, %A
 ; CHECK-NEXT: ret <2 x i64> %add
 }
+
+define i16 @test9(i16 %a) {
+       %b = mul i16 %a, 2
+       %c = mul i16 %a, 32767
+       %d = add i16 %b, %c
+       ret i16 %d
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:  %d = mul i16 %a, -32767
+; CHECK-NEXT:  ret i16 %d
+}
+
+; y + (~((x >> 3) & 0x55555555) + 1) -> y - ((x >> 3) & 0x55555555)
+define i32 @test10(i32 %x, i32 %y) {
+  %shr = ashr i32 %x, 3
+  %shr.not = or i32 %shr, -1431655766
+  %neg = xor i32 %shr.not, 1431655765
+  %add = add i32 %y, 1
+  %add1 = add i32 %add, %neg
+  ret i32 %add1
+; CHECK-LABEL: @test10(
+; CHECK-NEXT: [[SHR:%[a-z0-9]+]] = ashr i32 %x, 3
+; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 [[SHR]], 1431655765
+; CHECK-NEXT: [[SUB:%[a-z0-9]+]] = sub i32 %y, [[AND]]
+; CHECK-NEXT: ret i32 [[SUB]]
+}
+
+; y + (~(x & 0x55555555) + 1) -> y - (x & 0x55555555)
+define i32 @test11(i32 %x, i32 %y) {
+  %x.not = or i32 %x, -1431655766
+  %neg = xor i32 %x.not, 1431655765
+  %add = add i32 %y, 1
+  %add1 = add i32 %add, %neg
+  ret i32 %add1
+; CHECK-LABEL: @test11(
+; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 %x, 1431655765
+; CHECK-NEXT: [[SUB:%[a-z0-9]+]] = sub i32 %y, [[AND]]
+; CHECK-NEXT: ret i32 [[SUB]]
+}
+
+; (y + 1) + ~(x & 0x55555555) -> y - (x & 0x55555555)
+define i32 @test12(i32 %x, i32 %y) {
+  %add = add nsw i32 %y, 1
+  %x.not = or i32 %x, -1431655766
+  %neg = xor i32 %x.not, 1431655765
+  %add1 = add nsw i32 %add, %neg
+  ret i32 %add1
+; CHECK-LABEL: @test12(
+; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 %x, 1431655765
+; CHECK-NEXT: [[SUB:%[a-z0-9]+]] = sub i32 %y, [[AND]]
+; CHECK-NEXT: ret i32 [[SUB]]
+}
+
+; y + (~(x & 0x55555556) + 1) -> y - (x & 0x55555556)
+define i32 @test13(i32 %x, i32 %y) {
+  %x.not = or i32 %x, -1431655767
+  %neg = xor i32 %x.not, 1431655766
+  %add = add i32 %y, 1
+  %add1 = add i32 %add, %neg
+  ret i32 %add1
+; CHECK-LABEL: @test13(
+; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 %x, 1431655766
+; CHECK-NEXT: [[SUB:%[a-z0-9]+]] = sub i32 %y, [[AND]]
+; CHECK-NEXT: ret i32 [[SUB]]
+}
+
+; (y + 1) + ~(x & 0x55555556) -> y - (x & 0x55555556)
+define i32 @test14(i32 %x, i32 %y) {
+  %add = add nsw i32 %y, 1
+  %x.not = or i32 %x, -1431655767
+  %neg = xor i32 %x.not, 1431655766
+  %add1 = add nsw i32 %add, %neg
+  ret i32 %add1
+; CHECK-LABEL: @test14(
+; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 %x, 1431655766
+; CHECK-NEXT: [[SUB:%[a-z0-9]+]] = sub i32 %y, [[AND]]
+; CHECK-NEXT: ret i32 [[SUB]]
+}
+
+; y + (~(x | 0x55555556) + 1) -> y - (x | 0x55555556)
+define i32 @test15(i32 %x, i32 %y) {
+  %x.not = and i32 %x, -1431655767
+  %neg = xor i32 %x.not, -1431655767
+  %add = add i32 %y, 1
+  %add1 = add i32 %add, %neg
+  ret i32 %add1
+; CHECK-LABEL: @test15(
+; CHECK-NEXT: [[AND:%[a-z0-9]+]] = or i32 %x, 1431655766
+; CHECK-NEXT: [[SUB:%[a-z0-9]+]] = sub i32 %y, [[AND]]
+; CHECK-NEXT: ret i32 [[SUB]]
+}
+
+; (y + 1) + ~(x | 0x55555556) -> y - (x | 0x555555556)
+define i32 @test16(i32 %x, i32 %y) {
+  %add = add nsw i32 %y, 1
+  %x.not = and i32 %x, -1431655767
+  %neg = xor i32 %x.not, -1431655767
+  %add1 = add nsw i32 %add, %neg
+  ret i32 %add1
+; CHECK-LABEL: @test16(
+; CHECK-NEXT: [[AND:%[a-z0-9]+]] = or i32 %x, 1431655766
+; CHECK-NEXT: [[SUB:%[a-z0-9]+]] = sub i32 %y, [[AND]]
+; CHECK-NEXT: ret i32 [[SUB]]
+}
+
+; y + (~(x | 0x55555555) + 1) -> y - (x | 0x55555555)
+define i32 @test17(i32 %x, i32 %y) {
+  %x.not = and i32 %x, -1431655766
+  %add2 = xor i32 %x.not, -1431655765
+  %add1 = add nsw i32 %add2, %y
+  ret i32 %add1
+; CHECK-LABEL: @test17(
+; CHECK-NEXT: [[AND:%[a-z0-9]+]] = or i32 %x, 1431655765
+; CHECK-NEXT: [[SUB:%[a-z0-9]+]] = sub i32 %y, [[AND]]
+; CHECK-NEXT: ret i32 [[SUB]]
+}
+
+; (y + 1) + ~(x | 0x55555555) -> y - (x | 0x55555555)
+define i32 @test18(i32 %x, i32 %y) {
+  %add = add nsw i32 %y, 1
+  %x.not = and i32 %x, -1431655766
+  %neg = xor i32 %x.not, -1431655766
+  %add1 = add nsw i32 %add, %neg
+  ret i32 %add1
+; CHECK-LABEL: @test18(
+; CHECK-NEXT: [[AND:%[a-z0-9]+]] = or i32 %x, 1431655765
+; CHECK-NEXT: [[SUB:%[a-z0-9]+]] = sub i32 %y, [[AND]]
+; CHECK-NEXT: ret i32 [[SUB]]
+}
+
+define i16 @add_nsw_mul_nsw(i16 %x) {
+ %add1 = add nsw i16 %x, %x
+ %add2 = add nsw i16 %add1, %x
+ ret i16 %add2
+; CHECK-LABEL: @add_nsw_mul_nsw(
+; CHECK-NEXT: %add2 = mul nsw i16 %x, 3
+; CHECK-NEXT: ret i16 %add2
+}
+
+define i16 @mul_add_to_mul_1(i16 %x) {
+ %mul1 = mul nsw i16 %x, 8
+ %add2 = add nsw i16 %x, %mul1
+ ret i16 %add2
+; CHECK-LABEL: @mul_add_to_mul_1(
+; CHECK-NEXT: %add2 = mul nsw i16 %x, 9
+; CHECK-NEXT: ret i16 %add2
+}
+
+define i16 @mul_add_to_mul_2(i16 %x) {
+ %mul1 = mul nsw i16 %x, 8
+ %add2 = add nsw i16 %mul1, %x
+ ret i16 %add2
+; CHECK-LABEL: @mul_add_to_mul_2(
+; CHECK-NEXT: %add2 = mul nsw i16 %x, 9
+; CHECK-NEXT: ret i16 %add2
+}
+
+define i16 @mul_add_to_mul_3(i16 %a) {
+ %mul1 = mul i16 %a, 2
+ %mul2 = mul i16 %a, 3
+ %add = add nsw i16 %mul1, %mul2
+ ret i16 %add
+; CHECK-LABEL: @mul_add_to_mul_3(
+; CHECK-NEXT: %add = mul i16 %a, 5
+; CHECK-NEXT: ret i16 %add
+}
+
+define i16 @mul_add_to_mul_4(i16 %a) {
+ %mul1 = mul nsw i16 %a, 2
+ %mul2 = mul nsw i16 %a, 7
+ %add = add nsw i16 %mul1, %mul2
+ ret i16 %add
+; CHECK-LABEL: @mul_add_to_mul_4(
+; CHECK-NEXT: %add = mul nsw i16 %a, 9
+; CHECK-NEXT: ret i16 %add
+}
+
+define i16 @mul_add_to_mul_5(i16 %a) {
+ %mul1 = mul nsw i16 %a, 3
+ %mul2 = mul nsw i16 %a, 7
+ %add = add nsw i16 %mul1, %mul2
+ ret i16 %add
+; CHECK-LABEL: @mul_add_to_mul_5(
+; CHECK-NEXT: %add = mul nsw i16 %a, 10
+; CHECK-NEXT: ret i16 %add
+}
+
+define i32 @mul_add_to_mul_6(i32 %x, i32 %y) {
+  %mul1 = mul nsw i32 %x, %y
+  %mul2 = mul nsw i32 %mul1, 5
+  %add = add nsw i32 %mul1, %mul2
+  ret i32 %add
+; CHECK-LABEL: @mul_add_to_mul_6(
+; CHECK-NEXT: %mul1 = mul nsw i32 %x, %y
+; CHECK-NEXT: %add = mul nsw i32 %mul1, 6
+; CHECK-NEXT: ret i32 %add
+}
+
+; This test and the next test verify that when a range metadata is attached to
+; llvm.cttz, ValueTracking correctly intersects the range specified by the
+; metadata and the range implied by the intrinsic.
+;
+; In this test, the range specified by the metadata is more strict. Therefore,
+; ValueTracking uses that range.
+define i16 @add_cttz(i16 %a) {
+; CHECK-LABEL: @add_cttz(
+  ; llvm.cttz.i16(..., /*is_zero_undefined=*/true) implies the value returned
+  ; is in [0, 16). The range metadata indicates the value returned is in [0, 8).
+  ; Intersecting these ranges, we know the value returned is in [0, 8).
+  ; Therefore, InstCombine will transform
+  ;     add %cttz, 1111 1111 1111 1000 ; decimal -8
+  ; to
+  ;     or  %cttz, 1111 1111 1111 1000
+  %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 true), !range !0
+  %b = add i16 %cttz, -8
+; CHECK: or i16 %cttz, -8
+  ret i16 %b
+}
+declare i16 @llvm.cttz.i16(i16, i1)
+!0 = metadata !{i16 0, i16 8}
+
+; Similar to @add_cttz, but in this test, the range implied by the
+; intrinsic is more strict. Therefore, ValueTracking uses that range.
+define i16 @add_cttz_2(i16 %a) {
+; CHECK-LABEL: @add_cttz_2(
+  ; llvm.cttz.i16(..., /*is_zero_undefined=*/true) implies the value returned
+  ; is in [0, 16). The range metadata indicates the value returned is in
+  ; [0, 32). Intersecting these ranges, we know the value returned is in
+  ; [0, 16). Therefore, InstCombine will transform
+  ;     add %cttz, 1111 1111 1111 0000 ; decimal -16
+  ; to
+  ;     or  %cttz, 1111 1111 1111 0000
+  %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 true), !range !1
+  %b = add i16 %cttz, -16
+; CHECK: or i16 %cttz, -16
+  ret i16 %b
+}
+!1 = metadata !{i16 0, i16 32}
diff --git a/test/Transforms/InstCombine/addrspacecast.ll b/test/Transforms/InstCombine/addrspacecast.ll
index d908b55..c168436 100644
--- a/test/Transforms/InstCombine/addrspacecast.ll
+++ b/test/Transforms/InstCombine/addrspacecast.ll
@@ -28,13 +28,91 @@ define <4 x i32*> @combine_redundant_addrspacecast_vector(<4 x i32 addrspace(1)*
 
 define float* @combine_redundant_addrspacecast_types(i32 addrspace(1)* %x) nounwind {
 ; CHECK-LABEL: @combine_redundant_addrspacecast_types(
-; CHECK: addrspacecast i32 addrspace(1)* %x to float*
+; CHECK-NEXT: bitcast i32 addrspace(1)* %x to float addrspace(1)*
+; CHECK-NEXT: addrspacecast float addrspace(1)* %1 to float*
 ; CHECK-NEXT: ret
   %y = addrspacecast i32 addrspace(1)* %x to i32 addrspace(3)*
   %z = addrspacecast i32 addrspace(3)* %y to float*
   ret float* %z
 }
 
+define <4 x float*> @combine_redundant_addrspacecast_types_vector(<4 x i32 addrspace(1)*> %x) nounwind {
+; CHECK-LABEL: @combine_redundant_addrspacecast_types_vector(
+; CHECK-NEXT: bitcast <4 x i32 addrspace(1)*> %x to <4 x float addrspace(1)*>
+; CHECK-NEXT: addrspacecast <4 x float addrspace(1)*> %1 to <4 x float*>
+; CHECK-NEXT: ret
+  %y = addrspacecast <4 x i32 addrspace(1)*> %x to <4 x i32 addrspace(3)*>
+  %z = addrspacecast <4 x i32 addrspace(3)*> %y to <4 x float*>
+  ret <4 x float*> %z
+}
+
+define float addrspace(2)* @combine_addrspacecast_bitcast_1(i32 addrspace(1)* %x) nounwind {
+; CHECK-LABEL: @combine_addrspacecast_bitcast_1(
+; CHECK-NEXT: bitcast i32 addrspace(1)* %x to float addrspace(1)*
+; CHECK-NEXT: addrspacecast float addrspace(1)* %1 to float addrspace(2)*
+; CHECK-NEXT: ret
+  %y = addrspacecast i32 addrspace(1)* %x to i32 addrspace(2)*
+  %z = bitcast i32 addrspace(2)* %y to float addrspace(2)*
+  ret float addrspace(2)* %z
+}
+
+define i32 addrspace(2)* @combine_addrspacecast_bitcast_2(i32 addrspace(1)* %x) nounwind {
+; CHECK-LABEL: @combine_addrspacecast_bitcast_2(
+; CHECK: addrspacecast i32 addrspace(1)* %x to i32 addrspace(2)*
+; CHECK-NEXT: ret
+  %y = addrspacecast i32 addrspace(1)* %x to float addrspace(2)*
+  %z = bitcast float addrspace(2)* %y to i32 addrspace(2)*
+  ret i32 addrspace(2)* %z
+}
+
+define i32 addrspace(2)* @combine_bitcast_addrspacecast_1(i32 addrspace(1)* %x) nounwind {
+; CHECK-LABEL: @combine_bitcast_addrspacecast_1(
+; CHECK: addrspacecast i32 addrspace(1)* %x to i32 addrspace(2)*
+; CHECK-NEXT: ret
+  %y = bitcast i32 addrspace(1)* %x to i8 addrspace(1)*
+  %z = addrspacecast i8 addrspace(1)* %y to i32 addrspace(2)*
+  ret i32 addrspace(2)* %z
+}
+
+define float addrspace(2)* @combine_bitcast_addrspacecast_2(i32 addrspace(1)* %x) nounwind {
+; CHECK-LABEL: @combine_bitcast_addrspacecast_2(
+; CHECK: bitcast i32 addrspace(1)* %x to float addrspace(1)*
+; CHECK: addrspacecast float addrspace(1)* %1 to float addrspace(2)*
+; CHECK-NEXT: ret
+  %y = bitcast i32 addrspace(1)* %x to i8 addrspace(1)*
+  %z = addrspacecast i8 addrspace(1)* %y to float addrspace(2)*
+  ret float addrspace(2)* %z
+}
+
+define float addrspace(2)* @combine_addrspacecast_types(i32 addrspace(1)* %x) nounwind {
+; CHECK-LABEL: @combine_addrspacecast_types(
+; CHECK-NEXT: bitcast i32 addrspace(1)* %x to float addrspace(1)*
+; CHECK-NEXT: addrspacecast float addrspace(1)* %1 to float addrspace(2)*
+; CHECK-NEXT: ret
+  %y = addrspacecast i32 addrspace(1)* %x to float addrspace(2)*
+  ret float addrspace(2)* %y
+}
+
+define <4 x float addrspace(2)*> @combine_addrspacecast_types_vector(<4 x i32 addrspace(1)*> %x) nounwind {
+; CHECK-LABEL: @combine_addrspacecast_types_vector(
+; CHECK-NEXT: bitcast <4 x i32 addrspace(1)*> %x to <4 x float addrspace(1)*>
+; CHECK-NEXT: addrspacecast <4 x float addrspace(1)*> %1 to <4 x float addrspace(2)*>
+; CHECK-NEXT: ret
+  %y = addrspacecast <4 x i32 addrspace(1)*> %x to <4 x float addrspace(2)*>
+  ret <4 x float addrspace(2)*> %y
+}
+
+define i32 @canonicalize_addrspacecast([16 x i32] addrspace(1)* %arr) {
+; CHECK-LABEL: @canonicalize_addrspacecast(
+; CHECK-NEXT: getelementptr inbounds [16 x i32] addrspace(1)* %arr, i32 0, i32 0
+; CHECK-NEXT: addrspacecast i32 addrspace(1)* %{{[a-zA-Z0-9]+}} to i32*
+; CHECK-NEXT: load i32*
+; CHECK-NEXT: ret i32
+  %p = addrspacecast [16 x i32] addrspace(1)* %arr to i32*
+  %v = load i32* %p
+  ret i32 %v
+}
+
 @const_array = addrspace(2) constant [60 x i8] [i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22,
                                                 i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22,
                                                 i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22,
diff --git a/test/Transforms/InstCombine/align-2d-gep.ll b/test/Transforms/InstCombine/align-2d-gep.ll
index 5bca46d..f6a8776 100644
--- a/test/Transforms/InstCombine/align-2d-gep.ll
+++ b/test/Transforms/InstCombine/align-2d-gep.ll
@@ -31,7 +31,7 @@ bb1:
   store <2 x double><double 0.0, double 0.0>, <2 x double>* %r, align 8
 
   %indvar.next = add i64 %j, 2
-  %exitcond = icmp eq i64 %indvar.next, 557
+  %exitcond = icmp eq i64 %indvar.next, 556
   br i1 %exitcond, label %bb11, label %bb1
 
 bb11:
diff --git a/test/Transforms/InstCombine/bitcast-alias-function.ll b/test/Transforms/InstCombine/bitcast-alias-function.ll
index 284960b..a6b56f9 100644
--- a/test/Transforms/InstCombine/bitcast-alias-function.ll
+++ b/test/Transforms/InstCombine/bitcast-alias-function.ll
@@ -6,46 +6,46 @@ target datalayout = "e-p:32:32:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16
 ; Cases that should be bitcast
 
 ; Test cast between scalars with same bit sizes
-@alias_i32_to_f32 = alias float (float), i32 (i32)* @func_i32
+@alias_i32_to_f32 = alias bitcast (i32 (i32)* @func_i32 to float (float)*)
 
 ; Test cast between vectors with same number of elements and bit sizes
-@alias_v2i32_to_v2f32 = alias <2 x float> (<2 x float>), <2 x i32> (<2 x i32>)* @func_v2i32
+@alias_v2i32_to_v2f32 = alias bitcast (<2 x i32> (<2 x i32>)* @func_v2i32 to <2 x float> (<2 x float>)*)
 
 ; Test cast from vector to scalar with same number of bits
-@alias_v2f32_to_i64 = alias <2 x float> (<2 x float>), i64 (i64)* @func_i64
+@alias_v2f32_to_i64 = alias bitcast (i64 (i64)* @func_i64 to <2 x float> (<2 x float>)*)
 
 ; Test cast from scalar to vector with same number of bits
-@alias_i64_to_v2f32 = alias  i64 (i64), <2 x float> (<2 x float>)* @func_v2f32
+@alias_i64_to_v2f32 = alias bitcast (<2 x float> (<2 x float>)* @func_v2f32 to i64 (i64)*)
 
 ; Test cast between vectors of pointers
-@alias_v2i32p_to_v2i64p = alias <2 x i64*> (<2 x i64*>), <2 x i32*> (<2 x i32*>)* @func_v2i32p
+@alias_v2i32p_to_v2i64p = alias bitcast (<2 x i32*> (<2 x i32*>)* @func_v2i32p to <2 x i64*> (<2 x i64*>)*)
 
 
 ; Cases that should be invalid and unchanged
 
 ; Test cast between scalars with different bit sizes
-@alias_i64_to_f32 = alias float (float), i64 (i64)* @func_i64
+@alias_i64_to_f32 = alias bitcast (i64 (i64)* @func_i64 to float (float)*)
 
 ; Test cast between vectors with different bit sizes but the
 ; same number of elements
-@alias_v2i64_to_v2f32 = alias <2 x float> (<2 x float>), <2 x i64> (<2 x i64>)* @func_v2i64
+@alias_v2i64_to_v2f32 = alias bitcast (<2 x i64> (<2 x i64>)* @func_v2i64 to <2 x float> (<2 x float>)*)
 
 ; Test cast between vectors with same number of bits and different
 ; numbers of elements
-@alias_v2i32_to_v4f32 = alias  <4 x float> (<4 x float>), <2 x i32> (<2 x i32>)* @func_v2i32
+@alias_v2i32_to_v4f32 = alias bitcast (<2 x i32> (<2 x i32>)* @func_v2i32 to <4 x float> (<4 x float>)*)
 
 ; Test cast between scalar and vector with different number of bits
-@alias_i64_to_v4f32 = alias i64 (i64), <4 x float> (<4 x float>)* @func_v4f32
+@alias_i64_to_v4f32 = alias bitcast (<4 x float> (<4 x float>)* @func_v4f32 to i64 (i64)*)
 
 ; Test cast between vector and scalar with different number of bits
-@alias_v4f32_to_i64 = alias <4 x float> (<4 x float>), i64 (i64)* @func_i64
+@alias_v4f32_to_i64 = alias bitcast (i64 (i64)* @func_i64 to <4 x float> (<4 x float>)*)
 
 ; Test cast from scalar to vector of pointers with same number of bits
 ; We don't know the pointer size at this point, so this can't be done
-@alias_i64_to_v2i32p = alias  i64 (i64), <2 x i32*> (<2 x i32*>)* @func_v2i32p
+@alias_i64_to_v2i32p = alias bitcast (<2 x i32*> (<2 x i32*>)* @func_v2i32p to i64 (i64)*)
 
 ; Test cast between vector of pointers and scalar with different number of bits
-@alias_v4i32p_to_i64 = alias <4 x i32*> (<4 x i32*>), i64 (i64)* @func_i64
+@alias_v4i32p_to_i64 = alias bitcast (i64 (i64)* @func_i64 to <4 x i32*> (<4 x i32*>)*)
 
 
 
diff --git a/test/Transforms/InstCombine/cast.ll b/test/Transforms/InstCombine/cast.ll
index 4fab92f..0cbfbb0 100644
--- a/test/Transforms/InstCombine/cast.ll
+++ b/test/Transforms/InstCombine/cast.ll
@@ -370,7 +370,7 @@ define zeroext i64 @test43(i8 zeroext %on_off) nounwind readonly {
 	ret i64 %C  ;; Should be (add (zext i8 -> i64), -1)
 ; CHECK-LABEL: @test43(
 ; CHECK-NEXT: %A = zext i8 %on_off to i64
-; CHECK-NEXT: %B = add i64 %A, -1
+; CHECK-NEXT: %B = add nsw i64 %A, -1
 ; CHECK-NEXT: ret i64 %B
 }
 
diff --git a/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll b/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll
index 9f21d54..7fac78a 100644
--- a/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll
+++ b/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll
@@ -230,3 +230,13 @@ define i32 @constant_through_array_as_ptrs() {
   %b = load i32 addrspace(1)* %a, align 4
   ret i32 %b
 }
+
+@shared_mem = external addrspace(3) global [0 x i8]
+
+define float @canonicalize_addrspacecast(i32 %i) {
+; CHECK-LABEL: @canonicalize_addrspacecast
+; CHECK-NEXT: getelementptr inbounds float* addrspacecast (float addrspace(3)* bitcast ([0 x i8] addrspace(3)* @shared_mem to float addrspace(3)*) to float*), i32 %i
+  %p = getelementptr inbounds float* addrspacecast ([0 x i8] addrspace(3)* @shared_mem to float*), i32 %i
+  %v = load float* %p
+  ret float %v
+}
diff --git a/test/Transforms/InstCombine/descale-zero.ll b/test/Transforms/InstCombine/descale-zero.ll
new file mode 100644
index 0000000..7990fdb
--- /dev/null
+++ b/test/Transforms/InstCombine/descale-zero.ll
@@ -0,0 +1,21 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+define internal i8* @descale_zero() {
+entry:
+; CHECK: load i16** inttoptr (i64 48 to i16**), align 16
+; CHECK-NEXT: bitcast i16*
+; CHECK-NEXT: ret i8*
+  %i16_ptr = load i16** inttoptr (i64 48 to i16**), align 16
+  %num = load i64* inttoptr (i64 64 to i64*), align 64
+  %num_times_2 = shl i64 %num, 1
+  %num_times_2_plus_4 = add i64 %num_times_2, 4
+  %i8_ptr = bitcast i16* %i16_ptr to i8*
+  %i8_ptr_num_times_2_plus_4 = getelementptr i8* %i8_ptr, i64 %num_times_2_plus_4
+  %num_times_neg2 = mul i64 %num, -2
+  %num_times_neg2_minus_4 = add i64 %num_times_neg2, -4
+  %addr = getelementptr i8* %i8_ptr_num_times_2_plus_4, i64 %num_times_neg2_minus_4
+  ret i8* %addr
+}
diff --git a/test/Transforms/InstSimplify/2010-12-20-Distribute.ll b/test/Transforms/InstCombine/distribute.ll
index 9ea0a5e..e6360f8 100644
--- a/test/Transforms/InstSimplify/2010-12-20-Distribute.ll
+++ b/test/Transforms/InstCombine/distribute.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instsimplify -S | FileCheck %s
+; RUN: opt < %s -instcombine -S | FileCheck %s
 
 define i32 @factorize(i32 %x, i32 %y) {
 ; CHECK-LABEL: @factorize(
@@ -28,27 +28,32 @@ define i32 @factorize3(i32 %x, i32 %a, i32 %b) {
   %r = or i32 %x, %b
   %z = and i32 %l, %r
   ret i32 %z
-; CHECK: ret i32 %r
+; CHECK: %z = or i32 %b, %x
+; CHECK: ret i32 %z
 }
 
 define i32 @factorize4(i32 %x, i32 %y) {
 ; CHECK-LABEL: @factorize4(
+; ((Y << 1) * X) - (X * Y) -> (X * (Y * 2 - Y)) -> (X * Y)
   %sh = shl i32 %y, 1
   %ml = mul i32 %sh, %x
   %mr = mul i32 %x, %y
   %s = sub i32 %ml, %mr
   ret i32 %s
-; CHECK: ret i32 %mr
+; CHECK: %s = mul i32 %y, %x
+; CHECK: ret i32 %s
 }
 
 define i32 @factorize5(i32 %x, i32 %y) {
 ; CHECK-LABEL: @factorize5(
+; ((Y * 2) * X) - (X * Y) -> (X * Y)
   %sh = mul i32 %y, 2
   %ml = mul i32 %sh, %x
   %mr = mul i32 %x, %y
   %s = sub i32 %ml, %mr
   ret i32 %s
-; CHECK: ret i32 %mr
+; CHECK: %s = mul i32 %y, %x
+; CHECK: ret i32 %s
 }
 
 define i32 @expand(i32 %x) {
@@ -58,5 +63,6 @@ define i32 @expand(i32 %x) {
   %b = or i32 %a, 2
   %c = and i32 %b, 1
   ret i32 %c
+; CHECK: %a = and i32 %x, 1
 ; CHECK: ret i32 %a
 }
diff --git a/test/Transforms/InstCombine/ffs-1.ll b/test/Transforms/InstCombine/ffs-1.ll
index 1dec11d..c8763dc 100644
--- a/test/Transforms/InstCombine/ffs-1.ll
+++ b/test/Transforms/InstCombine/ffs-1.ll
@@ -103,7 +103,7 @@ define i32 @test_simplify13(i32 %x) {
 ; CHECK-LABEL: @test_simplify13(
   %ret = call i32 @ffs(i32 %x)
 ; CHECK-NEXT: [[CTTZ:%[a-z0-9]+]] = call i32 @llvm.cttz.i32(i32 %x, i1 false)
-; CHECK-NEXT: [[INC:%[a-z0-9]+]] = add i32 [[CTTZ]], 1
+; CHECK-NEXT: [[INC:%[a-z0-9]+]] = add nuw nsw i32 [[CTTZ]], 1
 ; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp ne i32 %x, 0
 ; CHECK-NEXT: [[RET:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[INC]], i32 0
   ret i32 %ret
@@ -114,7 +114,7 @@ define i32 @test_simplify14(i32 %x) {
 ; CHECK-LINUX-LABEL: @test_simplify14(
   %ret = call i32 @ffsl(i32 %x)
 ; CHECK-LINUX-NEXT: [[CTTZ:%[a-z0-9]+]] = call i32 @llvm.cttz.i32(i32 %x, i1 false)
-; CHECK-LINUX-NEXT: [[INC:%[a-z0-9]+]] = add i32 [[CTTZ]], 1
+; CHECK-LINUX-NEXT: [[INC:%[a-z0-9]+]] = add nuw nsw i32 [[CTTZ]], 1
 ; CHECK-LINUX-NEXT: [[CMP:%[a-z0-9]+]] = icmp ne i32 %x, 0
 ; CHECK-LINUX-NEXT: [[RET:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[INC]], i32 0
   ret i32 %ret
@@ -125,7 +125,7 @@ define i32 @test_simplify15(i64 %x) {
 ; CHECK-LINUX-LABEL: @test_simplify15(
   %ret = call i32 @ffsll(i64 %x)
 ; CHECK-LINUX-NEXT: [[CTTZ:%[a-z0-9]+]] = call i64 @llvm.cttz.i64(i64 %x, i1 false)
-; CHECK-LINUX-NEXT: [[INC:%[a-z0-9]+]] = add i64 [[CTTZ]], 1
+; CHECK-LINUX-NEXT: [[INC:%[a-z0-9]+]] = add nuw nsw i64 [[CTTZ]], 1
 ; CHECK-LINUX-NEXT: [[TRUNC:%[a-z0-9]+]] = trunc i64 [[INC]] to i32
 ; CHECK-LINUX-NEXT: [[CMP:%[a-z0-9]+]] = icmp ne i64 %x, 0
 ; CHECK-LINUX-NEXT: [[RET:%[a-z0-9]+]] = select i1 [[CMP]], i32 [[TRUNC]], i32 0
diff --git a/test/Transforms/InstCombine/gepphigep.ll b/test/Transforms/InstCombine/gepphigep.ll
new file mode 100644
index 0000000..9aab609
--- /dev/null
+++ b/test/Transforms/InstCombine/gepphigep.ll
@@ -0,0 +1,56 @@
+; RUN: opt -instcombine -S  < %s | FileCheck %s
+
+%struct1 = type { %struct2*, i32, i32, i32 }
+%struct2 = type { i32, i32 }
+
+define i32 @test1(%struct1* %dm, i1 %tmp4, i64 %tmp9, i64 %tmp19) {
+bb:
+  %tmp = getelementptr inbounds %struct1* %dm, i64 0, i32 0
+  %tmp1 = load %struct2** %tmp, align 8
+  br i1 %tmp4, label %bb1, label %bb2
+
+bb1:
+  %tmp10 = getelementptr inbounds %struct2* %tmp1, i64 %tmp9
+  %tmp11 = getelementptr inbounds %struct2* %tmp10, i64 0, i32 0
+  store i32 0, i32* %tmp11, align 4
+  br label %bb3
+
+bb2:
+  %tmp20 = getelementptr inbounds %struct2* %tmp1, i64 %tmp19
+  %tmp21 = getelementptr inbounds %struct2* %tmp20, i64 0, i32 0
+  store i32 0, i32* %tmp21, align 4
+  br label %bb3
+
+bb3:
+  %phi = phi %struct2* [ %tmp10, %bb1 ], [ %tmp20, %bb2 ]
+  %tmp24 = getelementptr inbounds %struct2* %phi, i64 0, i32 1
+  %tmp25 = load i32* %tmp24, align 4
+  ret i32 %tmp25
+
+; CHECK-LABEL: @test1(
+; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp9, i32 0
+; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp19, i32 0
+; CHECK: %[[PHI:[0-9A-Za-z]+]] = phi i64 [ %tmp9, %bb1 ], [ %tmp19, %bb2 ]
+; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %[[PHI]], i32 1
+
+}
+
+define i32 @test2(%struct1* %dm, i1 %tmp4, i64 %tmp9, i64 %tmp19) {
+bb:
+  %tmp = getelementptr inbounds %struct1* %dm, i64 0, i32 0
+  %tmp1 = load %struct2** %tmp, align 8
+  %tmp10 = getelementptr inbounds %struct2* %tmp1, i64 %tmp9
+  %tmp11 = getelementptr inbounds %struct2* %tmp10, i64 0, i32 0
+  store i32 0, i32* %tmp11, align 4
+  %tmp20 = getelementptr inbounds %struct2* %tmp1, i64 %tmp19
+  %tmp21 = getelementptr inbounds %struct2* %tmp20, i64 0, i32 0
+  store i32 0, i32* %tmp21, align 4
+  %tmp24 = getelementptr inbounds %struct2* %tmp10, i64 0, i32 1
+  %tmp25 = load i32* %tmp24, align 4
+  ret i32 %tmp25
+
+; CHECK-LABEL: @test2(
+; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp9, i32 0
+; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp19, i32 0
+; CHECK: getelementptr inbounds %struct2* %tmp1, i64 %tmp9, i32 1
+}
diff --git a/test/Transforms/InstCombine/getelementptr.ll b/test/Transforms/InstCombine/getelementptr.ll
index ef0cb29..3240c6d 100644
--- a/test/Transforms/InstCombine/getelementptr.ll
+++ b/test/Transforms/InstCombine/getelementptr.ll
@@ -732,7 +732,8 @@ define i64 @test_gep_bitcast_array_same_size_element([100 x double]* %arr, i64 %
 define i64 @test_gep_bitcast_array_same_size_element_addrspacecast([100 x double]* %arr, i64 %N) {
 ; CHECK-LABEL: @test_gep_bitcast_array_same_size_element_addrspacecast(
 ; CHECK: getelementptr [100 x double]* %arr, i64 0, i64 %V
-; CHECK-NEXT: %t = addrspacecast double*
+; CHECK-NEXT: bitcast double*
+; CHECK-NEXT: %t = addrspacecast i64*
 ; CHECK: load i64 addrspace(3)* %t
   %cast = addrspacecast [100 x double]* %arr to i64 addrspace(3)*
   %V = mul i64 %N, 8
@@ -802,10 +803,22 @@ define i16 @test41([3 x i32] addrspace(1)* %array) {
 ; CHECK-NEXT: ret i16 8
 }
 
-define i32 addrspace(1)* @ascast_0_gep([128 x i32]* %p) nounwind {
+define i32 addrspace(1)* @ascast_0_gep(i32* %p) nounwind {
 ; CHECK-LABEL: @ascast_0_gep(
 ; CHECK-NOT: getelementptr
 ; CHECK: ret
+  %gep = getelementptr i32* %p, i32 0
+  %x = addrspacecast i32* %gep to i32 addrspace(1)*
+  ret i32 addrspace(1)* %x
+}
+
+; Do not merge the GEP and the addrspacecast, because it would undo the
+; addrspacecast canonicalization.
+define i32 addrspace(1)* @ascast_0_0_gep([128 x i32]* %p) nounwind {
+; CHECK-LABEL: @ascast_0_0_gep(
+; CHECK-NEXT: getelementptr [128 x i32]
+; CHECK-NEXT: addrspacecast i32*
+; CHECK-NEXT: ret i32 addrspace(1)*
   %gep = getelementptr [128 x i32]* %p, i32 0, i32 0
   %x = addrspacecast i32* %gep to i32 addrspace(1)*
   ret i32 addrspace(1)* %x
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index f45897c..26e144f 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -1,7 +1,6 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-target datalayout =
-"e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target datalayout = "e-p:64:64:64-p1:16:16:16-p2:32:32:32-p3:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define i32 @test1(i32 %X) {
 entry:
@@ -166,6 +165,14 @@ define i1 @test17(i32 %x) nounwind {
 ; CHECK-NEXT: %cmp = icmp ne i32 %x, 3
 }
 
+define i1 @test17a(i32 %x) nounwind {
+  %shl = shl i32 1, %x
+  %and = and i32 %shl, 7
+  %cmp = icmp eq i32 %and, 0
+  ret i1 %cmp
+; CHECK-LABEL: @test17a(
+; CHECK-NEXT: %cmp = icmp ugt i32 %x, 2
+}
 
 define i1 @test18(i32 %x) nounwind {
   %sh = lshr i32 8, %x
@@ -194,6 +201,15 @@ define i1 @test20(i32 %x) nounwind {
 ; CHECK-NEXT: %cmp = icmp eq i32 %x, 3
 }
 
+define i1 @test20a(i32 %x) nounwind {
+  %shl = shl i32 1, %x
+  %and = and i32 %shl, 7
+  %cmp = icmp ne i32 %and, 0
+  ret i1 %cmp
+; CHECK-LABEL: @test20a(
+; CHECK-NEXT: %cmp = icmp ult i32 %x, 3
+}
+
 define i1 @test21(i8 %x, i8 %y) {
 ; CHECK-LABEL: @test21(
 ; CHECK-NOT: or i8
@@ -657,6 +673,49 @@ define i1 @test60_as1(i8 addrspace(1)* %foo, i64 %i, i64 %j) {
 ; CHECK-NEXT: ret i1
 }
 
+; Same as test60, but look through an addrspacecast instead of a
+; bitcast. This uses the same sized addrspace.
+define i1 @test60_addrspacecast(i8* %foo, i64 %i, i64 %j) {
+  %bit = addrspacecast i8* %foo to i32 addrspace(3)*
+  %gep1 = getelementptr inbounds i32 addrspace(3)* %bit, i64 %i
+  %gep2 = getelementptr inbounds i8* %foo, i64 %j
+  %cast1 = addrspacecast i32 addrspace(3)* %gep1 to i8*
+  %cmp = icmp ult i8* %cast1, %gep2
+  ret i1 %cmp
+; CHECK-LABEL: @test60_addrspacecast(
+; CHECK-NEXT: %gep1.idx = shl nuw i64 %i, 2
+; CHECK-NEXT: icmp slt i64 %gep1.idx, %j
+; CHECK-NEXT: ret i1
+}
+
+define i1 @test60_addrspacecast_smaller(i8* %foo, i16 %i, i64 %j) {
+  %bit = addrspacecast i8* %foo to i32 addrspace(1)*
+  %gep1 = getelementptr inbounds i32 addrspace(1)* %bit, i16 %i
+  %gep2 = getelementptr inbounds i8* %foo, i64 %j
+  %cast1 = addrspacecast i32 addrspace(1)* %gep1 to i8*
+  %cmp = icmp ult i8* %cast1, %gep2
+  ret i1 %cmp
+; CHECK-LABEL: @test60_addrspacecast_smaller(
+; CHECK-NEXT: %gep1.idx = shl nuw i16 %i, 2
+; CHECK-NEXT: trunc i64 %j to i16
+; CHECK-NEXT: icmp sgt i16 %1, %gep1.idx
+; CHECK-NEXT: ret i1
+}
+
+define i1 @test60_addrspacecast_larger(i8 addrspace(1)* %foo, i32 %i, i16 %j) {
+  %bit = addrspacecast i8 addrspace(1)* %foo to i32 addrspace(2)*
+  %gep1 = getelementptr inbounds i32 addrspace(2)* %bit, i32 %i
+  %gep2 = getelementptr inbounds i8 addrspace(1)* %foo, i16 %j
+  %cast1 = addrspacecast i32 addrspace(2)* %gep1 to i8 addrspace(1)*
+  %cmp = icmp ult i8 addrspace(1)* %cast1, %gep2
+  ret i1 %cmp
+; CHECK-LABEL: @test60_addrspacecast_larger(
+; CHECK-NEXT:  %gep1.idx = shl nuw i32 %i, 2
+; CHECK-NEXT:  trunc i32 %gep1.idx to i16
+; CHECK-NEXT:  icmp slt i16 %1, %j
+; CHECK-NEXT:  ret i1
+}
+
 define i1 @test61(i8* %foo, i64 %i, i64 %j) {
   %bit = bitcast i8* %foo to i32*
   %gep1 = getelementptr i32* %bit, i64 %i
diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll
index 91c4470..9b58d93 100644
--- a/test/Transforms/InstCombine/intrinsics.ll
+++ b/test/Transforms/InstCombine/intrinsics.ll
@@ -3,6 +3,7 @@
 %overflow.result = type {i8, i1}
 
 declare %overflow.result @llvm.uadd.with.overflow.i8(i8, i8)
+declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32)
 declare %overflow.result @llvm.umul.with.overflow.i8(i8, i8)
 declare double @llvm.powi.f64(double, i32) nounwind readonly
 declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
@@ -89,6 +90,18 @@ define i8 @uaddtest7(i8 %A, i8 %B) {
 ; CHECK-NEXT: ret i8 %z
 }
 
+; PR20194
+define { i32, i1 } @saddtest1(i8 %a, i8 %b) {
+  %A = sext i8 %a to i32
+  %B = sext i8 %b to i32
+  %x = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %A, i32 %B)
+  ret { i32, i1 } %x
+; CHECK-LABEL: @saddtest1
+; CHECK: %x = add nsw i32 %A, %B
+; CHECK-NEXT: %1 = insertvalue { i32, i1 } { i32 undef, i1 false }, i32 %x, 0
+; CHECK-NEXT:  ret { i32, i1 } %1
+}
+
 
 define i8 @umultest1(i8 %A, i1* %overflowPtr) {
   %x = call %overflow.result @llvm.umul.with.overflow.i8(i8 0, i8 %A)
diff --git a/test/Transforms/InstCombine/memcpy-from-global.ll b/test/Transforms/InstCombine/memcpy-from-global.ll
index b5a0ab8..3bc1d36 100644
--- a/test/Transforms/InstCombine/memcpy-from-global.ll
+++ b/test/Transforms/InstCombine/memcpy-from-global.ll
@@ -78,7 +78,8 @@ define void @test2_addrspacecast() {
 ; %A alloca is deleted
 ; This doesn't exactly match what test2 does, because folding the type
 ; cast into the alloca doesn't work for the addrspacecast yet.
-; CHECK-NEXT: alloca %T
+; CHECK-NEXT: alloca [124 x i8]
+; CHECK-NEXT: getelementptr
 ; CHECK-NEXT: addrspacecast
 
 ; use @G instead of %A
diff --git a/test/Transforms/InstCombine/overflow-mul.ll b/test/Transforms/InstCombine/overflow-mul.ll
index 04019ae..cbb2f5f 100644
--- a/test/Transforms/InstCombine/overflow-mul.ll
+++ b/test/Transforms/InstCombine/overflow-mul.ll
@@ -162,3 +162,14 @@ entry:
   ret i32 %retval
 }
 
+define <4 x i32> @pr20113(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: @pr20113
+; CHECK-NOT: mul.with.overflow
+; CHECK: ret
+  %vmovl.i.i726 = zext <4 x i16> %a to <4 x i32>
+  %vmovl.i.i712 = zext <4 x i16> %b to <4 x i32>
+  %mul.i703 = mul <4 x i32> %vmovl.i.i712, %vmovl.i.i726
+  %tmp = icmp sge <4 x i32> %mul.i703, zeroinitializer
+  %vcgez.i = sext <4 x i1> %tmp to <4 x i32>
+  ret <4 x i32> %vcgez.i
+}
diff --git a/test/Transforms/InstCombine/pr20059.ll b/test/Transforms/InstCombine/pr20059.ll
new file mode 100644
index 0000000..0ef3159
--- /dev/null
+++ b/test/Transforms/InstCombine/pr20059.ll
@@ -0,0 +1,16 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; In PR20059 ( http://llvm.org/pr20059 ), shufflevector operations are reordered/removed
+; for an srem operation. This is not a valid optimization because it may cause a trap
+; on div-by-zero.
+
+; CHECK-LABEL: @do_not_reorder
+; CHECK: %splat1 = shufflevector <4 x i32> %p1, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: %splat2 = shufflevector <4 x i32> %p2, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: %retval = srem <4 x i32> %splat1, %splat2
+define <4 x i32> @do_not_reorder(<4 x i32> %p1, <4 x i32> %p2) {
+  %splat1 = shufflevector <4 x i32> %p1, <4 x i32> undef, <4 x i32> zeroinitializer
+  %splat2 = shufflevector <4 x i32> %p2, <4 x i32> undef, <4 x i32> zeroinitializer
+  %retval = srem <4 x i32> %splat1, %splat2
+  ret <4 x i32> %retval
+}
diff --git a/test/Transforms/InstCombine/pr20079.ll b/test/Transforms/InstCombine/pr20079.ll
new file mode 100644
index 0000000..ce9c4de
--- /dev/null
+++ b/test/Transforms/InstCombine/pr20079.ll
@@ -0,0 +1,9 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+@b = internal global [1 x i32] zeroinitializer, align 4
+@c = internal global i32 0, align 4
+
+; CHECK-LABEL: @fn1
+; CHECK-NEXT: ret i32 0
+define i32 @fn1(i32 %a) {
+  ret i32 0
+}
diff --git a/test/Transforms/InstCombine/r600-intrinsics.ll b/test/Transforms/InstCombine/r600-intrinsics.ll
new file mode 100644
index 0000000..1db6b0d
--- /dev/null
+++ b/test/Transforms/InstCombine/r600-intrinsics.ll
@@ -0,0 +1,47 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone
+declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
+
+; CHECK-LABEL: @test_constant_fold_rcp_f32_1
+; CHECK-NEXT: ret float 1.000000e+00
+define float @test_constant_fold_rcp_f32_1() nounwind {
+  %val = call float @llvm.AMDGPU.rcp.f32(float 1.0) nounwind readnone
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_rcp_f64_1
+; CHECK-NEXT:  ret double 1.000000e+00
+define double @test_constant_fold_rcp_f64_1() nounwind {
+  %val = call double @llvm.AMDGPU.rcp.f64(double 1.0) nounwind readnone
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_rcp_f32_half
+; CHECK-NEXT: ret float 2.000000e+00
+define float @test_constant_fold_rcp_f32_half() nounwind {
+  %val = call float @llvm.AMDGPU.rcp.f32(float 0.5) nounwind readnone
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_rcp_f64_half
+; CHECK-NEXT:  ret double 2.000000e+00
+define double @test_constant_fold_rcp_f64_half() nounwind {
+  %val = call double @llvm.AMDGPU.rcp.f64(double 0.5) nounwind readnone
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_rcp_f32_43
+; CHECK-NEXT: call float @llvm.AMDGPU.rcp.f32(float 4.300000e+01)
+define float @test_constant_fold_rcp_f32_43() nounwind {
+ %val = call float @llvm.AMDGPU.rcp.f32(float 4.300000e+01) nounwind readnone
+ ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_rcp_f64_43
+; CHECK-NEXT: call double @llvm.AMDGPU.rcp.f64(double 4.300000e+01)
+define double @test_constant_fold_rcp_f64_43() nounwind {
+  %val = call double @llvm.AMDGPU.rcp.f64(double 4.300000e+01) nounwind readnone
+  ret double %val
+}
+
diff --git a/test/Transforms/InstCombine/rem.ll b/test/Transforms/InstCombine/rem.ll
index 9f07702..0595a67 100644
--- a/test/Transforms/InstCombine/rem.ll
+++ b/test/Transforms/InstCombine/rem.ll
@@ -127,7 +127,7 @@ define i64 @test14(i64 %x, i32 %y) {
 ; CHECK-LABEL: @test14(
 ; CHECK-NEXT: [[SHL:%.*]] = shl i32 1, %y
 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SHL]] to i64
-; CHECK-NEXT: [[ADD:%.*]] = add i64 [[ZEXT]], -1
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[ZEXT]], -1
 ; CHECK-NEXT: [[AND:%.*]] = and i64 [[ADD]], %x
 ; CHECK-NEXT: ret i64 [[AND]]
 	%shl = shl i32 1, %y
diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll
index 2213be1..d625f3b 100644
--- a/test/Transforms/InstCombine/select.ll
+++ b/test/Transforms/InstCombine/select.ll
@@ -281,7 +281,7 @@ define i32 @test15i(i32 %X) {
 ; CHECK-NEXT: %t1 = shl i32 %X, 8
 ; CHECK-NEXT: %1 = and i32 %t1, 512
 ; CHECK-NEXT: %2 = xor i32 %1, 512
-; CHECK-NEXT: %3 = add i32 %2, 577
+; CHECK-NEXT: %3 = add nuw nsw i32 %2, 577
 ; CHECK-NEXT: ret i32 %3
 }
 
@@ -294,7 +294,7 @@ define i32 @test15j(i32 %X) {
 ; CHECK-LABEL: @test15j(
 ; CHECK-NEXT: %t1 = shl i32 %X, 8
 ; CHECK-NEXT: %1 = and i32 %t1, 512
-; CHECK-NEXT: %2 = add i32 %1, 577
+; CHECK-NEXT: %2 = add nuw nsw i32 %1, 577
 ; CHECK-NEXT: ret i32 %2
 }
 
@@ -521,7 +521,7 @@ define i32 @test35(i32 %x) {
 ; CHECK-LABEL: @test35(
 ; CHECK: ashr i32 %x, 31
 ; CHECK: and i32 {{.*}}, 40
-; CHECK: add i32 {{.*}}, 60
+; CHECK: add nuw nsw i32 {{.*}}, 60
 ; CHECK: ret
 }
 
@@ -532,7 +532,7 @@ define i32 @test36(i32 %x) {
 ; CHECK-LABEL: @test36(
 ; CHECK: ashr i32 %x, 31
 ; CHECK: and i32 {{.*}}, -40
-; CHECK: add i32 {{.*}}, 100
+; CHECK: add nsw i32 {{.*}}, 100
 ; CHECK: ret
 }
 
@@ -996,6 +996,111 @@ define <2 x i32> @select_icmp_eq_and_1_0_or_vector_of_2s(i32 %x, <2 x i32> %y) {
   ret <2 x i32> %select
 }
 
+; CHECK-LABEL: @select_icmp_and_8_eq_0_or_8(
+; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 %x, 8
+; CHECK-NEXT: ret i32 [[OR]]
+define i32 @select_icmp_and_8_eq_0_or_8(i32 %x) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %or = or i32 %x, 8
+  %or.x = select i1 %cmp, i32 %or, i32 %x
+  ret i32 %or.x
+}
+
+; CHECK-LABEL: @select_icmp_and_8_ne_0_xor_8(
+; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 %x, -9
+; CHECK-NEXT: ret i32 [[AND]]
+define i32 @select_icmp_and_8_ne_0_xor_8(i32 %x) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %xor = xor i32 %x, 8
+  %x.xor = select i1 %cmp, i32 %x, i32 %xor
+  ret i32 %x.xor
+}
+
+; CHECK-LABEL: @select_icmp_and_8_eq_0_xor_8(
+; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 %x, 8
+; CHECK-NEXT: ret i32 [[OR]]
+define i32 @select_icmp_and_8_eq_0_xor_8(i32 %x) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %xor = xor i32 %x, 8
+  %xor.x = select i1 %cmp, i32 %xor, i32 %x
+  ret i32 %xor.x
+}
+
+; CHECK-LABEL: @select_icmp_and_8_ne_0_and_not_8(
+; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 %x, -9
+; CHECK-NEXT: ret i32 [[AND]]
+define i32 @select_icmp_and_8_ne_0_and_not_8(i32 %x) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %and1 = and i32 %x, -9
+  %x.and1 = select i1 %cmp, i32 %x, i32 %and1
+  ret i32 %x.and1
+}
+
+; CHECK-LABEL: @select_icmp_and_8_eq_0_and_not_8(
+; CHECK-NEXT: ret i32 %x
+define i32 @select_icmp_and_8_eq_0_and_not_8(i32 %x) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %and1 = and i32 %x, -9
+  %and1.x = select i1 %cmp, i32 %and1, i32 %x
+  ret i32 %and1.x
+}
+
+; CHECK-LABEL: @select_icmp_x_and_8_eq_0_y_xor_8(
+; CHECK: select i1 %cmp, i64 %y, i64 %xor
+define i64 @select_icmp_x_and_8_eq_0_y_xor_8(i32 %x, i64 %y) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %xor = xor i64 %y, 8
+  %y.xor = select i1 %cmp, i64 %y, i64 %xor
+  ret i64 %y.xor
+}
+
+; CHECK-LABEL: @select_icmp_x_and_8_eq_0_y_and_not_8(
+; CHECK: select i1 %cmp, i64 %y, i64 %and1
+define i64 @select_icmp_x_and_8_eq_0_y_and_not_8(i32 %x, i64 %y) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %and1 = and i64 %y, -9
+  %y.and1 = select i1 %cmp, i64 %y, i64 %and1
+  ret i64 %y.and1
+}
+
+; CHECK-LABEL: @select_icmp_x_and_8_ne_0_y_xor_8(
+; CHECK: select i1 %cmp, i64 %xor, i64 %y
+define i64 @select_icmp_x_and_8_ne_0_y_xor_8(i32 %x, i64 %y) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %xor = xor i64 %y, 8
+  %xor.y = select i1 %cmp, i64 %xor, i64 %y
+  ret i64 %xor.y
+}
+
+; CHECK-LABEL: @select_icmp_x_and_8_ne_0_y_and_not_8(
+; CHECK: select i1 %cmp, i64 %and1, i64 %y
+define i64 @select_icmp_x_and_8_ne_0_y_and_not_8(i32 %x, i64 %y) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %and1 = and i64 %y, -9
+  %and1.y = select i1 %cmp, i64 %and1, i64 %y
+  ret i64 %and1.y
+}
+
+; CHECK-LABEL: @select_icmp_x_and_8_ne_0_y_or_8(
+; CHECK: xor i64 %1, 8
+; CHECK: or i64 %2, %y
+define i64 @select_icmp_x_and_8_ne_0_y_or_8(i32 %x, i64 %y) {
+  %and = and i32 %x, 8
+  %cmp = icmp eq i32 %and, 0
+  %or = or i64 %y, 8
+  %or.y = select i1 %cmp, i64 %or, i64 %y
+  ret i64 %or.y
+}
+
 define i32 @test65(i64 %x) {
   %1 = and i64 %x, 16
   %2 = icmp ne i64 %1, 0
@@ -1130,4 +1235,4 @@ define i32 @test75(i32 %x) {
 ; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp ult i32 %x, 68
 ; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 68, i32 %x
 ; CHECK-NEXT: ret i32 [[SEL]]
-}
-\ No newline at end of file
+}
diff --git a/test/Transforms/InstCombine/sext.ll b/test/Transforms/InstCombine/sext.ll
index b8dfe22..f04afcc 100644
--- a/test/Transforms/InstCombine/sext.ll
+++ b/test/Transforms/InstCombine/sext.ll
@@ -145,7 +145,7 @@ define i32 @test13(i32 %x) nounwind {
 ; CHECK-LABEL: @test13(
 ; CHECK-NEXT: %and = lshr i32 %x, 3
 ; CHECK-NEXT: %1 = and i32 %and, 1
-; CHECK-NEXT: %sext = add i32 %1, -1
+; CHECK-NEXT: %sext = add nsw i32 %1, -1
 ; CHECK-NEXT: ret i32 %sext
 }
 
@@ -157,7 +157,7 @@ define i32 @test14(i16 %x) nounwind {
 ; CHECK-LABEL: @test14(
 ; CHECK-NEXT: %and = lshr i16 %x, 4
 ; CHECK-NEXT: %1 = and i16 %and, 1
-; CHECK-NEXT: %sext = add i16 %1, -1
+; CHECK-NEXT: %sext = add nsw i16 %1, -1
 ; CHECK-NEXT: %ext = sext i16 %sext to i32
 ; CHECK-NEXT: ret i32 %ext
 }
diff --git a/test/Transforms/InstCombine/sub.ll b/test/Transforms/InstCombine/sub.ll
index 41d803c8..67b7c49 100644
--- a/test/Transforms/InstCombine/sub.ll
+++ b/test/Transforms/InstCombine/sub.ll
@@ -444,3 +444,23 @@ define <2 x i64> @test36(<2 x i64> %A) {
 ; CHECK-NEXT: %sub = mul <2 x i64> %A, <i64 7, i64 15>
 ; CHECK-NEXT: ret <2 x i64> %sub
 }
+
+define <2 x i32> @test37(<2 x i32> %A) {
+  %div = sdiv <2 x i32> %A, <i32 -2147483648, i32 -2147483648>
+  %sub = sub nsw <2 x i32> zeroinitializer, %div
+  ret <2 x i32> %sub
+; CHECK-LABEL: @test37(
+; CHECK-NEXT: [[ICMP:%.*]] = icmp eq <2 x i32> %A, <i32 -2147483648, i32 -2147483648>
+; CHECK-NEXT: [[SEXT:%.*]] = sext <2 x i1> [[ICMP]] to <2 x i32>
+; CHECK-NEXT: ret <2 x i32> [[SEXT]]
+}
+
+define i32 @test38(i32 %A) {
+  %div = sdiv i32 %A, -2147483648
+  %sub = sub nsw i32 0, %div
+  ret i32 %sub
+; CHECK-LABEL: @test38(
+; CHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 %A, -2147483648
+; CHECK-NEXT: [[SEXT:%.*]] = sext i1 [[ICMP]] to i32
+; CHECK-NEXT: ret i32 [[SEXT]]
+}
diff --git a/test/Transforms/InstCombine/vec_shuffle.ll b/test/Transforms/InstCombine/vec_shuffle.ll
index fc0f8bd..eb4e9d6 100644
--- a/test/Transforms/InstCombine/vec_shuffle.ll
+++ b/test/Transforms/InstCombine/vec_shuffle.ll
@@ -405,3 +405,12 @@ define i32 @pr19737(<4 x i32> %in0) {
   %rv = extractelement <4 x i32> %and.i, i32 0
   ret i32 %rv
 }
+
+define <4 x i32> @pr20114(<4 x i32> %__mask) {
+; CHECK-LABEL: @pr20114
+; CHECK: shufflevector
+; CHECK: and
+  %mask01.i = shufflevector <4 x i32> %__mask, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  %masked_new.i.i.i = and <4 x i32> bitcast (<2 x i64> <i64 ptrtoint (<4 x i32> (<4 x i32>)* @pr20114 to i64), i64 ptrtoint (<4 x i32> (<4 x i32>)* @pr20114 to i64)> to <4 x i32>), %mask01.i
+  ret <4 x i32> %masked_new.i.i.i
+}
diff --git a/test/Transforms/InstCombine/zext-bool-add-sub.ll b/test/Transforms/InstCombine/zext-bool-add-sub.ll
index d7f338b..6fa4d70 100644
--- a/test/Transforms/InstCombine/zext-bool-add-sub.ll
+++ b/test/Transforms/InstCombine/zext-bool-add-sub.ll
@@ -6,7 +6,7 @@ entry:
 ; CHECK-LABEL: @a(
 ; CHECK: [[TMP1:%.*]] = sext i1 %y to i32
 ; CHECK: [[TMP2:%.*]] = select i1 %x, i32 2, i32 1
-; CHECK-NEXT: add i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT: add nsw i32 [[TMP2]], [[TMP1]]
   %conv = zext i1 %x to i32
   %conv3 = zext i1 %y to i32
   %conv3.neg = sub i32 0, %conv3
diff --git a/test/Transforms/InstSimplify/apint-or.ll b/test/Transforms/InstSimplify/apint-or.ll
new file mode 100644
index 0000000..5d314db
--- /dev/null
+++ b/test/Transforms/InstSimplify/apint-or.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s -instsimplify -S | not grep or
+
+; Test the case where integer BitWidth <= 64 && BitWidth % 2 != 0.
+define i39 @test1(i39 %V, i39 %M) {
+    ;; If we have: ((V + N) & C1) | (V & C2)
+    ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
+    ;; replace with V+N.
+    %C1 = xor i39 274877906943, -1 ;; C2 = 274877906943
+    %N = and i39 %M, 274877906944
+    %A = add i39 %V, %N
+    %B = and i39 %A, %C1
+    %D = and i39 %V, 274877906943
+    %R = or i39 %B, %D
+    ret i39 %R
+; CHECK-LABEL @test1
+; CHECK-NEXT: and {{.*}}, -274877906944
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
+
+; Test the case where Integer BitWidth > 64 && BitWidth <= 1024. 
+define i399 @test2(i399 %V, i399 %M) {
+    ;; If we have: ((V + N) & C1) | (V & C2)
+    ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
+    ;; replace with V+N.
+    %C1 = xor i399 274877906943, -1 ;; C2 = 274877906943
+    %N = and i399 %M, 18446742974197923840
+    %A = add i399 %V, %N
+    %B = and i399 %A, %C1
+    %D = and i399 %V, 274877906943
+    %R = or i399 %B, %D
+    ret i399 %R
+; CHECK-LABEL @test2
+; CHECK-NEXT: and {{.*}}, 18446742974197923840
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
diff --git a/test/Transforms/InstSimplify/compare.ll b/test/Transforms/InstSimplify/compare.ll
index 105e244..7d0cd9c 100644
--- a/test/Transforms/InstSimplify/compare.ll
+++ b/test/Transforms/InstSimplify/compare.ll
@@ -883,3 +883,33 @@ define i1 @returns_nonnull() {
 ; CHECK: ret i1 false
 }
 
+; If a bit is known to be zero for A and known to be one for B,
+; then A and B cannot be equal.
+define i1 @icmp_eq_const(i32 %a) nounwind {
+  %b = mul nsw i32 %a, -2
+  %c = icmp eq i32 %b, 1
+  ret i1 %c
+
+; CHECK-LABEL: @icmp_eq_const
+; CHECK-NEXT: ret i1 false 
+}
+
+define i1 @icmp_ne_const(i32 %a) nounwind {
+  %b = mul nsw i32 %a, -2
+  %c = icmp ne i32 %b, 1
+  ret i1 %c
+
+; CHECK-LABEL: @icmp_ne_const
+; CHECK-NEXT: ret i1 true
+}
+
+define i1 @icmp_sdiv_int_min(i32 %a) {
+  %div = sdiv i32 -2147483648, %a
+  %cmp = icmp ne i32 %div, -1073741824
+  ret i1 %cmp
+
+; CHECK-LABEL: @icmp_sdiv_int_min
+; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 -2147483648, %a
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[DIV]], -1073741824
+; CHECK-NEXT: ret i1 [[CMP]]
+}
diff --git a/test/Transforms/JumpThreading/pr15851_hang.ll b/test/Transforms/JumpThreading/pr15851_hang.ll
new file mode 100644
index 0000000..0484bc9
--- /dev/null
+++ b/test/Transforms/JumpThreading/pr15851_hang.ll
@@ -0,0 +1,22 @@
+; RUN: opt -S -jump-threading < %s | FileCheck %s
+
+; CHECK-LABEL: @f(
+; CHECK-LABEL: entry
+; CHECK: ret void
+; CHECK-NOT: for.cond1
+; CHECK-NOT: for.body
+
+define void @f() {
+entry:
+  ret void
+
+for.cond1:
+  %i.025 = phi i32 [ %inc, %for.body ], [ %inc, %for.body ], [ 1, %for.cond1 ]
+  %cmp = icmp slt i32 %i.025, 2
+  br i1 %cmp, label %for.body, label %for.cond1
+
+for.body:
+  %inc = add nsw i32 %i.025, 0
+  %a = icmp ugt i32 %inc, 2
+  br i1 %a, label %for.cond1, label %for.cond1
+}
diff --git a/test/Transforms/JumpThreading/select.ll b/test/Transforms/JumpThreading/select.ll
index 201e604..545e86c 100644
--- a/test/Transforms/JumpThreading/select.ll
+++ b/test/Transforms/JumpThreading/select.ll
@@ -127,7 +127,7 @@ L4:
 ; CHECK: test_switch_default
 ; CHECK: entry:
 ; CHECK: load
-; CHECK: switch
+; CHECK: icmp
 ; CHECK: [[THREADED:[A-Za-z.0-9]+]]:
 ; CHECK: store
 ; CHECK: br
diff --git a/test/Transforms/LICM/extra-copies.ll b/test/Transforms/LICM/extra-copies.ll
new file mode 100644
index 0000000..ef52f9f
--- /dev/null
+++ b/test/Transforms/LICM/extra-copies.ll
@@ -0,0 +1,29 @@
+; RUN: opt < %s -licm -S | FileCheck %s
+; PR19835
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @f(i32 %x) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %storemerge4 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %mul = mul nsw i32 %x, %x
+  %add2 = add nsw i32 %mul, %x
+  %mul3 = add nsw i32 %add2, %mul
+  %inc = add nsw i32 %storemerge4, 1
+  %cmp = icmp slt i32 %inc, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %a9.0.lcssa = phi i32 [ %mul3, %for.body ]
+  ret i32 %a9.0.lcssa
+}
+
+; Test that there is exactly one copy of mul nsw i32 %x, %x in the exit block.
+; CHECK: define i32 @f(i32 [[X:%.*]])
+; CHECK: for.end:
+; CHECK-NOT: mul nsw i32 [[X]], [[X]]
+; CHECK: mul nsw i32 [[X]], [[X]]
+; CHECK-NOT: mul nsw i32 [[X]], [[X]]
diff --git a/test/Transforms/LICM/hoist-bitcast-load.ll b/test/Transforms/LICM/hoist-bitcast-load.ll
new file mode 100644
index 0000000..639dca5
--- /dev/null
+++ b/test/Transforms/LICM/hoist-bitcast-load.ll
@@ -0,0 +1,201 @@
+; RUN: opt -S -basicaa -licm < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Make sure the basic alloca pointer hoisting works:
+; CHECK-LABEL: @test1
+; CHECK: load i32* %c, align 4
+; CHECK: for.body:
+
+; Function Attrs: nounwind uwtable
+define void @test1(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) #0 {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  %c = alloca i32
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i64 %indvars.iv
+  %2 = load i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; Make sure the basic alloca pointer hoisting works through a bitcast to a
+; pointer to a smaller type:
+; CHECK-LABEL: @test2
+; CHECK: load i32* %c, align 4
+; CHECK: for.body:
+
+; Function Attrs: nounwind uwtable
+define void @test2(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) #0 {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  %ca = alloca i64
+  %c = bitcast i64* %ca to i32*
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i64 %indvars.iv
+  %2 = load i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; Make sure the basic alloca pointer hoisting works through a bitcast to a
+; pointer to a smaller type (where the bitcast also needs to be hoisted):
+; CHECK-LABEL: @test3
+; CHECK: load i32* %c, align 4
+; CHECK: for.body:
+
+; Function Attrs: nounwind uwtable
+define void @test3(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) #0 {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  %ca = alloca i64
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %c = bitcast i64* %ca to i32*
+  %1 = load i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i64 %indvars.iv
+  %2 = load i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; Make sure the basic alloca pointer hoisting does not happen through a bitcast
+; to a pointer to a larger type:
+; CHECK-LABEL: @test4
+; CHECK: for.body:
+; CHECK: load i32* %c, align 4
+
+; Function Attrs: nounwind uwtable
+define void @test4(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) #0 {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  %ca = alloca i16
+  %c = bitcast i16* %ca to i32*
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i64 %indvars.iv
+  %2 = load i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; Don't crash on bitcasts to unsized types.
+; CHECK-LABEL: @test5
+; CHECK: for.body:
+; CHECK: load i32* %c, align 4
+
+%atype = type opaque
+
+; Function Attrs: nounwind uwtable
+define void @test5(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) #0 {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  %ca = alloca i16
+  %cab = bitcast i16* %ca to %atype*
+  %c = bitcast %atype* %cab to i32*
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i64 %indvars.iv
+  %2 = load i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+attributes #0 = { nounwind uwtable }
+
diff --git a/test/Transforms/LoadCombine/load-combine.ll b/test/Transforms/LoadCombine/load-combine.ll
new file mode 100644
index 0000000..c4d9241
--- /dev/null
+++ b/test/Transforms/LoadCombine/load-combine.ll
@@ -0,0 +1,190 @@
+; RUN: opt < %s -load-combine -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Combine read from char* idiom.
+define i64 @LoadU64_x64_0(i64* %pData) {
+  %1 = bitcast i64* %pData to i8*
+  %2 = load i8* %1, align 1
+  %3 = zext i8 %2 to i64
+  %4 = shl nuw i64 %3, 56
+  %5 = getelementptr inbounds i8* %1, i64 1
+  %6 = load i8* %5, align 1
+  %7 = zext i8 %6 to i64
+  %8 = shl nuw nsw i64 %7, 48
+  %9 = or i64 %8, %4
+  %10 = getelementptr inbounds i8* %1, i64 2
+  %11 = load i8* %10, align 1
+  %12 = zext i8 %11 to i64
+  %13 = shl nuw nsw i64 %12, 40
+  %14 = or i64 %9, %13
+  %15 = getelementptr inbounds i8* %1, i64 3
+  %16 = load i8* %15, align 1
+  %17 = zext i8 %16 to i64
+  %18 = shl nuw nsw i64 %17, 32
+  %19 = or i64 %14, %18
+  %20 = getelementptr inbounds i8* %1, i64 4
+  %21 = load i8* %20, align 1
+  %22 = zext i8 %21 to i64
+  %23 = shl nuw nsw i64 %22, 24
+  %24 = or i64 %19, %23
+  %25 = getelementptr inbounds i8* %1, i64 5
+  %26 = load i8* %25, align 1
+  %27 = zext i8 %26 to i64
+  %28 = shl nuw nsw i64 %27, 16
+  %29 = or i64 %24, %28
+  %30 = getelementptr inbounds i8* %1, i64 6
+  %31 = load i8* %30, align 1
+  %32 = zext i8 %31 to i64
+  %33 = shl nuw nsw i64 %32, 8
+  %34 = or i64 %29, %33
+  %35 = getelementptr inbounds i8* %1, i64 7
+  %36 = load i8* %35, align 1
+  %37 = zext i8 %36 to i64
+  %38 = or i64 %34, %37
+  ret i64 %38
+; CHECK-LABEL: @LoadU64_x64_0(
+; CHECK: load i64* %{{.*}}, align 1
+; CHECK-NOT: load
+}
+
+; Combine simple adjacent loads.
+define i32 @"2xi16_i32"(i16* %x) {
+  %1 = load i16* %x, align 2
+  %2 = getelementptr inbounds i16* %x, i64 1
+  %3 = load i16* %2, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw i32 %4, 16
+  %6 = zext i16 %1 to i32
+  %7 = or i32 %5, %6
+  ret i32 %7
+; CHECK-LABEL: @"2xi16_i32"(
+; CHECK: load i32* %{{.*}}, align 2
+; CHECK-NOT: load
+}
+
+; Don't combine loads across stores.
+define i32 @"2xi16_i32_store"(i16* %x, i16* %y) {
+  %1 = load i16* %x, align 2
+  store i16 0, i16* %y, align 2
+  %2 = getelementptr inbounds i16* %x, i64 1
+  %3 = load i16* %2, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw i32 %4, 16
+  %6 = zext i16 %1 to i32
+  %7 = or i32 %5, %6
+  ret i32 %7
+; CHECK-LABEL: @"2xi16_i32_store"(
+; CHECK: load i16* %{{.*}}, align 2
+; CHECK: store
+; CHECK: load i16* %{{.*}}, align 2
+}
+
+; Don't combine loads with a gap.
+define i32 @"2xi16_i32_gap"(i16* %x) {
+  %1 = load i16* %x, align 2
+  %2 = getelementptr inbounds i16* %x, i64 2
+  %3 = load i16* %2, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw i32 %4, 16
+  %6 = zext i16 %1 to i32
+  %7 = or i32 %5, %6
+  ret i32 %7
+; CHECK-LABEL: @"2xi16_i32_gap"(
+; CHECK: load i16* %{{.*}}, align 2
+; CHECK: load i16* %{{.*}}, align 2
+}
+
+; Combine out of order loads.
+define i32 @"2xi16_i32_order"(i16* %x) {
+  %1 = getelementptr inbounds i16* %x, i64 1
+  %2 = load i16* %1, align 2
+  %3 = zext i16 %2 to i32
+  %4 = load i16* %x, align 2
+  %5 = shl nuw i32 %3, 16
+  %6 = zext i16 %4 to i32
+  %7 = or i32 %5, %6
+  ret i32 %7
+; CHECK-LABEL: @"2xi16_i32_order"(
+; CHECK: load i32* %{{.*}}, align 2
+; CHECK-NOT: load
+}
+
+; Overlapping loads.
+define i32 @"2xi16_i32_overlap"(i8* %x) {
+  %1 = bitcast i8* %x to i16*
+  %2 = load i16* %1, align 2
+  %3 = getelementptr inbounds i8* %x, i64 1
+  %4 = bitcast i8* %3 to i16*
+  %5 = load i16* %4, align 2
+  %6 = zext i16 %5 to i32
+  %7 = shl nuw i32 %6, 16
+  %8 = zext i16 %2 to i32
+  %9 = or i32 %7, %8
+  ret i32 %9
+; CHECK-LABEL: @"2xi16_i32_overlap"(
+; CHECK: load i16* %{{.*}}, align 2
+; CHECK: load i16* %{{.*}}, align 2
+}
+
+; Combine valid alignments.
+define i64 @"2xi16_i64_align"(i8* %x) {
+  %1 = bitcast i8* %x to i32*
+  %2 = load i32* %1, align 4
+  %3 = getelementptr inbounds i8* %x, i64 4
+  %4 = bitcast i8* %3 to i16*
+  %5 = load i16* %4, align 2
+  %6 = getelementptr inbounds i8* %x, i64 6
+  %7 = bitcast i8* %6 to i16*
+  %8 = load i16* %7, align 2
+  %9 = zext i16 %8 to i64
+  %10 = shl nuw i64 %9, 48
+  %11 = zext i16 %5 to i64
+  %12 = shl nuw nsw i64 %11, 32
+  %13 = zext i32 %2 to i64
+  %14 = or i64 %12, %13
+  %15 = or i64 %14, %10
+  ret i64 %15
+; CHECK-LABEL: @"2xi16_i64_align"(
+; CHECK: load i64* %{{.*}}, align 4
+}
+
+; Non power of two.
+define i64 @"2xi16_i64_npo2"(i8* %x) {
+  %1 = load i8* %x, align 1
+  %2 = zext i8 %1 to i64
+  %3 = getelementptr inbounds i8* %x, i64 1
+  %4 = load i8* %3, align 1
+  %5 = zext i8 %4 to i64
+  %6 = shl nuw nsw i64 %5, 8
+  %7 = or i64 %6, %2
+  %8 = getelementptr inbounds i8* %x, i64 2
+  %9 = load i8* %8, align 1
+  %10 = zext i8 %9 to i64
+  %11 = shl nuw nsw i64 %10, 16
+  %12 = or i64 %11, %7
+  %13 = getelementptr inbounds i8* %x, i64 3
+  %14 = load i8* %13, align 1
+  %15 = zext i8 %14 to i64
+  %16 = shl nuw nsw i64 %15, 24
+  %17 = or i64 %16, %12
+  %18 = getelementptr inbounds i8* %x, i64 4
+  %19 = load i8* %18, align 1
+  %20 = zext i8 %19 to i64
+  %21 = shl nuw nsw i64 %20, 32
+  %22 = or i64 %21, %17
+  %23 = getelementptr inbounds i8* %x, i64 5
+  %24 = load i8* %23, align 1
+  %25 = zext i8 %24 to i64
+  %26 = shl nuw nsw i64 %25, 40
+  %27 = or i64 %26, %22
+  %28 = getelementptr inbounds i8* %x, i64 6
+  %29 = load i8* %28, align 1
+  %30 = zext i8 %29 to i64
+  %31 = shl nuw nsw i64 %30, 48
+  %32 = or i64 %31, %27
+  ret i64 %32
+; CHECK-LABEL: @"2xi16_i64_npo2"(
+; CHECK: load i32* %{{.*}}, align 1
+}
diff --git a/test/Transforms/LoopIdiom/X86/lit.local.cfg b/test/Transforms/LoopIdiom/X86/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/Transforms/LoopIdiom/X86/lit.local.cfg
+++ b/test/Transforms/LoopIdiom/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/LoopStrengthReduce/AArch64/lit.local.cfg b/test/Transforms/LoopStrengthReduce/AArch64/lit.local.cfg
index 6642d28..675f48e 100644
--- a/test/Transforms/LoopStrengthReduce/AArch64/lit.local.cfg
+++ b/test/Transforms/LoopStrengthReduce/AArch64/lit.local.cfg
@@ -1,5 +1,4 @@
 config.suffixes = ['.ll']
 
-targets = set(config.root.targets_to_build.split())
-if not 'AArch64' in targets:
+if not 'AArch64' in config.root.targets:
     config.unsupported = True
diff --git a/test/Transforms/LoopStrengthReduce/ARM/2012-06-15-lsr-noaddrmode.ll b/test/Transforms/LoopStrengthReduce/ARM/2012-06-15-lsr-noaddrmode.ll
index 756ea82..1d56dde 100644
--- a/test/Transforms/LoopStrengthReduce/ARM/2012-06-15-lsr-noaddrmode.ll
+++ b/test/Transforms/LoopStrengthReduce/ARM/2012-06-15-lsr-noaddrmode.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 -mtriple=thumb-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
+; RUN: llc -O3 -mtriple=thumb-eabi -mcpu=cortex-a8 %s -o - -arm-atomic-cfg-tidy=0 | FileCheck %s
 ;
 ; LSR should only check for valid address modes when the IV user is a
 ; memory address.
diff --git a/test/Transforms/LoopStrengthReduce/ARM/lit.local.cfg b/test/Transforms/LoopStrengthReduce/ARM/lit.local.cfg
index 8a3ba96..98c6700 100644
--- a/test/Transforms/LoopStrengthReduce/ARM/lit.local.cfg
+++ b/test/Transforms/LoopStrengthReduce/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/LoopStrengthReduce/X86/lit.local.cfg b/test/Transforms/LoopStrengthReduce/X86/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/Transforms/LoopStrengthReduce/X86/lit.local.cfg
+++ b/test/Transforms/LoopStrengthReduce/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/LoopUnroll/PowerPC/lit.local.cfg b/test/Transforms/LoopUnroll/PowerPC/lit.local.cfg
index 2e46300..5d33887 100644
--- a/test/Transforms/LoopUnroll/PowerPC/lit.local.cfg
+++ b/test/Transforms/LoopUnroll/PowerPC/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'PowerPC' in targets:
+if not 'PowerPC' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/LoopUnroll/X86/lit.local.cfg b/test/Transforms/LoopUnroll/X86/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/Transforms/LoopUnroll/X86/lit.local.cfg
+++ b/test/Transforms/LoopUnroll/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/LoopUnroll/pr18861.ll b/test/Transforms/LoopUnroll/pr18861.ll
new file mode 100644
index 0000000..62f2610
--- /dev/null
+++ b/test/Transforms/LoopUnroll/pr18861.ll
@@ -0,0 +1,43 @@
+; RUN: opt < %s -loop-unroll -indvars -disable-output
+
+@b = external global i32, align 4
+
+; Function Attrs: nounwind uwtable
+define void @fn1() #0 {
+entry:
+  br label %for.cond1thread-pre-split
+
+for.cond1thread-pre-split:                        ; preds = %for.inc8, %entry
+  %storemerge1 = phi i32 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br i1 undef, label %for.inc8, label %for.cond2.preheader.lr.ph
+
+for.cond2.preheader.lr.ph:                        ; preds = %for.cond1thread-pre-split
+  br label %for.cond2.preheader
+
+for.cond2.preheader:                              ; preds = %for.inc5, %for.cond2.preheader.lr.ph
+  br label %for.cond2
+
+for.cond2:                                        ; preds = %for.body3, %for.cond2.preheader
+  %storemerge = phi i32 [ %add, %for.body3 ], [ 0, %for.cond2.preheader ]
+  %cmp = icmp slt i32 %storemerge, 1
+  br i1 %cmp, label %for.body3, label %for.inc5
+
+for.body3:                                        ; preds = %for.cond2
+  %tobool4 = icmp eq i32 %storemerge, 0
+  %add = add nsw i32 %storemerge, 1
+  br i1 %tobool4, label %for.cond2, label %if.then
+
+if.then:                                          ; preds = %for.body3
+  store i32 %storemerge1, i32* @b, align 4
+  ret void
+
+for.inc5:                                         ; preds = %for.cond2
+  br i1 undef, label %for.cond1.for.inc8_crit_edge, label %for.cond2.preheader
+
+for.cond1.for.inc8_crit_edge:                     ; preds = %for.inc5
+  br label %for.inc8
+
+for.inc8:                                         ; preds = %for.cond1.for.inc8_crit_edge, %for.cond1thread-pre-split
+  %inc9 = add nsw i32 %storemerge1, 1
+  br label %for.cond1thread-pre-split
+}
diff --git a/test/Transforms/LoopUnroll/runtime-loop.ll b/test/Transforms/LoopUnroll/runtime-loop.ll
index d8bbea9..a14087d 100644
--- a/test/Transforms/LoopUnroll/runtime-loop.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop.ll
@@ -2,6 +2,12 @@
 
 ; Tests for unrolling loops with run-time trip counts
 
+; CHECK: %xtraiter = and i32 %n
+; CHECK: %lcmp.mod = icmp ne i32 %xtraiter, 0
+; CHECK: %lcmp.overflow = icmp eq i32 %n, 0
+; CHECK: %lcmp.or = or i1 %lcmp.overflow, %lcmp.mod
+; CHECK: br i1 %lcmp.or, label %unr.cmp
+
 ; CHECK: unr.cmp{{.*}}:
 ; CHECK: for.body.unr{{.*}}:
 ; CHECK: for.body:
diff --git a/test/Transforms/LoopUnroll/unroll-pragmas.ll b/test/Transforms/LoopUnroll/unroll-pragmas.ll
new file mode 100644
index 0000000..5e45a2d
--- /dev/null
+++ b/test/Transforms/LoopUnroll/unroll-pragmas.ll
@@ -0,0 +1,285 @@
+; RUN: opt < %s -loop-unroll -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; loop4 contains a small loop which should be completely unrolled by
+; the default unrolling heuristics.  It serves as a control for the
+; unroll(disable) pragma test loop4_with_disable.
+;
+; CHECK-LABEL: @loop4(
+; CHECK-NOT: br i1
+define void @loop4(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 4
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; #pragma clang loop unroll(disable)
+;
+; CHECK-LABEL: @loop4_with_disable(
+; CHECK: store i32
+; CHECK-NOT: store i32
+; CHECK: br i1
+define void @loop4_with_disable(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 4
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+!1 = metadata !{metadata !1, metadata !2}
+!2 = metadata !{metadata !"llvm.loop.unroll.enable", i1 false}
+
+; loop64 has a high enough count that it should *not* be unrolled by
+; the default unrolling heuristic.  It serves as the control for the
+; unroll(enable) pragma test loop64_with_.* tests below.
+;
+; CHECK-LABEL: @loop64(
+; CHECK: store i32
+; CHECK-NOT: store i32
+; CHECK: br i1
+define void @loop64(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; #pragma clang loop unroll(enable)
+; Loop should be fully unrolled.
+;
+; CHECK-LABEL: @loop64_with_enable(
+; CHECK-NOT: br i1
+define void @loop64_with_enable(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+!3 = metadata !{metadata !3, metadata !4}
+!4 = metadata !{metadata !"llvm.loop.unroll.enable", i1 true}
+
+; #pragma clang loop unroll_count(4)
+; Loop should be unrolled 4 times.
+;
+; CHECK-LABEL: @loop64_with_count4(
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK-NOT: store i32
+; CHECK: br i1
+define void @loop64_with_count4(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !5
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+!5 = metadata !{metadata !5, metadata !6}
+!6 = metadata !{metadata !"llvm.loop.unroll.count", i32 4}
+
+
+; #pragma clang loop unroll_count(enable) unroll_count(4)
+; Loop should be unrolled 4 times.
+;
+; CHECK-LABEL: @loop64_with_enable_and_count4(
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK-NOT: store i32
+; CHECK: br i1
+define void @loop64_with_enable_and_count4(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !7
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+!7 = metadata !{metadata !7, metadata !6, metadata !4}
+
+; #pragma clang loop unroll_count(enable)
+; Full unrolling is requested, but loop has a dynamic trip count so
+; no unrolling should occur.
+;
+; CHECK-LABEL: @dynamic_loop_with_enable(
+; CHECK: store i32
+; CHECK-NOT: store i32
+; CHECK: br i1
+define void @dynamic_loop_with_enable(i32* nocapture %a, i32 %b) {
+entry:
+  %cmp3 = icmp sgt i32 %b, 0
+  br i1 %cmp3, label %for.body, label %for.end, !llvm.loop !8
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %b
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !8
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+!8 = metadata !{metadata !8, metadata !4}
+
+; #pragma clang loop unroll_count(4)
+; Loop has a dynamic trip count.  Unrolling should occur, but no
+; conditional branches can be removed.
+;
+; CHECK-LABEL: @dynamic_loop_with_count4(
+; CHECK-NOT: store
+; CHECK: br i1
+; CHECK: store
+; CHECK: br i1
+; CHECK: store
+; CHECK: br i1
+; CHECK: store
+; CHECK: br i1
+; CHECK: store
+; CHECK: br i1
+; CHECK-NOT: br i1
+define void @dynamic_loop_with_count4(i32* nocapture %a, i32 %b) {
+entry:
+  %cmp3 = icmp sgt i32 %b, 0
+  br i1 %cmp3, label %for.body, label %for.end, !llvm.loop !9
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %b
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !9
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+!9 = metadata !{metadata !9, metadata !6}
+
+; #pragma clang loop unroll_count(1)
+; Loop should not be unrolled
+;
+; CHECK-LABEL: @unroll_1(
+; CHECK: store i32
+; CHECK-NOT: store i32
+; CHECK: br i1
+define void @unroll_1(i32* nocapture %a, i32 %b) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 4
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !10
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+!10 = metadata !{metadata !10, metadata !11}
+!11 = metadata !{metadata !"llvm.loop.unroll.count", i32 1}
+
+; #pragma clang loop unroll(enable)
+; Loop has very high loop count (1 million) and full unrolling was requested.
+; Loop should unrolled up to the pragma threshold, but not completely.
+;
+; CHECK-LABEL: @unroll_1M(
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: br i1
+define void @unroll_1M(i32* nocapture %a, i32 %b) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !12
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+!12 = metadata !{metadata !12, metadata !4}
diff --git a/test/Transforms/LoopVectorize/AArch64/lit.local.cfg b/test/Transforms/LoopVectorize/AArch64/lit.local.cfg
index f1d1f88..937cffb 100644
--- a/test/Transforms/LoopVectorize/AArch64/lit.local.cfg
+++ b/test/Transforms/LoopVectorize/AArch64/lit.local.cfg
@@ -1,6 +1,5 @@
 config.suffixes = ['.ll']
 
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
+if not 'AArch64' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/LoopVectorize/ARM/lit.local.cfg b/test/Transforms/LoopVectorize/ARM/lit.local.cfg
index 8a3ba96..98c6700 100644
--- a/test/Transforms/LoopVectorize/ARM/lit.local.cfg
+++ b/test/Transforms/LoopVectorize/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/LoopVectorize/PowerPC/lit.local.cfg b/test/Transforms/LoopVectorize/PowerPC/lit.local.cfg
index 2e46300..5d33887 100644
--- a/test/Transforms/LoopVectorize/PowerPC/lit.local.cfg
+++ b/test/Transforms/LoopVectorize/PowerPC/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'PowerPC' in targets:
+if not 'PowerPC' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/LoopVectorize/X86/already-vectorized.ll b/test/Transforms/LoopVectorize/X86/already-vectorized.ll
index faed77d..fce3b70 100644
--- a/test/Transforms/LoopVectorize/X86/already-vectorized.ll
+++ b/test/Transforms/LoopVectorize/X86/already-vectorized.ll
@@ -40,7 +40,7 @@ for.end:                                          ; preds = %for.body
 
 ; Now, we check for the Hint metadata
 ; CHECK: [[vect]] = metadata !{metadata [[vect]], metadata [[width:![0-9]+]], metadata [[unroll:![0-9]+]]}
-; CHECK: [[width]] = metadata !{metadata !"llvm.vectorizer.width", i32 1}
-; CHECK: [[unroll]] = metadata !{metadata !"llvm.vectorizer.unroll", i32 1}
+; CHECK: [[width]] = metadata !{metadata !"llvm.loop.vectorize.width", i32 1}
+; CHECK: [[unroll]] = metadata !{metadata !"llvm.loop.vectorize.unroll", i32 1}
 ; CHECK: [[scalar]] = metadata !{metadata [[scalar]], metadata [[width]], metadata [[unroll]]}
 
diff --git a/test/Transforms/LoopVectorize/X86/avx512.ll b/test/Transforms/LoopVectorize/X86/avx512.ll
new file mode 100644
index 0000000..a220866
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/avx512.ll
@@ -0,0 +1,35 @@
+; RUN: opt -mattr=+avx512f --loop-vectorize -S < %s | llc -mattr=+avx512f | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Verify that we generate 512-bit wide vectors for a basic integer memset
+; loop.
+
+; CHECK-LABEL: f:
+; CHECK: vmovdqu32 %zmm{{.}}, (
+; CHECK-NOT: %ymm
+
+define void @f(i32* %a, i32 %n) {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  store i32 %n, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/X86/lit.local.cfg b/test/Transforms/LoopVectorize/X86/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/Transforms/LoopVectorize/X86/lit.local.cfg
+++ b/test/Transforms/LoopVectorize/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
index 9e4e989..8e0ca41 100644
--- a/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -9,8 +9,9 @@
 ; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC2
 ; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DIS
 
-; This file tests the llvm.vectorizer.pragma forcing vectorization even when
-; optimization levels are too low, or when vectorization is disabled.
+; This file tests the llvm.loop.vectorize.enable metadata forcing
+; vectorization even when optimization levels are too low, or when
+; vectorization is disabled.
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -170,6 +171,6 @@ for.end:                                          ; preds = %for.body
 }
 
 !0 = metadata !{metadata !0, metadata !1}
-!1 = metadata !{metadata !"llvm.vectorizer.enable", i1 1}
+!1 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 1}
 !2 = metadata !{metadata !2, metadata !3}
-!3 = metadata !{metadata !"llvm.vectorizer.enable", i1 0}
+!3 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 0}
diff --git a/test/Transforms/LoopVectorize/X86/vect.omp.force.ll b/test/Transforms/LoopVectorize/X86/vect.omp.force.ll
index 84ffb27..074313b 100644
--- a/test/Transforms/LoopVectorize/X86/vect.omp.force.ll
+++ b/test/Transforms/LoopVectorize/X86/vect.omp.force.ll
@@ -53,7 +53,7 @@ for.end:
 }
 
 !1 = metadata !{metadata !1, metadata !2}
-!2 = metadata !{metadata !"llvm.vectorizer.enable", i1 true}
+!2 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
 
 ;
 ; This method will not be vectorized, as scalar cost is lower than any of vector costs.
diff --git a/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
index 1b979e5..97c31a1 100644
--- a/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -44,7 +44,7 @@ for.end:
 }
 
 !1 = metadata !{metadata !1, metadata !2}
-!2 = metadata !{metadata !"llvm.vectorizer.enable", i1 true}
+!2 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
 
 ;
 ; This loop will not be vectorized as the trip count is below the threshold.
diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
new file mode 100644
index 0000000..6cdd29b
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
@@ -0,0 +1,160 @@
+; RUN: opt < %s -loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
+
+; C/C++ code for tests
+; void test(int *A, int Length) {
+; #pragma clang loop vectorize(enable) interleave(enable)
+;   for (int i = 0; i < Length; i++) {
+;     A[i] = i;
+;     if (A[i] > Length)
+;       break;
+;   }
+; }
+
+; void test_disabled(int *A, int Length) {
+; #pragma clang loop vectorize(disable) interleave(disable)
+;   for (int i = 0; i < Length; i++)
+;     A[i] = i;
+; }
+
+; void test_array_bounds(int *A, int *B, int Length) {
+; #pragma clang loop vectorize(enable)
+;   for (int i = 0; i < Length; i++)
+;     A[i] = A[B[i]];
+; }
+
+; File, line, and column should match those specified in the metadata
+; CHECK: remark: source.cpp:4:5: loop not vectorized: could not determine number of loop iterations
+; CHECK: remark: source.cpp:4:5: loop not vectorized: vectorization was not specified
+; CHECK: remark: source.cpp:13:5: loop not vectorized: vector width and interleave count are explicitly set to 1
+; CHECK: remark: source.cpp:19:5: loop not vectorized: cannot identify array bounds
+; CHECK: remark: source.cpp:19:5: loop not vectorized: vectorization is explicitly enabled
+
+; CHECK: _Z4testPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+; CHECK: _Z13test_disabledPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+; CHECK: _Z17test_array_boundsPiS_i
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind optsize ssp uwtable
+define void @_Z4testPii(i32* nocapture %A, i32 %Length) #0 {
+entry:
+  %cmp10 = icmp sgt i32 %Length, 0, !dbg !12
+  br i1 %cmp10, label %for.body, label %for.end, !dbg !12, !llvm.loop !14
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv, !dbg !16
+  %0 = trunc i64 %indvars.iv to i32, !dbg !16
+  store i32 %0, i32* %arrayidx, align 4, !dbg !16, !tbaa !18
+  %cmp3 = icmp sle i32 %0, %Length, !dbg !22
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !12
+  %1 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %1, %Length, !dbg !12
+  %or.cond = and i1 %cmp3, %cmp, !dbg !22
+  br i1 %or.cond, label %for.body, label %for.end, !dbg !22
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void, !dbg !24
+}
+
+; Function Attrs: nounwind optsize ssp uwtable
+define void @_Z13test_disabledPii(i32* nocapture %A, i32 %Length) #0 {
+entry:
+  %cmp4 = icmp sgt i32 %Length, 0, !dbg !25
+  br i1 %cmp4, label %for.body, label %for.end, !dbg !25, !llvm.loop !27
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv, !dbg !30
+  %0 = trunc i64 %indvars.iv to i32, !dbg !30
+  store i32 %0, i32* %arrayidx, align 4, !dbg !30, !tbaa !18
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !25
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !25
+  %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !25
+  br i1 %exitcond, label %for.end, label %for.body, !dbg !25, !llvm.loop !27
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void, !dbg !31
+}
+
+; Function Attrs: nounwind optsize ssp uwtable
+define void @_Z17test_array_boundsPiS_i(i32* nocapture %A, i32* nocapture readonly %B, i32 %Length) #0 {
+entry:
+  %cmp9 = icmp sgt i32 %Length, 0, !dbg !32
+  br i1 %cmp9, label %for.body.preheader, label %for.end, !dbg !32, !llvm.loop !34
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !35
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32* %B, i64 %indvars.iv, !dbg !35
+  %0 = load i32* %arrayidx, align 4, !dbg !35, !tbaa !18
+  %idxprom1 = sext i32 %0 to i64, !dbg !35
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1, !dbg !35
+  %1 = load i32* %arrayidx2, align 4, !dbg !35, !tbaa !18
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 %indvars.iv, !dbg !35
+  store i32 %1, i32* %arrayidx4, align 4, !dbg !35, !tbaa !18
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !32
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !32
+  %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !32
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !32, !llvm.loop !34
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void, !dbg !36
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2}
+!1 = metadata !{metadata !"source.cpp", metadata !"."}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !7, metadata !8}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test", metadata !"test", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32)* @_Z4testPii, null, null, metadata !2, i32 1}
+!5 = metadata !{i32 786473, metadata !1}
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null}
+!7 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test_disabled", metadata !"test_disabled", metadata !"", i32 10, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32)* @_Z13test_disabledPii, null, null, metadata !2, i32 10}
+!8 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test_array_bounds", metadata !"test_array_bounds", metadata !"", i32 16, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32*, i32)* @_Z17test_array_boundsPiS_i, null, null, metadata !2, i32 16}
+!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{metadata !"clang version 3.5.0"}
+!12 = metadata !{i32 3, i32 8, metadata !13, null}
+!13 = metadata !{i32 786443, metadata !1, metadata !4, i32 3, i32 3, i32 0, i32 0}
+!14 = metadata !{metadata !14, metadata !15, metadata !15}
+!15 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
+!16 = metadata !{i32 4, i32 5, metadata !17, null}
+!17 = metadata !{i32 786443, metadata !1, metadata !13, i32 3, i32 36, i32 0, i32 1}
+!18 = metadata !{metadata !19, metadata !19, i64 0}
+!19 = metadata !{metadata !"int", metadata !20, i64 0}
+!20 = metadata !{metadata !"omnipotent char", metadata !21, i64 0}
+!21 = metadata !{metadata !"Simple C/C++ TBAA"}
+!22 = metadata !{i32 5, i32 9, metadata !23, null}
+!23 = metadata !{i32 786443, metadata !1, metadata !17, i32 5, i32 9, i32 0, i32 2}
+!24 = metadata !{i32 8, i32 1, metadata !4, null}
+!25 = metadata !{i32 12, i32 8, metadata !26, null}
+!26 = metadata !{i32 786443, metadata !1, metadata !7, i32 12, i32 3, i32 0, i32 3}
+!27 = metadata !{metadata !27, metadata !28, metadata !29}
+!28 = metadata !{metadata !"llvm.loop.vectorize.unroll", i32 1}
+!29 = metadata !{metadata !"llvm.loop.vectorize.width", i32 1}
+!30 = metadata !{i32 13, i32 5, metadata !26, null}
+!31 = metadata !{i32 14, i32 1, metadata !7, null}
+!32 = metadata !{i32 18, i32 8, metadata !33, null}
+!33 = metadata !{i32 786443, metadata !1, metadata !8, i32 18, i32 3, i32 0, i32 4}
+!34 = metadata !{metadata !34, metadata !15}
+!35 = metadata !{i32 19, i32 5, metadata !33, null}
+!36 = metadata !{i32 20, i32 1, metadata !8, null}
diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
index 685d034..f683447 100644
--- a/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
+++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
@@ -1,8 +1,17 @@
 ; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=VECTORIZED %s
-; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-unroll=4 -mtriple=x86_64-unknown-linux -S -pass-remarks='.*vectorize.*' 2>&1 | FileCheck -check-prefix=UNROLLED %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-unroll=4 -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=UNROLLED %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-unroll=1 -mtriple=x86_64-unknown-linux -S -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck -check-prefix=NONE %s
 
-; VECTORIZED: remark: {{.*}}.c:17:8: vectorized loop (vectorization factor: 4, unrolling interleave factor: 1)
-; UNROLLED: remark: {{.*}}.c:17:8: unrolled with interleaving factor 4 (vectorization not beneficial)
+; This code has all the !dbg annotations needed to track source line information,
+; but is missing the llvm.dbg.cu annotation. This prevents code generation from
+; emitting debug info in the final output.
+; RUN: llc -mtriple x86_64-pc-linux-gnu %s -o - | FileCheck -check-prefix=DEBUG-OUTPUT %s
+; DEBUG-OUTPUT-NOT: .loc
+; DEBUG-OUTPUT-NOT: {{.*}}.debug_info
+
+; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization factor: 4, unrolling interleave factor: 1)
+; UNROLLED: remark: vectorization-remarks.c:17:8: unrolled with interleaving factor 4 (vectorization not beneficial)
+; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vector width and interleave count are explicitly set to 1
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -37,11 +46,9 @@ for.end:                                          ; preds = %for.body
 
 declare void @ibar(i32*) #1
 
-!llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2} ; [ DW_TAG_compile_unit ] [./vectorization-remarks.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"vectorization-remarks.c", metadata !"."}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
diff --git a/test/Transforms/LoopVectorize/XCore/lit.local.cfg b/test/Transforms/LoopVectorize/XCore/lit.local.cfg
index 4d17d46..bb48713 100644
--- a/test/Transforms/LoopVectorize/XCore/lit.local.cfg
+++ b/test/Transforms/LoopVectorize/XCore/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'XCore' in targets:
+if not 'XCore' in config.root.targets:
     config.unsupported = True
diff --git a/test/Transforms/LoopVectorize/control-flow.ll b/test/Transforms/LoopVectorize/control-flow.ll
new file mode 100644
index 0000000..e4ba77f
--- /dev/null
+++ b/test/Transforms/LoopVectorize/control-flow.ll
@@ -0,0 +1,78 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
+
+; C/C++ code for control flow test
+; int test(int *A, int Length) {
+;   for (int i = 0; i < Length; i++) {
+;     if (A[i] > 10.0) goto end;
+;     A[i] = 0;
+;   }
+; end:
+;   return 0;
+; }
+
+; CHECK: remark: source.cpp:5:9: loop not vectorized: loop control flow is not understood by vectorizer
+; CHECK: remark: source.cpp:5:9: loop not vectorized: vectorization was not specified
+
+; CHECK: _Z4testPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind optsize ssp uwtable
+define i32 @_Z4testPii(i32* nocapture %A, i32 %Length) #0 {
+entry:
+  %cmp8 = icmp sgt i32 %Length, 0, !dbg !10
+  br i1 %cmp8, label %for.body.preheader, label %end, !dbg !10
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !12
+
+for.body:                                         ; preds = %for.body.preheader, %if.else
+  %indvars.iv = phi i64 [ %indvars.iv.next, %if.else ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv, !dbg !12
+  %0 = load i32* %arrayidx, align 4, !dbg !12, !tbaa !15
+  %cmp1 = icmp sgt i32 %0, 10, !dbg !12
+  br i1 %cmp1, label %end.loopexit, label %if.else, !dbg !12
+
+if.else:                                          ; preds = %for.body
+  store i32 0, i32* %arrayidx, align 4, !dbg !19, !tbaa !15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10
+  %1 = trunc i64 %indvars.iv.next to i32, !dbg !10
+  %cmp = icmp slt i32 %1, %Length, !dbg !10
+  br i1 %cmp, label %for.body, label %end.loopexit, !dbg !10
+
+end.loopexit:                                     ; preds = %if.else, %for.body
+  br label %end
+
+end:                                              ; preds = %end.loopexit, %entry
+  ret i32 0, !dbg !20
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2}
+!1 = metadata !{metadata !"source.cpp", metadata !"."}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test", metadata !"test", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32*, i32)* @_Z4testPii, null, null, metadata !2, i32 2}
+!5 = metadata !{i32 786473, metadata !1}
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null}
+!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!8 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!9 = metadata !{metadata !"clang version 3.5.0"}
+!10 = metadata !{i32 3, i32 8, metadata !11, null}
+!11 = metadata !{i32 786443, metadata !1, metadata !4, i32 3, i32 3, i32 0, i32 0}
+!12 = metadata !{i32 5, i32 9, metadata !13, null}
+!13 = metadata !{i32 786443, metadata !1, metadata !14, i32 5, i32 9, i32 0, i32 2}
+!14 = metadata !{i32 786443, metadata !1, metadata !11, i32 4, i32 3, i32 0, i32 1}
+!15 = metadata !{metadata !16, metadata !16, i64 0}
+!16 = metadata !{metadata !"int", metadata !17, i64 0}
+!17 = metadata !{metadata !"omnipotent char", metadata !18, i64 0}
+!18 = metadata !{metadata !"Simple C/C++ TBAA"}
+!19 = metadata !{i32 8, i32 7, metadata !13, null}
+!20 = metadata !{i32 12, i32 3, metadata !4, null}
diff --git a/test/Transforms/LoopVectorize/if-conversion.ll b/test/Transforms/LoopVectorize/if-conversion.ll
index dbe0243..6e3e8ed 100644
--- a/test/Transforms/LoopVectorize/if-conversion.ll
+++ b/test/Transforms/LoopVectorize/if-conversion.ll
@@ -156,7 +156,7 @@ for.body:
   br i1 icmp eq (i32** getelementptr inbounds ([1 x i32*]* @a, i64 0, i64 0), i32** @c), label %cond.false, label %cond.end
 
 cond.false:
-  %cond.1 = or i32 %inc3, sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*]* @a, i64 0, i64 0), i32** @c) to i32))
+  %cond.1 = or i32 %inc3, sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*]* @a, i64 0, i64 1), i32** @c) to i32))
   br label %cond.end
 
 cond.end:
diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll
index ad2c663..7dabcb2 100644
--- a/test/Transforms/LoopVectorize/induction.ll
+++ b/test/Transforms/LoopVectorize/induction.ll
@@ -108,3 +108,64 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; <label>:5                                       ; preds = %1
   ret i32 %2
 }
+
+; This loop has a backedge taken count of i32_max. We need to check for this
+; condition and branch directly to the scalar loop.
+
+; CHECK-LABEL: max_i32_backedgetaken
+; CHECK:  %backedge.overflow = icmp eq i32 -1, -1
+; CHECK:  br i1 %backedge.overflow, label %scalar.ph, label %overflow.checked
+
+; CHECK: scalar.ph:
+; CHECK:  %bc.resume.val = phi i32 [ %resume.val, %middle.block ], [ 0, %0 ]
+; CHECK:  %bc.merge.rdx = phi i32 [ 1, %0 ], [ %5, %middle.block ]
+
+define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {
+
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %a.0 = phi i32 [ 1, %0 ], [ %2, %1 ]
+  %b.0 = phi i32 [ 0, %0 ], [ %3, %1 ]
+  %2 = and i32 %a.0, 4
+  %3 = add i32 %b.0, -1
+  %4 = icmp eq i32 %3, 0
+  br i1 %4, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 %2
+}
+
+; When generating the overflow check we must sure that the induction start value
+; is defined before the branch to the scalar preheader.
+
+; CHECK-LABEL: testoverflowcheck
+; CHECK: entry
+; CHECK: %[[LOAD:.*]] = load i8
+; CHECK: %[[VAL:.*]] =  zext i8 %[[LOAD]] to i32
+; CHECK: br
+
+; CHECK: scalar.ph
+; CHECK: phi i32 [ %{{.*}}, %middle.block ], [ %[[VAL]], %entry ]
+
+@e = global i8 1, align 1
+@d = common global i32 0, align 4
+@c = common global i32 0, align 4
+define i32 @testoverflowcheck() {
+entry:
+  %.pr.i = load i8* @e, align 1
+  %0 = load i32* @d, align 4
+  %c.promoted.i = load i32* @c, align 4
+  br label %cond.end.i
+
+cond.end.i:
+  %inc4.i = phi i8 [ %.pr.i, %entry ], [ %inc.i, %cond.end.i ]
+  %and3.i = phi i32 [ %c.promoted.i, %entry ], [ %and.i, %cond.end.i ]
+  %and.i = and i32 %0, %and3.i
+  %inc.i = add i8 %inc4.i, 1
+  %tobool.i = icmp eq i8 %inc.i, 0
+  br i1 %tobool.i, label %loopexit, label %cond.end.i
+
+loopexit:
+  ret i32 %and.i
+}
diff --git a/test/Transforms/LoopVectorize/intrinsic.ll b/test/Transforms/LoopVectorize/intrinsic.ll
index c3d570c..7dfaf03 100644
--- a/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/test/Transforms/LoopVectorize/intrinsic.ll
@@ -1090,3 +1090,105 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
+declare double @llvm.powi.f64(double %Val, i32 %power) nounwind readnone
+
+;CHECK-LABEL: @powi_f64(
+;CHECK: llvm.powi.v4f64
+;CHECK: ret void
+define void @powi_f64(i32 %n, double* noalias %y, double* noalias %x, i32 %P) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double* %y, i64 %indvars.iv
+  %0 = load double* %arrayidx, align 8
+  %call = tail call double @llvm.powi.f64(double %0, i32  %P) nounwind readnone
+  %arrayidx4 = getelementptr inbounds double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx4, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @powi_f64_neg(
+;CHECK-NOT: llvm.powi.v4f64
+;CHECK: ret void
+define void @powi_f64_neg(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double* %y, i64 %indvars.iv
+  %0 = load double* %arrayidx, align 8
+  %1 = trunc i64 %indvars.iv to i32
+  %call = tail call double @llvm.powi.f64(double %0, i32  %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx4, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64  @llvm.cttz.i64 (i64, i1) nounwind readnone
+
+;CHECK-LABEL: @cttz_f64(
+;CHECK: llvm.cttz.v4i64
+;CHECK: ret void
+define void @cttz_f64(i32 %n, i64* noalias %y, i64* noalias %x) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i64* %y, i64 %indvars.iv
+  %0 = load i64* %arrayidx, align 8
+  %call = tail call i64 @llvm.cttz.i64(i64 %0, i1 true) nounwind readnone
+  %arrayidx4 = getelementptr inbounds i64* %x, i64 %indvars.iv
+  store i64 %call, i64* %arrayidx4, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64  @llvm.ctlz.i64 (i64, i1) nounwind readnone
+
+;CHECK-LABEL: @ctlz_f64(
+;CHECK: llvm.ctlz.v4i64
+;CHECK: ret void
+define void @ctlz_f64(i32 %n, i64* noalias %y, i64* noalias %x) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i64* %y, i64 %indvars.iv
+  %0 = load i64* %arrayidx, align 8
+  %call = tail call i64 @llvm.ctlz.i64(i64 %0, i1 true) nounwind readnone
+  %arrayidx4 = getelementptr inbounds i64* %x, i64 %indvars.iv
+  store i64 %call, i64* %arrayidx4, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/metadata-unroll.ll b/test/Transforms/LoopVectorize/metadata-unroll.ll
index 7f10372..2fcc53a 100644
--- a/test/Transforms/LoopVectorize/metadata-unroll.ll
+++ b/test/Transforms/LoopVectorize/metadata-unroll.ll
@@ -38,4 +38,4 @@ define void @inc(i32 %n) nounwind uwtable noinline ssp {
 }
 
 !0 = metadata !{metadata !0, metadata !1}
-!1 = metadata !{metadata !"llvm.vectorizer.unroll", i32 2}
+!1 = metadata !{metadata !"llvm.loop.vectorize.unroll", i32 2}
diff --git a/test/Transforms/LoopVectorize/metadata-width.ll b/test/Transforms/LoopVectorize/metadata-width.ll
index 1960c0b..87de655 100644
--- a/test/Transforms/LoopVectorize/metadata-width.ll
+++ b/test/Transforms/LoopVectorize/metadata-width.ll
@@ -28,4 +28,4 @@ for.end:                                          ; preds = %for.body, %entry
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !0 = metadata !{metadata !0, metadata !1}
-!1 = metadata !{metadata !"llvm.vectorizer.width", i32 8}
+!1 = metadata !{metadata !"llvm.loop.vectorize.width", i32 8}
diff --git a/test/Transforms/LoopVectorize/no_switch.ll b/test/Transforms/LoopVectorize/no_switch.ll
new file mode 100644
index 0000000..52b4285
--- /dev/null
+++ b/test/Transforms/LoopVectorize/no_switch.ll
@@ -0,0 +1,85 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
+
+; CHECK: remark: source.cpp:4:5: loop not vectorized: loop contains a switch statement
+; CHECK: remark: source.cpp:4:5: loop not vectorized: vectorization is explicitly enabled with width 4
+
+; CHECK: _Z11test_switchPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind optsize ssp uwtable
+define void @_Z11test_switchPii(i32* nocapture %A, i32 %Length) #0 {
+entry:
+  %cmp18 = icmp sgt i32 %Length, 0, !dbg !10
+  br i1 %cmp18, label %for.body.preheader, label %for.end, !dbg !10, !llvm.loop !12
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !14
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv, !dbg !14
+  %0 = load i32* %arrayidx, align 4, !dbg !14, !tbaa !16
+  switch i32 %0, label %for.inc [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb3
+  ], !dbg !14
+
+sw.bb:                                            ; preds = %for.body
+  %1 = trunc i64 %indvars.iv to i32, !dbg !20
+  %mul = shl nsw i32 %1, 1, !dbg !20
+  br label %for.inc, !dbg !22
+
+sw.bb3:                                           ; preds = %for.body
+  %2 = trunc i64 %indvars.iv to i32, !dbg !23
+  store i32 %2, i32* %arrayidx, align 4, !dbg !23, !tbaa !16
+  br label %for.inc, !dbg !23
+
+for.inc:                                          ; preds = %sw.bb3, %for.body, %sw.bb
+  %storemerge = phi i32 [ %mul, %sw.bb ], [ 0, %for.body ], [ 0, %sw.bb3 ]
+  store i32 %storemerge, i32* %arrayidx, align 4, !dbg !20, !tbaa !16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !10
+  %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !10
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !10, !llvm.loop !12
+
+for.end.loopexit:                                 ; preds = %for.inc
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void, !dbg !24
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2}
+!1 = metadata !{metadata !"source.cpp", metadata !"."}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test_switch", metadata !"test_switch", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32)* @_Z11test_switchPii, null, null, metadata !2, i32 1}
+!5 = metadata !{i32 786473, metadata !1}
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null}
+!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!8 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!9 = metadata !{metadata !"clang version 3.5.0"}
+!10 = metadata !{i32 3, i32 8, metadata !11, null}
+!11 = metadata !{i32 786443, metadata !1, metadata !4, i32 3, i32 3, i32 0, i32 0}
+!12 = metadata !{metadata !12, metadata !13, metadata !13}
+!13 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
+!14 = metadata !{i32 4, i32 5, metadata !15, null}
+!15 = metadata !{i32 786443, metadata !1, metadata !11, i32 3, i32 36, i32 0, i32 1}
+!16 = metadata !{metadata !17, metadata !17, i64 0}
+!17 = metadata !{metadata !"int", metadata !18, i64 0}
+!18 = metadata !{metadata !"omnipotent char", metadata !19, i64 0}
+!19 = metadata !{metadata !"Simple C/C++ TBAA"}
+!20 = metadata !{i32 6, i32 7, metadata !21, null}
+!21 = metadata !{i32 786443, metadata !1, metadata !15, i32 4, i32 18, i32 0, i32 2}
+!22 = metadata !{i32 7, i32 5, metadata !21, null}
+!23 = metadata !{i32 9, i32 7, metadata !21, null}
+!24 = metadata !{i32 14, i32 1, metadata !4, null}
diff --git a/test/Transforms/LoopVectorize/runtime-check-readonly.ll b/test/Transforms/LoopVectorize/runtime-check-readonly.ll
index e7b1e2a..01e28bc 100644
--- a/test/Transforms/LoopVectorize/runtime-check-readonly.ll
+++ b/test/Transforms/LoopVectorize/runtime-check-readonly.ll
@@ -5,6 +5,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 ;CHECK-LABEL: @add_ints(
 ;CHECK: br
+;CHECK: br
 ;CHECK: getelementptr
 ;CHECK-NEXT: getelementptr
 ;CHECK-DAG: icmp uge
diff --git a/test/Transforms/LoopVectorize/vect.omp.persistence.ll b/test/Transforms/LoopVectorize/vect.omp.persistence.ll
index dc3df7a..f646567 100644
--- a/test/Transforms/LoopVectorize/vect.omp.persistence.ll
+++ b/test/Transforms/LoopVectorize/vect.omp.persistence.ll
@@ -18,7 +18,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ;
 ; Test #1
 ;
-; Ensure that "llvm.vectorizer.enable" metadata was not lost prior to LoopVectorize pass.
+; Ensure that "llvm.loop.vectorize.enable" metadata was not lost prior to LoopVectorize pass.
 ; In past LoopRotate was clearing that metadata.
 ;
 ; The source C code is:
@@ -62,12 +62,12 @@ for.end:
 }
 
 !1 = metadata !{metadata !1, metadata !2}
-!2 = metadata !{metadata !"llvm.vectorizer.enable", i1 true}
+!2 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
 
 ;
 ; Test #2
 ;
-; Ensure that "llvm.vectorizer.enable" metadata was not lost even
+; Ensure that "llvm.loop.vectorize.enable" metadata was not lost even
 ; if loop was not rotated (see http://reviews.llvm.org/D3348#comment-4).
 ;
 define i32 @nonrotated(i32 %a) {
@@ -85,4 +85,4 @@ return:
 }
 
 !3 = metadata !{metadata !3, metadata !4}
-!4 = metadata !{metadata !"llvm.vectorizer.enable", i1 true}
+!4 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
diff --git a/test/Transforms/LoopVectorize/vectorize-once.ll b/test/Transforms/LoopVectorize/vectorize-once.ll
index 7800469..47de13d 100644
--- a/test/Transforms/LoopVectorize/vectorize-once.ll
+++ b/test/Transforms/LoopVectorize/vectorize-once.ll
@@ -69,9 +69,9 @@ _ZSt10accumulateIPiiET0_T_S2_S1_.exit:            ; preds = %for.body.i, %entry
 attributes #0 = { nounwind readonly ssp uwtable "fp-contract-model"="standard" "no-frame-pointer-elim" "no-frame-pointer-elim-non-leaf" "realign-stack" "relocation-model"="pic" "ssp-buffers-size"="8" }
 
 ; CHECK: !0 = metadata !{metadata !0, metadata !1, metadata !2}
-; CHECK: !1 = metadata !{metadata !"llvm.vectorizer.width", i32 1}
-; CHECK: !2 = metadata !{metadata !"llvm.vectorizer.unroll", i32 1}
+; CHECK: !1 = metadata !{metadata !"llvm.loop.vectorize.width", i32 1}
+; CHECK: !2 = metadata !{metadata !"llvm.loop.vectorize.unroll", i32 1}
 ; CHECK: !3 = metadata !{metadata !3, metadata !1, metadata !2}
 
 !0 = metadata !{metadata !0, metadata !1}
-!1 = metadata !{metadata !"llvm.vectorizer.width", i32 1}
+!1 = metadata !{metadata !"llvm.loop.vectorize.width", i32 1}
diff --git a/test/Transforms/LowerAtomic/atomic-swap.ll b/test/Transforms/LowerAtomic/atomic-swap.ll
index c319834..cb11241 100644
--- a/test/Transforms/LowerAtomic/atomic-swap.ll
+++ b/test/Transforms/LowerAtomic/atomic-swap.ll
@@ -3,15 +3,20 @@
 define i8 @cmpswap() {
 ; CHECK-LABEL: @cmpswap(
   %i = alloca i8
-  %j = cmpxchg i8* %i, i8 0, i8 42 monotonic monotonic
-; CHECK: [[INST:%[a-z0-9]+]] = load
-; CHECK-NEXT: icmp
-; CHECK-NEXT: select
-; CHECK-NEXT: store
+  %pair = cmpxchg i8* %i, i8 0, i8 42 monotonic monotonic
+  %j = extractvalue { i8, i1 } %pair, 0
+; CHECK: [[OLDVAL:%[a-z0-9]+]] = load i8* [[ADDR:%[a-z0-9]+]]
+; CHECK-NEXT: [[SAME:%[a-z0-9]+]] = icmp eq i8 [[OLDVAL]], 0
+; CHECK-NEXT: [[TO_STORE:%[a-z0-9]+]] = select i1 [[SAME]], i8 42, i8 [[OLDVAL]]
+; CHECK-NEXT: store i8 [[TO_STORE]], i8* [[ADDR]]
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = insertvalue { i8, i1 } undef, i8 [[OLDVAL]], 0
+; CHECK-NEXT: [[RES:%[a-z0-9]+]] = insertvalue { i8, i1 } [[TMP]], i1 [[SAME]], 1
+; CHECK-NEXT: [[VAL:%[a-z0-9]+]] = extractvalue { i8, i1 } [[RES]], 0
   ret i8 %j
-; CHECK: ret i8 [[INST]]
+; CHECK: ret i8 [[VAL]]
 }
 
+
 define i8 @swap() {
 ; CHECK-LABEL: @swap(
   %i = alloca i8
diff --git a/test/Transforms/LowerSwitch/2014-06-10-SwitchContiguousOpt.ll b/test/Transforms/LowerSwitch/2014-06-10-SwitchContiguousOpt.ll
new file mode 100644
index 0000000..3673c04
--- /dev/null
+++ b/test/Transforms/LowerSwitch/2014-06-10-SwitchContiguousOpt.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -lowerswitch -S | FileCheck %s
+; CHECK-NOT: icmp eq i32 %0, 1
+
+define i32 @foo(i32 %a) #0 {
+entry:
+  %retval = alloca i32, align 4
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %0 = load i32* %a.addr, align 4
+  switch i32 %0, label %sw.default [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+  ]
+
+sw.bb:
+  ret i32 12
+
+sw.bb1:
+  ret i32 4
+
+sw.bb2:
+  ret i32 2
+
+sw.default:
+  ret i32 9
+}
diff --git a/test/Transforms/LowerSwitch/2014-06-11-SwitchDefaultUnreachableOpt.ll b/test/Transforms/LowerSwitch/2014-06-11-SwitchDefaultUnreachableOpt.ll
new file mode 100644
index 0000000..0f73721
--- /dev/null
+++ b/test/Transforms/LowerSwitch/2014-06-11-SwitchDefaultUnreachableOpt.ll
@@ -0,0 +1,41 @@
+; RUN: opt < %s -lowerswitch -S | FileCheck %s
+; CHECK-NOT: {{.*}}icmp eq{{.*}}
+;
+;int foo(int a) {
+;
+;  switch (a) {
+;  case 0:
+;    return 10;
+;  case 1:
+;    return 3;
+;  default:
+;    __builtin_unreachable();
+;  }
+;
+;}
+
+define i32 @foo(i32 %a) nounwind ssp uwtable {
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  store i32 %a, i32* %2, align 4
+  %3 = load i32* %2, align 4
+  switch i32 %3, label %6 [
+    i32 0, label %4
+    i32 1, label %5
+  ]
+
+; <label>:4 
+  store i32 10, i32* %1
+  br label %7
+
+; <label>:5
+  store i32 3, i32* %1
+  br label %7
+
+; <label>:6
+  unreachable
+
+; <label>:7
+  %8 = load i32* %1
+  ret i32 %8
+}
diff --git a/test/Transforms/LowerSwitch/feature.ll b/test/Transforms/LowerSwitch/feature.ll
index e85f03e..09d25f0 100644
--- a/test/Transforms/LowerSwitch/feature.ll
+++ b/test/Transforms/LowerSwitch/feature.ll
@@ -3,93 +3,57 @@
 ; We have switch on input.
 ; On output we should got binary comparison tree. Check that all is fine.
 
-;CHECK:      entry:
-;CHECK-NEXT:   br label %NodeBlock37
+;CHECK:     entry:
+;CHECK-NEXT:  br label %NodeBlock19
 
-;CHECK:      NodeBlock37:                                      ; preds = %entry
-;CHECK-NEXT:   %Pivot38 = icmp slt i32 %tmp158, 10
-;CHECK-NEXT:   br i1 %Pivot38, label %NodeBlock13, label %NodeBlock35
+;CHECK:     NodeBlock19:                                      ; preds = %entry
+;CHECK-NEXT:  %Pivot20 = icmp slt i32 %tmp158, 10
+;CHECK-NEXT:  br i1 %Pivot20, label %NodeBlock5, label %NodeBlock17
 
-;CHECK:      NodeBlock35:                                      ; preds = %NodeBlock37
-;CHECK-NEXT:   %Pivot36 = icmp slt i32 %tmp158, 13
-;CHECK-NEXT:   br i1 %Pivot36, label %NodeBlock23, label %NodeBlock33
+;CHECK:     NodeBlock17:                                      ; preds = %NodeBlock19
+;CHECK-NEXT:  %Pivot18 = icmp slt i32 %tmp158, 13
+;CHECK-NEXT:  br i1 %Pivot18, label %NodeBlock9, label %NodeBlock15
 
-;CHECK:      NodeBlock33:                                      ; preds = %NodeBlock35
-;CHECK-NEXT:   %Pivot34 = icmp slt i32 %tmp158, 14
-;CHECK-NEXT:   br i1 %Pivot34, label %LeafBlock25, label %NodeBlock31
+;CHECK:     NodeBlock15:                                      ; preds = %NodeBlock17
+;CHECK-NEXT:  %Pivot16 = icmp slt i32 %tmp158, 14
+;CHECK-NEXT:  br i1 %Pivot16, label %bb330, label %NodeBlock13
 
-;CHECK:      NodeBlock31:                                      ; preds = %NodeBlock33
-;CHECK-NEXT:   %Pivot32 = icmp slt i32 %tmp158, 15
-;CHECK-NEXT:   br i1 %Pivot32, label %LeafBlock27, label %LeafBlock29
+;CHECK:     NodeBlock13:                                      ; preds = %NodeBlock15
+;CHECK-NEXT:  %Pivot14 = icmp slt i32 %tmp158, 15
+;CHECK-NEXT:  br i1 %Pivot14, label %bb332, label %LeafBlock11
 
-;CHECK:      LeafBlock29:                                      ; preds = %NodeBlock31
-;CHECK-NEXT:   %SwitchLeaf30 = icmp eq i32 %tmp158, 15
-;CHECK-NEXT:   br i1 %SwitchLeaf30, label %bb334, label %NewDefault
+;CHECK:     LeafBlock11:                                      ; preds = %NodeBlock13
+;CHECK-NEXT:  %SwitchLeaf12 = icmp eq i32 %tmp158, 15
+;CHECK-NEXT:  br i1 %SwitchLeaf12, label %bb334, label %NewDefault
 
-;CHECK:      LeafBlock27:                                      ; preds = %NodeBlock31
-;CHECK-NEXT:   %SwitchLeaf28 = icmp eq i32 %tmp158, 14
-;CHECK-NEXT:   br i1 %SwitchLeaf28, label %bb332, label %NewDefault
+;CHECK:     NodeBlock9:                                       ; preds = %NodeBlock17
+;CHECK-NEXT:  %Pivot10 = icmp slt i32 %tmp158, 11
+;CHECK-NEXT:  br i1 %Pivot10, label %bb324, label %NodeBlock7
 
-;CHECK:      LeafBlock25:                                      ; preds = %NodeBlock33
-;CHECK-NEXT:   %SwitchLeaf26 = icmp eq i32 %tmp158, 13
-;CHECK-NEXT:   br i1 %SwitchLeaf26, label %bb330, label %NewDefault
+;CHECK:     NodeBlock7:                                       ; preds = %NodeBlock9
+;CHECK-NEXT:  %Pivot8 = icmp slt i32 %tmp158, 12
+;CHECK-NEXT:  br i1 %Pivot8, label %bb326, label %bb328
 
-;CHECK:      NodeBlock23:                                      ; preds = %NodeBlock35
-;CHECK-NEXT:   %Pivot24 = icmp slt i32 %tmp158, 11
-;CHECK-NEXT:   br i1 %Pivot24, label %LeafBlock15, label %NodeBlock21
+;CHECK:     NodeBlock5:                                       ; preds = %NodeBlock19
+;CHECK-NEXT:  %Pivot6 = icmp slt i32 %tmp158, 7
+;CHECK-NEXT:  br i1 %Pivot6, label %NodeBlock, label %NodeBlock3
 
-;CHECK:      NodeBlock21:                                      ; preds = %NodeBlock23
-;CHECK-NEXT:   %Pivot22 = icmp slt i32 %tmp158, 12
-;CHECK-NEXT:   br i1 %Pivot22, label %LeafBlock17, label %LeafBlock19
+;CHECK:     NodeBlock3:                                       ; preds = %NodeBlock5
+;CHECK-NEXT:  %Pivot4 = icmp slt i32 %tmp158, 8
+;CHECK-NEXT:  br i1 %Pivot4, label %bb, label %NodeBlock1
 
-;CHECK:      LeafBlock19:                                      ; preds = %NodeBlock21
-;CHECK-NEXT:   %SwitchLeaf20 = icmp eq i32 %tmp158, 12
-;CHECK-NEXT:   br i1 %SwitchLeaf20, label %bb328, label %NewDefault
+;CHECK:     NodeBlock1:                                       ; preds = %NodeBlock3
+;CHECK-NEXT:  %Pivot2 = icmp slt i32 %tmp158, 9
+;CHECK-NEXT:  br i1 %Pivot2, label %bb338, label %bb322
 
-;CHECK:      LeafBlock17:                                      ; preds = %NodeBlock21
-;CHECK-NEXT:   %SwitchLeaf18 = icmp eq i32 %tmp158, 11
-;CHECK-NEXT:   br i1 %SwitchLeaf18, label %bb326, label %NewDefault
+;CHECK:     NodeBlock:                                        ; preds = %NodeBlock5
+;CHECK-NEXT:  %Pivot = icmp slt i32 %tmp158, 0
+;CHECK-NEXT:  br i1 %Pivot, label %LeafBlock, label %bb338
 
-;CHECK:      LeafBlock15:                                      ; preds = %NodeBlock23
-;CHECK-NEXT:   %SwitchLeaf16 = icmp eq i32 %tmp158, 10
-;CHECK-NEXT:   br i1 %SwitchLeaf16, label %bb324, label %NewDefault
-
-;CHECK:      NodeBlock13:                                      ; preds = %NodeBlock37
-;CHECK-NEXT:   %Pivot14 = icmp slt i32 %tmp158, 7
-;CHECK-NEXT:   br i1 %Pivot14, label %NodeBlock, label %NodeBlock11
-
-;CHECK:      NodeBlock11:                                      ; preds = %NodeBlock13
-;CHECK-NEXT:   %Pivot12 = icmp slt i32 %tmp158, 8
-;CHECK-NEXT:   br i1 %Pivot12, label %LeafBlock3, label %NodeBlock9
-
-;CHECK:      NodeBlock9:                                       ; preds = %NodeBlock11
-;CHECK-NEXT:   %Pivot10 = icmp slt i32 %tmp158, 9
-;CHECK-NEXT:   br i1 %Pivot10, label %LeafBlock5, label %LeafBlock7
-
-;CHECK:      LeafBlock7:                                       ; preds = %NodeBlock9
-;CHECK-NEXT:   %SwitchLeaf8 = icmp eq i32 %tmp158, 9
-;CHECK-NEXT:   br i1 %SwitchLeaf8, label %bb322, label %NewDefault
-
-;CHECK:      LeafBlock5:                                       ; preds = %NodeBlock9
-;CHECK-NEXT:   %SwitchLeaf6 = icmp eq i32 %tmp158, 8
-;CHECK-NEXT:   br i1 %SwitchLeaf6, label %bb338, label %NewDefault
-
-;CHECK:      LeafBlock3:                                       ; preds = %NodeBlock11
-;CHECK-NEXT:   %SwitchLeaf4 = icmp eq i32 %tmp158, 7
-;CHECK-NEXT:   br i1 %SwitchLeaf4, label %bb, label %NewDefault
-
-;CHECK:      NodeBlock:                                        ; preds = %NodeBlock13
-;CHECK-NEXT:   %Pivot = icmp slt i32 %tmp158, 0
-;CHECK-NEXT:   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
-
-;CHECK:      LeafBlock1:                                       ; preds = %NodeBlock
-;CHECK-NEXT:   %SwitchLeaf2 = icmp ule i32 %tmp158, 6
-;CHECK-NEXT:   br i1 %SwitchLeaf2, label %bb338, label %NewDefault
-
-;CHECK:      LeafBlock:                                        ; preds = %NodeBlock
-;CHECK-NEXT:   %tmp158.off = add i32 %tmp158, 6
-;CHECK-NEXT:   %SwitchLeaf = icmp ule i32 %tmp158.off, 4
-;CHECK-NEXT:   br i1 %SwitchLeaf, label %bb338, label %NewDefault
+;CHECK:     LeafBlock:                                        ; preds = %NodeBlock
+;CHECK-NEXT:  %tmp158.off = add i32 %tmp158, 6
+;CHECK-NEXT:  %SwitchLeaf = icmp ule i32 %tmp158.off, 4
+;CHECK-NEXT:  br i1 %SwitchLeaf, label %bb338, label %NewDefault
 
 define i32 @main(i32 %tmp158) {
 entry:
diff --git a/test/Transforms/MergeFunc/functions.ll b/test/Transforms/MergeFunc/functions.ll
new file mode 100644
index 0000000..006fdf5
--- /dev/null
+++ b/test/Transforms/MergeFunc/functions.ll
@@ -0,0 +1,27 @@
+; RUN: opt -S -mergefunc < %s | FileCheck %s
+
+; Be sure we don't merge cross-referenced functions of same type.
+
+; CHECK-LABEL: @left
+; CHECK-LABEL: entry-block
+; CHECK-LABEL: call void @right(i64 %p)
+define void @left(i64 %p) {
+entry-block:
+  call void @right(i64 %p)
+  call void @right(i64 %p)
+  call void @right(i64 %p)
+  call void @right(i64 %p)
+  ret void
+}
+
+; CHECK-LABEL: @right
+; CHECK-LABEL: entry-block
+; CHECK-LABEL: call void @left(i64 %p)
+define void @right(i64 %p) {
+entry-block:
+  call void @left(i64 %p)
+  call void @left(i64 %p)
+  call void @left(i64 %p)
+  call void @left(i64 %p)
+  ret void
+}
diff --git a/test/Transforms/MergeFunc/ranges.ll b/test/Transforms/MergeFunc/ranges.ll
new file mode 100644
index 0000000..e25ff1d
--- /dev/null
+++ b/test/Transforms/MergeFunc/ranges.ll
@@ -0,0 +1,43 @@
+; RUN: opt -mergefunc -S < %s | FileCheck %s
+define i1 @cmp_with_range(i8*, i8*) {
+  %v1 = load i8* %0, !range !0
+  %v2 = load i8* %1, !range !0
+  %out = icmp eq i8 %v1, %v2
+  ret i1 %out
+}
+
+define i1 @cmp_no_range(i8*, i8*) {
+; CHECK-LABEL: @cmp_no_range
+; CHECK-NEXT  %v1 = load i8* %0
+; CHECK-NEXT  %v2 = load i8* %1
+; CHECK-NEXT  %out = icmp eq i8 %v1, %v2
+; CHECK-NEXT  ret i1 %out
+  %v1 = load i8* %0
+  %v2 = load i8* %1
+  %out = icmp eq i8 %v1, %v2
+  ret i1 %out
+}
+
+define i1 @cmp_different_range(i8*, i8*) {
+; CHECK-LABEL: @cmp_different_range
+; CHECK-NEXT:  %v1 = load i8* %0, !range !1
+; CHECK-NEXT:  %v2 = load i8* %1, !range !1
+; CHECK-NEXT:  %out = icmp eq i8 %v1, %v2
+; CHECK-NEXT:  ret i1 %out
+  %v1 = load i8* %0, !range !1
+  %v2 = load i8* %1, !range !1
+  %out = icmp eq i8 %v1, %v2
+  ret i1 %out
+}
+
+define i1 @cmp_with_same_range(i8*, i8*) {
+; CHECK-LABEL: @cmp_with_same_range
+; CHECK: tail call i1 @cmp_with_range
+  %v1 = load i8* %0, !range !0
+  %v2 = load i8* %1, !range !0
+  %out = icmp eq i8 %v1, %v2
+  ret i1 %out
+}
+
+!0 = metadata !{i8 0, i8 2}
+!1 = metadata !{i8 5, i8 7}
diff --git a/test/Transforms/Reassociate/2002-05-15-AgressiveSubMove.ll b/test/Transforms/Reassociate/2002-05-15-AgressiveSubMove.ll
index 5780990..2430035 100644
--- a/test/Transforms/Reassociate/2002-05-15-AgressiveSubMove.ll
+++ b/test/Transforms/Reassociate/2002-05-15-AgressiveSubMove.ll
@@ -1,9 +1,10 @@
-; RUN: opt < %s -reassociate -instcombine -constprop -dce -S | not grep add
+; RUN: opt < %s -reassociate -S | FileCheck %s
 
-define i32 @test(i32 %A) {
-	%X = add i32 %A, 1		; <i32> [#uses=1]
-	%Y = add i32 %A, 1		; <i32> [#uses=1]
-	%r = sub i32 %X, %Y		; <i32> [#uses=1]
-	ret i32 %r
+define i32 @test1(i32 %A) {
+; CHECK-LABEL: test1
+; CHECK: ret i32 0
+  %X = add i32 %A, 1
+  %Y = add i32 %A, 1
+  %r = sub i32 %X, %Y
+  ret i32 %r
 }
-
diff --git a/test/Transforms/Reassociate/2002-05-15-MissedTree.ll b/test/Transforms/Reassociate/2002-05-15-MissedTree.ll
index e8bccbd..5f3c920 100644
--- a/test/Transforms/Reassociate/2002-05-15-MissedTree.ll
+++ b/test/Transforms/Reassociate/2002-05-15-MissedTree.ll
@@ -1,9 +1,11 @@
-; RUN: opt < %s -reassociate -instcombine -constprop -die -S | not grep 5
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
 
-define i32 @test(i32 %A, i32 %B) {
-	%W = add i32 %B, -5		; <i32> [#uses=1]
-	%Y = add i32 %A, 5		; <i32> [#uses=1]
-	%Z = add i32 %W, %Y		; <i32> [#uses=1]
+define i32 @test1(i32 %A, i32 %B) {
+; CHECK-LABEL: test1
+; CHECK: %Z = add i32 %B, %A
+; CHECK: ret i32 %Z
+	%W = add i32 %B, -5
+	%Y = add i32 %A, 5
+	%Z = add i32 %W, %Y
 	ret i32 %Z
 }
-
diff --git a/test/Transforms/Reassociate/2002-05-15-SubReassociate.ll b/test/Transforms/Reassociate/2002-05-15-SubReassociate.ll
index c18af5e..29c178f 100644
--- a/test/Transforms/Reassociate/2002-05-15-SubReassociate.ll
+++ b/test/Transforms/Reassociate/2002-05-15-SubReassociate.ll
@@ -1,12 +1,30 @@
+; RUN: opt < %s -reassociate -constprop -instcombine -dce -S | FileCheck %s
+
 ; With sub reassociation, constant folding can eliminate all of the constants.
-;
-; RUN: opt < %s -reassociate -constprop -instcombine -dce -S | not grep add
+define i32 @test1(i32 %A, i32 %B) {
+; CHECK-LABEL: test1
+; CHECK-NEXT: %Z = sub i32 %A, %B
+; CHECK-NEXT: ret i32 %Z
 
-define i32 @test(i32 %A, i32 %B) {
-	%W = add i32 5, %B		; <i32> [#uses=1]
-	%X = add i32 -7, %A		; <i32> [#uses=1]
-	%Y = sub i32 %X, %W		; <i32> [#uses=1]
-	%Z = add i32 %Y, 12		; <i32> [#uses=1]
-	ret i32 %Z
+  %W = add i32 5, %B
+  %X = add i32 -7, %A
+  %Y = sub i32 %X, %W
+  %Z = add i32 %Y, 12
+  ret i32 %Z
 }
+ 
+; With sub reassociation, constant folding can eliminate the two 12 constants.
+define i32 @test2(i32 %A, i32 %B, i32 %C, i32 %D) {
+; CHECK-LABEL: test2
+; CHECK-NEXT: %sum = add i32 %B, %A
+; CHECK-NEXT: %sum1 = add i32 %sum, %C
+; CHECK-NEXT: %Q = sub i32 %D, %sum1
+; CHECK-NEXT: ret i32 %Q
 
+  %M = add i32 %A, 12
+  %N = add i32 %M, %B
+  %O = add i32 %N, %C
+  %P = sub i32 %D, %O
+  %Q = add i32 %P, 12
+  ret i32 %Q
+}
diff --git a/test/Transforms/Reassociate/2002-05-15-SubReassociate2.ll b/test/Transforms/Reassociate/2002-05-15-SubReassociate2.ll
deleted file mode 100644
index 5848821..0000000
--- a/test/Transforms/Reassociate/2002-05-15-SubReassociate2.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; With sub reassociation, constant folding can eliminate the two 12 constants.
-;
-; RUN: opt < %s -reassociate -constprop -dce -S | not grep 12
-
-define i32 @test(i32 %A, i32 %B, i32 %C, i32 %D) {
-	%M = add i32 %A, 12		; <i32> [#uses=1]
-	%N = add i32 %M, %B		; <i32> [#uses=1]
-	%O = add i32 %N, %C		; <i32> [#uses=1]
-	%P = sub i32 %D, %O		; <i32> [#uses=1]
-	%Q = add i32 %P, 12		; <i32> [#uses=1]
-	ret i32 %Q
-}
-
diff --git a/test/Transforms/Reassociate/2005-09-01-ArrayOutOfBounds.ll b/test/Transforms/Reassociate/2005-09-01-ArrayOutOfBounds.ll
index f66148b..f6cef35 100644
--- a/test/Transforms/Reassociate/2005-09-01-ArrayOutOfBounds.ll
+++ b/test/Transforms/Reassociate/2005-09-01-ArrayOutOfBounds.ll
@@ -1,23 +1,24 @@
-; RUN: opt < %s -reassociate -instcombine -S |\
-; RUN:   grep "ret i32 0"
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
 
-define i32 @f(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
-	%tmp.2 = add i32 %a4, %a3		; <i32> [#uses=1]
-	%tmp.4 = add i32 %tmp.2, %a2		; <i32> [#uses=1]
-	%tmp.6 = add i32 %tmp.4, %a1		; <i32> [#uses=1]
-	%tmp.8 = add i32 %tmp.6, %a0		; <i32> [#uses=1]
-	%tmp.11 = add i32 %a3, %a2		; <i32> [#uses=1]
-	%tmp.13 = add i32 %tmp.11, %a1		; <i32> [#uses=1]
-	%tmp.15 = add i32 %tmp.13, %a0		; <i32> [#uses=1]
-	%tmp.18 = add i32 %a2, %a1		; <i32> [#uses=1]
-	%tmp.20 = add i32 %tmp.18, %a0		; <i32> [#uses=1]
-	%tmp.23 = add i32 %a1, %a0		; <i32> [#uses=1]
-	%tmp.26 = sub i32 %tmp.8, %tmp.15		; <i32> [#uses=1]
-	%tmp.28 = add i32 %tmp.26, %tmp.20		; <i32> [#uses=1]
-	%tmp.30 = sub i32 %tmp.28, %tmp.23		; <i32> [#uses=1]
-	%tmp.32 = sub i32 %tmp.30, %a4		; <i32> [#uses=1]
-	%tmp.34 = sub i32 %tmp.32, %a2		; <i32> [#uses=2]
-	%T = mul i32 %tmp.34, %tmp.34		; <i32> [#uses=1]
-	ret i32 %T
-}
+define i32 @f1(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
+; CHECK-LABEL: f1
+; CHECK-NEXT: ret i32 0
 
+  %tmp.2 = add i32 %a4, %a3
+  %tmp.4 = add i32 %tmp.2, %a2
+  %tmp.6 = add i32 %tmp.4, %a1
+  %tmp.8 = add i32 %tmp.6, %a0
+  %tmp.11 = add i32 %a3, %a2
+  %tmp.13 = add i32 %tmp.11, %a1
+  %tmp.15 = add i32 %tmp.13, %a0
+  %tmp.18 = add i32 %a2, %a1
+  %tmp.20 = add i32 %tmp.18, %a0
+  %tmp.23 = add i32 %a1, %a0
+  %tmp.26 = sub i32 %tmp.8, %tmp.15
+  %tmp.28 = add i32 %tmp.26, %tmp.20
+  %tmp.30 = sub i32 %tmp.28, %tmp.23
+  %tmp.32 = sub i32 %tmp.30, %a4
+  %tmp.34 = sub i32 %tmp.32, %a2
+  %T = mul i32 %tmp.34, %tmp.34
+  ret i32 %T
+}
diff --git a/test/Transforms/Reassociate/2006-04-27-ReassociateVector.ll b/test/Transforms/Reassociate/2006-04-27-ReassociateVector.ll
index 384cbc9..f783955 100644
--- a/test/Transforms/Reassociate/2006-04-27-ReassociateVector.ll
+++ b/test/Transforms/Reassociate/2006-04-27-ReassociateVector.ll
@@ -1,8 +1,12 @@
-; RUN: opt < %s -reassociate -disable-output
+; RUN: opt < %s -reassociate -S | FileCheck %s
 
-define void @foo() {
-	%tmp162 = fsub <4 x float> zeroinitializer, zeroinitializer		; <<4 x float>> [#uses=1]
-	%tmp164 = fmul <4 x float> zeroinitializer, %tmp162		; <<4 x float>> [#uses=0]
-	ret void
-}
+define <4 x float> @test1() {
+; CHECK-LABEL: test1
+; CHECK-NEXT: %tmp1 = fsub <4 x float> zeroinitializer, zeroinitializer
+; CHECK-NEXT: %tmp2 = fmul <4 x float> zeroinitializer, %tmp1
+; CHECK-NEXT: ret <4 x float> %tmp2
 
+  %tmp1 = fsub <4 x float> zeroinitializer, zeroinitializer
+  %tmp2 = fmul <4 x float> zeroinitializer, %tmp1
+  ret <4 x float> %tmp2
+}
diff --git a/test/Transforms/Reassociate/basictest.ll b/test/Transforms/Reassociate/basictest.ll
index fda0ca6..d70bfcb 100644
--- a/test/Transforms/Reassociate/basictest.ll
+++ b/test/Transforms/Reassociate/basictest.ll
@@ -1,46 +1,47 @@
-; With reassociation, constant folding can eliminate the 12 and -12 constants.
-;
-; RUN: opt < %s -reassociate  -gvn -instcombine -S | FileCheck %s
+; RUN: opt < %s -reassociate -gvn -instcombine -S | FileCheck %s
 
 define i32 @test1(i32 %arg) {
-	%tmp1 = sub i32 -12, %arg
-	%tmp2 = add i32 %tmp1, 12
-	ret i32 %tmp2
-; CHECK-LABEL: @test1(
+  %tmp1 = sub i32 -12, %arg
+  %tmp2 = add i32 %tmp1, 12
+  ret i32 %tmp2
+
+; CHECK-LABEL: @test1
 ; CHECK-NEXT: sub i32 0, %arg
 ; CHECK-NEXT: ret i32
 }
 
 define i32 @test2(i32 %reg109, i32 %reg1111) {
-	%reg115 = add i32 %reg109, -30		; <i32> [#uses=1]
-	%reg116 = add i32 %reg115, %reg1111		; <i32> [#uses=1]
-	%reg117 = add i32 %reg116, 30		; <i32> [#uses=1]
-	ret i32 %reg117
-; CHECK-LABEL: @test2(
-; CHECK-NEXT: add i32 %reg1111, %reg109
-; CHECK-NEXT: ret i32
+  %reg115 = add i32 %reg109, -30
+  %reg116 = add i32 %reg115, %reg1111
+  %reg117 = add i32 %reg116, 30
+  ret i32 %reg117
+
+; CHECK-LABEL: @test2
+; CHECK-NEXT: %reg117 = add i32 %reg1111, %reg109
+; CHECK-NEXT: ret i32 %reg117
 }
 
-@e = external global i32		; <i32*> [#uses=3]
-@a = external global i32		; <i32*> [#uses=3]
-@b = external global i32		; <i32*> [#uses=3]
-@c = external global i32		; <i32*> [#uses=3]
-@f = external global i32		; <i32*> [#uses=3]
+@e = external global i32
+@a = external global i32
+@b = external global i32
+@c = external global i32
+@f = external global i32
 
 define void @test3() {
-	%A = load i32* @a		; <i32> [#uses=2]
-	%B = load i32* @b		; <i32> [#uses=2]
-	%C = load i32* @c		; <i32> [#uses=2]
-	%t1 = add i32 %A, %B		; <i32> [#uses=1]
-	%t2 = add i32 %t1, %C		; <i32> [#uses=1]
-	%t3 = add i32 %C, %A		; <i32> [#uses=1]
-	%t4 = add i32 %t3, %B		; <i32> [#uses=1]
-	; e = (a+b)+c;
-        store i32 %t2, i32* @e
-        ; f = (a+c)+b
-	store i32 %t4, i32* @f
-	ret void
-; CHECK-LABEL: @test3(
+  %A = load i32* @a
+  %B = load i32* @b
+  %C = load i32* @c
+  %t1 = add i32 %A, %B
+  %t2 = add i32 %t1, %C
+  %t3 = add i32 %C, %A
+  %t4 = add i32 %t3, %B
+  ; e = (a+b)+c;
+  store i32 %t2, i32* @e
+  ; f = (a+c)+b
+  store i32 %t4, i32* @f
+  ret void
+
+; CHECK-LABEL: @test3
 ; CHECK: add i32
 ; CHECK: add i32
 ; CHECK-NOT: add i32
@@ -48,19 +49,20 @@ define void @test3() {
 }
 
 define void @test4() {
-	%A = load i32* @a		; <i32> [#uses=2]
-	%B = load i32* @b		; <i32> [#uses=2]
-	%C = load i32* @c		; <i32> [#uses=2]
-	%t1 = add i32 %A, %B		; <i32> [#uses=1]
-	%t2 = add i32 %t1, %C		; <i32> [#uses=1]
-	%t3 = add i32 %C, %A		; <i32> [#uses=1]
-	%t4 = add i32 %t3, %B		; <i32> [#uses=1]
-	; e = c+(a+b)
-        store i32 %t2, i32* @e
-        ; f = (c+a)+b
-	store i32 %t4, i32* @f
-	ret void
-; CHECK-LABEL: @test4(
+  %A = load i32* @a
+  %B = load i32* @b
+  %C = load i32* @c
+  %t1 = add i32 %A, %B
+  %t2 = add i32 %t1, %C
+  %t3 = add i32 %C, %A
+  %t4 = add i32 %t3, %B
+  ; e = c+(a+b)
+  store i32 %t2, i32* @e
+  ; f = (c+a)+b
+  store i32 %t4, i32* @f
+  ret void
+
+; CHECK-LABEL: @test4
 ; CHECK: add i32
 ; CHECK: add i32
 ; CHECK-NOT: add i32
@@ -68,19 +70,20 @@ define void @test4() {
 }
 
 define void @test5() {
-	%A = load i32* @a		; <i32> [#uses=2]
-	%B = load i32* @b		; <i32> [#uses=2]
-	%C = load i32* @c		; <i32> [#uses=2]
-	%t1 = add i32 %B, %A		; <i32> [#uses=1]
-	%t2 = add i32 %t1, %C		; <i32> [#uses=1]
-	%t3 = add i32 %C, %A		; <i32> [#uses=1]
-	%t4 = add i32 %t3, %B		; <i32> [#uses=1]
-	; e = c+(b+a)
-        store i32 %t2, i32* @e
-        ; f = (c+a)+b
-	store i32 %t4, i32* @f
-	ret void
-; CHECK-LABEL: @test5(
+  %A = load i32* @a
+  %B = load i32* @b
+  %C = load i32* @c
+  %t1 = add i32 %B, %A
+  %t2 = add i32 %t1, %C
+  %t3 = add i32 %C, %A
+  %t4 = add i32 %t3, %B
+  ; e = c+(b+a)
+  store i32 %t2, i32* @e
+  ; f = (c+a)+b
+  store i32 %t4, i32* @f
+  ret void
+
+; CHECK-LABEL: @test5
 ; CHECK: add i32
 ; CHECK: add i32
 ; CHECK-NOT: add i32
@@ -88,60 +91,61 @@ define void @test5() {
 }
 
 define i32 @test6() {
-	%tmp.0 = load i32* @a
-	%tmp.1 = load i32* @b
-        ; (a+b)
-	%tmp.2 = add i32 %tmp.0, %tmp.1
-	%tmp.4 = load i32* @c
-	; (a+b)+c
-        %tmp.5 = add i32 %tmp.2, %tmp.4
-	; (a+c)
-        %tmp.8 = add i32 %tmp.0, %tmp.4
-	; (a+c)+b
-        %tmp.11 = add i32 %tmp.8, %tmp.1
-	; X ^ X = 0
-        %RV = xor i32 %tmp.5, %tmp.11
-	ret i32 %RV
-; CHECK-LABEL: @test6(
+  %tmp.0 = load i32* @a
+  %tmp.1 = load i32* @b
+  ; (a+b)
+  %tmp.2 = add i32 %tmp.0, %tmp.1
+  %tmp.4 = load i32* @c
+  ; (a+b)+c
+  %tmp.5 = add i32 %tmp.2, %tmp.4
+  ; (a+c)
+  %tmp.8 = add i32 %tmp.0, %tmp.4
+  ; (a+c)+b
+  %tmp.11 = add i32 %tmp.8, %tmp.1
+  ; X ^ X = 0
+  %RV = xor i32 %tmp.5, %tmp.11
+  ret i32 %RV
+
+; CHECK-LABEL: @test6
 ; CHECK: ret i32 0
 }
 
 ; This should be one add and two multiplies.
 define i32 @test7(i32 %A, i32 %B, i32 %C) {
- ; A*A*B + A*C*A
-	%aa = mul i32 %A, %A
-	%aab = mul i32 %aa, %B
-	%ac = mul i32 %A, %C
-	%aac = mul i32 %ac, %A
-	%r = add i32 %aab, %aac
-	ret i32 %r
-; CHECK-LABEL: @test7(
+  ; A*A*B + A*C*A
+  %aa = mul i32 %A, %A
+  %aab = mul i32 %aa, %B
+  %ac = mul i32 %A, %C
+  %aac = mul i32 %ac, %A
+  %r = add i32 %aab, %aac
+  ret i32 %r
+
+; CHECK-LABEL: @test7
 ; CHECK-NEXT: add i32 %C, %B
 ; CHECK-NEXT: mul i32 
 ; CHECK-NEXT: mul i32 
 ; CHECK-NEXT: ret i32 
 }
 
-
 define i32 @test8(i32 %X, i32 %Y, i32 %Z) {
-	%A = sub i32 0, %X
-	%B = mul i32 %A, %Y
-        ; (-X)*Y + Z -> Z-X*Y
-	%C = add i32 %B, %Z
-	ret i32 %C
-; CHECK-LABEL: @test8(
+  %A = sub i32 0, %X
+  %B = mul i32 %A, %Y
+  ; (-X)*Y + Z -> Z-X*Y
+  %C = add i32 %B, %Z
+  ret i32 %C
+
+; CHECK-LABEL: @test8
 ; CHECK-NEXT: %A = mul i32 %Y, %X
 ; CHECK-NEXT: %C = sub i32 %Z, %A
 ; CHECK-NEXT: ret i32 %C
 }
 
-
 ; PR5458
 define i32 @test9(i32 %X) {
   %Y = mul i32 %X, 47
   %Z = add i32 %Y, %Y
   ret i32 %Z
-; CHECK-LABEL: @test9(
+; CHECK-LABEL: @test9
 ; CHECK-NEXT: mul i32 %X, 94
 ; CHECK-NEXT: ret i32
 }
@@ -150,7 +154,7 @@ define i32 @test10(i32 %X) {
   %Y = add i32 %X ,%X
   %Z = add i32 %Y, %X
   ret i32 %Z
-; CHECK-LABEL: @test10(
+; CHECK-LABEL: @test10
 ; CHECK-NEXT: mul i32 %X, 3
 ; CHECK-NEXT: ret i32
 }
@@ -160,7 +164,7 @@ define i32 @test11(i32 %W) {
   %Y = add i32 %X ,%X
   %Z = add i32 %Y, %X
   ret i32 %Z
-; CHECK-LABEL: @test11(
+; CHECK-LABEL: @test11
 ; CHECK-NEXT: mul i32 %W, 381
 ; CHECK-NEXT: ret i32
 }
@@ -169,11 +173,10 @@ define i32 @test12(i32 %X) {
   %A = sub i32 1, %X
   %B = sub i32 2, %X
   %C = sub i32 3, %X
-
   %Y = add i32 %A ,%B
   %Z = add i32 %Y, %C
   ret i32 %Z
-; CHECK-LABEL: @test12(
+; CHECK-LABEL: @test12
 ; CHECK-NEXT: mul i32 %X, -3
 ; CHECK-NEXT: add i32{{.*}}, 6
 ; CHECK-NEXT: ret i32
@@ -185,7 +188,7 @@ define i32 @test13(i32 %X1, i32 %X2, i32 %X3) {
   %C = mul i32 %X1, %X3  ; X1*X3
   %D = add i32 %B, %C    ; -X1*X2 + X1*X3 -> X1*(X3-X2)
   ret i32 %D
-; CHECK-LABEL: @test13(
+; CHECK-LABEL: @test13
 ; CHECK-NEXT: sub i32 %X3, %X2
 ; CHECK-NEXT: mul i32 {{.*}}, %X1
 ; CHECK-NEXT: ret i32
@@ -197,9 +200,10 @@ define i32 @test14(i32 %X1, i32 %X2) {
   %C = mul i32 %X2, -47  ; X2*-47
   %D = add i32 %B, %C    ; X1*47 + X2*-47 -> 47*(X1-X2)
   ret i32 %D
-; CHECK-LABEL: @test14(
+
+; CHECK-LABEL: @test14
 ; CHECK-NEXT: sub i32 %X1, %X2
-; CHECK-NEXT: mul i32 {{.*}}, 47
+; CHECK-NEXT: mul i32 %tmp, 47
 ; CHECK-NEXT: ret i32
 }
 
@@ -210,7 +214,6 @@ define i32 @test15(i32 %X1, i32 %X2, i32 %X3) {
   %C = and i1 %A, %B
   %D = select i1 %C, i32 %X1, i32 0
   ret i32 %D
-; CHECK-LABEL: @test15(
+; CHECK-LABEL: @test15
 ; CHECK: and i1 %A, %B
 }
-
diff --git a/test/Transforms/Reassociate/fp-commute.ll b/test/Transforms/Reassociate/fp-commute.ll
index 025689b..eac5b59 100644
--- a/test/Transforms/Reassociate/fp-commute.ll
+++ b/test/Transforms/Reassociate/fp-commute.ll
@@ -1,18 +1,19 @@
 ; RUN: opt -reassociate -S < %s | FileCheck %s
 
-target triple = "armv7-apple-ios"
-
 declare void @use(float)
 
-; CHECK: test
-define void @test(float %x, float %y) {
-entry:
+define void @test1(float %x, float %y) {
+; CHECK-LABEL: test1
 ; CHECK: fmul float %x, %y
 ; CHECK: fmul float %x, %y
-  %0 = fmul float %x, %y
-  %1 = fmul float %y, %x
-  %2 = fsub float %0, %1
-  call void @use(float %0)
-  call void @use(float %2)
+; CHECK: fsub float %1, %2
+; CHECK: call void @use(float %{{.*}})
+; CHECK: call void @use(float %{{.*}})
+
+  %1 = fmul float %x, %y
+  %2 = fmul float %y, %x
+  %3 = fsub float %1, %2
+  call void @use(float %1)
+  call void @use(float %3)
   ret void
 }
diff --git a/test/Transforms/Reassociate/inverses.ll b/test/Transforms/Reassociate/inverses.ll
index afe076c..8500cd8 100644
--- a/test/Transforms/Reassociate/inverses.ll
+++ b/test/Transforms/Reassociate/inverses.ll
@@ -32,3 +32,15 @@ define i32 @test3(i32 %b, i32 %a) {
 ; CHECK: %tmp.5 = add i32 %b, 1234
 ; CHECK: ret i32 %tmp.5
 }
+
+define i32 @test4(i32 %b, i32 %a) {
+        %tmp.1 = add i32 %a, 1234
+        %tmp.2 = add i32 %b, %tmp.1
+        %tmp.4 = xor i32 %a, -1
+        ; (b+(a+1234))+~a -> b+1233
+        %tmp.5 = add i32 %tmp.2, %tmp.4
+        ret i32 %tmp.5
+; CHECK-LABEL: @test4(
+; CHECK: %tmp.5 = add i32 %b, 1233
+; CHECK: ret i32 %tmp.5
+}
diff --git a/test/Transforms/Reassociate/looptest.ll b/test/Transforms/Reassociate/looptest.ll
index 91723bc..aad3b20 100644
--- a/test/Transforms/Reassociate/looptest.ll
+++ b/test/Transforms/Reassociate/looptest.ll
@@ -18,6 +18,7 @@
 
 declare i32 @printf(i8*, ...)
 
+; FIXME: No longer works.
 define void @test(i32 %Num, i32* %Array) {
 bb0:
 	%cond221 = icmp eq i32 0, %Num		; <i1> [#uses=3]
diff --git a/test/Transforms/Reassociate/mightymul.ll b/test/Transforms/Reassociate/mightymul.ll
index cfbc485..ae915da 100644
--- a/test/Transforms/Reassociate/mightymul.ll
+++ b/test/Transforms/Reassociate/mightymul.ll
@@ -1,7 +1,7 @@
-; RUN: opt < %s -reassociate
+; RUN: opt < %s -reassociate -disable-output
 ; PR13021
 
-define i32 @foo(i32 %x) {
+define i32 @test1(i32 %x) {
   %t0 = mul i32 %x, %x
   %t1 = mul i32 %t0, %t0
   %t2 = mul i32 %t1, %t1
diff --git a/test/Transforms/Reassociate/multistep.ll b/test/Transforms/Reassociate/multistep.ll
index d794647..12eaeee 100644
--- a/test/Transforms/Reassociate/multistep.ll
+++ b/test/Transforms/Reassociate/multistep.ll
@@ -28,4 +28,3 @@ define i64 @multistep2(i64 %a, i64 %b, i64 %c, i64 %d) {
 ; CHECK-NEXT: ret
   ret i64 %t3
 }
-
diff --git a/test/Transforms/Reassociate/negation.ll b/test/Transforms/Reassociate/negation.ll
index 6a3dfd3..12d2c86 100644
--- a/test/Transforms/Reassociate/negation.ll
+++ b/test/Transforms/Reassociate/negation.ll
@@ -1,21 +1,31 @@
-; RUN: opt < %s -reassociate -instcombine -S | not grep sub
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
 
 ; Test that we can turn things like X*-(Y*Z) -> X*-1*Y*Z.
 
 define i32 @test1(i32 %a, i32 %b, i32 %z) {
-	%c = sub i32 0, %z		; <i32> [#uses=1]
-	%d = mul i32 %a, %b		; <i32> [#uses=1]
-	%e = mul i32 %c, %d		; <i32> [#uses=1]
-	%f = mul i32 %e, 12345		; <i32> [#uses=1]
-	%g = sub i32 0, %f		; <i32> [#uses=1]
-	ret i32 %g
+; CHECK-LABEL: test1
+; CHECK-NEXT: %e = mul i32 %a, 12345
+; CHECK-NEXT: %f = mul i32 %e, %b
+; CHECK-NEXT: %g = mul i32 %f, %z
+; CHECK-NEXT: ret i32 %g
+
+  %c = sub i32 0, %z
+  %d = mul i32 %a, %b
+  %e = mul i32 %c, %d
+  %f = mul i32 %e, 12345
+  %g = sub i32 0, %f
+  ret i32 %g
 }
 
 define i32 @test2(i32 %a, i32 %b, i32 %z) {
-	%d = mul i32 %z, 40		; <i32> [#uses=1]
-	%c = sub i32 0, %d		; <i32> [#uses=1]
-	%e = mul i32 %a, %c		; <i32> [#uses=1]
-	%f = sub i32 0, %e		; <i32> [#uses=1]
-	ret i32 %f
-}
+; CHECK-LABEL: test2
+; CHECK-NEXT: %e = mul i32 %a, 40
+; CHECK-NEXT: %f = mul i32 %e, %z
+; CHECK-NEXT: ret i32 %f
 
+  %d = mul i32 %z, 40
+  %c = sub i32 0, %d
+  %e = mul i32 %a, %c
+  %f = sub i32 0, %e
+  ret i32 %f
+}
diff --git a/test/Transforms/Reassociate/otherops.ll b/test/Transforms/Reassociate/otherops.ll
index d68d008..7718881 100644
--- a/test/Transforms/Reassociate/otherops.ll
+++ b/test/Transforms/Reassociate/otherops.ll
@@ -1,28 +1,42 @@
 ; Reassociation should apply to Add, Mul, And, Or, & Xor
 ;
-; RUN: opt < %s -reassociate -constprop -instcombine -die -S | not grep 12
+; RUN: opt < %s -reassociate -constprop -instcombine -die -S | FileCheck %s
 
 define i32 @test_mul(i32 %arg) {
-	%tmp1 = mul i32 12, %arg		; <i32> [#uses=1]
-	%tmp2 = mul i32 %tmp1, 12		; <i32> [#uses=1]
-	ret i32 %tmp2
+; CHECK-LABEL: test_mul
+; CHECK-NEXT: %tmp2 = mul i32 %arg, 144
+; CHECK-NEXT: ret i32 %tmp2
+
+  %tmp1 = mul i32 12, %arg
+  %tmp2 = mul i32 %tmp1, 12
+  ret i32 %tmp2
 }
 
 define i32 @test_and(i32 %arg) {
-	%tmp1 = and i32 14, %arg		; <i32> [#uses=1]
-	%tmp2 = and i32 %tmp1, 14		; <i32> [#uses=1]
-	ret i32 %tmp2
+; CHECK-LABEL: test_and
+; CHECK-NEXT: %tmp2 = and i32 %arg, 14
+; CHECK-NEXT: ret i32 %tmp2
+
+  %tmp1 = and i32 14, %arg
+  %tmp2 = and i32 %tmp1, 14
+  ret i32 %tmp2
 }
 
 define i32 @test_or(i32 %arg) {
-	%tmp1 = or i32 14, %arg		; <i32> [#uses=1]
-	%tmp2 = or i32 %tmp1, 14		; <i32> [#uses=1]
-	ret i32 %tmp2
+; CHECK-LABEL: test_or
+; CHECK-NEXT: %tmp2 = or i32 %arg, 14
+; CHECK-NEXT: ret i32 %tmp2
+
+  %tmp1 = or i32 14, %arg
+  %tmp2 = or i32 %tmp1, 14
+  ret i32 %tmp2
 }
 
 define i32 @test_xor(i32 %arg) {
-	%tmp1 = xor i32 12, %arg		; <i32> [#uses=1]
-	%tmp2 = xor i32 %tmp1, 12		; <i32> [#uses=1]
-	ret i32 %tmp2
-}
+; CHECK-LABEL: test_xor
+; CHECK-NEXT: ret i32 %arg
 
+  %tmp1 = xor i32 12, %arg
+  %tmp2 = xor i32 %tmp1, 12
+  ret i32 %tmp2
+}
diff --git a/test/Transforms/Reassociate/shift-factor.ll b/test/Transforms/Reassociate/shift-factor.ll
index 73af5e5..8fbf1b9 100644
--- a/test/Transforms/Reassociate/shift-factor.ll
+++ b/test/Transforms/Reassociate/shift-factor.ll
@@ -1,12 +1,14 @@
 ; There should be exactly one shift and one add left.
-; RUN: opt < %s -reassociate -instcombine -S > %t
-; RUN: grep shl %t | count 1
-; RUN: grep add %t | count 1
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
 
-define i32 @test(i32 %X, i32 %Y) {
-	%tmp.2 = shl i32 %X, 1		; <i32> [#uses=1]
-	%tmp.6 = shl i32 %Y, 1		; <i32> [#uses=1]
-	%tmp.4 = add i32 %tmp.6, %tmp.2		; <i32> [#uses=1]
-	ret i32 %tmp.4
-}
+define i32 @test1(i32 %X, i32 %Y) {
+; CHECK-LABEL: test1
+; CHECK-NEXT: %tmp = add i32 %Y, %X
+; CHECK-NEXT: %tmp1 = shl i32 %tmp, 1
+; CHECK-NEXT: ret i32 %tmp1
 
+  %tmp.2 = shl i32 %X, 1
+  %tmp.6 = shl i32 %Y, 1
+  %tmp.4 = add i32 %tmp.6, %tmp.2
+  ret i32 %tmp.4
+}
diff --git a/test/Transforms/Reassociate/subtest.ll b/test/Transforms/Reassociate/subtest.ll
index 4c63d12..e6263d8 100644
--- a/test/Transforms/Reassociate/subtest.ll
+++ b/test/Transforms/Reassociate/subtest.ll
@@ -1,11 +1,26 @@
-; With sub reassociation, constant folding can eliminate the 12 and -12 constants.
-;
-; RUN: opt < %s -reassociate -instcombine -S | not grep 12
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
 
-define i32 @test(i32 %A, i32 %B) {
-	%X = add i32 -12, %A		; <i32> [#uses=1]
-	%Y = sub i32 %X, %B		; <i32> [#uses=1]
-	%Z = add i32 %Y, 12		; <i32> [#uses=1]
-	ret i32 %Z
+; With sub reassociation, constant folding can eliminate the 12 and -12 constants.
+define i32 @test1(i32 %A, i32 %B) {
+; CHECK-LABEL: @test1
+; CHECK-NEXT: %Z = sub i32 %A, %B
+; CHECK-NEXT: ret i32 %Z
+  %X = add i32 -12, %A
+  %Y = sub i32 %X, %B
+  %Z = add i32 %Y, 12
+  ret i32 %Z
 }
 
+; PR2047
+; With sub reassociation, constant folding can eliminate the uses of %a.
+define i32 @test2(i32 %a, i32 %b, i32 %c) nounwind  {
+; CHECK-LABEL: @test2
+; CHECK-NEXT: %sum = add i32 %c, %b
+; CHECK-NEXT: %tmp7 = sub i32 0, %sum
+; CHECK-NEXT: ret i32 %tmp7
+
+  %tmp3 = sub i32 %a, %b
+  %tmp5 = sub i32 %tmp3, %c
+  %tmp7 = sub i32 %tmp5, %a
+  ret i32 %tmp7
+}
diff --git a/test/Transforms/Reassociate/subtest2.ll b/test/Transforms/Reassociate/subtest2.ll
deleted file mode 100644
index 0513c5f..0000000
--- a/test/Transforms/Reassociate/subtest2.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; With sub reassociation, constant folding can eliminate the uses of %a.
-;
-; RUN: opt < %s -reassociate -instcombine -S | grep %a | count 1
-; PR2047
-
-define i32 @test(i32 %a, i32 %b, i32 %c) nounwind  {
-entry:
-	%tmp3 = sub i32 %a, %b		; <i32> [#uses=1]
-	%tmp5 = sub i32 %tmp3, %c		; <i32> [#uses=1]
-	%tmp7 = sub i32 %tmp5, %a		; <i32> [#uses=1]
-	ret i32 %tmp7
-}
-
diff --git a/test/Transforms/SCCP/atomic.ll b/test/Transforms/SCCP/atomic.ll
new file mode 100644
index 0000000..60d4896
--- /dev/null
+++ b/test/Transforms/SCCP/atomic.ll
@@ -0,0 +1,9 @@
+; RUN: opt < %s -sccp -S | FileCheck %s
+
+define i1 @test_cmpxchg(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: @test_cmpxchg
+; CHECK: cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %val = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %res = extractvalue { i32, i1 } %val, 1
+  ret i1 %res
+}
diff --git a/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg b/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg
index c420349..7184443 100644
--- a/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg
+++ b/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'AArch64' in targets:
+if not 'AArch64' in config.root.targets:
     config.unsupported = True
diff --git a/test/Transforms/SLPVectorizer/ARM/lit.local.cfg b/test/Transforms/SLPVectorizer/ARM/lit.local.cfg
index 5fc35d8..236e1d3 100644
--- a/test/Transforms/SLPVectorizer/ARM/lit.local.cfg
+++ b/test/Transforms/SLPVectorizer/ARM/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
diff --git a/test/Transforms/SLPVectorizer/R600/lit.local.cfg b/test/Transforms/SLPVectorizer/R600/lit.local.cfg
index 9e0ab99..4086e8d 100644
--- a/test/Transforms/SLPVectorizer/R600/lit.local.cfg
+++ b/test/Transforms/SLPVectorizer/R600/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'R600' in targets:
+if not 'R600' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/SLPVectorizer/X86/addsub.ll b/test/Transforms/SLPVectorizer/X86/addsub.ll
new file mode 100644
index 0000000..8303bc8
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/addsub.ll
@@ -0,0 +1,181 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@b = common global [4 x i32] zeroinitializer, align 16
+@c = common global [4 x i32] zeroinitializer, align 16
+@d = common global [4 x i32] zeroinitializer, align 16
+@e = common global [4 x i32] zeroinitializer, align 16
+@a = common global [4 x i32] zeroinitializer, align 16
+@fb = common global [4 x float] zeroinitializer, align 16
+@fc = common global [4 x float] zeroinitializer, align 16
+@fa = common global [4 x float] zeroinitializer, align 16
+
+; CHECK-LABEL: @addsub
+; CHECK: %5 = add <4 x i32> %3, %4
+; CHECK: %6 = add <4 x i32> %2, %5
+; CHECK: %7 = sub <4 x i32> %2, %5
+; CHECK: %8 = shufflevector <4 x i32> %6, <4 x i32> %7, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+
+; Function Attrs: nounwind uwtable
+define void @addsub() #0 {
+entry:
+  %0 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 0), align 4
+  %1 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 0), align 4
+  %add = add nsw i32 %0, %1
+  %2 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 0), align 4
+  %3 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 0), align 4
+  %add1 = add nsw i32 %2, %3
+  %add2 = add nsw i32 %add, %add1
+  store i32 %add2, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 0), align 4
+  %4 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 1), align 4
+  %5 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 1), align 4
+  %add3 = add nsw i32 %4, %5
+  %6 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 1), align 4
+  %7 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 1), align 4
+  %add4 = add nsw i32 %6, %7
+  %sub = sub nsw i32 %add3, %add4
+  store i32 %sub, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 1), align 4
+  %8 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 2), align 4
+  %9 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 2), align 4
+  %add5 = add nsw i32 %8, %9
+  %10 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 2), align 4
+  %11 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 2), align 4
+  %add6 = add nsw i32 %10, %11
+  %add7 = add nsw i32 %add5, %add6
+  store i32 %add7, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 2), align 4
+  %12 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 3), align 4
+  %13 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 3), align 4
+  %add8 = add nsw i32 %12, %13
+  %14 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 3), align 4
+  %15 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 3), align 4
+  %add9 = add nsw i32 %14, %15
+  %sub10 = sub nsw i32 %add8, %add9
+  store i32 %sub10, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 3), align 4
+  ret void
+}
+
+; CHECK-LABEL: @subadd
+; CHECK:  %5 = add <4 x i32> %3, %4
+; CHECK:  %6 = sub <4 x i32> %2, %5
+; CHECK:  %7 = add <4 x i32> %2, %5
+; CHECK:  %8 = shufflevector <4 x i32> %6, <4 x i32> %7, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+
+; Function Attrs: nounwind uwtable
+define void @subadd() #0 {
+entry:
+  %0 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 0), align 4
+  %1 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 0), align 4
+  %add = add nsw i32 %0, %1
+  %2 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 0), align 4
+  %3 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 0), align 4
+  %add1 = add nsw i32 %2, %3
+  %sub = sub nsw i32 %add, %add1
+  store i32 %sub, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 0), align 4
+  %4 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 1), align 4
+  %5 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 1), align 4
+  %add2 = add nsw i32 %4, %5
+  %6 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 1), align 4
+  %7 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 1), align 4
+  %add3 = add nsw i32 %6, %7
+  %add4 = add nsw i32 %add2, %add3
+  store i32 %add4, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 1), align 4
+  %8 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 2), align 4
+  %9 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 2), align 4
+  %add5 = add nsw i32 %8, %9
+  %10 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 2), align 4
+  %11 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 2), align 4
+  %add6 = add nsw i32 %10, %11
+  %sub7 = sub nsw i32 %add5, %add6
+  store i32 %sub7, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 2), align 4
+  %12 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 3), align 4
+  %13 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 3), align 4
+  %add8 = add nsw i32 %12, %13
+  %14 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 3), align 4
+  %15 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 3), align 4
+  %add9 = add nsw i32 %14, %15
+  %add10 = add nsw i32 %add8, %add9
+  store i32 %add10, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 3), align 4
+  ret void
+}
+
+; CHECK-LABEL: @faddfsub
+; CHECK: %2 = fadd <4 x float> %0, %1
+; CHECK: %3 = fsub <4 x float> %0, %1
+; CHECK: %4 = shufflevector <4 x float> %2, <4 x float> %3, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; Function Attrs: nounwind uwtable
+define void @faddfsub() #0 {
+entry:
+  %0 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
+  %1 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
+  %add = fadd float %0, %1
+  store float %add, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
+  %2 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
+  %3 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
+  %sub = fsub float %2, %3
+  store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
+  %4 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
+  %5 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
+  %add1 = fadd float %4, %5
+  store float %add1, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
+  %6 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
+  %7 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
+  %sub2 = fsub float %6, %7
+  store float %sub2, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
+  ret void
+}
+
+; CHECK-LABEL: @fsubfadd
+; CHECK: %2 = fsub <4 x float> %0, %1
+; CHECK: %3 = fadd <4 x float> %0, %1
+; CHECK: %4 = shufflevector <4 x float> %2, <4 x float> %3, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; Function Attrs: nounwind uwtable
+define void @fsubfadd() #0 {
+entry:
+  %0 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
+  %1 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
+  %sub = fsub float %0, %1
+  store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
+  %2 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
+  %3 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
+  %add = fadd float %2, %3
+  store float %add, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
+  %4 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
+  %5 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
+  %sub1 = fsub float %4, %5
+  store float %sub1, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
+  %6 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
+  %7 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
+  %add2 = fadd float %6, %7
+  store float %add2, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
+  ret void
+}
+
+; CHECK-LABEL: @No_faddfsub
+; CHECK-NOT: fadd <4 x float>
+; CHECK-NOT: fsub <4 x float>
+; CHECK-NOT: shufflevector
+; Function Attrs: nounwind uwtable
+define void @No_faddfsub() #0 {
+entry:
+  %0 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
+  %1 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
+  %add = fadd float %0, %1
+  store float %add, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
+  %2 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
+  %3 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
+  %add1 = fadd float %2, %3
+  store float %add1, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
+  %4 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
+  %5 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
+  %add2 = fadd float %4, %5
+  store float %add2, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
+  %6 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
+  %7 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
+  %sub = fsub float %6, %7
+  store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/Transforms/SLPVectorizer/X86/gep.ll b/test/Transforms/SLPVectorizer/X86/gep.ll
new file mode 100644
index 0000000..9e105ec
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/gep.ll
@@ -0,0 +1,41 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S |FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Test if SLP can handle GEP expressions.
+; The test perform the following action:
+;   x->first  = y->first  + 16
+;   x->second = y->second + 16
+
+; CHECK-LABEL: foo1
+; CHECK: <2 x i32*>
+define void @foo1 ({ i32*, i32* }* noalias %x, { i32*, i32* }* noalias %y) {
+  %1 = getelementptr inbounds { i32*, i32* }* %y, i64 0, i32 0
+  %2 = load i32** %1, align 8
+  %3 = getelementptr inbounds i32* %2, i64 16
+  %4 = getelementptr inbounds { i32*, i32* }* %x, i64 0, i32 0
+  store i32* %3, i32** %4, align 8
+  %5 = getelementptr inbounds { i32*, i32* }* %y, i64 0, i32 1
+  %6 = load i32** %5, align 8
+  %7 = getelementptr inbounds i32* %6, i64 16
+  %8 = getelementptr inbounds { i32*, i32* }* %x, i64 0, i32 1
+  store i32* %7, i32** %8, align 8
+  ret void
+}
+
+; Test that we don't vectorize GEP expressions if indexes are not constants.
+; We can't produce an efficient code in that case.
+; CHECK-LABEL: foo2
+; CHECK-NOT: <2 x i32*>
+define void @foo2 ({ i32*, i32* }* noalias %x, { i32*, i32* }* noalias %y, i32 %i) {
+  %1 = getelementptr inbounds { i32*, i32* }* %y, i64 0, i32 0
+  %2 = load i32** %1, align 8
+  %3 = getelementptr inbounds i32* %2, i32 %i
+  %4 = getelementptr inbounds { i32*, i32* }* %x, i64 0, i32 0
+  store i32* %3, i32** %4, align 8
+  %5 = getelementptr inbounds { i32*, i32* }* %y, i64 0, i32 1
+  %6 = load i32** %5, align 8
+  %7 = getelementptr inbounds i32* %6, i32 %i
+  %8 = getelementptr inbounds { i32*, i32* }* %x, i64 0, i32 1
+  store i32* %7, i32** %8, align 8
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/intrinsic.ll b/test/Transforms/SLPVectorizer/X86/intrinsic.ll
index 30c5093..937252f 100644
--- a/test/Transforms/SLPVectorizer/X86/intrinsic.ll
+++ b/test/Transforms/SLPVectorizer/X86/intrinsic.ll
@@ -117,3 +117,270 @@ entry:
 ; CHECK: store <4 x i32>
 ; CHECK: ret
 }
+
+declare i32 @llvm.ctlz.i32(i32,i1) nounwind readnone
+
+define void @vec_ctlz_i32(i32* %a, i32* %b, i32* %c, i1) {
+entry:
+  %i0 = load i32* %a, align 4
+  %i1 = load i32* %b, align 4
+  %add1 = add i32 %i0, %i1
+  %call1 = tail call i32 @llvm.ctlz.i32(i32 %add1,i1 true) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds i32* %a, i32 1
+  %i2 = load i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i32 1
+  %i3 = load i32* %arrayidx3, align 4
+  %add2 = add i32 %i2, %i3
+  %call2 = tail call i32 @llvm.ctlz.i32(i32 %add2,i1 true) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds i32* %a, i32 2
+  %i4 = load i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32* %b, i32 2
+  %i5 = load i32* %arrayidx5, align 4
+  %add3 = add i32 %i4, %i5
+  %call3 = tail call i32 @llvm.ctlz.i32(i32 %add3,i1 true) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds i32* %a, i32 3
+  %i6 = load i32* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32* %b, i32 3
+  %i7 = load i32* %arrayidx7, align 4
+  %add4 = add i32 %i6, %i7
+  %call4 = tail call i32 @llvm.ctlz.i32(i32 %add4,i1 true) nounwind readnone
+
+  store i32 %call1, i32* %c, align 4
+  %arrayidx8 = getelementptr inbounds i32* %c, i32 1
+  store i32 %call2, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32* %c, i32 2
+  store i32 %call3, i32* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32* %c, i32 3
+  store i32 %call4, i32* %arrayidx10, align 4
+  ret void
+
+; CHECK-LABEL: @vec_ctlz_i32(
+; CHECK: load <4 x i32>
+; CHECK: load <4 x i32>
+; CHECK: call <4 x i32> @llvm.ctlz.v4i32
+; CHECK: store <4 x i32>
+; CHECK: ret
+}
+
+define void @vec_ctlz_i32_neg(i32* %a, i32* %b, i32* %c, i1) {
+entry:
+  %i0 = load i32* %a, align 4
+  %i1 = load i32* %b, align 4
+  %add1 = add i32 %i0, %i1
+  %call1 = tail call i32 @llvm.ctlz.i32(i32 %add1,i1 true) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds i32* %a, i32 1
+  %i2 = load i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i32 1
+  %i3 = load i32* %arrayidx3, align 4
+  %add2 = add i32 %i2, %i3
+  %call2 = tail call i32 @llvm.ctlz.i32(i32 %add2,i1 false) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds i32* %a, i32 2
+  %i4 = load i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32* %b, i32 2
+  %i5 = load i32* %arrayidx5, align 4
+  %add3 = add i32 %i4, %i5
+  %call3 = tail call i32 @llvm.ctlz.i32(i32 %add3,i1 true) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds i32* %a, i32 3
+  %i6 = load i32* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32* %b, i32 3
+  %i7 = load i32* %arrayidx7, align 4
+  %add4 = add i32 %i6, %i7
+  %call4 = tail call i32 @llvm.ctlz.i32(i32 %add4,i1 false) nounwind readnone
+
+  store i32 %call1, i32* %c, align 4
+  %arrayidx8 = getelementptr inbounds i32* %c, i32 1
+  store i32 %call2, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32* %c, i32 2
+  store i32 %call3, i32* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32* %c, i32 3
+  store i32 %call4, i32* %arrayidx10, align 4
+  ret void
+
+; CHECK-LABEL: @vec_ctlz_i32_neg(
+; CHECK-NOT: call <4 x i32> @llvm.ctlz.v4i32
+
+}
+
+
+declare i32 @llvm.cttz.i32(i32,i1) nounwind readnone
+
+define void @vec_cttz_i32(i32* %a, i32* %b, i32* %c, i1) {
+entry:
+  %i0 = load i32* %a, align 4
+  %i1 = load i32* %b, align 4
+  %add1 = add i32 %i0, %i1
+  %call1 = tail call i32 @llvm.cttz.i32(i32 %add1,i1 true) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds i32* %a, i32 1
+  %i2 = load i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i32 1
+  %i3 = load i32* %arrayidx3, align 4
+  %add2 = add i32 %i2, %i3
+  %call2 = tail call i32 @llvm.cttz.i32(i32 %add2,i1 true) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds i32* %a, i32 2
+  %i4 = load i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32* %b, i32 2
+  %i5 = load i32* %arrayidx5, align 4
+  %add3 = add i32 %i4, %i5
+  %call3 = tail call i32 @llvm.cttz.i32(i32 %add3,i1 true) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds i32* %a, i32 3
+  %i6 = load i32* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32* %b, i32 3
+  %i7 = load i32* %arrayidx7, align 4
+  %add4 = add i32 %i6, %i7
+  %call4 = tail call i32 @llvm.cttz.i32(i32 %add4,i1 true) nounwind readnone
+
+  store i32 %call1, i32* %c, align 4
+  %arrayidx8 = getelementptr inbounds i32* %c, i32 1
+  store i32 %call2, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32* %c, i32 2
+  store i32 %call3, i32* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32* %c, i32 3
+  store i32 %call4, i32* %arrayidx10, align 4
+  ret void
+
+; CHECK-LABEL: @vec_cttz_i32(
+; CHECK: load <4 x i32>
+; CHECK: load <4 x i32>
+; CHECK: call <4 x i32> @llvm.cttz.v4i32
+; CHECK: store <4 x i32>
+; CHECK: ret
+}
+
+define void @vec_cttz_i32_neg(i32* %a, i32* %b, i32* %c, i1) {
+entry:
+  %i0 = load i32* %a, align 4
+  %i1 = load i32* %b, align 4
+  %add1 = add i32 %i0, %i1
+  %call1 = tail call i32 @llvm.cttz.i32(i32 %add1,i1 true) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds i32* %a, i32 1
+  %i2 = load i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i32 1
+  %i3 = load i32* %arrayidx3, align 4
+  %add2 = add i32 %i2, %i3
+  %call2 = tail call i32 @llvm.cttz.i32(i32 %add2,i1 false) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds i32* %a, i32 2
+  %i4 = load i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32* %b, i32 2
+  %i5 = load i32* %arrayidx5, align 4
+  %add3 = add i32 %i4, %i5
+  %call3 = tail call i32 @llvm.cttz.i32(i32 %add3,i1 true) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds i32* %a, i32 3
+  %i6 = load i32* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32* %b, i32 3
+  %i7 = load i32* %arrayidx7, align 4
+  %add4 = add i32 %i6, %i7
+  %call4 = tail call i32 @llvm.cttz.i32(i32 %add4,i1 false) nounwind readnone
+
+  store i32 %call1, i32* %c, align 4
+  %arrayidx8 = getelementptr inbounds i32* %c, i32 1
+  store i32 %call2, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32* %c, i32 2
+  store i32 %call3, i32* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32* %c, i32 3
+  store i32 %call4, i32* %arrayidx10, align 4
+  ret void
+
+; CHECK-LABEL: @vec_cttz_i32_neg(
+; CHECK-NOT: call <4 x i32> @llvm.cttz.v4i32
+}
+
+
+declare float @llvm.powi.f32(float, i32)
+define void @vec_powi_f32(float* %a, float* %b, float* %c, i32 %P) {
+entry:
+  %i0 = load float* %a, align 4
+  %i1 = load float* %b, align 4
+  %add1 = fadd float %i0, %i1
+  %call1 = tail call float @llvm.powi.f32(float %add1,i32 %P) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds float* %a, i32 1
+  %i2 = load float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float* %b, i32 1
+  %i3 = load float* %arrayidx3, align 4
+  %add2 = fadd float %i2, %i3
+  %call2 = tail call float @llvm.powi.f32(float %add2,i32 %P) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds float* %a, i32 2
+  %i4 = load float* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds float* %b, i32 2
+  %i5 = load float* %arrayidx5, align 4
+  %add3 = fadd float %i4, %i5
+  %call3 = tail call float @llvm.powi.f32(float %add3,i32 %P) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds float* %a, i32 3
+  %i6 = load float* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds float* %b, i32 3
+  %i7 = load float* %arrayidx7, align 4
+  %add4 = fadd float %i6, %i7
+  %call4 = tail call float @llvm.powi.f32(float %add4,i32 %P) nounwind readnone
+
+  store float %call1, float* %c, align 4
+  %arrayidx8 = getelementptr inbounds float* %c, i32 1
+  store float %call2, float* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds float* %c, i32 2
+  store float %call3, float* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds float* %c, i32 3
+  store float %call4, float* %arrayidx10, align 4
+  ret void
+
+; CHECK-LABEL: @vec_powi_f32(
+; CHECK: load <4 x float>
+; CHECK: load <4 x float>
+; CHECK: call <4 x float> @llvm.powi.v4f32
+; CHECK: store <4 x float>
+; CHECK: ret
+}
+
+
+define void @vec_powi_f32_neg(float* %a, float* %b, float* %c, i32 %P, i32 %Q) {
+entry:
+  %i0 = load float* %a, align 4
+  %i1 = load float* %b, align 4
+  %add1 = fadd float %i0, %i1
+  %call1 = tail call float @llvm.powi.f32(float %add1,i32 %P) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds float* %a, i32 1
+  %i2 = load float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float* %b, i32 1
+  %i3 = load float* %arrayidx3, align 4
+  %add2 = fadd float %i2, %i3
+  %call2 = tail call float @llvm.powi.f32(float %add2,i32 %Q) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds float* %a, i32 2
+  %i4 = load float* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds float* %b, i32 2
+  %i5 = load float* %arrayidx5, align 4
+  %add3 = fadd float %i4, %i5
+  %call3 = tail call float @llvm.powi.f32(float %add3,i32 %P) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds float* %a, i32 3
+  %i6 = load float* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds float* %b, i32 3
+  %i7 = load float* %arrayidx7, align 4
+  %add4 = fadd float %i6, %i7
+  %call4 = tail call float @llvm.powi.f32(float %add4,i32 %Q) nounwind readnone
+
+  store float %call1, float* %c, align 4
+  %arrayidx8 = getelementptr inbounds float* %c, i32 1
+  store float %call2, float* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds float* %c, i32 2
+  store float %call3, float* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds float* %c, i32 3
+  store float %call4, float* %arrayidx10, align 4
+  ret void
+
+; CHECK-LABEL: @vec_powi_f32_neg(
+; CHECK-NOT: call <4 x float> @llvm.powi.v4f32
+}
diff --git a/test/Transforms/SLPVectorizer/X86/lit.local.cfg b/test/Transforms/SLPVectorizer/X86/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/Transforms/SLPVectorizer/X86/lit.local.cfg
+++ b/test/Transforms/SLPVectorizer/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/SLPVectorizer/X86/pr19657.ll b/test/Transforms/SLPVectorizer/X86/pr19657.ll
new file mode 100644
index 0000000..9352308
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/pr19657.ll
@@ -0,0 +1,73 @@
+; RUN: opt < %s -O1 -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+;CHECK: load <2 x double>*
+;CHECK: fadd <2 x double>
+;CHECK: store <2 x double>
+
+; Function Attrs: nounwind uwtable
+define void @foo(double* %x) #0 {
+  %1 = alloca double*, align 8
+  store double* %x, double** %1, align 8
+  %2 = load double** %1, align 8
+  %3 = getelementptr inbounds double* %2, i64 0
+  %4 = load double* %3, align 8
+  %5 = load double** %1, align 8
+  %6 = getelementptr inbounds double* %5, i64 0
+  %7 = load double* %6, align 8
+  %8 = fadd double %4, %7
+  %9 = load double** %1, align 8
+  %10 = getelementptr inbounds double* %9, i64 0
+  %11 = load double* %10, align 8
+  %12 = fadd double %8, %11
+  %13 = load double** %1, align 8
+  %14 = getelementptr inbounds double* %13, i64 0
+  store double %12, double* %14, align 8
+  %15 = load double** %1, align 8
+  %16 = getelementptr inbounds double* %15, i64 1
+  %17 = load double* %16, align 8
+  %18 = load double** %1, align 8
+  %19 = getelementptr inbounds double* %18, i64 1
+  %20 = load double* %19, align 8
+  %21 = fadd double %17, %20
+  %22 = load double** %1, align 8
+  %23 = getelementptr inbounds double* %22, i64 1
+  %24 = load double* %23, align 8
+  %25 = fadd double %21, %24
+  %26 = load double** %1, align 8
+  %27 = getelementptr inbounds double* %26, i64 1
+  store double %25, double* %27, align 8
+  %28 = load double** %1, align 8
+  %29 = getelementptr inbounds double* %28, i64 2
+  %30 = load double* %29, align 8
+  %31 = load double** %1, align 8
+  %32 = getelementptr inbounds double* %31, i64 2
+  %33 = load double* %32, align 8
+  %34 = fadd double %30, %33
+  %35 = load double** %1, align 8
+  %36 = getelementptr inbounds double* %35, i64 2
+  %37 = load double* %36, align 8
+  %38 = fadd double %34, %37
+  %39 = load double** %1, align 8
+  %40 = getelementptr inbounds double* %39, i64 2
+  store double %38, double* %40, align 8
+  %41 = load double** %1, align 8
+  %42 = getelementptr inbounds double* %41, i64 3
+  %43 = load double* %42, align 8
+  %44 = load double** %1, align 8
+  %45 = getelementptr inbounds double* %44, i64 3
+  %46 = load double* %45, align 8
+  %47 = fadd double %43, %46
+  %48 = load double** %1, align 8
+  %49 = getelementptr inbounds double* %48, i64 3
+  %50 = load double* %49, align 8
+  %51 = fadd double %47, %50
+  %52 = load double** %1, align 8
+  %53 = getelementptr inbounds double* %52, i64 3
+  store double %51, double* %53, align 8
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/Transforms/SLPVectorizer/XCore/lit.local.cfg b/test/Transforms/SLPVectorizer/XCore/lit.local.cfg
index 4d17d46..bb48713 100644
--- a/test/Transforms/SLPVectorizer/XCore/lit.local.cfg
+++ b/test/Transforms/SLPVectorizer/XCore/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'XCore' in targets:
+if not 'XCore' in config.root.targets:
     config.unsupported = True
diff --git a/test/Transforms/SROA/slice-order-independence.ll b/test/Transforms/SROA/slice-order-independence.ll
new file mode 100644
index 0000000..364ef85
--- /dev/null
+++ b/test/Transforms/SROA/slice-order-independence.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+; Check that the chosen type for a split is independent from the order of
+; slices even in case of types that are skipped because their width is not a
+; byte width multiple
+define void @skipped_inttype_first({ i16*, i32 }*) {
+; CHECK-LABEL: @skipped_inttype_first
+; CHECK: alloca i8*
+  %arg = alloca { i16*, i32 }, align 8
+  %2 = bitcast { i16*, i32 }* %0 to i8*
+  %3 = bitcast { i16*, i32 }* %arg to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %3, i8* %2, i32 16, i32 8, i1 false)
+  %b = getelementptr inbounds { i16*, i32 }* %arg, i64 0, i32 0
+  %pb0 = bitcast i16** %b to i63*
+  %b0 = load i63* %pb0
+  %pb1 = bitcast i16** %b to i8**
+  %b1 = load i8** %pb1
+  ret void
+}
+
+define void @skipped_inttype_last({ i16*, i32 }*) {
+; CHECK-LABEL: @skipped_inttype_last
+; CHECK: alloca i8*
+  %arg = alloca { i16*, i32 }, align 8
+  %2 = bitcast { i16*, i32 }* %0 to i8*
+  %3 = bitcast { i16*, i32 }* %arg to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %3, i8* %2, i32 16, i32 8, i1 false)
+  %b = getelementptr inbounds { i16*, i32 }* %arg, i64 0, i32 0
+  %pb1 = bitcast i16** %b to i8**
+  %b1 = load i8** %pb1
+  %pb0 = bitcast i16** %b to i63*
+  %b0 = load i63* %pb0
+  ret void
+}
diff --git a/test/Transforms/SROA/slice-width.ll b/test/Transforms/SROA/slice-width.ll
new file mode 100644
index 0000000..179780b
--- /dev/null
+++ b/test/Transforms/SROA/slice-width.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+define void @no_split_on_non_byte_width(i32) {
+; This tests that allocas are not split into slices that are not byte width multiple
+  %arg = alloca i32 , align 8
+  store i32 %0, i32* %arg
+  br label %load_i32
+
+load_i32:
+; CHECK-LABEL: load_i32:
+; CHECK-NOT: bitcast {{.*}} to i1
+; CHECK-NOT: zext i1
+  %r0 = load i32* %arg
+  br label %load_i1
+
+load_i1:
+; CHECK-LABEL: load_i1:
+; CHECK: bitcast {{.*}} to i1
+  %p1 = bitcast i32* %arg to i1*
+  %t1 = load i1* %p1
+  ret void
+}
diff --git a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lit.local.cfg b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lit.local.cfg
index 40532cd..a5e90f8 100644
--- a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lit.local.cfg
+++ b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'NVPTX' in targets:
+if not 'NVPTX' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
index 850fc4c..c07440c 100644
--- a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
+++ b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
@@ -1,4 +1,3 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX
 ; RUN: opt < %s -S -separate-const-offset-from-gep -gvn -dce | FileCheck %s --check-prefix=IR
 
@@ -20,6 +19,90 @@ target triple = "nvptx64-unknown-unknown"
 
 define void @sum_of_array(i32 %x, i32 %y, float* nocapture %output) {
 .preheader:
+  %0 = sext i32 %y to i64
+  %1 = sext i32 %x to i64
+  %2 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %0
+  %3 = addrspacecast float addrspace(3)* %2 to float*
+  %4 = load float* %3, align 4
+  %5 = fadd float %4, 0.000000e+00
+  %6 = add i32 %y, 1
+  %7 = sext i32 %6 to i64
+  %8 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %7
+  %9 = addrspacecast float addrspace(3)* %8 to float*
+  %10 = load float* %9, align 4
+  %11 = fadd float %5, %10
+  %12 = add i32 %x, 1
+  %13 = sext i32 %12 to i64
+  %14 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %13, i64 %0
+  %15 = addrspacecast float addrspace(3)* %14 to float*
+  %16 = load float* %15, align 4
+  %17 = fadd float %11, %16
+  %18 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %13, i64 %7
+  %19 = addrspacecast float addrspace(3)* %18 to float*
+  %20 = load float* %19, align 4
+  %21 = fadd float %17, %20
+  store float %21, float* %output, align 4
+  ret void
+}
+; PTX-LABEL: sum_of_array(
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rl|r)[0-9]+]]{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
+
+; IR-LABEL: @sum_of_array(
+; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 1
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 32
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 33
+
+; @sum_of_array2 is very similar to @sum_of_array. The only difference is in
+; the order of "sext" and "add" when computing the array indices. @sum_of_array
+; computes add before sext, e.g., array[sext(x + 1)][sext(y + 1)], while
+; @sum_of_array2 computes sext before add,
+; e.g., array[sext(x) + 1][sext(y) + 1]. SeparateConstOffsetFromGEP should be
+; able to extract constant offsets from both forms.
+define void @sum_of_array2(i32 %x, i32 %y, float* nocapture %output) {
+.preheader:
+  %0 = sext i32 %y to i64
+  %1 = sext i32 %x to i64
+  %2 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %0
+  %3 = addrspacecast float addrspace(3)* %2 to float*
+  %4 = load float* %3, align 4
+  %5 = fadd float %4, 0.000000e+00
+  %6 = add i64 %0, 1
+  %7 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %6
+  %8 = addrspacecast float addrspace(3)* %7 to float*
+  %9 = load float* %8, align 4
+  %10 = fadd float %5, %9
+  %11 = add i64 %1, 1
+  %12 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %11, i64 %0
+  %13 = addrspacecast float addrspace(3)* %12 to float*
+  %14 = load float* %13, align 4
+  %15 = fadd float %10, %14
+  %16 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %11, i64 %6
+  %17 = addrspacecast float addrspace(3)* %16 to float*
+  %18 = load float* %17, align 4
+  %19 = fadd float %15, %18
+  store float %19, float* %output, align 4
+  ret void
+}
+; PTX-LABEL: sum_of_array2(
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rl|r)[0-9]+]]{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
+
+; IR-LABEL: @sum_of_array2(
+; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 1
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 32
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 33
+
+; Similar to @sum_of_array3, but extends array indices using zext instead of
+; sext. e.g., array[zext(x + 1)][zext(y + 1)].
+define void @sum_of_array3(i32 %x, i32 %y, float* nocapture %output) {
+.preheader:
   %0 = zext i32 %y to i64
   %1 = zext i32 %x to i64
   %2 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %0
@@ -45,15 +128,14 @@ define void @sum_of_array(i32 %x, i32 %y, float* nocapture %output) {
   store float %21, float* %output, align 4
   ret void
 }
-
-; PTX-LABEL: sum_of_array(
+; PTX-LABEL: sum_of_array3(
 ; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rl|r)[0-9]+]]{{\]}}
 ; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
 ; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
 ; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
 
-; IR-LABEL: @sum_of_array(
-; IR: [[BASE_PTR:%[0-9]+]] = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i32 %x, i32 %y
+; IR-LABEL: @sum_of_array3(
+; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
 ; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 1
 ; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 32
 ; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 33
diff --git a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
index 2e50f5f..ed40c7e 100644
--- a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
+++ b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
@@ -23,71 +23,94 @@ entry:
   %p = getelementptr inbounds [1024 x %struct.S]* @struct_array, i64 0, i64 %idxprom, i32 1
   ret double* %p
 }
-; CHECK-LABEL: @struct
-; CHECK: getelementptr [1024 x %struct.S]* @struct_array, i64 0, i32 %i, i32 1
+; CHECK-LABEL: @struct(
+; CHECK: getelementptr [1024 x %struct.S]* @struct_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i32 1
 
-; We should be able to trace into sext/zext if it's directly used as a GEP
-; index.
-define float* @sext_zext(i32 %i, i32 %j) {
+; We should be able to trace into sext(a + b) if a + b is non-negative
+; (e.g., used as an index of an inbounds GEP) and one of a and b is
+; non-negative.
+define float* @sext_add(i32 %i, i32 %j) {
 entry:
-  %i1 = add i32 %i, 1
-  %j2 = add i32 %j, 2
-  %i1.ext = sext i32 %i1 to i64
-  %j2.ext = zext i32 %j2 to i64
-  %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i1.ext, i64 %j2.ext
+  %0 = add i32 %i, 1
+  %1 = sext i32 %0 to i64  ; inbound sext(i + 1) = sext(i) + 1
+  %2 = add i32 %j, -2
+  ; However, inbound sext(j + -2) != sext(j) + -2, e.g., j = INT_MIN
+  %3 = sext i32 %2 to i64
+  %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %1, i64 %3
   ret float* %p
 }
-; CHECK-LABEL: @sext_zext
-; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i32 %i, i32 %j
-; CHECK: getelementptr float* %{{[0-9]+}}, i64 34
+; CHECK-LABEL: @sext_add(
+; CHECK-NOT: = add
+; CHECK: add i32 %j, -2
+; CHECK: sext
+; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; CHECK: getelementptr float* %{{[a-zA-Z0-9]+}}, i64 32
 
 ; We should be able to trace into sext/zext if it can be distributed to both
 ; operands, e.g., sext (add nsw a, b) == add nsw (sext a), (sext b)
+;
+; This test verifies we can transform
+;   gep base, a + sext(b +nsw 1), c + zext(d +nuw 1)
+; to
+;   gep base, a + sext(b), c + zext(d); gep ..., 1 * 32 + 1
 define float* @ext_add_no_overflow(i64 %a, i32 %b, i64 %c, i32 %d) {
   %b1 = add nsw i32 %b, 1
   %b2 = sext i32 %b1 to i64
-  %i = add i64 %a, %b2
+  %i = add i64 %a, %b2       ; i = a + sext(b +nsw 1)
   %d1 = add nuw i32 %d, 1
   %d2 = zext i32 %d1 to i64
-  %j = add i64 %c, %d2
+  %j = add i64 %c, %d2       ; j = c + zext(d +nuw 1)
   %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %j
   ret float* %p
 }
-; CHECK-LABEL: @ext_add_no_overflow
-; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[0-9]+}}, i64 %{{[0-9]+}}
+; CHECK-LABEL: @ext_add_no_overflow(
+; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
 ; CHECK: getelementptr float* [[BASE_PTR]], i64 33
 
-; Similar to @ext_add_no_overflow, we should be able to trace into sext/zext if
-; its operand is an "or" instruction.
-define float* @ext_or(i64 %a, i32 %b) {
+; Verifies we handle nested sext/zext correctly.
+define void @sext_zext(i32 %a, i32 %b, float** %out1, float** %out2) {
+entry:
+  %0 = add nsw nuw i32 %a, 1
+  %1 = sext i32 %0 to i48
+  %2 = zext i48 %1 to i64    ; zext(sext(a +nsw nuw 1)) = zext(sext(a)) + 1
+  %3 = add nsw i32 %b, 2
+  %4 = sext i32 %3 to i48
+  %5 = zext i48 %4 to i64    ; zext(sext(b +nsw 2)) != zext(sext(b)) + 2
+  %p1 = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %2, i64 %5
+  store float* %p1, float** %out1
+  %6 = add nuw i32 %a, 3
+  %7 = zext i32 %6 to i48
+  %8 = sext i48 %7 to i64 ; sext(zext(a +nuw 3)) = zext(a +nuw 3) = zext(a) + 3
+  %9 = add nsw i32 %b, 4
+  %10 = zext i32 %9 to i48
+  %11 = sext i48 %10 to i64  ; sext(zext(b +nsw 4)) != zext(b) + 4
+  %p2 = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %8, i64 %11
+  store float* %p2, float** %out2
+  ret void
+}
+; CHECK-LABEL: @sext_zext(
+; CHECK: [[BASE_PTR_1:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; CHECK: getelementptr float* [[BASE_PTR_1]], i64 32
+; CHECK: [[BASE_PTR_2:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; CHECK: getelementptr float* [[BASE_PTR_2]], i64 96
+
+; Similar to @ext_add_no_overflow, we should be able to trace into s/zext if
+; its operand is an OR and the two operands of the OR have no common bits.
+define float* @sext_or(i64 %a, i32 %b) {
 entry:
   %b1 = shl i32 %b, 2
-  %b2 = or i32 %b1, 1
-  %b3 = or i32 %b1, 2
-  %b2.ext = sext i32 %b2 to i64
+  %b2 = or i32 %b1, 1 ; (b << 2) and 1 have no common bits
+  %b3 = or i32 %b1, 4 ; (b << 2) and 4 may have common bits
+  %b2.ext = zext i32 %b2 to i64
   %b3.ext = sext i32 %b3 to i64
   %i = add i64 %a, %b2.ext
   %j = add i64 %a, %b3.ext
   %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %j
   ret float* %p
 }
-; CHECK-LABEL: @ext_or
-; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[0-9]+}}, i64 %{{[0-9]+}}
-; CHECK: getelementptr float* [[BASE_PTR]], i64 34
-
-; We should treat "or" with no common bits (%k) as "add", and leave "or" with
-; potentially common bits (%l) as is.
-define float* @or(i64 %i) {
-entry:
-  %j = shl i64 %i, 2
-  %k = or i64 %j, 3 ; no common bits
-  %l = or i64 %j, 4 ; potentially common bits
-  %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %k, i64 %l
-  ret float* %p
-}
-; CHECK-LABEL: @or
-; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %j, i64 %l
-; CHECK: getelementptr float* [[BASE_PTR]], i64 96
+; CHECK-LABEL: @sext_or(
+; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; CHECK: getelementptr float* [[BASE_PTR]], i64 32
 
 ; The subexpression (b + 5) is used in both "i = a + (b + 5)" and "*out = b +
 ; 5". When extracting the constant offset 5, make sure "*out = b + 5" isn't
@@ -100,11 +123,28 @@ entry:
   store i64 %b5, i64* %out
   ret float* %p
 }
-; CHECK-LABEL: @expr
-; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %0, i64 0
+; CHECK-LABEL: @expr(
+; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 0
 ; CHECK: getelementptr float* [[BASE_PTR]], i64 160
 ; CHECK: store i64 %b5, i64* %out
 
+; d + sext(a +nsw (b +nsw (c +nsw 8))) => (d + sext(a) + sext(b) + sext(c)) + 8
+define float* @sext_expr(i32 %a, i32 %b, i32 %c, i64 %d) {
+entry:
+  %0 = add nsw i32 %c, 8
+  %1 = add nsw i32 %b, %0
+  %2 = add nsw i32 %a, %1
+  %3 = sext i32 %2 to i64
+  %i = add i64 %d, %3
+  %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %i
+  ret float* %p
+}
+; CHECK-LABEL: @sext_expr(
+; CHECK: sext i32
+; CHECK: sext i32
+; CHECK: sext i32
+; CHECK: getelementptr float* %{{[a-zA-Z0-9]+}}, i64 8
+
 ; Verifies we handle "sub" correctly.
 define float* @sub(i64 %i, i64 %j) {
   %i2 = sub i64 %i, 5 ; i - 5
@@ -112,9 +152,9 @@ define float* @sub(i64 %i, i64 %j) {
   %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i2, i64 %j2
   ret float* %p
 }
-; CHECK-LABEL: @sub
-; CHECK: %[[j2:[0-9]+]] = sub i64 0, %j
-; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %[[j2]]
+; CHECK-LABEL: @sub(
+; CHECK: %[[j2:[a-zA-Z0-9]+]] = sub i64 0, %j
+; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %[[j2]]
 ; CHECK: getelementptr float* [[BASE_PTR]], i64 -155
 
 %struct.Packed = type <{ [3 x i32], [8 x i64] }> ; <> means packed
@@ -130,8 +170,92 @@ entry:
   %arrayidx3 = getelementptr inbounds [1024 x %struct.Packed]* %s, i64 0, i64 %idxprom2, i32 1, i64 %idxprom
   ret i64* %arrayidx3
 }
-; CHECK-LABEL: @packed_struct
-; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [1024 x %struct.Packed]* %s, i64 0, i32 %i, i32 1, i32 %j
-; CHECK: [[CASTED_PTR:%[0-9]+]] = bitcast i64* [[BASE_PTR]] to i8*
+; CHECK-LABEL: @packed_struct(
+; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [1024 x %struct.Packed]* %s, i64 0, i64 %{{[a-zA-Z0-9]+}}, i32 1, i64 %{{[a-zA-Z0-9]+}}
+; CHECK: [[CASTED_PTR:%[a-zA-Z0-9]+]] = bitcast i64* [[BASE_PTR]] to i8*
 ; CHECK: %uglygep = getelementptr i8* [[CASTED_PTR]], i64 100
 ; CHECK: bitcast i8* %uglygep to i64*
+
+; We shouldn't be able to extract the 8 from "zext(a +nuw (b + 8))",
+; because "zext(b + 8) != zext(b) + 8"
+define float* @zext_expr(i32 %a, i32 %b) {
+entry:
+  %0 = add i32 %b, 8
+  %1 = add nuw i32 %a, %0
+  %i = zext i32 %1 to i64
+  %p = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %i
+  ret float* %p
+}
+; CHECK-LABEL: zext_expr(
+; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %i
+
+; Per http://llvm.org/docs/LangRef.html#id181, the indices of a off-bound gep
+; should be considered sign-extended to the pointer size. Therefore,
+;   gep base, (add i32 a, b) != gep (gep base, i32 a), i32 b
+; because
+;   sext(a + b) != sext(a) + sext(b)
+;
+; This test verifies we do not illegitimately extract the 8 from
+;   gep base, (i32 a + 8)
+define float* @i32_add(i32 %a) {
+entry:
+  %i = add i32 %a, 8
+  %p = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i32 %i
+  ret float* %p
+}
+; CHECK-LABEL: @i32_add(
+; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %{{[a-zA-Z0-9]+}}
+; CHECK-NOT: getelementptr
+
+; Verifies that we compute the correct constant offset when the index is
+; sign-extended and then zero-extended. The old version of our code failed to
+; handle this case because it simply computed the constant offset as the
+; sign-extended value of the constant part of the GEP index.
+define float* @apint(i1 %a) {
+entry:
+  %0 = add nsw nuw i1 %a, 1
+  %1 = sext i1 %0 to i4
+  %2 = zext i4 %1 to i64         ; zext (sext i1 1 to i4) to i64 = 15
+  %p = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %2
+  ret float* %p
+}
+; CHECK-LABEL: @apint(
+; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %{{[a-zA-Z0-9]+}}
+; CHECK: getelementptr float* [[BASE_PTR]], i64 15
+
+; Do not trace into binary operators other than ADD, SUB, and OR.
+define float* @and(i64 %a) {
+entry:
+  %0 = shl i64 %a, 2
+  %1 = and i64 %0, 1
+  %p = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %1
+  ret float* %p
+}
+; CHECK-LABEL: @and(
+; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array
+; CHECK-NOT: getelementptr
+
+; if zext(a + b) <= max signed value of typeof(a + b), then we can prove
+; a + b >= 0 and zext(a + b) == sext(a + b). If we can prove further a or b is
+; non-negative, we have zext(a + b) == sext(a) + sext(b).
+define float* @inbounds_zext_add(i32 %i, i4 %j) {
+entry:
+  %0 = add i32 %i, 1
+  %1 = zext i32 %0 to i64
+  ; Because zext(i + 1) is an index of an in bounds GEP based on
+  ; float_2d_array, zext(i + 1) <= sizeof(float_2d_array) = 4096.
+  ; Furthermore, since typeof(i + 1) is i32 and 4096 < 2^31, we are sure the
+  ; sign bit of i + 1 is 0. This implies zext(i + 1) = sext(i + 1).
+  %2 = add i4 %j, 2
+  %3 = zext i4 %2 to i64
+  ; In this case, typeof(j + 2) is i4, so zext(j + 2) <= 4096 does not imply
+  ; the sign bit of j + 2 is 0.
+  %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %1, i64 %3
+  ret float* %p
+}
+; CHECK-LABEL: @inbounds_zext_add(
+; CHECK-NOT: add
+; CHECK: add i4 %j, 2
+; CHECK: sext
+; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; CHECK: getelementptr float* %{{[a-zA-Z0-9]+}}, i64 32
diff --git a/test/Transforms/SimplifyCFG/PR17073.ll b/test/Transforms/SimplifyCFG/PR17073.ll
new file mode 100644
index 0000000..8dc9fb2
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/PR17073.ll
@@ -0,0 +1,73 @@
+; RUN: opt < %s -simplifycfg -S | FileCheck %s
+
+; In PR17073 ( http://llvm.org/pr17073 ), we illegally hoisted an operation that can trap.
+; The first test confirms that we don't do that when the trapping op is reached by the current BB (block1).
+; The second test confirms that we don't do that when the trapping op is reached by the previous BB (entry).
+; The third test confirms that we can still do this optimization for an operation (add) that doesn't trap.
+; The tests must be complicated enough to prevent previous SimplifyCFG actions from optimizing away
+; the instructions that we're checking for.
+
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.9.0"
+
+@a = common global i32 0, align 4
+@b = common global i8 0, align 1
+
+; CHECK-LABEL: can_trap1 
+; CHECK-NOT: or i1 %tobool, icmp eq (i32* bitcast (i8* @b to i32*), i32* @a)
+; CHECK-NOT: select i1 %tobool, i32* null, i32* select (i1 icmp eq (i64 urem (i64 2, i64 zext (i1 icmp eq (i32* bitcast (i8* @b to i32*), i32* @a) to i64)), i64 0), i32* null, i32* @a) 
+define i32* @can_trap1() {
+entry:
+  %0 = load i32* @a, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %exit, label %block1
+
+block1:
+  br i1 icmp eq (i32* bitcast (i8* @b to i32*), i32* @a), label %exit, label %block2
+
+block2:
+  br label %exit
+
+exit:
+  %storemerge = phi i32* [ null, %entry ],[ null, %block2 ], [ select (i1 icmp eq (i64 urem (i64 2, i64 zext (i1 icmp eq (i32* bitcast (i8* @b to i32*), i32* @a) to i64)), i64 0), i32* null, i32* @a), %block1 ]
+  ret i32* %storemerge
+}
+
+; CHECK-LABEL: can_trap2 
+; CHECK-NOT: or i1 %tobool, icmp eq (i32* bitcast (i8* @b to i32*), i32* @a)
+; CHECK-NOT: select i1 %tobool, i32* select (i1 icmp eq (i64 urem (i64 2, i64 zext (i1 icmp eq (i32* bitcast (i8* @b to i32*), i32* @a) to i64)), i64 0), i32* null, i32* @a), i32* null
+define i32* @can_trap2() {
+entry:
+  %0 = load i32* @a, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %exit, label %block1
+
+block1:
+  br i1 icmp eq (i32* bitcast (i8* @b to i32*), i32* @a), label %exit, label %block2
+
+block2:
+  br label %exit
+
+exit:
+  %storemerge = phi i32* [ select (i1 icmp eq (i64 urem (i64 2, i64 zext (i1 icmp eq (i32* bitcast (i8* @b to i32*), i32* @a) to i64)), i64 0), i32* null, i32* @a), %entry ],[ null, %block2 ], [ null, %block1 ]
+  ret i32* %storemerge
+}
+
+; CHECK-LABEL: cannot_trap 
+; CHECK: select i1 icmp eq (i32* bitcast (i8* @b to i32*), i32* @a), i32* select (i1 icmp eq (i64 add (i64 zext (i1 icmp eq (i32* bitcast (i8* @b to i32*), i32* @a) to i64), i64 2), i64 0), i32* null, i32* @a), i32* null
+define i32* @cannot_trap() {
+entry:
+  %0 = load i32* @a, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %exit, label %block1
+
+block1:
+  br i1 icmp eq (i32* bitcast (i8* @b to i32*), i32* @a), label %exit, label %block2
+
+block2:
+  br label %exit
+
+exit:
+  %storemerge = phi i32* [ null, %entry ],[ null, %block2 ], [ select (i1 icmp eq (i64 add (i64 2, i64 zext (i1 icmp eq (i32* bitcast (i8* @b to i32*), i32* @a) to i64)), i64 0), i32* null, i32* @a), %block1 ]
+  ret i32* %storemerge
+}
diff --git a/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg b/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg
index 4d344fa..fa6a54e 100644
--- a/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg
+++ b/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'Sparc' in targets:
+if not 'Sparc' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/SimplifyCFG/X86/lit.local.cfg b/test/Transforms/SimplifyCFG/X86/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/Transforms/SimplifyCFG/X86/lit.local.cfg
+++ b/test/Transforms/SimplifyCFG/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
index 81079b1..51ced40 100644
--- a/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
+++ b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
@@ -918,3 +918,58 @@ return:
 ; CHECK: switch i32
 ; CHECK-NOT: @switch.table
 }
+
+; Don't build tables for switches with TLS variables.
+@tls_a = thread_local global i32 0
+@tls_b = thread_local global i32 0
+@tls_c = thread_local global i32 0
+@tls_d = thread_local global i32 0
+define i32* @tls(i32 %x) {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %return
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+  ]
+sw.bb1:
+  br label %return
+sw.bb2:
+  br label %return
+sw.default:
+  br label %return
+return:
+  %retval.0 = phi i32* [ @tls_d, %sw.default ], [ @tls_c, %sw.bb2 ], [ @tls_b, %sw.bb1 ], [ @tls_a, %entry ]
+  ret i32* %retval.0
+; CHECK-LABEL: @tls(
+; CHECK: switch i32
+; CHECK-NOT: @switch.table
+}
+
+; Don't build tables for switches with dllimport variables.
+@dllimport_a = external dllimport global [3x i32]
+@dllimport_b = external dllimport global [3x i32]
+@dllimport_c = external dllimport global [3x i32]
+@dllimport_d = external dllimport global [3x i32]
+define i32* @dllimport(i32 %x) {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %return
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+  ]
+sw.bb1:
+  br label %return
+sw.bb2:
+  br label %return
+sw.default:
+  br label %return
+return:
+  %retval.0 = phi i32* [ getelementptr inbounds ([3 x i32]* @dllimport_d, i32 0, i32 0), %sw.default ],
+                       [ getelementptr inbounds ([3 x i32]* @dllimport_c, i32 0, i32 0), %sw.bb2 ],
+                       [ getelementptr inbounds ([3 x i32]* @dllimport_b, i32 0, i32 0), %sw.bb1 ],
+                       [ getelementptr inbounds ([3 x i32]* @dllimport_a, i32 0, i32 0), %entry ]
+  ret i32* %retval.0
+; CHECK-LABEL: @dllimport(
+; CHECK: switch i32
+; CHECK-NOT: @switch.table
+}
diff --git a/test/Transforms/SimplifyCFG/speculate-vector-ops.ll b/test/Transforms/SimplifyCFG/speculate-vector-ops.ll
new file mode 100644
index 0000000..91972eb
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/speculate-vector-ops.ll
@@ -0,0 +1,60 @@
+; RUN: opt -S -simplifycfg < %s | FileCheck %s
+
+define i32 @speculate_vector_extract(i32 %d, <4 x i32> %v) #0 {
+; CHECK-LABEL: @speculate_vector_extract(
+; CHECK-NOT: br
+entry:
+  %conv = insertelement <4 x i32> undef, i32 %d, i32 0
+  %conv2 = insertelement <4 x i32> %conv, i32 %d, i32 1
+  %conv3 = insertelement <4 x i32> %conv2, i32 %d, i32 2
+  %conv4 = insertelement <4 x i32> %conv3, i32 %d, i32 3
+  %tmp6 = add nsw <4 x i32> %conv4, <i32 0, i32 -1, i32 -2, i32 -3>
+  %cmp = icmp eq <4 x i32> %tmp6, zeroinitializer
+  %cmp.ext = sext <4 x i1> %cmp to <4 x i32>
+  %tmp8 = extractelement <4 x i32> %cmp.ext, i32 0
+  %tobool = icmp eq i32 %tmp8, 0
+  br i1 %tobool, label %cond.else, label %cond.then
+
+return:                                           ; preds = %cond.end28
+  ret i32 %cond32
+
+cond.then:                                        ; preds = %entry
+  %tmp10 = extractelement <4 x i32> %v, i32 0
+  br label %cond.end
+
+cond.else:                                        ; preds = %entry
+  %tmp12 = extractelement <4 x i32> %v, i32 3
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.else, %cond.then
+  %cond = phi i32 [ %tmp10, %cond.then ], [ %tmp12, %cond.else ]
+  %tmp14 = extractelement <4 x i32> %cmp.ext, i32 1
+  %tobool15 = icmp eq i32 %tmp14, 0
+  br i1 %tobool15, label %cond.else17, label %cond.then16
+
+cond.then16:                                      ; preds = %cond.end
+  %tmp20 = extractelement <4 x i32> %v, i32 1
+  br label %cond.end18
+
+cond.else17:                                      ; preds = %cond.end
+  br label %cond.end18
+
+cond.end18:                                       ; preds = %cond.else17, %cond.then16
+  %cond22 = phi i32 [ %tmp20, %cond.then16 ], [ %cond, %cond.else17 ]
+  %tmp24 = extractelement <4 x i32> %cmp.ext, i32 2
+  %tobool25 = icmp eq i32 %tmp24, 0
+  br i1 %tobool25, label %cond.else27, label %cond.then26
+
+cond.then26:                                      ; preds = %cond.end18
+  %tmp30 = extractelement <4 x i32> %v, i32 2
+  br label %cond.end28
+
+cond.else27:                                      ; preds = %cond.end18
+  br label %cond.end28
+
+cond.end28:                                       ; preds = %cond.else27, %cond.then26
+  %cond32 = phi i32 [ %tmp30, %cond.then26 ], [ %cond22, %cond.else27 ]
+  br label %return
+}
+
+attributes #0 = { nounwind }
diff --git a/test/Transforms/TailDup/X86/lit.local.cfg b/test/Transforms/TailDup/X86/lit.local.cfg
index ba763cf..e71f3cc 100644
--- a/test/Transforms/TailDup/X86/lit.local.cfg
+++ b/test/Transforms/TailDup/X86/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/Transforms/TailDup/lit.local.cfg b/test/Transforms/TailDup/lit.local.cfg
index 19840aa..c8625f4 100644
--- a/test/Transforms/TailDup/lit.local.cfg
+++ b/test/Transforms/TailDup/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
diff --git a/test/Unit/lit.cfg b/test/Unit/lit.cfg
index 04e8830..e481dcc 100644
--- a/test/Unit/lit.cfg
+++ b/test/Unit/lit.cfg
@@ -35,6 +35,11 @@ for symbolizer in ['ASAN_SYMBOLIZER_PATH', 'MSAN_SYMBOLIZER_PATH']:
     if symbolizer in os.environ:
         config.environment[symbolizer] = os.environ[symbolizer]
 
+# Win32 seeks DLLs along %PATH%.
+if sys.platform in ['win32', 'cygwin'] and os.path.isdir(config.shlibdir):
+    config.environment['PATH'] = os.path.pathsep.join((
+            config.shlibdir, config.environment['PATH']))
+
 ###
 
 # Check that the object root is known.
diff --git a/test/Verifier/alias.ll b/test/Verifier/alias.ll
index e3636bc..ff02a37 100644
--- a/test/Verifier/alias.ll
+++ b/test/Verifier/alias.ll
@@ -10,3 +10,18 @@ declare void @f()
 @ga = alias i32* @g
 ; CHECK: Alias must point to a definition
 ; CHECK-NEXT: @ga
+
+
+@test2_a = alias i32* @test2_b
+@test2_b = alias i32* @test2_a
+; CHECK:      Aliases cannot form a cycle
+; CHECK-NEXT: i32* @test2_a
+; CHECK-NEXT: Aliases cannot form a cycle
+; CHECK-NEXT: i32* @test2_b
+
+
+@test3_a = global i32 42
+@test3_b = alias weak i32* @test3_a
+@test3_c = alias i32* @test3_b
+; CHECK: Alias cannot point to a weak alias
+; CHECK-NEXT: i32* @test3_c
diff --git a/test/Verifier/bitcast-alias-address-space.ll b/test/Verifier/bitcast-alias-address-space.ll
new file mode 100644
index 0000000..d9794d9
--- /dev/null
+++ b/test/Verifier/bitcast-alias-address-space.ll
@@ -0,0 +1,10 @@
+; RUN: not llvm-as -disable-output %s 2>&1 | FileCheck %s
+
+; CHECK: error: invalid cast opcode for cast from 'i32 addrspace(2)*' to 'i32 addrspace(1)*'
+
+target datalayout = "e-p:32:32:32-p1:16:16:16-p2:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n8:16:32"
+
+
+@data = addrspace(2) global i32 27
+
+@illegal_alias_data = alias bitcast (i32 addrspace(2)* @data to i32 addrspace(1)*)
diff --git a/test/Verifier/comdat.ll b/test/Verifier/comdat.ll
new file mode 100644
index 0000000..ca47429
--- /dev/null
+++ b/test/Verifier/comdat.ll
@@ -0,0 +1,5 @@
+; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+
+$v = comdat any
+@v = common global i32 0, comdat $v
+; CHECK: 'common' global may not be in a Comdat!
diff --git a/test/Verifier/comdat2.ll b/test/Verifier/comdat2.ll
new file mode 100644
index 0000000..23b6cee
--- /dev/null
+++ b/test/Verifier/comdat2.ll
@@ -0,0 +1,5 @@
+; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+
+$v = comdat any
+@v = private global i32 0, comdat $v
+; CHECK: comdat global value has local linkage
diff --git a/test/Verifier/jumptable.ll b/test/Verifier/jumptable.ll
new file mode 100644
index 0000000..5f4cd3f
--- /dev/null
+++ b/test/Verifier/jumptable.ll
@@ -0,0 +1,9 @@
+; RUN: not llc <%s 2>&1 | FileCheck %s
+
+define i32 @f() jumptable {
+  ret i32 0
+}
+
+; CHECK: Attribute 'jumptable' requires 'unnamed_addr'
+; CHECK: i32 ()* @f
+; CHECK: LLVM ERROR: Broken function found, compilation aborted!
diff --git a/test/Verifier/range-1.ll b/test/Verifier/range-1.ll
index b6a75d1..f15ca3f 100644
--- a/test/Verifier/range-1.ll
+++ b/test/Verifier/range-1.ll
@@ -6,7 +6,7 @@ entry:
   ret void
 }
 !0 = metadata !{i8 0, i8 1}
-; CHECK: Ranges are only for loads!
+; CHECK: Ranges are only for loads, calls and invokes!
 ; CHECK-NEXT: store i8 0, i8* %x, align 1, !range !0
 
 define i8 @f2(i8* %x) {
diff --git a/test/Verifier/range-2.ll b/test/Verifier/range-2.ll
index 8d85d19..1d2e057 100644
--- a/test/Verifier/range-2.ll
+++ b/test/Verifier/range-2.ll
@@ -34,3 +34,33 @@ entry:
   ret i8 %y
 }
 !4 = metadata !{i8 -1, i8 0, i8 1, i8 -2}
+
+; We can annotate the range of the return value of a CALL.
+define void @call_all(i8* %x) {
+entry:
+  %v1 = call i8 @f1(i8* %x), !range !0
+  %v2 = call i8 @f2(i8* %x), !range !1
+  %v3 = call i8 @f3(i8* %x), !range !2
+  %v4 = call i8 @f4(i8* %x), !range !3
+  %v5 = call i8 @f5(i8* %x), !range !4
+  ret void
+}
+
+; We can annotate the range of the return value of an INVOKE.
+define void @invoke_all(i8* %x) {
+entry:
+  %v1 = invoke i8 @f1(i8* %x) to label %cont unwind label %lpad, !range !0
+  %v2 = invoke i8 @f2(i8* %x) to label %cont unwind label %lpad, !range !1
+  %v3 = invoke i8 @f3(i8* %x) to label %cont unwind label %lpad, !range !2
+  %v4 = invoke i8 @f4(i8* %x) to label %cont unwind label %lpad, !range !3
+  %v5 = invoke i8 @f5(i8* %x) to label %cont unwind label %lpad, !range !4
+
+cont:
+  ret void
+
+lpad:
+  %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          filter [0 x i8*] zeroinitializer
+  ret void
+}
+declare i32 @__gxx_personality_v0(...)
diff --git a/test/lit.cfg b/test/lit.cfg
index 2815a61..664d55f 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -243,7 +243,6 @@ for pattern in [r"\bbugpoint\b(?!-)",
                 r"\bmacho-dump\b",
                 NOJUNK + r"\bopt\b",
                 r"\bFileCheck\b",
-                r"\bFileUpdate\b",
                 r"\bobj2yaml\b",
                 r"\byaml2obj\b",
                 # Handle these specially as they are strings searched
@@ -265,6 +264,10 @@ for pattern in [r"\bbugpoint\b(?!-)",
         tool_path = llvm_tools_dir + '/' + tool_name
     config.substitutions.append((pattern, tool_pipe + tool_path))
 
+### Targets
+
+config.targets = frozenset(config.targets_to_build.split())
+
 ### Features
 
 # Shell execution
@@ -332,6 +335,10 @@ if 'darwin' == sys.platform:
         config.available_features.add('fma3')
     sysctl_cmd.wait()
 
+# .debug_frame is not emitted for targeting Windows x64.
+if not re.match(r'^x86_64.*-(mingw32|win32)', config.target_triple):
+    config.available_features.add('debug_frame')
+
 # Check if we should use gmalloc.
 use_gmalloc_str = lit_config.params.get('use_gmalloc', None)
 if use_gmalloc_str is not None:
diff --git a/test/tools/llvm-cov/copy_block_helper.m b/test/tools/llvm-cov/copy_block_helper.m
index 1859b88..64973f1 100644
--- a/test/tools/llvm-cov/copy_block_helper.m
+++ b/test/tools/llvm-cov/copy_block_helper.m
@@ -29,4 +29,4 @@ void test(id x) { // GCOV: -:    [[@LINE]]:void test
 int main(int argc, const char *argv[]) { test(0); }
 
 // llvm-cov doesn't work on big endian yet
-// XFAIL: powerpc64, s390x, mips-, mips64-, sparc
+// XFAIL: powerpc64-, s390x, mips-, mips64-, sparc
diff --git a/test/tools/llvm-cov/llvm-cov.test b/test/tools/llvm-cov/llvm-cov.test
index 2345f8d..0d3eb6b 100644
--- a/test/tools/llvm-cov/llvm-cov.test
+++ b/test/tools/llvm-cov/llvm-cov.test
@@ -102,12 +102,12 @@ RUN: diff -aub test_no_gcda.cpp.gcov test.cpp.gcov
 RUN: diff -aub test_no_gcda.h.gcov test.h.gcov
 
 # Invalid gcno file.
-RUN: not llvm-cov test.c -gcno=test_read_fail.gcno
+RUN: llvm-cov test.c -gcno=test_read_fail.gcno
 
 # Bad file checksum on gcda.
-RUN: not llvm-cov test.c -gcda=test_file_checksum_fail.gcda
+RUN: llvm-cov test.c -gcda=test_file_checksum_fail.gcda
 
 # Bad function checksum on gcda
-RUN: not llvm-cov test.c -gcda=test_func_checksum_fail.gcda
+RUN: llvm-cov test.c -gcda=test_func_checksum_fail.gcda
 
-XFAIL: powerpc64, s390x, mips-, mips64-, sparc
+XFAIL: powerpc64-, s390x, mips-, mips64-, sparc
diff --git a/test/tools/llvm-cov/range_based_for.cpp b/test/tools/llvm-cov/range_based_for.cpp
index 61f60f6..3fdb244 100644
--- a/test/tools/llvm-cov/range_based_for.cpp
+++ b/test/tools/llvm-cov/range_based_for.cpp
@@ -26,4 +26,4 @@ int main(int argc, const char *argv[]) { // GCOV: 1:    [[@LINE]]:int main(
 }                                        // GCOV: -:    [[@LINE]]:}
 
 // llvm-cov doesn't work on big endian yet
-// XFAIL: powerpc64, s390x, mips-, mips64-, sparc
+// XFAIL: powerpc64-, s390x, mips-, mips64-, sparc
diff --git a/test/tools/llvm-objdump/lit.local.cfg b/test/tools/llvm-objdump/lit.local.cfg
index 19840aa..c8625f4 100644
--- a/test/tools/llvm-objdump/lit.local.cfg
+++ b/test/tools/llvm-objdump/lit.local.cfg
@@ -1,3 +1,2 @@
-targets = set(config.root.targets_to_build.split())
-if not 'X86' in targets:
+if not 'X86' in config.root.targets:
     config.unsupported = True
diff --git a/test/tools/llvm-readobj/ARM/lit.local.cfg b/test/tools/llvm-readobj/ARM/lit.local.cfg
index 8a3ba96..98c6700 100644
--- a/test/tools/llvm-readobj/ARM/lit.local.cfg
+++ b/test/tools/llvm-readobj/ARM/lit.local.cfg
@@ -1,4 +1,3 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM' in targets:
+if not 'ARM' in config.root.targets:
     config.unsupported = True
 
diff --git a/test/tools/llvm-readobj/Inputs/got-empty.exe.mipsel b/test/tools/llvm-readobj/Inputs/got-empty.exe.mipsel
new file mode 100755
index 0000000..b578745
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/got-empty.exe.mipsel
diff --git a/test/tools/llvm-readobj/Inputs/got-tls.so.elf-mips64el b/test/tools/llvm-readobj/Inputs/got-tls.so.elf-mips64el
new file mode 100755
index 0000000..3afc567
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/got-tls.so.elf-mips64el
diff --git a/test/tools/llvm-readobj/mips-got.test b/test/tools/llvm-readobj/mips-got.test
new file mode 100644
index 0000000..2021587
--- /dev/null
+++ b/test/tools/llvm-readobj/mips-got.test
@@ -0,0 +1,331 @@
+RUN: llvm-readobj -mips-plt-got %p/Inputs/relocs.obj.elf-mips | \
+RUN:   FileCheck %s -check-prefix GOT-OBJ
+RUN: llvm-readobj -mips-plt-got %p/Inputs/dynamic-table-exe.mips | \
+RUN:   FileCheck %s -check-prefix GOT-EXE
+RUN: llvm-readobj -mips-plt-got %p/Inputs/dynamic-table-so.mips | \
+RUN:   FileCheck %s -check-prefix GOT-SO
+RUN: llvm-readobj -mips-plt-got %p/Inputs/got-tls.so.elf-mips64el | \
+RUN:   FileCheck %s -check-prefix GOT-TLS
+RUN: llvm-readobj -mips-plt-got %p/Inputs/got-empty.exe.mipsel | \
+RUN:   FileCheck %s -check-prefix GOT-EMPTY
+
+GOT-OBJ: Cannot find PLTGOT dynamic table tag.
+
+GOT-EXE:      Primary GOT {
+GOT-EXE-NEXT:   Canonical gp value: 0x418880
+GOT-EXE-NEXT:   Reserved entries [
+GOT-EXE-NEXT:     Entry {
+GOT-EXE-NEXT:       Address: 0x410890
+GOT-EXE-NEXT:       Access: -32752
+GOT-EXE-NEXT:       Initial: 0x0
+GOT-EXE-NEXT:       Purpose: Lazy resolver
+GOT-EXE-NEXT:     }
+GOT-EXE-NEXT:     Entry {
+GOT-EXE-NEXT:       Address: 0x410894
+GOT-EXE-NEXT:       Access: -32748
+GOT-EXE-NEXT:       Initial: 0x80000000
+GOT-EXE-NEXT:       Purpose: Module pointer (GNU extension)
+GOT-EXE-NEXT:     }
+GOT-EXE-NEXT:   ]
+GOT-EXE-NEXT:   Local entries [
+GOT-EXE-NEXT:     Entry {
+GOT-EXE-NEXT:       Address: 0x410898
+GOT-EXE-NEXT:       Access: -32744
+GOT-EXE-NEXT:       Initial: 0x400418
+GOT-EXE-NEXT:     }
+GOT-EXE-NEXT:     Entry {
+GOT-EXE-NEXT:       Address: 0x41089C
+GOT-EXE-NEXT:       Access: -32740
+GOT-EXE-NEXT:       Initial: 0x410840
+GOT-EXE-NEXT:     }
+GOT-EXE-NEXT:     Entry {
+GOT-EXE-NEXT:       Address: 0x4108A0
+GOT-EXE-NEXT:       Access: -32736
+GOT-EXE-NEXT:       Initial: 0x0
+GOT-EXE-NEXT:     }
+GOT-EXE-NEXT:   ]
+GOT-EXE-NEXT:   Global entries [
+GOT-EXE-NEXT:     Entry {
+GOT-EXE-NEXT:       Address: 0x4108A4
+GOT-EXE-NEXT:       Access: -32732
+GOT-EXE-NEXT:       Initial: 0x0
+GOT-EXE-NEXT:       Value: 0x0
+GOT-EXE-NEXT:       Type: Function (0x2)
+GOT-EXE-NEXT:       Section: Undefined (0x0)
+GOT-EXE-NEXT:       Name: __gmon_start__@ (1)
+GOT-EXE-NEXT:     }
+GOT-EXE-NEXT:   ]
+GOT-EXE-NEXT:   Number of TLS and multi-GOT entries: 0
+GOT-EXE-NEXT: }
+
+GOT-SO:      Primary GOT {
+GOT-SO-NEXT:   Canonical gp value: 0x188D0
+GOT-SO-NEXT:   Reserved entries [
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x108E0
+GOT-SO-NEXT:       Access: -32752
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:       Purpose: Lazy resolver
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x108E4
+GOT-SO-NEXT:       Access: -32748
+GOT-SO-NEXT:       Initial: 0x80000000
+GOT-SO-NEXT:       Purpose: Module pointer (GNU extension)
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:   ]
+GOT-SO-NEXT:   Local entries [
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x108E8
+GOT-SO-NEXT:       Access: -32744
+GOT-SO-NEXT:       Initial: 0x108E0
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x108EC
+GOT-SO-NEXT:       Access: -32740
+GOT-SO-NEXT:       Initial: 0x10000
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x108F0
+GOT-SO-NEXT:       Access: -32736
+GOT-SO-NEXT:       Initial: 0x10920
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x108F4
+GOT-SO-NEXT:       Access: -32732
+GOT-SO-NEXT:       Initial: 0x108CC
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x108F8
+GOT-SO-NEXT:       Access: -32728
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x108FC
+GOT-SO-NEXT:       Access: -32724
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x10900
+GOT-SO-NEXT:       Access: -32720
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x10904
+GOT-SO-NEXT:       Access: -32716
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:   ]
+GOT-SO-NEXT:   Global entries [
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x10908
+GOT-SO-NEXT:       Access: -32712
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:       Value: 0x0
+GOT-SO-NEXT:       Type: None (0x0)
+GOT-SO-NEXT:       Section: Undefined (0x0)
+GOT-SO-NEXT:       Name: _ITM_registerTMCloneTable@ (87)
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x1090C
+GOT-SO-NEXT:       Access: -32708
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:       Value: 0x0
+GOT-SO-NEXT:       Type: None (0x0)
+GOT-SO-NEXT:       Section: Undefined (0x0)
+GOT-SO-NEXT:       Name: _Jv_RegisterClasses@ (128)
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x10910
+GOT-SO-NEXT:       Access: -32704
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:       Value: 0x0
+GOT-SO-NEXT:       Type: Function (0x2)
+GOT-SO-NEXT:       Section: Undefined (0x0)
+GOT-SO-NEXT:       Name: __gmon_start__@ (23)
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x10914
+GOT-SO-NEXT:       Access: -32700
+GOT-SO-NEXT:       Initial: 0x840
+GOT-SO-NEXT:       Value: 0x840
+GOT-SO-NEXT:       Type: Function (0x2)
+GOT-SO-NEXT:       Section: Undefined (0x0)
+GOT-SO-NEXT:       Name: puts@GLIBC_2.0 (162)
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x10918
+GOT-SO-NEXT:       Access: -32696
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:       Value: 0x0
+GOT-SO-NEXT:       Type: None (0x0)
+GOT-SO-NEXT:       Section: Undefined (0x0)
+GOT-SO-NEXT:       Name: _ITM_deregisterTMCloneTable@ (59)
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:     Entry {
+GOT-SO-NEXT:       Address: 0x1091C
+GOT-SO-NEXT:       Access: -32692
+GOT-SO-NEXT:       Initial: 0x0
+GOT-SO-NEXT:       Value: 0x0
+GOT-SO-NEXT:       Type: Function (0x2)
+GOT-SO-NEXT:       Section: Undefined (0x0)
+GOT-SO-NEXT:       Name: __cxa_finalize@GLIBC_2.2 (113)
+GOT-SO-NEXT:     }
+GOT-SO-NEXT:   ]
+GOT-SO-NEXT:   Number of TLS and multi-GOT entries: 0
+GOT-SO-NEXT: }
+
+GOT-TLS:      Primary GOT {
+GOT-TLS-NEXT:   Canonical gp value: 0x18BF0
+GOT-TLS-NEXT:   Reserved entries [
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C00
+GOT-TLS-NEXT:       Access: -32752
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:       Purpose: Lazy resolver
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C08
+GOT-TLS-NEXT:       Access: -32744
+GOT-TLS-NEXT:       Initial: 0x8000000000000000
+GOT-TLS-NEXT:       Purpose: Module pointer (GNU extension)
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:   ]
+GOT-TLS-NEXT:   Local entries [
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C10
+GOT-TLS-NEXT:       Access: -32736
+GOT-TLS-NEXT:       Initial: 0x10000
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C18
+GOT-TLS-NEXT:       Access: -32728
+GOT-TLS-NEXT:       Initial: 0x10C00
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C20
+GOT-TLS-NEXT:       Access: -32720
+GOT-TLS-NEXT:       Initial: 0x10CB8
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C28
+GOT-TLS-NEXT:       Access: -32712
+GOT-TLS-NEXT:       Initial: 0x10BF0
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C30
+GOT-TLS-NEXT:       Access: -32704
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C38
+GOT-TLS-NEXT:       Access: -32696
+GOT-TLS-NEXT:       Initial: 0x948
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C40
+GOT-TLS-NEXT:       Access: -32688
+GOT-TLS-NEXT:       Initial: 0xA20
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C48
+GOT-TLS-NEXT:       Access: -32680
+GOT-TLS-NEXT:       Initial: 0xAF0
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C50
+GOT-TLS-NEXT:       Access: -32672
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C58
+GOT-TLS-NEXT:       Access: -32664
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C60
+GOT-TLS-NEXT:       Access: -32656
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:   ]
+GOT-TLS-NEXT:   Global entries [
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C68
+GOT-TLS-NEXT:       Access: -32648
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:       Value: 0x0
+GOT-TLS-NEXT:       Type: None (0x0)
+GOT-TLS-NEXT:       Section: Undefined (0x0)
+GOT-TLS-NEXT:       Name: _ITM_registerTMCloneTable@ (78)
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C70
+GOT-TLS-NEXT:       Access: -32640
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:       Value: 0x0
+GOT-TLS-NEXT:       Type: None (0x0)
+GOT-TLS-NEXT:       Section: Undefined (0x0)
+GOT-TLS-NEXT:       Name: _Jv_RegisterClasses@ (119)
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C78
+GOT-TLS-NEXT:       Access: -32632
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:       Value: 0x0
+GOT-TLS-NEXT:       Type: Function (0x2)
+GOT-TLS-NEXT:       Section: Undefined (0x0)
+GOT-TLS-NEXT:       Name: __gmon_start__@ (23)
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C80
+GOT-TLS-NEXT:       Access: -32624
+GOT-TLS-NEXT:       Initial: 0xB60
+GOT-TLS-NEXT:       Value: 0xB60
+GOT-TLS-NEXT:       Type: Function (0x2)
+GOT-TLS-NEXT:       Section: Undefined (0x0)
+GOT-TLS-NEXT:       Name: __tls_get_addr@GLIBC_2.3 (150)
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C88
+GOT-TLS-NEXT:       Access: -32616
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:       Value: 0x0
+GOT-TLS-NEXT:       Type: None (0x0)
+GOT-TLS-NEXT:       Section: Undefined (0x0)
+GOT-TLS-NEXT:       Name: _ITM_deregisterTMCloneTable@ (50)
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:     Entry {
+GOT-TLS-NEXT:       Address: 0x10C90
+GOT-TLS-NEXT:       Access: -32608
+GOT-TLS-NEXT:       Initial: 0x0
+GOT-TLS-NEXT:       Value: 0x0
+GOT-TLS-NEXT:       Type: Function (0x2)
+GOT-TLS-NEXT:       Section: Undefined (0x0)
+GOT-TLS-NEXT:       Name: __cxa_finalize@GLIBC_2.2 (104)
+GOT-TLS-NEXT:     }
+GOT-TLS-NEXT:   ]
+GOT-TLS-NEXT:   Number of TLS and multi-GOT entries: 4
+GOT-TLS-NEXT: }
+
+GOT-EMPTY:      Primary GOT {
+GOT-EMPTY-NEXT:   Canonical gp value: 0x409FF0
+GOT-EMPTY-NEXT:   Reserved entries [
+GOT-EMPTY-NEXT:     Entry {
+GOT-EMPTY-NEXT:       Address: 0x402000
+GOT-EMPTY-NEXT:       Access: -32752
+GOT-EMPTY-NEXT:       Initial: 0x0
+GOT-EMPTY-NEXT:       Purpose: Lazy resolver
+GOT-EMPTY-NEXT:     }
+GOT-EMPTY-NEXT:     Entry {
+GOT-EMPTY-NEXT:       Address: 0x402004
+GOT-EMPTY-NEXT:       Access: -32748
+GOT-EMPTY-NEXT:       Initial: 0x80000000
+GOT-EMPTY-NEXT:       Purpose: Module pointer (GNU extension)
+GOT-EMPTY-NEXT:     }
+GOT-EMPTY-NEXT:   ]
+GOT-EMPTY-NEXT:   Local entries [
+GOT-EMPTY-NEXT:   ]
+GOT-EMPTY-NEXT:   Global entries [
+GOT-EMPTY-NEXT:   ]
+GOT-EMPTY-NEXT:   Number of TLS and multi-GOT entries: 2
+GOT-EMPTY-NEXT: }
diff --git a/test/tools/llvm-readobj/program-headers.test b/test/tools/llvm-readobj/program-headers.test
index 7c22f2b..f014c03 100644
--- a/test/tools/llvm-readobj/program-headers.test
+++ b/test/tools/llvm-readobj/program-headers.test
@@ -4,6 +4,8 @@ RUN: llvm-readobj -program-headers %p/../../Object/Inputs/program-headers.elf-x8
 RUN:     | FileCheck %s -check-prefix ELF-X86-64
 RUN: llvm-readobj -program-headers %p/../../Object/Inputs/program-headers.mips \
 RUN:     | FileCheck %s -check-prefix ELF-MIPS
+RUN: llvm-readobj -program-headers %p/../../Object/Inputs/program-headers.mips64 \
+RUN:     | FileCheck %s -check-prefix ELF-MIPS64
 
 ELF-I386:      ProgramHeaders [
 ELF-I386-NEXT:   ProgramHeader {
@@ -75,7 +77,11 @@ ELF-X86-64-NEXT:     Alignment: 8
 ELF-X86-64-NEXT:   }
 ELF-X86-64-NEXT: ]
 
-ELF-MIPS:      ProgramHeaders [
+ELF-MIPS:      Format: ELF32-mips
+ELF-MIPS-NEXT: Arch: mips
+ELF-MIPS-NEXT: AddressSize: 32bit
+ELF-MIPS-NEXT: LoadName:
+ELF-MIPS-NEXT: ProgramHeaders [
 ELF-MIPS-NEXT:   ProgramHeader {
 ELF-MIPS-NEXT:     Type: PT_MIPS_REGINFO (0x70000000)
 ELF-MIPS-NEXT:     Offset: 0x74
@@ -102,3 +108,23 @@ ELF-MIPS-NEXT:     ]
 ELF-MIPS-NEXT:     Alignment: 65536
 ELF-MIPS-NEXT:   }
 ELF-MIPS-NEXT: ]
+
+ELF-MIPS64:      Format: ELF64-mips
+ELF-MIPS64-NEXT: Arch: mips64
+ELF-MIPS64-NEXT: AddressSize: 64bit
+ELF-MIPS64-NEXT: LoadName:
+ELF-MIPS64-NEXT: ProgramHeaders [
+ELF-MIPS64-NEXT:   ProgramHeader {
+ELF-MIPS64-NEXT:     Type: PT_LOAD (0x1)
+ELF-MIPS64-NEXT:     Offset: 0x0
+ELF-MIPS64-NEXT:     VirtualAddress: 0x120000000
+ELF-MIPS64-NEXT:     PhysicalAddress: 0x120000000
+ELF-MIPS64-NEXT:     FileSize: 136
+ELF-MIPS64-NEXT:     MemSize: 136
+ELF-MIPS64-NEXT:     Flags [ (0x5)
+ELF-MIPS64-NEXT:       PF_R (0x4)
+ELF-MIPS64-NEXT:       PF_X (0x1)
+ELF-MIPS64-NEXT:     ]
+ELF-MIPS64-NEXT:     Alignment: 65536
+ELF-MIPS64-NEXT:   }
+ELF-MIPS64-NEXT: ]
diff --git a/test/tools/llvm-readobj/relocations.test b/test/tools/llvm-readobj/relocations.test
index 3a87ff5..864ded3 100644
--- a/test/tools/llvm-readobj/relocations.test
+++ b/test/tools/llvm-readobj/relocations.test
@@ -34,8 +34,8 @@ MACHO-I386:      Relocations [
 MACHO-I386-NEXT:   Section __text {
 MACHO-I386-NEXT:     0x18 1 2 1 GENERIC_RELOC_VANILLA 0 _SomeOtherFunction
 MACHO-I386-NEXT:     0x13 1 2 1 GENERIC_RELOC_VANILLA 0 _puts
-MACHO-I386-NEXT:     0xB 0 2 n/a GENERIC_RELOC_LOCAL_SECTDIFF 1 -
-MACHO-I386-NEXT:     0x0 0 2 n/a GENERIC_RELOC_PAIR 1 -
+MACHO-I386-NEXT:     0xB 0 2 n/a GENERIC_RELOC_LOCAL_SECTDIFF 1 0x22
+MACHO-I386-NEXT:     0x0 0 2 n/a GENERIC_RELOC_PAIR 1 0x8
 MACHO-I386-NEXT:   }
 MACHO-I386-NEXT: ]
 
@@ -49,17 +49,17 @@ MACHO-X86-64-NEXT:]
 
 MACHO-PPC: Relocations [
 MACHO-PPC-NEXT:   Section __text {
-MACHO-PPC-NEXT:     0x24 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 -
-MACHO-PPC-NEXT:     0x0 0 2 n/a PPC_RELOC_PAIR 1 -
-MACHO-PPC-NEXT:     0x1C 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 -
-MACHO-PPC-NEXT:     0x58 0 2 n/a PPC_RELOC_PAIR 1 -
-MACHO-PPC-NEXT:     0x18 1 2 0 PPC_RELOC_BR24 0 -
+MACHO-PPC-NEXT:     0x24 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 0x64
+MACHO-PPC-NEXT:     0x0 0 2 n/a PPC_RELOC_PAIR 1 0xC
+MACHO-PPC-NEXT:     0x1C 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 0x64
+MACHO-PPC-NEXT:     0x58 0 2 n/a PPC_RELOC_PAIR 1 0xC
+MACHO-PPC-NEXT:     0x18 1 2 0 PPC_RELOC_BR24 0 0x2
 MACHO-PPC-NEXT:   }
 MACHO-PPC-NEXT:   Section __picsymbolstub1 {
-MACHO-PPC-NEXT:     0x14 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 -
-MACHO-PPC-NEXT:     0x0 0 2 n/a PPC_RELOC_PAIR 1 -
-MACHO-PPC-NEXT:     0xC 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 -
-MACHO-PPC-NEXT:     0x20 0 2 n/a PPC_RELOC_PAIR 1 -
+MACHO-PPC-NEXT:     0x14 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 0x68
+MACHO-PPC-NEXT:     0x0 0 2 n/a PPC_RELOC_PAIR 1 0x48
+MACHO-PPC-NEXT:     0xC 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 0x68
+MACHO-PPC-NEXT:     0x20 0 2 n/a PPC_RELOC_PAIR 1 0x48
 MACHO-PPC-NEXT:   }
 MACHO-PPC-NEXT:   Section __la_symbol_ptr {
 MACHO-PPC-NEXT:     0x0 0 2 1 PPC_RELOC_VANILLA 0 dyld_stub_binding_helper
@@ -68,17 +68,17 @@ MACHO-PPC-NEXT: ]
 
 MACHO-PPC64: Relocations [
 MACHO-PPC64-NEXT:   Section __text {
-MACHO-PPC64-NEXT:     0x24 0 2 n/a 1 -
-MACHO-PPC64-NEXT:     0x0 0 2 n/a 1 -
-MACHO-PPC64-NEXT:     0x1C 0 2 n/a 1 -
-MACHO-PPC64-NEXT:     0x58 0 2 n/a 1 -
-MACHO-PPC64-NEXT:     0x18 1 2 0 0 -
+MACHO-PPC64-NEXT:     0x24 0 2 n/a 1 0x64
+MACHO-PPC64-NEXT:     0x0 0 2 n/a 1 0xC
+MACHO-PPC64-NEXT:     0x1C 0 2 n/a 1 0x64
+MACHO-PPC64-NEXT:     0x58 0 2 n/a 1 0xC
+MACHO-PPC64-NEXT:     0x18 1 2 0 0 0x2
 MACHO-PPC64-NEXT:   }
 MACHO-PPC64-NEXT:   Section __picsymbolstub1 {
-MACHO-PPC64-NEXT:     0x14 0 2 n/a 1 -
-MACHO-PPC64-NEXT:     0x0 0 2 n/a 1 -
-MACHO-PPC64-NEXT:     0xC 0 2 n/a 1 -
-MACHO-PPC64-NEXT:     0x24 0 2 n/a 1 -
+MACHO-PPC64-NEXT:     0x14 0 2 n/a 1 0x6C
+MACHO-PPC64-NEXT:     0x0 0 2 n/a 1 0x48
+MACHO-PPC64-NEXT:     0xC 0 2 n/a 1 0x6C
+MACHO-PPC64-NEXT:     0x24 0 2 n/a 1 0x48
 MACHO-PPC64-NEXT:   }
 MACHO-PPC64-NEXT:   Section __la_symbol_ptr {
 MACHO-PPC64-NEXT:     0x0 0 3 1 0 dyld_stub_binding_helper
@@ -94,7 +94,7 @@ MACHO-ARM-NEXT:        PCRel: 0
 MACHO-ARM-NEXT:        Length: 2
 MACHO-ARM-NEXT:        Extern: N/A
 MACHO-ARM-NEXT:        Type: ARM_RELOC_SECTDIFF (2)
-MACHO-ARM-NEXT:        Symbol: -
+MACHO-ARM-NEXT:        Symbol: 0x40
 MACHO-ARM-NEXT:        Scattered: 1
 MACHO-ARM-NEXT:      }
 MACHO-ARM-NEXT:      Relocation {
@@ -103,7 +103,7 @@ MACHO-ARM-NEXT:        PCRel: 0
 MACHO-ARM-NEXT:        Length: 2
 MACHO-ARM-NEXT:        Extern: N/A
 MACHO-ARM-NEXT:        Type: ARM_RELOC_PAIR (1)
-MACHO-ARM-NEXT:        Symbol: -
+MACHO-ARM-NEXT:        Symbol: 0x28
 MACHO-ARM-NEXT:        Scattered: 1
 MACHO-ARM-NEXT:      }
 MACHO-ARM-NEXT:      Relocation {
@@ -130,7 +130,7 @@ MACHO-ARM-NEXT:        PCRel: 0
 MACHO-ARM-NEXT:        Length: 1
 MACHO-ARM-NEXT:        Extern: 0
 MACHO-ARM-NEXT:        Type: ARM_RELOC_PAIR (1)
-MACHO-ARM-NEXT:        Symbol: -
+MACHO-ARM-NEXT:        Symbol: 0xFFFFFF
 MACHO-ARM-NEXT:        Scattered: 0
 MACHO-ARM-NEXT:      }
 MACHO-ARM-NEXT:      Relocation {
@@ -148,7 +148,7 @@ MACHO-ARM-NEXT:        PCRel: 0
 MACHO-ARM-NEXT:        Length: 0
 MACHO-ARM-NEXT:        Extern: 0
 MACHO-ARM-NEXT:        Type: ARM_RELOC_PAIR (1)
-MACHO-ARM-NEXT:        Symbol: -
+MACHO-ARM-NEXT:        Symbol: 0xFFFFFF
 MACHO-ARM-NEXT:        Scattered: 0
 MACHO-ARM-NEXT:      }
 MACHO-ARM-NEXT:      Relocation {
@@ -157,7 +157,7 @@ MACHO-ARM-NEXT:        PCRel: 0
 MACHO-ARM-NEXT:        Length: 2
 MACHO-ARM-NEXT:        Extern: N/A
 MACHO-ARM-NEXT:        Type: ARM_RELOC_SECTDIFF (2)
-MACHO-ARM-NEXT:        Symbol: -
+MACHO-ARM-NEXT:        Symbol: 0x44
 MACHO-ARM-NEXT:        Scattered: 1
 MACHO-ARM-NEXT:      }
 MACHO-ARM-NEXT:      Relocation {
@@ -166,7 +166,7 @@ MACHO-ARM-NEXT:        PCRel: 0
 MACHO-ARM-NEXT:        Length: 2
 MACHO-ARM-NEXT:        Extern: N/A
 MACHO-ARM-NEXT:        Type: ARM_RELOC_PAIR (1)
-MACHO-ARM-NEXT:        Symbol: -
+MACHO-ARM-NEXT:        Symbol: 0x4
 MACHO-ARM-NEXT:        Scattered: 1
 MACHO-ARM-NEXT:      }
 MACHO-ARM-NEXT:    }
diff --git a/test/tools/llvm-readobj/sections-ext.test b/test/tools/llvm-readobj/sections-ext.test
index 0f7ce26..972d8e6 100644
--- a/test/tools/llvm-readobj/sections-ext.test
+++ b/test/tools/llvm-readobj/sections-ext.test
@@ -183,8 +183,8 @@ MACHO-I386-NEXT:     Reserved2: 0x0
 MACHO-I386-NEXT:     Relocations [
 MACHO-I386-NEXT:       0x18 1 2 1 GENERIC_RELOC_VANILLA 0 _SomeOtherFunction
 MACHO-I386-NEXT:       0x13 1 2 1 GENERIC_RELOC_VANILLA 0 _puts
-MACHO-I386-NEXT:       0xB 0 2 n/a GENERIC_RELOC_LOCAL_SECTDIFF 1 -
-MACHO-I386-NEXT:       0x0 0 2 n/a GENERIC_RELOC_PAIR 1 -
+MACHO-I386-NEXT:       0xB 0 2 n/a GENERIC_RELOC_LOCAL_SECTDIFF 1 0x22
+MACHO-I386-NEXT:       0x0 0 2 n/a GENERIC_RELOC_PAIR 1 0x8
 MACHO-I386-NEXT:     ]
 MACHO-I386-NEXT:     Symbols [
 MACHO-I386-NEXT:       Symbol {
@@ -299,11 +299,11 @@ MACHO-PPC-NEXT:     ]
 MACHO-PPC-NEXT:     Reserved1: 0x0
 MACHO-PPC-NEXT:     Reserved2: 0x0
 MACHO-PPC-NEXT:     Relocations [
-MACHO-PPC-NEXT:       0x24 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 -
-MACHO-PPC-NEXT:       0x0 0 2 n/a PPC_RELOC_PAIR 1 -
-MACHO-PPC-NEXT:       0x1C 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 -
-MACHO-PPC-NEXT:       0x58 0 2 n/a PPC_RELOC_PAIR 1 -
-MACHO-PPC-NEXT:       0x18 1 2 0 PPC_RELOC_BR24 0 -
+MACHO-PPC-NEXT:       0x24 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 0x64
+MACHO-PPC-NEXT:       0x0 0 2 n/a PPC_RELOC_PAIR 1 0xC
+MACHO-PPC-NEXT:       0x1C 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 0x64
+MACHO-PPC-NEXT:       0x58 0 2 n/a PPC_RELOC_PAIR 1 0xC
+MACHO-PPC-NEXT:       0x18 1 2 0 PPC_RELOC_BR24 0 0x2
 MACHO-PPC-NEXT:     ]
 MACHO-PPC-NEXT:     Symbols [
 MACHO-PPC-NEXT:       Symbol {
@@ -342,10 +342,10 @@ MACHO-PPC-NEXT:     ]
 MACHO-PPC-NEXT:     Reserved1: 0x0
 MACHO-PPC-NEXT:     Reserved2: 0x20
 MACHO-PPC-NEXT:     Relocations [
-MACHO-PPC-NEXT:       0x14 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 -
-MACHO-PPC-NEXT:       0x0 0 2 n/a PPC_RELOC_PAIR 1 -
-MACHO-PPC-NEXT:       0xC 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 -
-MACHO-PPC-NEXT:       0x20 0 2 n/a PPC_RELOC_PAIR 1 -
+MACHO-PPC-NEXT:       0x14 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 0x68
+MACHO-PPC-NEXT:       0x0 0 2 n/a PPC_RELOC_PAIR 1 0x48
+MACHO-PPC-NEXT:       0xC 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 0x68
+MACHO-PPC-NEXT:       0x20 0 2 n/a PPC_RELOC_PAIR 1 0x48
 MACHO-PPC-NEXT:     ]
 MACHO-PPC-NEXT:     Symbols [
 MACHO-PPC-NEXT:     ]
@@ -456,11 +456,11 @@ MACHO-PPC64-NEXT:     ]
 MACHO-PPC64-NEXT:     Reserved1: 0x0
 MACHO-PPC64-NEXT:     Reserved2: 0x0
 MACHO-PPC64-NEXT:     Relocations [
-MACHO-PPC64-NEXT:       0x24 0 2 n/a 1 -
-MACHO-PPC64-NEXT:       0x0 0 2 n/a 1 -
-MACHO-PPC64-NEXT:       0x1C 0 2 n/a 1 -
-MACHO-PPC64-NEXT:       0x58 0 2 n/a 1 -
-MACHO-PPC64-NEXT:       0x18 1 2 0 0 -
+MACHO-PPC64-NEXT:       0x24 0 2 n/a 1 0x64
+MACHO-PPC64-NEXT:       0x0 0 2 n/a 1 0xC
+MACHO-PPC64-NEXT:       0x1C 0 2 n/a 1 0x64
+MACHO-PPC64-NEXT:       0x58 0 2 n/a 1 0xC
+MACHO-PPC64-NEXT:       0x18 1 2 0 0 0x2
 MACHO-PPC64-NEXT:     ]
 MACHO-PPC64-NEXT:     Symbols [
 MACHO-PPC64-NEXT:       Symbol {
@@ -499,10 +499,10 @@ MACHO-PPC64-NEXT:     ]
 MACHO-PPC64-NEXT:     Reserved1: 0x0
 MACHO-PPC64-NEXT:     Reserved2: 0x20
 MACHO-PPC64-NEXT:     Relocations [
-MACHO-PPC64-NEXT:       0x14 0 2 n/a 1 -
-MACHO-PPC64-NEXT:       0x0 0 2 n/a 1 -
-MACHO-PPC64-NEXT:       0xC 0 2 n/a 1 -
-MACHO-PPC64-NEXT:       0x24 0 2 n/a 1 -
+MACHO-PPC64-NEXT:       0x14 0 2 n/a 1 0x6C
+MACHO-PPC64-NEXT:       0x0 0 2 n/a 1 0x48
+MACHO-PPC64-NEXT:       0xC 0 2 n/a 1 0x6C
+MACHO-PPC64-NEXT:       0x24 0 2 n/a 1 0x48
 MACHO-PPC64-NEXT:     ]
 MACHO-PPC64-NEXT:     Symbols [
 MACHO-PPC64-NEXT:     ]
@@ -618,7 +618,7 @@ MACHO-ARM-NEXT:         PCRel: 0
 MACHO-ARM-NEXT:         Length: 2
 MACHO-ARM-NEXT:         Extern: N/A
 MACHO-ARM-NEXT:         Type: ARM_RELOC_SECTDIFF (2)
-MACHO-ARM-NEXT:         Symbol: -
+MACHO-ARM-NEXT:         Symbol: 0x40
 MACHO-ARM-NEXT:         Scattered: 1
 MACHO-ARM-NEXT:       }
 MACHO-ARM-NEXT:       Relocation {
@@ -627,7 +627,7 @@ MACHO-ARM-NEXT:         PCRel: 0
 MACHO-ARM-NEXT:         Length: 2
 MACHO-ARM-NEXT:         Extern: N/A
 MACHO-ARM-NEXT:         Type: ARM_RELOC_PAIR (1)
-MACHO-ARM-NEXT:         Symbol: -
+MACHO-ARM-NEXT:         Symbol: 0x28
 MACHO-ARM-NEXT:         Scattered: 1
 MACHO-ARM-NEXT:       }
 MACHO-ARM-NEXT:       Relocation {
@@ -654,7 +654,7 @@ MACHO-ARM-NEXT:         PCRel: 0
 MACHO-ARM-NEXT:         Length: 1
 MACHO-ARM-NEXT:         Extern: 0
 MACHO-ARM-NEXT:         Type: ARM_RELOC_PAIR (1)
-MACHO-ARM-NEXT:         Symbol: -
+MACHO-ARM-NEXT:         Symbol: 0xFFFFFF
 MACHO-ARM-NEXT:         Scattered: 0
 MACHO-ARM-NEXT:       }
 MACHO-ARM-NEXT:       Relocation {
@@ -672,7 +672,7 @@ MACHO-ARM-NEXT:         PCRel: 0
 MACHO-ARM-NEXT:         Length: 0
 MACHO-ARM-NEXT:         Extern: 0
 MACHO-ARM-NEXT:         Type: ARM_RELOC_PAIR (1)
-MACHO-ARM-NEXT:         Symbol: -
+MACHO-ARM-NEXT:         Symbol: 0xFFFFFF
 MACHO-ARM-NEXT:         Scattered: 0
 MACHO-ARM-NEXT:       }
 MACHO-ARM-NEXT:       Relocation {
@@ -681,7 +681,7 @@ MACHO-ARM-NEXT:         PCRel: 0
 MACHO-ARM-NEXT:         Length: 2
 MACHO-ARM-NEXT:         Extern: N/A
 MACHO-ARM-NEXT:         Type: ARM_RELOC_SECTDIFF (2)
-MACHO-ARM-NEXT:         Symbol: -
+MACHO-ARM-NEXT:         Symbol: 0x44
 MACHO-ARM-NEXT:         Scattered: 1
 MACHO-ARM-NEXT:       }
 MACHO-ARM-NEXT:       Relocation {
@@ -690,7 +690,7 @@ MACHO-ARM-NEXT:         PCRel: 0
 MACHO-ARM-NEXT:         Length: 2
 MACHO-ARM-NEXT:         Extern: N/A
 MACHO-ARM-NEXT:         Type: ARM_RELOC_PAIR (1)
-MACHO-ARM-NEXT:         Symbol: -
+MACHO-ARM-NEXT:         Symbol: 0x4
 MACHO-ARM-NEXT:         Scattered: 1
 MACHO-ARM-NEXT:       }
 MACHO-ARM-NEXT:     ]
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 13b7f5a..846ad1e 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -49,7 +49,7 @@ add_llvm_tool_subdirectory(llvm-c-test)
 add_llvm_tool_subdirectory(obj2yaml)
 add_llvm_tool_subdirectory(yaml2obj)
 
-if( NOT CYGWIN )
+if(NOT CYGWIN AND LLVM_ENABLE_PIC)
   add_llvm_tool_subdirectory(lto)
   add_llvm_tool_subdirectory(llvm-lto)
 else()
diff --git a/tools/Makefile b/tools/Makefile
index 2b8c32e..97ad99a 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -19,9 +19,9 @@ else
   OPTIONAL_PARALLEL_DIRS := clang
 endif
 
-# Build LLDB if present. Note LLDB must be built last as it depends on the
-# wider LLVM infrastructure (including Clang).
-OPTIONAL_DIRS := lldb
+# Build LLD and LLDB if present. Note LLDB must be built last as it depends on
+# the wider LLVM infrastructure (including Clang).
+OPTIONAL_DIRS := lld lldb
 
 # NOTE: The tools are organized into five groups of four consisting of one
 # large and three small executables. This is done to minimize memory load
diff --git a/tools/bugpoint/ExecutionDriver.cpp b/tools/bugpoint/ExecutionDriver.cpp
index 5ed7d2c..25813b3 100644
--- a/tools/bugpoint/ExecutionDriver.cpp
+++ b/tools/bugpoint/ExecutionDriver.cpp
@@ -267,7 +267,7 @@ void BugDriver::compileProgram(Module *M, std::string *Error) const {
   // Emit the program to a bitcode file...
   SmallString<128> BitcodeFile;
   int BitcodeFD;
-  error_code EC = sys::fs::createUniqueFile(
+  std::error_code EC = sys::fs::createUniqueFile(
       OutputPrefix + "-test-program-%%%%%%%.bc", BitcodeFD, BitcodeFile);
   if (EC) {
     errs() << ToolName << ": Error making unique filename: " << EC.message()
@@ -305,7 +305,7 @@ std::string BugDriver::executeProgram(const Module *Program,
     // Emit the program to a bitcode file...
     SmallString<128> UniqueFilename;
     int UniqueFD;
-    error_code EC = sys::fs::createUniqueFile(
+    std::error_code EC = sys::fs::createUniqueFile(
         OutputPrefix + "-test-program-%%%%%%%.bc", UniqueFD, UniqueFilename);
     if (EC) {
       errs() << ToolName << ": Error making unique filename: "
@@ -331,7 +331,7 @@ std::string BugDriver::executeProgram(const Module *Program,
 
   // Check to see if this is a valid output filename...
   SmallString<128> UniqueFile;
-  error_code EC = sys::fs::createUniqueFile(OutputFile, UniqueFile);
+  std::error_code EC = sys::fs::createUniqueFile(OutputFile, UniqueFile);
   if (EC) {
     errs() << ToolName << ": Error making unique filename: "
            << EC.message() << "\n";
diff --git a/tools/bugpoint/ExtractFunction.cpp b/tools/bugpoint/ExtractFunction.cpp
index 38cdf24..4fb6856 100644
--- a/tools/bugpoint/ExtractFunction.cpp
+++ b/tools/bugpoint/ExtractFunction.cpp
@@ -366,7 +366,7 @@ Module *BugDriver::ExtractMappedBlocksFromModule(const
                                                  Module *M) {
   SmallString<128> Filename;
   int FD;
-  error_code EC = sys::fs::createUniqueFile(
+  std::error_code EC = sys::fs::createUniqueFile(
       OutputPrefix + "-extractblocks%%%%%%%", FD, Filename);
   if (EC) {
     outs() << "*** Basic Block extraction failed!\n";
diff --git a/tools/bugpoint/Miscompilation.cpp b/tools/bugpoint/Miscompilation.cpp
index f5936ac..3f1f84e 100644
--- a/tools/bugpoint/Miscompilation.cpp
+++ b/tools/bugpoint/Miscompilation.cpp
@@ -964,8 +964,8 @@ static bool TestCodeGenerator(BugDriver &BD, Module *Test, Module *Safe,
 
   SmallString<128> TestModuleBC;
   int TestModuleFD;
-  error_code EC = sys::fs::createTemporaryFile("bugpoint.test", "bc",
-                                               TestModuleFD, TestModuleBC);
+  std::error_code EC = sys::fs::createTemporaryFile("bugpoint.test", "bc",
+                                                    TestModuleFD, TestModuleBC);
   if (EC) {
     errs() << BD.getToolName() << "Error making unique filename: "
            << EC.message() << "\n";
@@ -1058,8 +1058,8 @@ bool BugDriver::debugCodeGenerator(std::string *Error) {
 
   SmallString<128> TestModuleBC;
   int TestModuleFD;
-  error_code EC = sys::fs::createTemporaryFile("bugpoint.test", "bc",
-                                               TestModuleFD, TestModuleBC);
+  std::error_code EC = sys::fs::createTemporaryFile("bugpoint.test", "bc",
+                                                    TestModuleFD, TestModuleBC);
   if (EC) {
     errs() << getToolName() << "Error making unique filename: "
            << EC.message() << "\n";
diff --git a/tools/bugpoint/OptimizerDriver.cpp b/tools/bugpoint/OptimizerDriver.cpp
index b2722e6..d452fd9 100644
--- a/tools/bugpoint/OptimizerDriver.cpp
+++ b/tools/bugpoint/OptimizerDriver.cpp
@@ -129,7 +129,7 @@ bool BugDriver::runPasses(Module *Program,
   // setup the output file name
   outs().flush();
   SmallString<128> UniqueFilename;
-  error_code EC = sys::fs::createUniqueFile(
+  std::error_code EC = sys::fs::createUniqueFile(
       OutputPrefix + "-output-%%%%%%%.bc", UniqueFilename);
   if (EC) {
     errs() << getToolName() << ": Error making unique filename: "
diff --git a/tools/bugpoint/ToolRunner.cpp b/tools/bugpoint/ToolRunner.cpp
index c481b03..4a2401b 100644
--- a/tools/bugpoint/ToolRunner.cpp
+++ b/tools/bugpoint/ToolRunner.cpp
@@ -142,7 +142,7 @@ static std::string ProcessFailure(StringRef ProgPath, const char** Args,
   // Rerun the compiler, capturing any error messages to print them.
   SmallString<128> ErrorFilename;
   int ErrorFD;
-  error_code EC = sys::fs::createTemporaryFile(
+  std::error_code EC = sys::fs::createTemporaryFile(
       "bugpoint.program_error_messages", "", ErrorFD, ErrorFilename);
   if (EC) {
     errs() << "Error making unique filename: " << EC.message() << "\n";
@@ -478,7 +478,7 @@ GCC::FileType LLC::OutputCode(const std::string &Bitcode,
   const char *Suffix = (UseIntegratedAssembler ? ".llc.o" : ".llc.s");
 
   SmallString<128> UniqueFile;
-  error_code EC =
+  std::error_code EC =
       sys::fs::createUniqueFile(Bitcode + "-%%%%%%%" + Suffix, UniqueFile);
   if (EC) {
     errs() << "Error making unique filename: " << EC.message() << "\n";
@@ -715,7 +715,7 @@ int GCC::ExecuteProgram(const std::string &ProgramFile,
   GCCArgs.push_back("-o");
 
   SmallString<128> OutputBinary;
-  error_code EC =
+  std::error_code EC =
       sys::fs::createUniqueFile(ProgramFile + "-%%%%%%%.gcc.exe", OutputBinary);
   if (EC) {
     errs() << "Error making unique filename: " << EC.message() << "\n";
@@ -825,7 +825,7 @@ int GCC::MakeSharedObject(const std::string &InputFile, FileType fileType,
                           const std::vector<std::string> &ArgsForGCC,
                           std::string &Error) {
   SmallString<128> UniqueFilename;
-  error_code EC = sys::fs::createUniqueFile(
+  std::error_code EC = sys::fs::createUniqueFile(
       InputFile + "-%%%%%%%" + LTDL_SHLIB_EXT, UniqueFilename);
   if (EC) {
     errs() << "Error making unique filename: " << EC.message() << "\n";
diff --git a/tools/gold/CMakeLists.txt b/tools/gold/CMakeLists.txt
index 07a1e28..3864e15 100644
--- a/tools/gold/CMakeLists.txt
+++ b/tools/gold/CMakeLists.txt
@@ -14,13 +14,14 @@ else()
   # ABI compatibility.
   add_definitions( -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 )
 
-  set(LLVM_LINK_COMPONENTS support)
+  set(LLVM_LINK_COMPONENTS
+     ${LLVM_TARGETS_TO_BUILD}
+     LTO
+     )
 
   add_llvm_loadable_module(LLVMgold
     gold-plugin.cpp
     )
 
-  target_link_libraries(LLVMgold ${cmake_2_8_12_PRIVATE} LTO)
-
 endif()
 
diff --git a/tools/gold/Makefile b/tools/gold/Makefile
index 496e31c..593d8ea 100644
--- a/tools/gold/Makefile
+++ b/tools/gold/Makefile
@@ -9,7 +9,6 @@
 
 LEVEL := ../..
 LIBRARYNAME := LLVMgold
-LINK_COMPONENTS := support
 LINK_LIBS_IN_SHARED := 1
 SHARED_LIBRARY := 1
 LOADABLE_MODULE := 1
@@ -21,6 +20,8 @@ EXPORTED_SYMBOL_FILE = $(PROJ_SRC_DIR)/gold.exports
 # early so we can set up LINK_COMPONENTS before including Makefile.rules
 include $(LEVEL)/Makefile.config
 
+LINK_COMPONENTS := $(TARGETS_TO_BUILD) LTO
+
 # Because off_t is used in the public API, the largefile parts are required for
 # ABI compatibility.
 CXXFLAGS += -I$(BINUTILS_INCDIR) -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64
@@ -28,4 +29,3 @@ LDFLAGS += -L$(SharedLibDir)/$(SharedPrefix)
 
 include $(LEVEL)/Makefile.common
 
-LIBS += -lLTO
diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index 4726d82..b908510 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -15,19 +15,22 @@
 #include "llvm/Config/config.h" // plugin-api.h requires HAVE_STDINT_H
 #include "llvm-c/lto.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/LTO/LTOCodeGenerator.h"
+#include "llvm/LTO/LTOModule.h"
 #include "llvm/Support/Errno.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
+#include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/system_error.h"
 #include <cerrno>
 #include <cstdlib>
 #include <cstring>
-#include <fstream>
 #include <list>
 #include <plugin-api.h>
+#include <system_error>
 #include <vector>
 
 // Support Windows/MinGW crazyness.
@@ -46,36 +49,32 @@
 using namespace llvm;
 
 namespace {
-  ld_plugin_status discard_message(int level, const char *format, ...) {
-    // Die loudly. Recent versions of Gold pass ld_plugin_message as the first
-    // callback in the transfer vector. This should never be called.
-    abort();
-  }
+struct claimed_file {
+  void *handle;
+  std::vector<ld_plugin_symbol> syms;
+};
+}
 
-  ld_plugin_add_symbols add_symbols = NULL;
-  ld_plugin_get_symbols get_symbols = NULL;
-  ld_plugin_add_input_file add_input_file = NULL;
-  ld_plugin_add_input_library add_input_library = NULL;
-  ld_plugin_set_extra_library_path set_extra_library_path = NULL;
-  ld_plugin_get_view get_view = NULL;
-  ld_plugin_message message = discard_message;
-
-  int api_version = 0;
-  int gold_version = 0;
-
-  struct claimed_file {
-    void *handle;
-    std::vector<ld_plugin_symbol> syms;
-  };
-
-  lto_codegen_model output_type = LTO_CODEGEN_PIC_MODEL_STATIC;
-  std::string output_name = "";
-  std::list<claimed_file> Modules;
-  std::vector<std::string> Cleanup;
-  lto_code_gen_t code_gen = NULL;
-  StringSet<> CannotBeHidden;
+static ld_plugin_status discard_message(int level, const char *format, ...) {
+  // Die loudly. Recent versions of Gold pass ld_plugin_message as the first
+  // callback in the transfer vector. This should never be called.
+  abort();
 }
 
+static ld_plugin_add_symbols add_symbols = NULL;
+static ld_plugin_get_symbols get_symbols = NULL;
+static ld_plugin_add_input_file add_input_file = NULL;
+static ld_plugin_set_extra_library_path set_extra_library_path = NULL;
+static ld_plugin_get_view get_view = NULL;
+static ld_plugin_message message = discard_message;
+static lto_codegen_model output_type = LTO_CODEGEN_PIC_MODEL_STATIC;
+static std::string output_name = "";
+static std::list<claimed_file> Modules;
+static std::vector<std::string> Cleanup;
+static LTOCodeGenerator *CodeGen = nullptr;
+static StringSet<> CannotBeHidden;
+static llvm::TargetOptions TargetOpts;
+
 namespace options {
   enum generate_bc { BC_NO, BC_ALSO, BC_ONLY };
   static bool generate_api_file = false;
@@ -135,6 +134,12 @@ static ld_plugin_status cleanup_hook(void);
 
 extern "C" ld_plugin_status onload(ld_plugin_tv *tv);
 ld_plugin_status onload(ld_plugin_tv *tv) {
+  InitializeAllTargetInfos();
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+  InitializeAllAsmParsers();
+  InitializeAllAsmPrinters();
+
   // We're given a pointer to the first transfer vector. We read through them
   // until we find one where tv_tag == LDPT_NULL. The REGISTER_* tagged values
   // contain pointers to functions that we need to call to register our own
@@ -142,15 +147,10 @@ ld_plugin_status onload(ld_plugin_tv *tv) {
   // for services.
 
   bool registeredClaimFile = false;
+  bool RegisteredAllSymbolsRead = false;
 
   for (; tv->tv_tag != LDPT_NULL; ++tv) {
     switch (tv->tv_tag) {
-      case LDPT_API_VERSION:
-        api_version = tv->tv_u.tv_val;
-        break;
-      case LDPT_GOLD_VERSION:  // major * 100 + minor
-        gold_version = tv->tv_u.tv_val;
-        break;
       case LDPT_OUTPUT_NAME:
         output_name = tv->tv_u.tv_string;
         break;
@@ -169,8 +169,6 @@ ld_plugin_status onload(ld_plugin_tv *tv) {
                        tv->tv_u.tv_val);
             return LDPS_ERR;
         }
-        // TODO: add an option to disable PIC.
-        //output_type = LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC;
         break;
       case LDPT_OPTION:
         options::process_plugin_option(tv->tv_u.tv_string);
@@ -191,7 +189,7 @@ ld_plugin_status onload(ld_plugin_tv *tv) {
         if ((*callback)(all_symbols_read_hook) != LDPS_OK)
           return LDPS_ERR;
 
-        code_gen = lto_codegen_create();
+        RegisteredAllSymbolsRead = true;
       } break;
       case LDPT_REGISTER_CLEANUP_HOOK: {
         ld_plugin_register_cleanup callback;
@@ -209,9 +207,6 @@ ld_plugin_status onload(ld_plugin_tv *tv) {
       case LDPT_ADD_INPUT_FILE:
         add_input_file = tv->tv_u.tv_add_input_file;
         break;
-      case LDPT_ADD_INPUT_LIBRARY:
-        add_input_library = tv->tv_u.tv_add_input_file;
-        break;
       case LDPT_SET_EXTRA_LIBRARY_PATH:
         set_extra_library_path = tv->tv_u.tv_set_extra_library_path;
         break;
@@ -235,15 +230,41 @@ ld_plugin_status onload(ld_plugin_tv *tv) {
     return LDPS_ERR;
   }
 
+  if (!RegisteredAllSymbolsRead)
+    return LDPS_OK;
+
+  CodeGen = new LTOCodeGenerator();
+
+  // Pass through extra options to the code generator.
+  if (!options::extra.empty()) {
+    for (std::vector<std::string>::iterator it = options::extra.begin();
+         it != options::extra.end(); ++it) {
+      CodeGen->setCodeGenDebugOptions((*it).c_str());
+    }
+  }
+
+  CodeGen->parseCodeGenDebugOptions();
+  if (MAttrs.size()) {
+    std::string Attrs;
+    for (unsigned I = 0; I < MAttrs.size(); ++I) {
+      if (I > 0)
+        Attrs.append(",");
+      Attrs.append(MAttrs[I]);
+    }
+    CodeGen->setAttr(Attrs.c_str());
+  }
+
+  TargetOpts = InitTargetOptionsFromCodeGenFlags();
+  CodeGen->setTargetOptions(TargetOpts);
+
   return LDPS_OK;
 }
 
-/// claim_file_hook - called by gold to see whether this file is one that
-/// our plugin can handle. We'll try to open it and register all the symbols
-/// with add_symbol if possible.
+/// Called by gold to see whether this file is one that our plugin can handle.
+/// We'll try to open it and register all the symbols with add_symbol if
+/// possible.
 static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
                                         int *claimed) {
-  lto_module_t M;
   const void *view;
   std::unique_ptr<MemoryBuffer> buffer;
   if (get_view) {
@@ -258,25 +279,27 @@ static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
     if (file->offset) {
       offset = file->offset;
     }
-    if (error_code ec = MemoryBuffer::getOpenFileSlice(
-            file->fd, file->name, buffer, file->filesize, offset)) {
-      (*message)(LDPL_ERROR, ec.message().c_str());
+    ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
+        MemoryBuffer::getOpenFileSlice(file->fd, file->name, file->filesize,
+                                       offset);
+    if (std::error_code EC = BufferOrErr.getError()) {
+      (*message)(LDPL_ERROR, EC.message().c_str());
       return LDPS_ERR;
     }
+    buffer = std::move(BufferOrErr.get());
     view = buffer->getBufferStart();
   }
 
-  if (!lto_module_is_object_file_in_memory(view, file->filesize))
+  if (!LTOModule::isBitcodeFile(view, file->filesize))
     return LDPS_OK;
 
-  M = lto_module_create_from_memory(view, file->filesize);
+  std::string Error;
+  LTOModule *M =
+      LTOModule::createFromBuffer(view, file->filesize, TargetOpts, Error);
   if (!M) {
-    if (const char* msg = lto_get_error_message()) {
-      (*message)(LDPL_ERROR,
-                 "LLVM gold plugin has failed to create LTO module: %s",
-                 msg);
-      return LDPS_ERR;
-    }
+    (*message)(LDPL_ERROR,
+               "LLVM gold plugin has failed to create LTO module: %s",
+               Error.c_str());
     return LDPS_OK;
   }
 
@@ -285,21 +308,20 @@ static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
   claimed_file &cf = Modules.back();
 
   if (!options::triple.empty())
-    lto_module_set_target_triple(M, options::triple.c_str());
+    M->setTargetTriple(options::triple.c_str());
 
   cf.handle = file->handle;
-  unsigned sym_count = lto_module_get_num_symbols(M);
+  unsigned sym_count = M->getSymbolCount();
   cf.syms.reserve(sym_count);
 
   for (unsigned i = 0; i != sym_count; ++i) {
-    lto_symbol_attributes attrs = lto_module_get_symbol_attribute(M, i);
+    lto_symbol_attributes attrs = M->getSymbolAttributes(i);
     if ((attrs & LTO_SYMBOL_SCOPE_MASK) == LTO_SYMBOL_SCOPE_INTERNAL)
       continue;
 
     cf.syms.push_back(ld_plugin_symbol());
     ld_plugin_symbol &sym = cf.syms.back();
-    sym.name = const_cast<char *>(lto_module_get_symbol_name(M, i));
-    sym.name = strdup(sym.name);
+    sym.name = strdup(M->getSymbolName(i));
     sym.version = NULL;
 
     int scope = attrs & LTO_SYMBOL_SCOPE_MASK;
@@ -361,15 +383,15 @@ static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
     }
   }
 
-  if (code_gen) {
-    if (lto_codegen_add_module(code_gen, M)) {
-      (*message)(LDPL_ERROR, "Error linking module: %s",
-                 lto_get_error_message());
+  if (CodeGen) {
+    std::string Error;
+    if (!CodeGen->addModule(M, Error)) {
+      (*message)(LDPL_ERROR, "Error linking module: %s", Error.c_str());
       return LDPS_ERR;
     }
   }
 
-  lto_module_dispose(M);
+  delete M;
 
   return LDPS_OK;
 }
@@ -387,15 +409,17 @@ static bool mustPreserve(const claimed_file &F, int i) {
 /// been overridden by a native object file. Then, perform optimization and
 /// codegen.
 static ld_plugin_status all_symbols_read_hook(void) {
-  std::ofstream api_file;
-  assert(code_gen);
+  // FIXME: raw_fd_ostream should be able to represent an unopened file.
+  std::unique_ptr<raw_fd_ostream> api_file;
+
+  assert(CodeGen);
 
   if (options::generate_api_file) {
-    api_file.open("apifile.txt", std::ofstream::out | std::ofstream::trunc);
-    if (!api_file.is_open()) {
-      (*message)(LDPL_FATAL, "Unable to open apifile.txt for writing.");
-      abort();
-    }
+    std::string Error;
+    api_file.reset(new raw_fd_ostream("apifile.txt", Error, sys::fs::F_None));
+    if (!Error.empty())
+      (*message)(LDPL_FATAL, "Unable to open apifile.txt for writing: %s",
+                 Error.c_str());
   }
 
   for (std::list<claimed_file>::iterator I = Modules.begin(),
@@ -405,29 +429,18 @@ static ld_plugin_status all_symbols_read_hook(void) {
     (*get_symbols)(I->handle, I->syms.size(), &I->syms[0]);
     for (unsigned i = 0, e = I->syms.size(); i != e; i++) {
       if (mustPreserve(*I, i)) {
-        lto_codegen_add_must_preserve_symbol(code_gen, I->syms[i].name);
+        CodeGen->addMustPreserveSymbol(I->syms[i].name);
 
         if (options::generate_api_file)
-          api_file << I->syms[i].name << "\n";
+          (*api_file) << I->syms[i].name << "\n";
       }
     }
   }
 
-  if (options::generate_api_file)
-    api_file.close();
-
-  lto_codegen_set_pic_model(code_gen, output_type);
-  lto_codegen_set_debug_model(code_gen, LTO_DEBUG_MODEL_DWARF);
+  CodeGen->setCodePICModel(output_type);
+  CodeGen->setDebugInfo(LTO_DEBUG_MODEL_DWARF);
   if (!options::mcpu.empty())
-    lto_codegen_set_cpu(code_gen, options::mcpu.c_str());
-
-  // Pass through extra options to the code generator.
-  if (!options::extra.empty()) {
-    for (std::vector<std::string>::iterator it = options::extra.begin();
-         it != options::extra.end(); ++it) {
-      lto_codegen_debug_options(code_gen, (*it).c_str());
-    }
-  }
+    CodeGen->setCpu(options::mcpu.c_str());
 
   if (options::generate_bc_file != options::BC_NO) {
     std::string path;
@@ -437,11 +450,11 @@ static ld_plugin_status all_symbols_read_hook(void) {
       path = options::bc_path;
     else
       path = output_name + ".bc";
-    bool err = lto_codegen_write_merged_modules(code_gen, path.c_str());
-    if (err)
+    std::string Error;
+    if (!CodeGen->writeMergedModules(path.c_str(), Error))
       (*message)(LDPL_FATAL, "Failed to write the output file.");
     if (options::generate_bc_file == options::BC_ONLY) {
-      lto_codegen_dispose(code_gen);
+      delete CodeGen;
       exit(0);
     }
   }
@@ -449,13 +462,14 @@ static ld_plugin_status all_symbols_read_hook(void) {
   std::string ObjPath;
   {
     const char *Temp;
-    if (lto_codegen_compile_to_file(code_gen, &Temp)) {
+    std::string Error;
+    if (!CodeGen->compile_to_file(&Temp, /*DisableOpt*/ false, /*DisableInline*/
+                                  false, /*DisableGVNLoadPRE*/ false, Error))
       (*message)(LDPL_ERROR, "Could not produce a combined object file\n");
-    }
     ObjPath = Temp;
   }
 
-  lto_codegen_dispose(code_gen);
+  delete CodeGen;
   for (std::list<claimed_file>::iterator I = Modules.begin(),
          E = Modules.end(); I != E; ++I) {
     for (unsigned i = 0; i != I->syms.size(); ++i) {
@@ -484,7 +498,7 @@ static ld_plugin_status all_symbols_read_hook(void) {
 
 static ld_plugin_status cleanup_hook(void) {
   for (int i = 0, e = Cleanup.size(); i != e; ++i) {
-    error_code EC = sys::fs::remove(Cleanup[i]);
+    std::error_code EC = sys::fs::remove(Cleanup[i]);
     if (EC)
       (*message)(LDPL_ERROR, "Failed to delete '%s': %s", Cleanup[i].c_str(),
                  EC.message().c_str());
diff --git a/tools/lli/RemoteMemoryManager.cpp b/tools/lli/RemoteMemoryManager.cpp
index 200ab75..4816517 100644
--- a/tools/lli/RemoteMemoryManager.cpp
+++ b/tools/lli/RemoteMemoryManager.cpp
@@ -61,7 +61,7 @@ allocateDataSection(uintptr_t Size, unsigned Alignment,
 }
 
 sys::MemoryBlock RemoteMemoryManager::allocateSection(uintptr_t Size) {
-  error_code ec;
+  std::error_code ec;
   sys::MemoryBlock MB = sys::Memory::allocateMappedMemory(Size,
                                                           &Near,
                                                           sys::Memory::MF_READ |
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index 4cde105..48828c1 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -285,8 +285,8 @@ public:
     if (!getCacheFilename(ModuleID, CacheName))
       return nullptr;
     // Load the object from the cache filename
-    std::unique_ptr<MemoryBuffer> IRObjectBuffer;
-    MemoryBuffer::getFile(CacheName.c_str(), IRObjectBuffer, -1, false);
+    ErrorOr<std::unique_ptr<MemoryBuffer>> IRObjectBuffer =
+        MemoryBuffer::getFile(CacheName.c_str(), -1, false);
     // If the file isn't there, that's OK.
     if (!IRObjectBuffer)
       return nullptr;
@@ -294,7 +294,7 @@ public:
     // because the file has probably just been mmapped.  Instead we make
     // a copy.  The filed-based buffer will be released when it goes
     // out of scope.
-    return MemoryBuffer::getMemBufferCopy(IRObjectBuffer->getBuffer());
+    return MemoryBuffer::getMemBufferCopy(IRObjectBuffer.get()->getBuffer());
   }
 
 private:
@@ -415,7 +415,7 @@ int main(int argc, char **argv, char * const *envp) {
 
   // If not jitting lazily, load the whole bitcode file eagerly too.
   if (NoLazyCompilation) {
-    if (error_code EC = Mod->materializeAllPermanently()) {
+    if (std::error_code EC = Mod->materializeAllPermanently()) {
       errs() << argv[0] << ": bitcode didn't read correctly.\n";
       errs() << "Reason: " << EC.message() << "\n";
       exit(1);
@@ -538,15 +538,15 @@ int main(int argc, char **argv, char * const *envp) {
   }
 
   for (unsigned i = 0, e = ExtraArchives.size(); i != e; ++i) {
-    std::unique_ptr<MemoryBuffer> ArBuf;
-    error_code ec;
-    ec = MemoryBuffer::getFileOrSTDIN(ExtraArchives[i], ArBuf);
-    if (ec) {
+    ErrorOr<std::unique_ptr<MemoryBuffer>> ArBuf =
+        MemoryBuffer::getFileOrSTDIN(ExtraArchives[i]);
+    if (!ArBuf) {
       Err.print(argv[0], errs());
       return 1;
     }
-    object::Archive *Ar = new object::Archive(ArBuf.release(), ec);
-    if (ec || !Ar) {
+    std::error_code EC;
+    object::Archive *Ar = new object::Archive(std::move(ArBuf.get()), EC);
+    if (EC || !Ar) {
       Err.print(argv[0], errs());
       return 1;
     }
diff --git a/tools/llvm-ar/Android.mk b/tools/llvm-ar/Android.mk
index 24bad69..23cd857 100644
--- a/tools/llvm-ar/Android.mk
+++ b/tools/llvm-ar/Android.mk
@@ -16,6 +16,53 @@ llvm_ar_STATIC_LIBRARIES := \
   libLLVMCore               \
   libLLVMSupport            \
 
+#  libLLVMAArch64CodeGen \
+  libLLVMAArch64Info \
+  libLLVMAArch64Desc \
+  libLLVMAArch64AsmParser \
+  libLLVMAArch64AsmPrinter \
+  libLLVMAArch64Disassembler \
+  libLLVMARMCodeGen \
+  libLLVMARMInfo \
+  libLLVMARMDesc \
+  libLLVMARMAsmParser \
+  libLLVMARMAsmPrinter \
+  libLLVMARMDisassembler \
+  libLLVMMipsCodeGen \
+  libLLVMMipsInfo \
+  libLLVMMipsDesc \
+  libLLVMMipsAsmParser \
+  libLLVMMipsAsmPrinter \
+  libLLVMMipsDisassembler \
+  libLLVMX86CodeGen \
+  libLLVMX86Info \
+  libLLVMX86Desc \
+  libLLVMX86AsmParser \
+  libLLVMX86AsmPrinter \
+  libLLVMX86Disassembler \
+  libLLVMAsmPrinter \
+  libLLVMSelectionDAG \
+  libLLVMCodeGen \
+  libLLVMObject \
+  libLLVMScalarOpts \
+  libLLVMInstCombine \
+  libLLVMInstrumentation \
+  libLLVMTransformObjCARC \
+  libLLVMTransformUtils \
+  libLLVMipa \
+  libLLVMAnalysis \
+  libLLVMTarget \
+  libLLVMMC \
+  libLLVMMCParser \
+  libLLVMCore \
+  libLLVMAsmParser \
+  libLLVMOption \
+  libLLVMSupport \
+  libLLVMVectorize \
+
+
+
+
 include $(CLEAR_VARS)
 
 LOCAL_MODULE := llvm-ar
@@ -26,6 +73,7 @@ LOCAL_IS_HOST_MODULE := true
 LOCAL_SRC_FILES := $(llvm_ar_SRC_FILES)
 
 LOCAL_STATIC_LIBRARIES := $(llvm_ar_STATIC_LIBRARIES)
+LOCAL_SHARED_LIBRARIES := libLLVM
 
 LOCAL_LDLIBS += -lpthread -lm -ldl
 
diff --git a/tools/llvm-ar/CMakeLists.txt b/tools/llvm-ar/CMakeLists.txt
index 9295efe..0e809a7 100644
--- a/tools/llvm-ar/CMakeLists.txt
+++ b/tools/llvm-ar/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  ${LLVM_TARGETS_TO_BUILD}
   Object
   Support
   )
diff --git a/tools/llvm-ar/Makefile b/tools/llvm-ar/Makefile
index 16a8283..e10d6ac 100644
--- a/tools/llvm-ar/Makefile
+++ b/tools/llvm-ar/Makefile
@@ -10,7 +10,7 @@
 LEVEL := ../..
 TOOLNAME := llvm-ar
 TOOLALIAS = llvm-ranlib
-LINK_COMPONENTS := bitreader support object
+LINK_COMPONENTS := all-targets bitreader support object
 
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS := 1
diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp
index ed7291e..f638e55 100644
--- a/tools/llvm-ar/llvm-ar.cpp
+++ b/tools/llvm-ar/llvm-ar.cpp
@@ -17,12 +17,14 @@
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
+#include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -53,7 +55,7 @@ LLVM_ATTRIBUTE_NORETURN static void fail(Twine Error) {
   exit(1);
 }
 
-static void failIfError(error_code EC, Twine Context = "") {
+static void failIfError(std::error_code EC, Twine Context = "") {
   if (!EC)
     return;
 
@@ -368,8 +370,9 @@ static void performReadOperation(ArchiveOperation Operation,
   for (object::Archive::child_iterator I = OldArchive->child_begin(),
                                        E = OldArchive->child_end();
        I != E; ++I) {
-    StringRef Name;
-    failIfError(I->getName(Name));
+    ErrorOr<StringRef> NameOrErr = I->getName();
+    failIfError(NameOrErr.getError());
+    StringRef Name = NameOrErr.get();
 
     if (!Members.empty() &&
         std::find(Members.begin(), Members.end(), Name) == Members.end())
@@ -453,8 +456,7 @@ int NewArchiveIterator::getFD() const {
   // Linux cannot open directories with open(2), although
   // cygwin and *bsd can.
   if (NewStatus.type() == sys::fs::file_type::directory_file)
-    failIfError(error_code(errc::is_a_directory, posix_category()),
-                NewFilename);
+    failIfError(make_error_code(errc::is_a_directory), NewFilename);
 
   return NewFD;
 }
@@ -544,8 +546,9 @@ computeNewArchiveMembers(ArchiveOperation Operation,
                                          E = OldArchive->child_end();
          I != E; ++I) {
       int Pos = Ret.size();
-      StringRef Name;
-      failIfError(I->getName(Name));
+      ErrorOr<StringRef> NameOrErr = I->getName();
+      failIfError(NameOrErr.getError());
+      StringRef Name = NameOrErr.get();
       if (Name == PosName) {
         assert(AddAfter || AddBefore);
         if (AddBefore)
@@ -681,61 +684,51 @@ static void writeStringTable(raw_fd_ostream &Out,
   Out.seek(Pos);
 }
 
-static void writeSymbolTable(
-    raw_fd_ostream &Out, ArrayRef<NewArchiveIterator> Members,
-    ArrayRef<MemoryBuffer *> Buffers,
-    std::vector<std::pair<unsigned, unsigned> > &MemberOffsetRefs) {
+static void
+writeSymbolTable(raw_fd_ostream &Out, ArrayRef<NewArchiveIterator> Members,
+                 MutableArrayRef<std::unique_ptr<MemoryBuffer>> Buffers,
+                 std::vector<std::pair<unsigned, unsigned>> &MemberOffsetRefs) {
   unsigned StartOffset = 0;
   unsigned MemberNum = 0;
   std::string NameBuf;
   raw_string_ostream NameOS(NameBuf);
   unsigned NumSyms = 0;
-  std::vector<object::SymbolicFile *> DeleteIt;
   LLVMContext &Context = getGlobalContext();
   for (ArrayRef<NewArchiveIterator>::iterator I = Members.begin(),
                                               E = Members.end();
        I != E; ++I, ++MemberNum) {
-    MemoryBuffer *MemberBuffer = Buffers[MemberNum];
+    std::unique_ptr<MemoryBuffer> &MemberBuffer = Buffers[MemberNum];
     ErrorOr<object::SymbolicFile *> ObjOrErr =
         object::SymbolicFile::createSymbolicFile(
-            MemberBuffer, false, sys::fs::file_magic::unknown, &Context);
+            MemberBuffer, sys::fs::file_magic::unknown, &Context);
     if (!ObjOrErr)
       continue;  // FIXME: check only for "not an object file" errors.
-    object::SymbolicFile *Obj = ObjOrErr.get();
+    std::unique_ptr<object::SymbolicFile> Obj(ObjOrErr.get());
 
-    DeleteIt.push_back(Obj);
     if (!StartOffset) {
       printMemberHeader(Out, "", sys::TimeValue::now(), 0, 0, 0, 0);
       StartOffset = Out.tell();
       print32BE(Out, 0);
     }
 
-    for (object::basic_symbol_iterator I = Obj->symbol_begin(),
-                                       E = Obj->symbol_end();
-         I != E; ++I) {
-      uint32_t Symflags = I->getFlags();
+    for (const object::BasicSymbolRef &S : Obj->symbols()) {
+      uint32_t Symflags = S.getFlags();
       if (Symflags & object::SymbolRef::SF_FormatSpecific)
         continue;
       if (!(Symflags & object::SymbolRef::SF_Global))
         continue;
       if (Symflags & object::SymbolRef::SF_Undefined)
         continue;
-      failIfError(I->printName(NameOS));
+      failIfError(S.printName(NameOS));
       NameOS << '\0';
       ++NumSyms;
       MemberOffsetRefs.push_back(std::make_pair(Out.tell(), MemberNum));
       print32BE(Out, 0);
     }
+    MemberBuffer.reset(Obj->releaseBuffer());
   }
   Out << NameOS.str();
 
-  for (std::vector<object::SymbolicFile *>::iterator I = DeleteIt.begin(),
-                                                     E = DeleteIt.end();
-       I != E; ++I) {
-    object::SymbolicFile *O = *I;
-    delete O;
-  }
-
   if (StartOffset == 0)
     return;
 
@@ -766,7 +759,7 @@ static void performWriteOperation(ArchiveOperation Operation,
 
   std::vector<std::pair<unsigned, unsigned> > MemberOffsetRefs;
 
-  std::vector<MemoryBuffer *> MemberBuffers;
+  std::vector<std::unique_ptr<MemoryBuffer>> MemberBuffers;
   MemberBuffers.resize(NewMembers.size());
 
   for (unsigned I = 0, N = NewMembers.size(); I < N; ++I) {
@@ -777,15 +770,18 @@ static void performWriteOperation(ArchiveOperation Operation,
       const char *Filename = Member.getNew();
       int FD = Member.getFD();
       const sys::fs::file_status &Status = Member.getStatus();
-      failIfError(MemoryBuffer::getOpenFile(FD, Filename, MemberBuffer,
-                                            Status.getSize(), false),
-                  Filename);
-
+      ErrorOr<std::unique_ptr<MemoryBuffer>> MemberBufferOrErr =
+          MemoryBuffer::getOpenFile(FD, Filename, Status.getSize(), false);
+      failIfError(MemberBufferOrErr.getError(), Filename);
+      MemberBuffer = std::move(MemberBufferOrErr.get());
     } else {
       object::Archive::child_iterator OldMember = Member.getOld();
-      failIfError(OldMember->getMemoryBuffer(MemberBuffer));
+      ErrorOr<std::unique_ptr<MemoryBuffer>> MemberBufferOrErr =
+          OldMember->getMemoryBuffer();
+      failIfError(MemberBufferOrErr.getError());
+      MemberBuffer = std::move(MemberBufferOrErr.get());
     }
-    MemberBuffers[I] = MemberBuffer.release();
+    MemberBuffers[I].reset(MemberBuffer.release());
   }
 
   if (Symtab) {
@@ -813,7 +809,7 @@ static void performWriteOperation(ArchiveOperation Operation,
     }
     Out.seek(Pos);
 
-    const MemoryBuffer *File = MemberBuffers[MemberNum];
+    const MemoryBuffer *File = MemberBuffers[MemberNum].get();
     if (I->isNewMember()) {
       const char *FileName = I->getNew();
       const sys::fs::file_status &Status = I->getStatus();
@@ -849,10 +845,6 @@ static void performWriteOperation(ArchiveOperation Operation,
       Out << '\n';
   }
 
-  for (unsigned I = 0, N = MemberBuffers.size(); I < N; ++I) {
-    delete MemberBuffers[I];
-  }
-
   Output.keep();
   Out.close();
   sys::fs::rename(TemporaryOutput, ArchiveName);
@@ -912,6 +904,10 @@ int main(int argc, char **argv) {
     "  This program archives bitcode files into single libraries\n"
   );
 
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllAsmParsers();
+
   StringRef Stem = sys::path::stem(ToolName);
   if (Stem.find("ar") != StringRef::npos)
     return ar_main(argv);
@@ -938,16 +934,17 @@ int ar_main(char **argv) {
 
 static int performOperation(ArchiveOperation Operation) {
   // Create or open the archive object.
-  std::unique_ptr<MemoryBuffer> Buf;
-  error_code EC = MemoryBuffer::getFile(ArchiveName, Buf, -1, false);
-  if (EC && EC != llvm::errc::no_such_file_or_directory) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
+      MemoryBuffer::getFile(ArchiveName, -1, false);
+  std::error_code EC = Buf.getError();
+  if (EC && EC != errc::no_such_file_or_directory) {
     errs() << ToolName << ": error opening '" << ArchiveName
            << "': " << EC.message() << "!\n";
     return 1;
   }
 
   if (!EC) {
-    object::Archive Archive(Buf.release(), EC);
+    object::Archive Archive(std::move(Buf.get()), EC);
 
     if (EC) {
       errs() << ToolName << ": error loading '" << ArchiveName
@@ -958,7 +955,7 @@ static int performOperation(ArchiveOperation Operation) {
     return 0;
   }
 
-  assert(EC == llvm::errc::no_such_file_or_directory);
+  assert(EC == errc::no_such_file_or_directory);
 
   if (!shouldCreateArchive(Operation)) {
     failIfError(EC, Twine("error loading '") + ArchiveName + "'");
diff --git a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
index dfdaa03..15567cf 100644
--- a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
+++ b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
@@ -38,10 +38,10 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <algorithm>
 #include <cctype>
 #include <map>
+#include <system_error>
 using namespace llvm;
 
 static cl::opt<std::string>
@@ -478,11 +478,11 @@ static void PrintSize(uint64_t Bits) {
 /// AnalyzeBitcode - Analyze the bitcode file specified by InputFilename.
 static int AnalyzeBitcode() {
   // Read the input file.
-  std::unique_ptr<MemoryBuffer> MemBuf;
-
-  if (error_code ec =
-        MemoryBuffer::getFileOrSTDIN(InputFilename, MemBuf))
-    return Error("Error reading '" + InputFilename + "': " + ec.message());
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MemBufOrErr =
+      MemoryBuffer::getFileOrSTDIN(InputFilename);
+  if (std::error_code EC = MemBufOrErr.getError())
+    return Error("Error reading '" + InputFilename + "': " + EC.message());
+  std::unique_ptr<MemoryBuffer> MemBuf = std::move(MemBufOrErr.get());
 
   if (MemBuf->getBufferSize() & 3)
     return Error("Bitcode stream should be a multiple of 4 bytes in length");
diff --git a/tools/llvm-c-test/Android.mk b/tools/llvm-c-test/Android.mk
index f26c989..3ab8830 100644
--- a/tools/llvm-c-test/Android.mk
+++ b/tools/llvm-c-test/Android.mk
@@ -50,6 +50,7 @@ llvm_c_test_STATIC_LIBRARIES := \
   libLLVMTarget \
   libLLVMMC \
   libLLVMObject \
+  libLLVMMCParser \
   libLLVMCore \
   libLLVMAsmParser \
   libLLVMOption \
diff --git a/tools/llvm-config/Android.mk b/tools/llvm-config/Android.mk
index 628a62c..bdc409d 100644
--- a/tools/llvm-config/Android.mk
+++ b/tools/llvm-config/Android.mk
@@ -19,7 +19,7 @@ llvm_config_LOCAL_INCLUDES := \
   LibraryDependencies.inc
 
 llvm_config_DEPENDENCIES := \
-  BuildVariables.inc
+  $(LOCAL_PATH)/BuildVariables.inc.in
 
 include $(CLEAR_VARS)
 
diff --git a/tools/llvm-cov/llvm-cov.cpp b/tools/llvm-cov/llvm-cov.cpp
index 9463609..18cc1b1 100644
--- a/tools/llvm-cov/llvm-cov.cpp
+++ b/tools/llvm-cov/llvm-cov.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/GCOV.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -20,11 +21,11 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 using namespace llvm;
 
-static cl::opt<std::string> SourceFile(cl::Positional, cl::Required,
-                                       cl::desc("SOURCEFILE"));
+static cl::list<std::string> SourceFiles(cl::Positional, cl::OneOrMore,
+                                         cl::desc("SOURCEFILE"));
 
 static cl::opt<bool> AllBlocks("a", cl::Grouping, cl::init(false),
                                cl::desc("Display all basic blocks"));
@@ -75,15 +76,7 @@ static cl::opt<std::string> InputGCNO("gcno", cl::cat(DebugCat), cl::init(""),
 static cl::opt<std::string> InputGCDA("gcda", cl::cat(DebugCat), cl::init(""),
                                       cl::desc("Override inferred gcda file"));
 
-//===----------------------------------------------------------------------===//
-int main(int argc, char **argv) {
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal();
-  PrettyStackTraceProgram X(argc, argv);
-  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
-
-  cl::ParseCommandLineOptions(argc, argv, "LLVM code coverage tool\n");
-
+void reportCoverage(StringRef SourceFile) {
   SmallString<128> CoverageFileStem(ObjectDir);
   if (CoverageFileStem.empty()) {
     // If no directory was specified with -o, look next to the source file.
@@ -96,37 +89,40 @@ int main(int argc, char **argv) {
     // A file was given. Ignore the source file and look next to this file.
     sys::path::replace_extension(CoverageFileStem, "");
 
-  if (InputGCNO.empty())
-    InputGCNO = (CoverageFileStem.str() + ".gcno").str();
-  if (InputGCDA.empty())
-    InputGCDA = (CoverageFileStem.str() + ".gcda").str();
-
+  std::string GCNO = InputGCNO.empty()
+                         ? std::string(CoverageFileStem.str()) + ".gcno"
+                         : InputGCNO;
+  std::string GCDA = InputGCDA.empty()
+                         ? std::string(CoverageFileStem.str()) + ".gcda"
+                         : InputGCDA;
   GCOVFile GF;
 
-  std::unique_ptr<MemoryBuffer> GCNO_Buff;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputGCNO, GCNO_Buff)) {
-    errs() << InputGCNO << ": " << ec.message() << "\n";
-    return 1;
+  ErrorOr<std::unique_ptr<MemoryBuffer>> GCNO_Buff =
+      MemoryBuffer::getFileOrSTDIN(GCNO);
+  if (std::error_code EC = GCNO_Buff.getError()) {
+    errs() << GCNO << ": " << EC.message() << "\n";
+    return;
   }
-  GCOVBuffer GCNO_GB(GCNO_Buff.get());
+  GCOVBuffer GCNO_GB(GCNO_Buff.get().get());
   if (!GF.readGCNO(GCNO_GB)) {
     errs() << "Invalid .gcno File!\n";
-    return 1;
+    return;
   }
 
-  std::unique_ptr<MemoryBuffer> GCDA_Buff;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputGCDA, GCDA_Buff)) {
-    if (ec != errc::no_such_file_or_directory) {
-      errs() << InputGCDA << ": " << ec.message() << "\n";
-      return 1;
+  ErrorOr<std::unique_ptr<MemoryBuffer>> GCDA_Buff =
+      MemoryBuffer::getFileOrSTDIN(GCDA);
+  if (std::error_code EC = GCDA_Buff.getError()) {
+    if (EC != errc::no_such_file_or_directory) {
+      errs() << GCDA << ": " << EC.message() << "\n";
+      return;
     }
     // Clear the filename to make it clear we didn't read anything.
-    InputGCDA = "-";
+    GCDA = "-";
   } else {
-    GCOVBuffer GCDA_GB(GCDA_Buff.get());
+    GCOVBuffer GCDA_GB(GCDA_Buff.get().get());
     if (!GF.readGCDA(GCDA_GB)) {
       errs() << "Invalid .gcda File!\n";
-      return 1;
+      return;
     }
   }
 
@@ -137,6 +133,18 @@ int main(int argc, char **argv) {
                       PreservePaths, UncondBranch, LongNames, NoOutput);
   FileInfo FI(Options);
   GF.collectLineCounts(FI);
-  FI.print(SourceFile, InputGCNO, InputGCDA);
+  FI.print(SourceFile, GCNO, GCDA);
+}
+
+int main(int argc, char **argv) {
+  // Print a stack trace if we signal out.
+  sys::PrintStackTraceOnErrorSignal();
+  PrettyStackTraceProgram X(argc, argv);
+  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+
+  cl::ParseCommandLineOptions(argc, argv, "LLVM code coverage tool\n");
+
+  for (const auto &SourceFile : SourceFiles)
+    reportCoverage(SourceFile);
   return 0;
 }
diff --git a/tools/llvm-dis/llvm-dis.cpp b/tools/llvm-dis/llvm-dis.cpp
index 0df7328..3b0f838 100644
--- a/tools/llvm-dis/llvm-dis.cpp
+++ b/tools/llvm-dis/llvm-dis.cpp
@@ -32,7 +32,7 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 using namespace llvm;
 
 static cl::opt<std::string>
@@ -137,7 +137,7 @@ int main(int argc, char **argv) {
     M.reset(getStreamedBitcodeModule(DisplayFilename, streamer, Context,
                                      &ErrorMessage));
     if(M.get()) {
-      if (error_code EC = M->materializeAllPermanently()) {
+      if (std::error_code EC = M->materializeAllPermanently()) {
         ErrorMessage = EC.message();
         M.reset();
       }
diff --git a/tools/llvm-dwarfdump/Android.mk b/tools/llvm-dwarfdump/Android.mk
index 7908201..61049e8 100644
--- a/tools/llvm-dwarfdump/Android.mk
+++ b/tools/llvm-dwarfdump/Android.mk
@@ -14,6 +14,8 @@ llvm_dwarfdump_STATIC_LIBRARIES := \
   libLLVMDebugInfo                 \
   libLLVMObject                    \
   libLLVMBitReader                 \
+  libLLVMMC                        \
+  libLLVMMCParser                  \
   libLLVMCore                      \
   libLLVMSupport                   \
 
diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 46ac36e..f44b0e3 100644
--- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -25,11 +25,11 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <algorithm>
 #include <cstring>
 #include <list>
 #include <string>
+#include <system_error>
 
 using namespace llvm;
 using namespace object;
@@ -66,15 +66,16 @@ DumpType("debug-dump", cl::init(DIDT_All),
         clEnumValEnd));
 
 static void DumpInput(const StringRef &Filename) {
-  std::unique_ptr<MemoryBuffer> Buff;
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Buff =
+      MemoryBuffer::getFileOrSTDIN(Filename);
 
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, Buff)) {
-    errs() << Filename << ": " << ec.message() << "\n";
+  if (std::error_code EC = Buff.getError()) {
+    errs() << Filename << ": " << EC.message() << "\n";
     return;
   }
 
-  ErrorOr<ObjectFile*> ObjOrErr(ObjectFile::createObjectFile(Buff.release()));
-  if (error_code EC = ObjOrErr.getError()) {
+  ErrorOr<ObjectFile *> ObjOrErr(ObjectFile::createObjectFile(Buff.get()));
+  if (std::error_code EC = ObjOrErr.getError()) {
     errs() << Filename << ": " << EC.message() << '\n';
     return;
   }
diff --git a/tools/llvm-lto/Android.mk b/tools/llvm-lto/Android.mk
index a5782d9..1a9979e 100644
--- a/tools/llvm-lto/Android.mk
+++ b/tools/llvm-lto/Android.mk
@@ -19,9 +19,9 @@ llvm_lto_STATIC_LIBRARIES := \
   libLLVMARMDisassembler \
   libLLVMAArch64CodeGen \
   libLLVMAArch64Info \
-  libLLVMAArch64Desc \
-  libLLVMAArch64AsmPrinter \
   libLLVMAArch64AsmParser \
+  libLLVMAArch64AsmPrinter \
+  libLLVMAArch64Desc \
   libLLVMAArch64Utils \
   libLLVMAArch64Disassembler \
   libLLVMMipsCodeGen \
diff --git a/tools/llvm-lto/llvm-lto.cpp b/tools/llvm-lto/llvm-lto.cpp
index 8c2d1cd..8b39f12 100644
--- a/tools/llvm-lto/llvm-lto.cpp
+++ b/tools/llvm-lto/llvm-lto.cpp
@@ -110,7 +110,7 @@ int main(int argc, char **argv) {
   for (unsigned i = BaseArg; i < InputFilenames.size(); ++i) {
     std::string error;
     std::unique_ptr<LTOModule> Module(
-        LTOModule::makeLTOModule(InputFilenames[i].c_str(), Options, error));
+        LTOModule::createFromFile(InputFilenames[i].c_str(), Options, error));
     if (!error.empty()) {
       errs() << argv[0] << ": error loading file '" << InputFilenames[i]
              << "': " << error << "\n";
diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp
index 84d578b..4c5b230 100644
--- a/tools/llvm-mc/llvm-mc.cpp
+++ b/tools/llvm-mc/llvm-mc.cpp
@@ -52,7 +52,8 @@ static cl::opt<bool>
 ShowEncoding("show-encoding", cl::desc("Show instruction encodings"));
 
 static cl::opt<bool>
-CompressDebugSections("compress-debug-sections", cl::desc("Compress DWARF debug sections"));
+CompressDebugSections("compress-debug-sections",
+                      cl::desc("Compress DWARF debug sections"));
 
 static cl::opt<bool>
 ShowInst("show-inst", cl::desc("Show internal instruction representation"));
@@ -65,6 +66,10 @@ static cl::opt<unsigned>
 OutputAsmVariant("output-asm-variant",
                  cl::desc("Syntax variant to use for output printing"));
 
+static cl::opt<bool>
+PrintImmHex("print-imm-hex", cl::init(false),
+            cl::desc("Prefer hex format for immediate values"));
+
 enum OutputFileType {
   OFT_Null,
   OFT_AssemblyFile,
@@ -145,9 +150,6 @@ static cl::opt<bool>
 GenDwarfForAssembly("g", cl::desc("Generate dwarf debugging info for assembly "
                                   "source files"));
 
-static cl::opt<int>
-DwarfVersion("dwarf-version", cl::desc("Dwarf version"), cl::init(4));
-
 static cl::opt<std::string>
 DebugCompilationDir("fdebug-compilation-dir",
                     cl::desc("Specifies the debug info's compilation dir"));
@@ -167,7 +169,6 @@ enum ActionType {
   AC_Assemble,
   AC_Disassemble,
   AC_MDisassemble,
-  AC_HDisassemble
 };
 
 static cl::opt<ActionType>
@@ -181,9 +182,6 @@ Action(cl::desc("Action to perform:"),
                              "Disassemble strings of hex bytes"),
                   clEnumValN(AC_MDisassemble, "mdis",
                              "Marked up disassembly of strings of hex bytes"),
-                  clEnumValN(AC_HDisassemble, "hdis",
-                             "Disassemble strings of hex bytes printing "
-                             "immediates as hex"),
                   clEnumValEnd));
 
 static const Target *GetTarget(const char *ProgName) {
@@ -240,10 +238,11 @@ static void setDwarfDebugProducer(void) {
   DwarfDebugProducer += getenv("DEBUG_PRODUCER");
 }
 
-static int AsLexInput(SourceMgr &SrcMgr, MCAsmInfo &MAI, tool_output_file *Out) {
+static int AsLexInput(SourceMgr &SrcMgr, MCAsmInfo &MAI,
+                      tool_output_file *Out) {
 
   AsmLexer Lexer(MAI);
-  Lexer.setBuffer(SrcMgr.getMemoryBuffer(0));
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(SrcMgr.getMainFileID())->getBuffer());
 
   bool Error = false;
   while (Lexer.Lex().isNot(AsmToken::Eof)) {
@@ -320,12 +319,13 @@ static int AsLexInput(SourceMgr &SrcMgr, MCAsmInfo &MAI, tool_output_file *Out)
 
 static int AssembleInput(const char *ProgName, const Target *TheTarget,
                          SourceMgr &SrcMgr, MCContext &Ctx, MCStreamer &Str,
-                         MCAsmInfo &MAI, MCSubtargetInfo &STI, MCInstrInfo &MCII) {
+                         MCAsmInfo &MAI, MCSubtargetInfo &STI,
+                         MCInstrInfo &MCII, MCTargetOptions &MCOptions) {
   std::unique_ptr<MCAsmParser> Parser(
       createMCAsmParser(SrcMgr, Ctx, Str, MAI));
   std::unique_ptr<MCTargetAsmParser> TAP(
-      TheTarget->createMCAsmParser(STI, *Parser, MCII,
-                                   InitMCTargetOptionsFromFlags()));
+      TheTarget->createMCAsmParser(STI, *Parser, MCII, MCOptions));
+
   if (!TAP) {
     errs() << ProgName
            << ": error: this target does not support assembly parsing.\n";
@@ -356,6 +356,7 @@ int main(int argc, char **argv) {
   cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
 
   cl::ParseCommandLineOptions(argc, argv, "llvm machine code playground\n");
+  MCTargetOptions MCOptions = InitMCTargetOptionsFromFlags();
   TripleName = Triple::normalize(TripleName);
   setDwarfDebugFlags(argc, argv);
 
@@ -366,12 +367,13 @@ int main(int argc, char **argv) {
   if (!TheTarget)
     return 1;
 
-  std::unique_ptr<MemoryBuffer> BufferPtr;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFilename, BufferPtr)) {
-    errs() << ProgName << ": " << ec.message() << '\n';
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferPtr =
+      MemoryBuffer::getFileOrSTDIN(InputFilename);
+  if (std::error_code EC = BufferPtr.getError()) {
+    errs() << ProgName << ": " << EC.message() << '\n';
     return 1;
   }
-  MemoryBuffer *Buffer = BufferPtr.release();
+  MemoryBuffer *Buffer = BufferPtr->release();
 
   SourceMgr SrcMgr;
 
@@ -390,7 +392,8 @@ int main(int argc, char **argv) {
 
   if (CompressDebugSections) {
     if (!zlib::isAvailable()) {
-      errs() << ProgName << ": build tools with zlib to enable -compress-debug-sections";
+      errs() << ProgName
+             << ": build tools with zlib to enable -compress-debug-sections";
       return 1;
     }
     MAI->setCompressDebugSections(true);
@@ -398,14 +401,16 @@ int main(int argc, char **argv) {
 
   // FIXME: This is not pretty. MCContext has a ptr to MCObjectFileInfo and
   // MCObjectFileInfo needs a MCContext reference in order to initialize itself.
-  std::unique_ptr<MCObjectFileInfo> MOFI(new MCObjectFileInfo());
-  MCContext Ctx(MAI.get(), MRI.get(), MOFI.get(), &SrcMgr);
-  MOFI->InitMCObjectFileInfo(TripleName, RelocModel, CMModel, Ctx);
+  MCObjectFileInfo MOFI;
+  MCContext Ctx(MAI.get(), MRI.get(), &MOFI, &SrcMgr);
+  MOFI.InitMCObjectFileInfo(TripleName, RelocModel, CMModel, Ctx);
 
   if (SaveTempLabels)
     Ctx.setAllowTemporaryLabels(false);
 
   Ctx.setGenDwarfForAssembly(GenDwarfForAssembly);
+  // Default to 4 for dwarf version.
+  unsigned DwarfVersion = MCOptions.DwarfVersion ? MCOptions.DwarfVersion : 4;
   if (DwarfVersion < 2 || DwarfVersion > 4) {
     errs() << ProgName << ": Dwarf version " << DwarfVersion
            << " is not supported." << '\n';
@@ -445,6 +450,11 @@ int main(int argc, char **argv) {
   if (FileType == OFT_AssemblyFile) {
     IP =
       TheTarget->createMCInstPrinter(OutputAsmVariant, *MAI, *MCII, *MRI, *STI);
+
+    // Set the display preference for hex vs. decimal immediates.
+    IP->setPrintImmHex(PrintImmHex);
+
+    // Set up the AsmStreamer.
     MCCodeEmitter *CE = nullptr;
     MCAsmBackend *MAB = nullptr;
     if (ShowEncoding) {
@@ -473,18 +483,14 @@ int main(int argc, char **argv) {
     Res = AsLexInput(SrcMgr, *MAI, Out.get());
     break;
   case AC_Assemble:
-    Res = AssembleInput(ProgName, TheTarget, SrcMgr, Ctx, *Str, *MAI, *STI, *MCII);
+    Res = AssembleInput(ProgName, TheTarget, SrcMgr, Ctx, *Str, *MAI, *STI,
+                        *MCII, MCOptions);
     break;
   case AC_MDisassemble:
     assert(IP && "Expected assembly output");
     IP->setUseMarkup(1);
     disassemble = true;
     break;
-  case AC_HDisassemble:
-    assert(IP && "Expected assembly output");
-    IP->setPrintImmHex(1);
-    disassemble = true;
-    break;
   case AC_Disassemble:
     disassemble = true;
     break;
diff --git a/tools/llvm-mcmarkup/llvm-mcmarkup.cpp b/tools/llvm-mcmarkup/llvm-mcmarkup.cpp
index f3a3e45..a878f11 100644
--- a/tools/llvm-mcmarkup/llvm-mcmarkup.cpp
+++ b/tools/llvm-mcmarkup/llvm-mcmarkup.cpp
@@ -19,7 +19,7 @@
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 using namespace llvm;
 
 static cl::list<std::string>
@@ -135,12 +135,13 @@ MarkupTag MarkupParser::parseTag() {
 }
 
 static void parseMCMarkup(StringRef Filename) {
-  std::unique_ptr<MemoryBuffer> BufferPtr;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, BufferPtr)) {
-    errs() << ToolName << ": " << ec.message() << '\n';
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferPtr =
+      MemoryBuffer::getFileOrSTDIN(Filename);
+  if (std::error_code EC = BufferPtr.getError()) {
+    errs() << ToolName << ": " << EC.message() << '\n';
     return;
   }
-  MemoryBuffer *Buffer = BufferPtr.release();
+  MemoryBuffer *Buffer = BufferPtr->release();
 
   SourceMgr SrcMgr;
 
diff --git a/tools/llvm-nm/Android.mk b/tools/llvm-nm/Android.mk
index 93bd9a3..98e7ba9 100644
--- a/tools/llvm-nm/Android.mk
+++ b/tools/llvm-nm/Android.mk
@@ -11,8 +11,36 @@ llvm_nm_SRC_FILES := \
   llvm-nm.cpp
 
 llvm_nm_STATIC_LIBRARIES := \
+  libLLVMARMCodeGen \
+  libLLVMARMInfo \
+  libLLVMARMDesc \
+  libLLVMARMAsmPrinter \
+  libLLVMARMAsmParser \
+  libLLVMARMDisassembler \
+  libLLVMAArch64CodeGen \
+  libLLVMAArch64Info \
+  libLLVMAArch64AsmParser \
+  libLLVMAArch64Desc \
+  libLLVMAArch64AsmPrinter \
+  libLLVMAArch64Utils \
+  libLLVMAArch64Disassembler \
+  libLLVMMipsCodeGen \
+  libLLVMMipsInfo \
+  libLLVMMipsAsmParser \
+  libLLVMMipsDesc \
+  libLLVMMipsAsmPrinter \
+  libLLVMMipsDisassembler \
+  libLLVMX86CodeGen \
+  libLLVMX86Info \
+  libLLVMX86Desc \
+  libLLVMX86AsmPrinter \
+  libLLVMX86AsmParser \
+  libLLVMX86Utils \
+  libLLVMX86Disassembler \
   libLLVMObject             \
   libLLVMBitReader          \
+  libLLVMMC                 \
+  libLLVMMCParser           \
   libLLVMCore               \
   libLLVMSupport            \
 
diff --git a/tools/llvm-nm/CMakeLists.txt b/tools/llvm-nm/CMakeLists.txt
index 6128bf9..1fe4a2d 100644
--- a/tools/llvm-nm/CMakeLists.txt
+++ b/tools/llvm-nm/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  ${LLVM_TARGETS_TO_BUILD}
   Object
   Support
   )
diff --git a/tools/llvm-nm/Makefile b/tools/llvm-nm/Makefile
index b95e920..ec20cef 100644
--- a/tools/llvm-nm/Makefile
+++ b/tools/llvm-nm/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL := ../..
 TOOLNAME := llvm-nm
-LINK_COMPONENTS := bitreader object
+LINK_COMPONENTS := all-targets bitreader object
 
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS := 1
diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp
index 3be9247..3bd9ef9 100644
--- a/tools/llvm-nm/llvm-nm.cpp
+++ b/tools/llvm-nm/llvm-nm.cpp
@@ -37,21 +37,23 @@
 #include "llvm/Support/Program.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include "llvm/Support/TargetSelect.h"
 #include <algorithm>
 #include <cctype>
 #include <cerrno>
 #include <cstring>
+#include <system_error>
 #include <vector>
 using namespace llvm;
 using namespace object;
 
 namespace {
-enum OutputFormatTy { bsd, sysv, posix };
+enum OutputFormatTy { bsd, sysv, posix, darwin };
 cl::opt<OutputFormatTy> OutputFormat(
     "format", cl::desc("Specify output format"),
     cl::values(clEnumVal(bsd, "BSD format"), clEnumVal(sysv, "System V format"),
-               clEnumVal(posix, "POSIX.2 format"), clEnumValEnd),
+               clEnumVal(posix, "POSIX.2 format"),
+               clEnumVal(darwin, "Darwin -m format"), clEnumValEnd),
     cl::init(bsd));
 cl::alias OutputFormat2("f", cl::desc("Alias for --format"),
                         cl::aliasopt(OutputFormat));
@@ -72,6 +74,8 @@ cl::alias DynamicSyms2("D", cl::desc("Alias for --dynamic"),
 
 cl::opt<bool> DefinedOnly("defined-only",
                           cl::desc("Show only defined symbols"));
+cl::alias DefinedOnly2("U", cl::desc("Alias for --defined-only"),
+                       cl::aliasopt(DefinedOnly));
 
 cl::opt<bool> ExternalOnly("extern-only",
                            cl::desc("Show only external symbols"));
@@ -80,6 +84,12 @@ cl::alias ExternalOnly2("g", cl::desc("Alias for --extern-only"),
 
 cl::opt<bool> BSDFormat("B", cl::desc("Alias for --format=bsd"));
 cl::opt<bool> POSIXFormat("P", cl::desc("Alias for --format=posix"));
+cl::opt<bool> DarwinFormat("m", cl::desc("Alias for --format=darwin"));
+
+static cl::list<std::string>
+ArchFlags("arch", cl::desc("architecture(s) from a Mach-O file to dump"),
+          cl::ZeroOrMore);
+bool ArchAll = false;
 
 cl::opt<bool> PrintFileName(
     "print-file-name",
@@ -104,6 +114,10 @@ cl::alias NumericSortv("v", cl::desc("Alias for --numeric-sort"),
 cl::opt<bool> NoSort("no-sort", cl::desc("Show symbols in order encountered"));
 cl::alias NoSortp("p", cl::desc("Alias for --no-sort"), cl::aliasopt(NoSort));
 
+cl::opt<bool> ReverseSort("reverse-sort", cl::desc("Sort in reverse order"));
+cl::alias ReverseSortr("r", cl::desc("Alias for --reverse-sort"),
+                       cl::aliasopt(ReverseSort));
+
 cl::opt<bool> PrintSize("print-size",
                         cl::desc("Show symbol size instead of address"));
 cl::alias PrintSizeS("S", cl::desc("Alias for --print-size"),
@@ -115,8 +129,13 @@ cl::opt<bool> WithoutAliases("without-aliases", cl::Hidden,
                              cl::desc("Exclude aliases from output"));
 
 cl::opt<bool> ArchiveMap("print-armap", cl::desc("Print the archive map"));
-cl::alias ArchiveMaps("s", cl::desc("Alias for --print-armap"),
+cl::alias ArchiveMaps("M", cl::desc("Alias for --print-armap"),
                       cl::aliasopt(ArchiveMap));
+
+cl::opt<bool> JustSymbolName("just-symbol-name",
+                             cl::desc("Print just the symbol's name"));
+cl::alias JustSymbolNames("j", cl::desc("Alias for --just-symbol-name"),
+                          cl::aliasopt(JustSymbolName));
 bool PrintAddress = true;
 
 bool MultipleFiles = false;
@@ -131,7 +150,7 @@ static void error(Twine Message, Twine Path = Twine()) {
   errs() << ToolName << ": " << Path << ": " << Message << ".\n";
 }
 
-static bool error(error_code EC, Twine Path = Twine()) {
+static bool error(std::error_code EC, Twine Path = Twine()) {
   if (EC) {
     error(EC.message(), Path);
     return true;
@@ -145,40 +164,74 @@ struct NMSymbol {
   uint64_t Size;
   char TypeChar;
   StringRef Name;
+  DataRefImpl Symb;
 };
 }
 
 static bool compareSymbolAddress(const NMSymbol &A, const NMSymbol &B) {
-  if (A.Address < B.Address)
-    return true;
-  else if (A.Address == B.Address && A.Name < B.Name)
-    return true;
-  else if (A.Address == B.Address && A.Name == B.Name && A.Size < B.Size)
-    return true;
-  else
-    return false;
+  if (!ReverseSort) {
+    if (A.Address < B.Address)
+      return true;
+    else if (A.Address == B.Address && A.Name < B.Name)
+      return true;
+    else if (A.Address == B.Address && A.Name == B.Name && A.Size < B.Size)
+      return true;
+    else
+      return false;
+  } else {
+    if (A.Address > B.Address)
+      return true;
+    else if (A.Address == B.Address && A.Name > B.Name)
+      return true;
+    else if (A.Address == B.Address && A.Name == B.Name && A.Size > B.Size)
+      return true;
+    else
+      return false;
+  }
 }
 
 static bool compareSymbolSize(const NMSymbol &A, const NMSymbol &B) {
-  if (A.Size < B.Size)
-    return true;
-  else if (A.Size == B.Size && A.Name < B.Name)
-    return true;
-  else if (A.Size == B.Size && A.Name == B.Name && A.Address < B.Address)
-    return true;
-  else
-    return false;
+  if (!ReverseSort) {
+    if (A.Size < B.Size)
+      return true;
+    else if (A.Size == B.Size && A.Name < B.Name)
+      return true;
+    else if (A.Size == B.Size && A.Name == B.Name && A.Address < B.Address)
+      return true;
+    else
+      return false;
+  } else {
+    if (A.Size > B.Size)
+      return true;
+    else if (A.Size == B.Size && A.Name > B.Name)
+      return true;
+    else if (A.Size == B.Size && A.Name == B.Name && A.Address > B.Address)
+      return true;
+    else
+      return false;
+  }
 }
 
 static bool compareSymbolName(const NMSymbol &A, const NMSymbol &B) {
-  if (A.Name < B.Name)
-    return true;
-  else if (A.Name == B.Name && A.Size < B.Size)
-    return true;
-  else if (A.Name == B.Name && A.Size == B.Size && A.Address < B.Address)
-    return true;
-  else
-    return false;
+  if (!ReverseSort) {
+    if (A.Name < B.Name)
+      return true;
+    else if (A.Name == B.Name && A.Size < B.Size)
+      return true;
+    else if (A.Name == B.Name && A.Size == B.Size && A.Address < B.Address)
+      return true;
+    else
+      return false;
+  } else {
+    if (A.Name > B.Name)
+      return true;
+    else if (A.Name == B.Name && A.Size > B.Size)
+      return true;
+    else if (A.Name == B.Name && A.Size == B.Size && A.Address > B.Address)
+      return true;
+    else
+      return false;
+  }
 }
 
 static char isSymbolList64Bit(SymbolicFile *Obj) {
@@ -194,7 +247,7 @@ static char isSymbolList64Bit(SymbolicFile *Obj) {
     return true;
   else if (isa<ELF32BEObjectFile>(Obj))
     return false;
-  else if(isa<ELF64BEObjectFile>(Obj))
+  else if (isa<ELF64BEObjectFile>(Obj))
     return true;
   else
     return false;
@@ -204,7 +257,164 @@ static StringRef CurrentFilename;
 typedef std::vector<NMSymbol> SymbolListT;
 static SymbolListT SymbolList;
 
-static void sortAndPrintSymbolList(SymbolicFile *Obj) {
+// darwinPrintSymbol() is used to print a symbol from a Mach-O file when the
+// the OutputFormat is darwin.  It produces the same output as darwin's nm(1) -m
+// output.
+static void darwinPrintSymbol(MachOObjectFile *MachO, SymbolListT::iterator I,
+                              char *SymbolAddrStr, const char *printBlanks) {
+  MachO::mach_header H;
+  MachO::mach_header_64 H_64;
+  uint32_t Filetype, Flags;
+  MachO::nlist_64 STE_64;
+  MachO::nlist STE;
+  uint8_t NType;
+  uint16_t NDesc;
+  uint64_t NValue;
+  if (MachO->is64Bit()) {
+    H_64 = MachO->MachOObjectFile::getHeader64();
+    Filetype = H_64.filetype;
+    Flags = H_64.flags;
+    STE_64 = MachO->getSymbol64TableEntry(I->Symb);
+    NType = STE_64.n_type;
+    NDesc = STE_64.n_desc;
+    NValue = STE_64.n_value;
+  } else {
+    H = MachO->MachOObjectFile::getHeader();
+    Filetype = H.filetype;
+    Flags = H.flags;
+    STE = MachO->getSymbolTableEntry(I->Symb);
+    NType = STE.n_type;
+    NDesc = STE.n_desc;
+    NValue = STE.n_value;
+  }
+
+  if (PrintAddress) {
+    if ((NType & MachO::N_TYPE) == MachO::N_INDR)
+      strcpy(SymbolAddrStr, printBlanks);
+    outs() << SymbolAddrStr << ' ';
+  }
+
+  switch (NType & MachO::N_TYPE) {
+  case MachO::N_UNDF:
+    if (NValue != 0) {
+      outs() << "(common) ";
+      if (MachO::GET_COMM_ALIGN(NDesc) != 0)
+        outs() << "(alignment 2^" << (int)MachO::GET_COMM_ALIGN(NDesc) << ") ";
+    } else {
+      if ((NType & MachO::N_TYPE) == MachO::N_PBUD)
+        outs() << "(prebound ";
+      else
+        outs() << "(";
+      if ((NDesc & MachO::REFERENCE_TYPE) ==
+          MachO::REFERENCE_FLAG_UNDEFINED_LAZY)
+        outs() << "undefined [lazy bound]) ";
+      else if ((NDesc & MachO::REFERENCE_TYPE) ==
+               MachO::REFERENCE_FLAG_UNDEFINED_LAZY)
+        outs() << "undefined [private lazy bound]) ";
+      else if ((NDesc & MachO::REFERENCE_TYPE) ==
+               MachO::REFERENCE_FLAG_PRIVATE_UNDEFINED_NON_LAZY)
+        outs() << "undefined [private]) ";
+      else
+        outs() << "undefined) ";
+    }
+    break;
+  case MachO::N_ABS:
+    outs() << "(absolute) ";
+    break;
+  case MachO::N_INDR:
+    outs() << "(indirect) ";
+    break;
+  case MachO::N_SECT: {
+    section_iterator Sec = MachO->section_end();
+    MachO->getSymbolSection(I->Symb, Sec);
+    DataRefImpl Ref = Sec->getRawDataRefImpl();
+    StringRef SectionName;
+    MachO->getSectionName(Ref, SectionName);
+    StringRef SegmentName = MachO->getSectionFinalSegmentName(Ref);
+    outs() << "(" << SegmentName << "," << SectionName << ") ";
+    break;
+  }
+  default:
+    outs() << "(?) ";
+    break;
+  }
+
+  if (NType & MachO::N_EXT) {
+    if (NDesc & MachO::REFERENCED_DYNAMICALLY)
+      outs() << "[referenced dynamically] ";
+    if (NType & MachO::N_PEXT) {
+      if ((NDesc & MachO::N_WEAK_DEF) == MachO::N_WEAK_DEF)
+        outs() << "weak private external ";
+      else
+        outs() << "private external ";
+    } else {
+      if ((NDesc & MachO::N_WEAK_REF) == MachO::N_WEAK_REF ||
+          (NDesc & MachO::N_WEAK_DEF) == MachO::N_WEAK_DEF) {
+        if ((NDesc & (MachO::N_WEAK_REF | MachO::N_WEAK_DEF)) ==
+            (MachO::N_WEAK_REF | MachO::N_WEAK_DEF))
+          outs() << "weak external automatically hidden ";
+        else
+          outs() << "weak external ";
+      } else
+        outs() << "external ";
+    }
+  } else {
+    if (NType & MachO::N_PEXT)
+      outs() << "non-external (was a private external) ";
+    else
+      outs() << "non-external ";
+  }
+
+  if (Filetype == MachO::MH_OBJECT &&
+      (NDesc & MachO::N_NO_DEAD_STRIP) == MachO::N_NO_DEAD_STRIP)
+    outs() << "[no dead strip] ";
+
+  if (Filetype == MachO::MH_OBJECT &&
+      ((NType & MachO::N_TYPE) != MachO::N_UNDF) &&
+      (NDesc & MachO::N_SYMBOL_RESOLVER) == MachO::N_SYMBOL_RESOLVER)
+    outs() << "[symbol resolver] ";
+
+  if (Filetype == MachO::MH_OBJECT &&
+      ((NType & MachO::N_TYPE) != MachO::N_UNDF) &&
+      (NDesc & MachO::N_ALT_ENTRY) == MachO::N_ALT_ENTRY)
+    outs() << "[alt entry] ";
+
+  if ((NDesc & MachO::N_ARM_THUMB_DEF) == MachO::N_ARM_THUMB_DEF)
+    outs() << "[Thumb] ";
+
+  if ((NType & MachO::N_TYPE) == MachO::N_INDR) {
+    outs() << I->Name << " (for ";
+    StringRef IndirectName;
+    if (MachO->getIndirectName(I->Symb, IndirectName))
+      outs() << "?)";
+    else
+      outs() << IndirectName << ")";
+  } else
+    outs() << I->Name;
+
+  if ((Flags & MachO::MH_TWOLEVEL) == MachO::MH_TWOLEVEL &&
+      (((NType & MachO::N_TYPE) == MachO::N_UNDF && NValue == 0) ||
+       (NType & MachO::N_TYPE) == MachO::N_PBUD)) {
+    uint32_t LibraryOrdinal = MachO::GET_LIBRARY_ORDINAL(NDesc);
+    if (LibraryOrdinal != 0) {
+      if (LibraryOrdinal == MachO::EXECUTABLE_ORDINAL)
+        outs() << " (from executable)";
+      else if (LibraryOrdinal == MachO::DYNAMIC_LOOKUP_ORDINAL)
+        outs() << " (dynamically looked up)";
+      else {
+        StringRef LibraryName;
+        if (MachO->getLibraryShortNameByIndex(LibraryOrdinal - 1, LibraryName))
+          outs() << " (from bad library ordinal " << LibraryOrdinal << ")";
+        else
+          outs() << " (from " << LibraryName << ")";
+      }
+    }
+  }
+
+  outs() << "\n";
+}
+
+static void sortAndPrintSymbolList(SymbolicFile *Obj, bool printName) {
   if (!NoSort) {
     if (NumericSort)
       std::sort(SymbolList.begin(), SymbolList.end(), compareSymbolAddress);
@@ -214,9 +424,9 @@ static void sortAndPrintSymbolList(SymbolicFile *Obj) {
       std::sort(SymbolList.begin(), SymbolList.end(), compareSymbolName);
   }
 
-  if (OutputFormat == posix && MultipleFiles) {
+  if (OutputFormat == posix && MultipleFiles && printName) {
     outs() << '\n' << CurrentFilename << ":\n";
-  } else if (OutputFormat == bsd && MultipleFiles) {
+  } else if (OutputFormat == bsd && MultipleFiles && printName) {
     outs() << "\n" << CurrentFilename << ":\n";
   } else if (OutputFormat == sysv) {
     outs() << "\n\nSymbols from " << CurrentFilename << ":\n\n"
@@ -241,6 +451,10 @@ static void sortAndPrintSymbolList(SymbolicFile *Obj) {
       continue;
     if (SizeSort && !PrintAddress && I->Size == UnknownAddressOrSize)
       continue;
+    if (JustSymbolName) {
+      outs() << I->Name << "\n";
+      continue;
+    }
 
     char SymbolAddrStr[18] = "";
     char SymbolSizeStr[18] = "";
@@ -256,10 +470,16 @@ static void sortAndPrintSymbolList(SymbolicFile *Obj) {
     if (I->Size != UnknownAddressOrSize)
       format(printFormat, I->Size).print(SymbolSizeStr, sizeof(SymbolSizeStr));
 
-    if (OutputFormat == posix) {
+    // If OutputFormat is darwin and we have a MachOObjectFile print as darwin's
+    // nm(1) -m output, else if OutputFormat is darwin and not a Mach-O object
+    // fall back to OutputFormat bsd (see below).
+    MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(Obj);
+    if (OutputFormat == darwin && MachO) {
+      darwinPrintSymbol(MachO, I, SymbolAddrStr, printBlanks);
+    } else if (OutputFormat == posix) {
       outs() << I->Name << " " << I->TypeChar << " " << SymbolAddrStr
              << SymbolSizeStr << "\n";
-    } else if (OutputFormat == bsd) {
+    } else if (OutputFormat == bsd || (OutputFormat == darwin && !MachO)) {
       if (PrintAddress)
         outs() << SymbolAddrStr << ' ';
       if (PrintSize) {
@@ -299,14 +519,14 @@ static char getSymbolNMTypeChar(ELFObjectFile<ELFT> &Obj,
     case ELF::SHT_PROGBITS:
     case ELF::SHT_DYNAMIC:
       switch (ESec->sh_flags) {
-      case(ELF::SHF_ALLOC | ELF::SHF_EXECINSTR) :
+      case (ELF::SHF_ALLOC | ELF::SHF_EXECINSTR):
         return 't';
-      case(ELF::SHF_TLS | ELF::SHF_ALLOC | ELF::SHF_WRITE) :
-      case(ELF::SHF_ALLOC | ELF::SHF_WRITE) :
+      case (ELF::SHF_TLS | ELF::SHF_ALLOC | ELF::SHF_WRITE):
+      case (ELF::SHF_ALLOC | ELF::SHF_WRITE):
         return 'd';
       case ELF::SHF_ALLOC:
-      case(ELF::SHF_ALLOC | ELF::SHF_MERGE) :
-      case(ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::SHF_STRINGS) :
+      case (ELF::SHF_ALLOC | ELF::SHF_MERGE):
+      case (ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::SHF_STRINGS):
         return 'r';
       }
       break;
@@ -395,6 +615,8 @@ static char getSymbolNMTypeChar(MachOObjectFile &Obj, basic_symbol_iterator I) {
   switch (NType & MachO::N_TYPE) {
   case MachO::N_ABS:
     return 's';
+  case MachO::N_INDR:
+    return 'i';
   case MachO::N_SECT: {
     section_iterator Sec = Obj.section_end();
     Obj.getSymbolSection(Symb, Sec);
@@ -404,6 +626,10 @@ static char getSymbolNMTypeChar(MachOObjectFile &Obj, basic_symbol_iterator I) {
     StringRef SegmentName = Obj.getSectionFinalSegmentName(Ref);
     if (SegmentName == "__TEXT" && SectionName == "__text")
       return 't';
+    else if (SegmentName == "__DATA" && SectionName == "__data")
+      return 'd';
+    else if (SegmentName == "__DATA" && SectionName == "__bss")
+      return 'b';
     else
       return 's';
   }
@@ -413,20 +639,18 @@ static char getSymbolNMTypeChar(MachOObjectFile &Obj, basic_symbol_iterator I) {
 }
 
 static char getSymbolNMTypeChar(const GlobalValue &GV) {
-  if (isa<Function>(GV))
+  if (GV.getType()->getElementType()->isFunctionTy())
     return 't';
   // FIXME: should we print 'b'? At the IR level we cannot be sure if this
   // will be in bss or not, but we could approximate.
-  if (isa<GlobalVariable>(GV))
-    return 'd';
-  const GlobalAlias *GA = cast<GlobalAlias>(&GV);
-  const GlobalValue *AliasedGV = GA->getAliasee();
-  return getSymbolNMTypeChar(*AliasedGV);
+  return 'd';
 }
 
 static char getSymbolNMTypeChar(IRObjectFile &Obj, basic_symbol_iterator I) {
-  const GlobalValue &GV = Obj.getSymbolGV(I->getRawDataRefImpl());
-  return getSymbolNMTypeChar(GV);
+  const GlobalValue *GV = Obj.getSymbolGV(I->getRawDataRefImpl());
+  if (!GV)
+    return 't';
+  return getSymbolNMTypeChar(*GV);
 }
 
 template <class ELFT>
@@ -490,7 +714,7 @@ static char getNMTypeChar(SymbolicFile *Obj, basic_symbol_iterator I) {
   return Ret;
 }
 
-static void dumpSymbolNamesFromObject(SymbolicFile *Obj) {
+static void dumpSymbolNamesFromObject(SymbolicFile *Obj, bool printName) {
   basic_symbol_iterator IBegin = Obj->symbol_begin();
   basic_symbol_iterator IEnd = Obj->symbol_end();
   if (DynamicSyms) {
@@ -511,8 +735,8 @@ static void dumpSymbolNamesFromObject(SymbolicFile *Obj) {
       continue;
     if (WithoutAliases) {
       if (IRObjectFile *IR = dyn_cast<IRObjectFile>(Obj)) {
-        const GlobalValue &GV = IR->getSymbolGV(I->getRawDataRefImpl());
-        if(isa<GlobalAlias>(GV))
+        const GlobalValue *GV = IR->getSymbolGV(I->getRawDataRefImpl());
+        if (GV && isa<GlobalAlias>(GV))
           continue;
       }
     }
@@ -531,6 +755,7 @@ static void dumpSymbolNamesFromObject(SymbolicFile *Obj) {
     if (error(I->printName(OS)))
       break;
     OS << '\0';
+    S.Symb = I->getRawDataRefImpl();
     SymbolList.push_back(S);
   }
 
@@ -542,18 +767,55 @@ static void dumpSymbolNamesFromObject(SymbolicFile *Obj) {
   }
 
   CurrentFilename = Obj->getFileName();
-  sortAndPrintSymbolList(Obj);
+  sortAndPrintSymbolList(Obj, printName);
+}
+
+// checkMachOAndArchFlags() checks to see if the SymbolicFile is a Mach-O file
+// and if it is and there is a list of architecture flags is specified then
+// check to make sure this Mach-O file is one of those architectures or all
+// architectures was specificed.  If not then an error is generated and this
+// routine returns false.  Else it returns true.
+static bool checkMachOAndArchFlags(SymbolicFile *O, std::string &Filename) {
+  if (isa<MachOObjectFile>(O) && !ArchAll && ArchFlags.size() != 0) {
+    MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(O);
+    bool ArchFound = false;
+    MachO::mach_header H;
+    MachO::mach_header_64 H_64;
+    Triple T;
+    if (MachO->is64Bit()) {
+      H_64 = MachO->MachOObjectFile::getHeader64();
+      T = MachOObjectFile::getArch(H_64.cputype, H_64.cpusubtype);
+    } else {
+      H = MachO->MachOObjectFile::getHeader();
+      T = MachOObjectFile::getArch(H.cputype, H.cpusubtype);
+    }
+    unsigned i;
+    for (i = 0; i < ArchFlags.size(); ++i) {
+      if (ArchFlags[i] == T.getArchName())
+        ArchFound = true;
+      break;
+    }
+    if (!ArchFound) {
+      error(ArchFlags[i],
+            "file: " + Filename + " does not contain architecture");
+      return false;
+    }
+  }
+  return true;
 }
 
 static void dumpSymbolNamesFromFile(std::string &Filename) {
-  std::unique_ptr<MemoryBuffer> Buffer;
-  if (error(MemoryBuffer::getFileOrSTDIN(Filename, Buffer), Filename))
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
+      MemoryBuffer::getFileOrSTDIN(Filename);
+  if (error(BufferOrErr.getError(), Filename))
     return;
+  std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get());
 
   LLVMContext &Context = getGlobalContext();
-  ErrorOr<Binary *> BinaryOrErr = createBinary(Buffer.release(), &Context);
+  ErrorOr<Binary *> BinaryOrErr = createBinary(Buffer, &Context);
   if (error(BinaryOrErr.getError(), Filename))
     return;
+  Buffer.release();
   std::unique_ptr<Binary> Bin(BinaryOrErr.get());
 
   if (Archive *A = dyn_cast<Archive>(Bin.get())) {
@@ -563,16 +825,14 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
       if (I != E) {
         outs() << "Archive map\n";
         for (; I != E; ++I) {
-          Archive::child_iterator C;
-          StringRef SymName;
-          StringRef FileName;
-          if (error(I->getMember(C)))
+          ErrorOr<Archive::child_iterator> C = I->getMember();
+          if (error(C.getError()))
             return;
-          if (error(I->getName(SymName)))
+          ErrorOr<StringRef> FileNameOrErr = C.get()->getName();
+          if (error(FileNameOrErr.getError()))
             return;
-          if (error(C->getName(FileName)))
-            return;
-          outs() << SymName << " in " << FileName << "\n";
+          StringRef SymName = I->getName();
+          outs() << SymName << " in " << FileNameOrErr.get() << "\n";
         }
         outs() << "\n";
       }
@@ -580,36 +840,145 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
 
     for (Archive::child_iterator I = A->child_begin(), E = A->child_end();
          I != E; ++I) {
-      std::unique_ptr<Binary> Child;
-      if (I->getAsBinary(Child, &Context))
+      ErrorOr<std::unique_ptr<Binary>> ChildOrErr = I->getAsBinary(&Context);
+      if (ChildOrErr.getError())
         continue;
-      if (SymbolicFile *O = dyn_cast<SymbolicFile>(Child.get())) {
-        outs() << O->getFileName() << ":\n";
-        dumpSymbolNamesFromObject(O);
+      if (SymbolicFile *O = dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
+        if (!checkMachOAndArchFlags(O, Filename))
+          return;
+        outs() << "\n";
+        if (isa<MachOObjectFile>(O)) {
+          outs() << Filename << "(" << O->getFileName() << ")";
+        } else
+          outs() << O->getFileName();
+        outs() << ":\n";
+        dumpSymbolNamesFromObject(O, false);
       }
     }
     return;
   }
   if (MachOUniversalBinary *UB = dyn_cast<MachOUniversalBinary>(Bin.get())) {
+    // If we have a list of architecture flags specified dump only those.
+    if (!ArchAll && ArchFlags.size() != 0) {
+      // Look for a slice in the universal binary that matches each ArchFlag.
+      bool ArchFound;
+      for (unsigned i = 0; i < ArchFlags.size(); ++i) {
+        ArchFound = false;
+        for (MachOUniversalBinary::object_iterator I = UB->begin_objects(),
+                                                   E = UB->end_objects();
+             I != E; ++I) {
+          if (ArchFlags[i] == I->getArchTypeName()) {
+            ArchFound = true;
+            ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr =
+                I->getAsObjectFile();
+            std::unique_ptr<Archive> A;
+            if (ObjOrErr) {
+              std::unique_ptr<ObjectFile> Obj = std::move(ObjOrErr.get());
+              if (ArchFlags.size() > 1) {
+                outs() << "\n" << Obj->getFileName() << " (for architecture "
+                       << I->getArchTypeName() << ")"
+                       << ":\n";
+              }
+              dumpSymbolNamesFromObject(Obj.get(), false);
+            } else if (!I->getAsArchive(A)) {
+              for (Archive::child_iterator AI = A->child_begin(),
+                                           AE = A->child_end();
+                   AI != AE; ++AI) {
+                ErrorOr<std::unique_ptr<Binary>> ChildOrErr =
+                    AI->getAsBinary(&Context);
+                if (ChildOrErr.getError())
+                  continue;
+                if (SymbolicFile *O =
+                        dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
+                  outs() << "\n" << A->getFileName();
+                  outs() << "(" << O->getFileName() << ")";
+                  if (ArchFlags.size() > 1) {
+                    outs() << " (for architecture " << I->getArchTypeName()
+                           << ")";
+                  }
+                  outs() << ":\n";
+                  dumpSymbolNamesFromObject(O, false);
+                }
+              }
+            }
+          }
+        }
+        if (!ArchFound) {
+          error(ArchFlags[i],
+                "file: " + Filename + " does not contain architecture");
+          return;
+        }
+      }
+      return;
+    }
+    // No architecture flags were specified so if this contains a slice that
+    // matches the host architecture dump only that.
+    if (!ArchAll) {
+      StringRef HostArchName = MachOObjectFile::getHostArch().getArchName();
+      for (MachOUniversalBinary::object_iterator I = UB->begin_objects(),
+                                                 E = UB->end_objects();
+           I != E; ++I) {
+        if (HostArchName == I->getArchTypeName()) {
+          ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr = I->getAsObjectFile();
+          std::unique_ptr<Archive> A;
+          if (ObjOrErr) {
+            std::unique_ptr<ObjectFile> Obj = std::move(ObjOrErr.get());
+            dumpSymbolNamesFromObject(Obj.get(), false);
+          } else if (!I->getAsArchive(A)) {
+            for (Archive::child_iterator AI = A->child_begin(),
+                                         AE = A->child_end();
+                 AI != AE; ++AI) {
+              ErrorOr<std::unique_ptr<Binary>> ChildOrErr =
+                  AI->getAsBinary(&Context);
+              if (ChildOrErr.getError())
+                continue;
+              if (SymbolicFile *O =
+                      dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
+                outs() << "\n" << A->getFileName() << "(" << O->getFileName()
+                       << ")"
+                       << ":\n";
+                dumpSymbolNamesFromObject(O, false);
+              }
+            }
+          }
+          return;
+        }
+      }
+    }
+    // Either all architectures have been specified or none have been specified
+    // and this does not contain the host architecture so dump all the slices.
+    bool moreThanOneArch = UB->getNumberOfObjects() > 1;
     for (MachOUniversalBinary::object_iterator I = UB->begin_objects(),
                                                E = UB->end_objects();
          I != E; ++I) {
-      std::unique_ptr<ObjectFile> Obj;
+      ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr = I->getAsObjectFile();
       std::unique_ptr<Archive> A;
-      if (!I->getAsObjectFile(Obj)) {
-        outs() << Obj->getFileName() << ":\n";
-        dumpSymbolNamesFromObject(Obj.get());
-      }
-      else if (!I->getAsArchive(A)) {
+      if (ObjOrErr) {
+        std::unique_ptr<ObjectFile> Obj = std::move(ObjOrErr.get());
+        if (moreThanOneArch)
+          outs() << "\n";
+        outs() << Obj->getFileName();
+        if (isa<MachOObjectFile>(Obj.get()) && moreThanOneArch)
+          outs() << " (for architecture " << I->getArchTypeName() << ")";
+        outs() << ":\n";
+        dumpSymbolNamesFromObject(Obj.get(), false);
+      } else if (!I->getAsArchive(A)) {
         for (Archive::child_iterator AI = A->child_begin(), AE = A->child_end();
              AI != AE; ++AI) {
-          std::unique_ptr<Binary> Child;
-          if (AI->getAsBinary(Child, &Context))
+          ErrorOr<std::unique_ptr<Binary>> ChildOrErr =
+              AI->getAsBinary(&Context);
+          if (ChildOrErr.getError())
             continue;
-          if (SymbolicFile *O = dyn_cast<SymbolicFile>(Child.get())) {
-            outs() << A->getFileName() << ":";
-            outs() << O->getFileName() << ":\n";
-            dumpSymbolNamesFromObject(O);
+          if (SymbolicFile *O = dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
+            outs() << "\n" << A->getFileName();
+            if (isa<MachOObjectFile>(O)) {
+              outs() << "(" << O->getFileName() << ")";
+              if (moreThanOneArch)
+                outs() << " (for architecture " << I->getArchTypeName() << ")";
+            } else
+              outs() << ":" << O->getFileName();
+            outs() << ":\n";
+            dumpSymbolNamesFromObject(O, false);
           }
         }
       }
@@ -617,7 +986,9 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
     return;
   }
   if (SymbolicFile *O = dyn_cast<SymbolicFile>(Bin.get())) {
-    dumpSymbolNamesFromObject(O);
+    if (!checkMachOAndArchFlags(O, Filename))
+      return;
+    dumpSymbolNamesFromObject(O, true);
     return;
   }
   error("unrecognizable file type", Filename);
@@ -636,11 +1007,17 @@ int main(int argc, char **argv) {
   if (error(sys::ChangeStdinToBinary()))
     return 1;
 
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllAsmParsers();
+
   ToolName = argv[0];
   if (BSDFormat)
     OutputFormat = bsd;
   if (POSIXFormat)
     OutputFormat = posix;
+  if (DarwinFormat)
+    OutputFormat = darwin;
 
   // The relative order of these is important. If you pass --size-sort it should
   // only print out the size. However, if you pass -S --size-sort, it should
@@ -652,13 +1029,24 @@ int main(int argc, char **argv) {
 
   switch (InputFilenames.size()) {
   case 0:
-    InputFilenames.push_back("-");
+    InputFilenames.push_back("a.out");
   case 1:
     break;
   default:
     MultipleFiles = true;
   }
 
+  for (unsigned i = 0; i < ArchFlags.size(); ++i) {
+    if (ArchFlags[i] == "all") {
+      ArchAll = true;
+    } else {
+      Triple T = MachOObjectFile::getArch(ArchFlags[i]);
+      if (T.getArch() == Triple::UnknownArch)
+        error("Unknown architecture named '" + ArchFlags[i] + "'",
+              "for the -arch option");
+    }
+  }
+
   std::for_each(InputFilenames.begin(), InputFilenames.end(),
                 dumpSymbolNamesFromFile);
 
diff --git a/tools/llvm-objdump/Android.mk b/tools/llvm-objdump/Android.mk
index ea738f4..8105ebf 100644
--- a/tools/llvm-objdump/Android.mk
+++ b/tools/llvm-objdump/Android.mk
@@ -39,15 +39,15 @@ llvm_objdump_STATIC_LIBRARIES := \
   libLLVMX86Disassembler \
   libLLVMAsmPrinter \
   libLLVMTarget \
+  libLLVMObject \
   libLLVMMCParser \
+  libLLVMMCAnalysis \
   libLLVMMC \
   libLLVMMCDisassembler \
-  libLLVMObject \
   libLLVMBitReader \
   libLLVMCore \
   libLLVMAsmParser \
   libLLVMSupport \
-  libLLVMMCDisassembler \
 
 include $(CLEAR_VARS)
 
diff --git a/tools/llvm-objdump/CMakeLists.txt b/tools/llvm-objdump/CMakeLists.txt
index 413cb9b..d63602b 100644
--- a/tools/llvm-objdump/CMakeLists.txt
+++ b/tools/llvm-objdump/CMakeLists.txt
@@ -2,6 +2,7 @@ set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
   DebugInfo
   MC
+  MCAnalysis
   Object
   Support
   )
diff --git a/tools/llvm-objdump/COFFDump.cpp b/tools/llvm-objdump/COFFDump.cpp
index 49f2755..39d8e8e 100644
--- a/tools/llvm-objdump/COFFDump.cpp
+++ b/tools/llvm-objdump/COFFDump.cpp
@@ -22,9 +22,9 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/Win64EH.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <algorithm>
 #include <cstring>
+#include <system_error>
 
 using namespace llvm;
 using namespace object;
@@ -157,14 +157,14 @@ static void printAllUnwindCodes(ArrayRef<UnwindCode> UCs) {
 }
 
 // Given a symbol sym this functions returns the address and section of it.
-static error_code resolveSectionAndAddress(const COFFObjectFile *Obj,
-                                           const SymbolRef &Sym,
-                                           const coff_section *&ResolvedSection,
-                                           uint64_t &ResolvedAddr) {
-  if (error_code EC = Sym.getAddress(ResolvedAddr))
+static std::error_code
+resolveSectionAndAddress(const COFFObjectFile *Obj, const SymbolRef &Sym,
+                         const coff_section *&ResolvedSection,
+                         uint64_t &ResolvedAddr) {
+  if (std::error_code EC = Sym.getAddress(ResolvedAddr))
     return EC;
   section_iterator iter(Obj->section_begin());
-  if (error_code EC = Sym.getSection(iter))
+  if (std::error_code EC = Sym.getSection(iter))
     return EC;
   ResolvedSection = Obj->getCOFFSection(*iter);
   return object_error::success;
@@ -172,13 +172,13 @@ static error_code resolveSectionAndAddress(const COFFObjectFile *Obj,
 
 // Given a vector of relocations for a section and an offset into this section
 // the function returns the symbol used for the relocation at the offset.
-static error_code resolveSymbol(const std::vector<RelocationRef> &Rels,
-                                uint64_t Offset, SymbolRef &Sym) {
+static std::error_code resolveSymbol(const std::vector<RelocationRef> &Rels,
+                                     uint64_t Offset, SymbolRef &Sym) {
   for (std::vector<RelocationRef>::const_iterator I = Rels.begin(),
                                                   E = Rels.end();
                                                   I != E; ++I) {
     uint64_t Ofs;
-    if (error_code EC = I->getOffset(Ofs))
+    if (std::error_code EC = I->getOffset(Ofs))
       return EC;
     if (Ofs == Offset) {
       Sym = *I->getSymbol();
@@ -192,18 +192,17 @@ static error_code resolveSymbol(const std::vector<RelocationRef> &Rels,
 // the function resolves the symbol used for the relocation at the offset and
 // returns the section content and the address inside the content pointed to
 // by the symbol.
-static error_code getSectionContents(const COFFObjectFile *Obj,
-                                     const std::vector<RelocationRef> &Rels,
-                                     uint64_t Offset,
-                                     ArrayRef<uint8_t> &Contents,
-                                     uint64_t &Addr) {
+static std::error_code
+getSectionContents(const COFFObjectFile *Obj,
+                   const std::vector<RelocationRef> &Rels, uint64_t Offset,
+                   ArrayRef<uint8_t> &Contents, uint64_t &Addr) {
   SymbolRef Sym;
-  if (error_code EC = resolveSymbol(Rels, Offset, Sym))
+  if (std::error_code EC = resolveSymbol(Rels, Offset, Sym))
     return EC;
   const coff_section *Section;
-  if (error_code EC = resolveSectionAndAddress(Obj, Sym, Section, Addr))
+  if (std::error_code EC = resolveSectionAndAddress(Obj, Sym, Section, Addr))
     return EC;
-  if (error_code EC = Obj->getSectionContents(Section, Contents))
+  if (std::error_code EC = Obj->getSectionContents(Section, Contents))
     return EC;
   return object_error::success;
 }
@@ -211,12 +210,12 @@ static error_code getSectionContents(const COFFObjectFile *Obj,
 // Given a vector of relocations for a section and an offset into this section
 // the function returns the name of the symbol used for the relocation at the
 // offset.
-static error_code resolveSymbolName(const std::vector<RelocationRef> &Rels,
-                                    uint64_t Offset, StringRef &Name) {
+static std::error_code resolveSymbolName(const std::vector<RelocationRef> &Rels,
+                                         uint64_t Offset, StringRef &Name) {
   SymbolRef Sym;
-  if (error_code EC = resolveSymbol(Rels, Offset, Sym))
+  if (std::error_code EC = resolveSymbol(Rels, Offset, Sym))
     return EC;
-  if (error_code EC = Sym.getName(Name))
+  if (std::error_code EC = Sym.getName(Name))
     return EC;
   return object_error::success;
 }
diff --git a/tools/llvm-objdump/LLVMBuild.txt b/tools/llvm-objdump/LLVMBuild.txt
index d16c501..d9c09b6 100644
--- a/tools/llvm-objdump/LLVMBuild.txt
+++ b/tools/llvm-objdump/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Tool
 name = llvm-objdump
 parent = Tools
-required_libraries = DebugInfo MC MCDisassembler MCParser Object all-targets
+required_libraries = DebugInfo MC MCAnalysis MCDisassembler MCParser Object all-targets
diff --git a/tools/llvm-objdump/MachODump.cpp b/tools/llvm-objdump/MachODump.cpp
index 3ca582f..4b46ac4 100644
--- a/tools/llvm-objdump/MachODump.cpp
+++ b/tools/llvm-objdump/MachODump.cpp
@@ -37,9 +37,9 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <algorithm>
 #include <cstring>
+#include <system_error>
 using namespace llvm;
 using namespace object;
 
@@ -195,15 +195,15 @@ static void DisassembleInputMachO2(StringRef Filename,
                                    MachOObjectFile *MachOOF);
 
 void llvm::DisassembleInputMachO(StringRef Filename) {
-  std::unique_ptr<MemoryBuffer> Buff;
-
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, Buff)) {
-    errs() << "llvm-objdump: " << Filename << ": " << ec.message() << "\n";
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Buff =
+      MemoryBuffer::getFileOrSTDIN(Filename);
+  if (std::error_code EC = Buff.getError()) {
+    errs() << "llvm-objdump: " << Filename << ": " << EC.message() << "\n";
     return;
   }
 
   std::unique_ptr<MachOObjectFile> MachOOF(static_cast<MachOObjectFile *>(
-      ObjectFile::createMachOObjectFile(Buff.release()).get()));
+      ObjectFile::createMachOObjectFile(Buff.get()).get()));
 
   DisassembleInputMachO2(Filename, MachOOF.get());
 }
@@ -288,12 +288,13 @@ static void DisassembleInputMachO2(StringRef Filename,
     // A separate DSym file path was specified, parse it as a macho file,
     // get the sections and supply it to the section name parsing machinery.
     if (!DSYMFile.empty()) {
-      std::unique_ptr<MemoryBuffer> Buf;
-      if (error_code ec = MemoryBuffer::getFileOrSTDIN(DSYMFile, Buf)) {
-        errs() << "llvm-objdump: " << Filename << ": " << ec.message() << '\n';
+      ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
+          MemoryBuffer::getFileOrSTDIN(DSYMFile);
+      if (std::error_code EC = Buf.getError()) {
+        errs() << "llvm-objdump: " << Filename << ": " << EC.message() << '\n';
         return;
       }
-      DbgObj = ObjectFile::createMachOObjectFile(Buf.release()).get();
+      DbgObj = ObjectFile::createMachOObjectFile(Buf.get()).get();
     }
 
     // Setup the DIContext
diff --git a/tools/llvm-objdump/Makefile b/tools/llvm-objdump/Makefile
index 4616b78..c3601eb 100644
--- a/tools/llvm-objdump/Makefile
+++ b/tools/llvm-objdump/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL := ../..
 TOOLNAME := llvm-objdump
-LINK_COMPONENTS := all-targets DebugInfo MC MCParser MCDisassembler Object
+LINK_COMPONENTS := all-targets DebugInfo MC MCAnalysis MCParser MCDisassembler Object
 
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS := 1
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index a4fc6d0..309bf23 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -20,17 +20,17 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAnalysis/MCAtom.h"
+#include "llvm/MC/MCAnalysis/MCFunction.h"
+#include "llvm/MC/MCAnalysis/MCModule.h"
+#include "llvm/MC/MCAnalysis/MCModuleYAML.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCAtom.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
-#include "llvm/MC/MCFunction.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCModule.h"
-#include "llvm/MC/MCModuleYAML.h"
 #include "llvm/MC/MCObjectDisassembler.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectSymbolizer.h"
@@ -57,10 +57,10 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <algorithm>
 #include <cctype>
 #include <cstring>
+#include <system_error>
 
 using namespace llvm;
 using namespace object;
@@ -148,7 +148,7 @@ YAMLCFG("yaml-cfg",
 
 static StringRef ToolName;
 
-bool llvm::error(error_code EC) {
+bool llvm::error(std::error_code EC) {
   if (!EC)
     return false;
 
@@ -395,7 +395,7 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
 
   // Create a mapping, RelocSecs = SectionRelocMap[S], where sections
   // in RelocSecs contain the relocations for section S.
-  error_code EC;
+  std::error_code EC;
   std::map<SectionRef, SmallVector<SectionRef, 1>> SectionRelocMap;
   for (const SectionRef &Section : Obj->sections()) {
     section_iterator Sec2 = Section.getRelocatedSection();
@@ -620,7 +620,7 @@ static void PrintSectionHeaders(const ObjectFile *Obj) {
 }
 
 static void PrintSectionContents(const ObjectFile *Obj) {
-  error_code EC;
+  std::error_code EC;
   for (const SectionRef &Section : Obj->sections()) {
     StringRef Name;
     StringRef Contents;
@@ -850,15 +850,15 @@ static void DumpObject(const ObjectFile *o) {
 static void DumpArchive(const Archive *a) {
   for (Archive::child_iterator i = a->child_begin(), e = a->child_end(); i != e;
        ++i) {
-    std::unique_ptr<Binary> child;
-    if (error_code EC = i->getAsBinary(child)) {
+    ErrorOr<std::unique_ptr<Binary>> ChildOrErr = i->getAsBinary();
+    if (std::error_code EC = ChildOrErr.getError()) {
       // Ignore non-object files.
       if (EC != object_error::invalid_file_type)
         errs() << ToolName << ": '" << a->getFileName() << "': " << EC.message()
                << ".\n";
       continue;
     }
-    if (ObjectFile *o = dyn_cast<ObjectFile>(child.get()))
+    if (ObjectFile *o = dyn_cast<ObjectFile>(&*ChildOrErr.get()))
       DumpObject(o);
     else
       errs() << ToolName << ": '" << a->getFileName() << "': "
@@ -881,7 +881,7 @@ static void DumpInput(StringRef file) {
 
   // Attempt to open the binary.
   ErrorOr<Binary *> BinaryOrErr = createBinary(file);
-  if (error_code EC = BinaryOrErr.getError()) {
+  if (std::error_code EC = BinaryOrErr.getError()) {
     errs() << ToolName << ": '" << file << "': " << EC.message() << ".\n";
     return;
   }
diff --git a/tools/llvm-objdump/llvm-objdump.h b/tools/llvm-objdump/llvm-objdump.h
index b716a26..80f8f58 100644
--- a/tools/llvm-objdump/llvm-objdump.h
+++ b/tools/llvm-objdump/llvm-objdump.h
@@ -16,19 +16,17 @@
 #include "llvm/Support/StringRefMemoryObject.h"
 
 namespace llvm {
-
 namespace object {
   class COFFObjectFile;
   class ObjectFile;
   class RelocationRef;
 }
-class error_code;
 
 extern cl::opt<std::string> TripleName;
 extern cl::opt<std::string> ArchName;
 
 // Various helper functions.
-bool error(error_code ec);
+bool error(std::error_code ec);
 bool RelocAddressLess(object::RelocationRef a, object::RelocationRef b);
 void DumpBytes(StringRef bytes);
 void DisassembleInputMachO(StringRef Filename);
diff --git a/tools/llvm-profdata/llvm-profdata.cpp b/tools/llvm-profdata/llvm-profdata.cpp
index fdde32a..ba88aad 100644
--- a/tools/llvm-profdata/llvm-profdata.cpp
+++ b/tools/llvm-profdata/llvm-profdata.cpp
@@ -56,11 +56,12 @@ int merge_main(int argc, const char *argv[]) {
   InstrProfWriter Writer;
   for (const auto &Filename : Inputs) {
     std::unique_ptr<InstrProfReader> Reader;
-    if (error_code ec = InstrProfReader::create(Filename, Reader))
+    if (std::error_code ec = InstrProfReader::create(Filename, Reader))
       exitWithError(ec.message(), Filename);
 
     for (const auto &I : *Reader)
-      if (error_code EC = Writer.addFunctionCounts(I.Name, I.Hash, I.Counts))
+      if (std::error_code EC =
+              Writer.addFunctionCounts(I.Name, I.Hash, I.Counts))
         errs() << Filename << ": " << I.Name << ": " << EC.message() << "\n";
     if (Reader->hasError())
       exitWithError(Reader->getError().message(), Filename);
@@ -90,7 +91,7 @@ int show_main(int argc, const char *argv[]) {
   cl::ParseCommandLineOptions(argc, argv, "LLVM profile data summary\n");
 
   std::unique_ptr<InstrProfReader> Reader;
-  if (error_code EC = InstrProfReader::create(Filename, Reader))
+  if (std::error_code EC = InstrProfReader::create(Filename, Reader))
     exitWithError(EC.message(), Filename);
 
   if (OutputFilename.empty())
diff --git a/tools/llvm-readobj/ARMWinEHPrinter.cpp b/tools/llvm-readobj/ARMWinEHPrinter.cpp
new file mode 100644
index 0000000..b486e4a
--- /dev/null
+++ b/tools/llvm-readobj/ARMWinEHPrinter.cpp
@@ -0,0 +1,744 @@
+//===-- ARMWinEHPrinter.cpp - Windows on ARM EH Data Printer ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Windows on ARM uses a series of serialised data structures (RuntimeFunction)
+// to create a table of information for unwinding.  In order to conserve space,
+// there are two different ways that this data is represented.
+//
+// For functions with canonical forms for the prologue and epilogue, the data
+// can be stored in a "packed" form.  In this case, the data is packed into the
+// RuntimeFunction's remaining 30-bits and can fully describe the entire frame.
+//
+//        +---------------------------------------+
+//        |         Function Entry Address        |
+//        +---------------------------------------+
+//        |           Packed Form Data            |
+//        +---------------------------------------+
+//
+// This layout is parsed by Decoder::dumpPackedEntry.  No unwind bytecode is
+// associated with such a frame as they can be derived from the provided data.
+// The decoder does not synthesize this data as it is unnecessary for the
+// purposes of validation, with the synthesis being required only by a proper
+// unwinder.
+//
+// For functions that are large or do not match canonical forms, the data is
+// split up into two portions, with the actual data residing in the "exception
+// data" table (.xdata) with a reference to the entry from the "procedure data"
+// (.pdata) entry.
+//
+// The exception data contains information about the frame setup, all of the
+// epilouge scopes (for functions for which there are multiple exit points) and
+// the associated exception handler.  Additionally, the entry contains byte-code
+// describing how to unwind the function (c.f. Decoder::decodeOpcodes).
+//
+//        +---------------------------------------+
+//        |         Function Entry Address        |
+//        +---------------------------------------+
+//        |      Exception Data Entry Address     |
+//        +---------------------------------------+
+//
+// This layout is parsed by Decoder::dumpUnpackedEntry.  Such an entry must
+// first resolve the exception data entry address.  This structure
+// (ExceptionDataRecord) has a variable sized header
+// (c.f. ARM::WinEH::HeaderWords) and encodes most of the same information as
+// the packed form.  However, because this information is insufficient to
+// synthesize the unwinding, there are associated unwinding bytecode which make
+// up the bulk of the Decoder.
+//
+// The decoder itself is table-driven, using the first byte to determine the
+// opcode and dispatching to the associated printing routine.  The bytecode
+// itself is a variable length instruction encoding that can fully describe the
+// state of the stack and the necessary operations for unwinding to the
+// beginning of the frame.
+//
+// The byte-code maintains a 1-1 instruction mapping, indicating both the width
+// of the instruction (Thumb2 instructions are variable length, 16 or 32 bits
+// wide) allowing the program to unwind from any point in the prologue, body, or
+// epilogue of the function.
+
+#include "ARMWinEHPrinter.h"
+#include "Error.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ARMWinEH.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+using namespace llvm::object;
+using namespace llvm::support;
+
+namespace llvm {
+raw_ostream &operator<<(raw_ostream &OS, const ARM::WinEH::ReturnType &RT) {
+  switch (RT) {
+  case ARM::WinEH::ReturnType::RT_POP:
+    OS << "pop {pc}";
+    break;
+  case ARM::WinEH::ReturnType::RT_B:
+    OS << "b target";
+    break;
+  case ARM::WinEH::ReturnType::RT_BW:
+    OS << "b.w target";
+    break;
+  case ARM::WinEH::ReturnType::RT_NoEpilogue:
+    OS << "(no epilogue)";
+    break;
+  }
+  return OS;
+}
+}
+
+static std::string formatSymbol(StringRef Name, uint64_t Address,
+                                uint64_t Offset = 0) {
+  std::string Buffer;
+  raw_string_ostream OS(Buffer);
+
+  if (!Name.empty())
+    OS << Name << " ";
+
+  if (Offset)
+    OS << format("+0x%X (0x%" PRIX64 ")", Offset, Address);
+  else if (!Name.empty())
+    OS << format("(0x%" PRIX64 ")", Address);
+  else
+    OS << format("0x%" PRIX64, Address);
+
+  return OS.str();
+}
+
+namespace llvm {
+namespace ARM {
+namespace WinEH {
+const size_t Decoder::PDataEntrySize = sizeof(RuntimeFunction);
+
+// TODO name the uops more appropriately
+const Decoder::RingEntry Decoder::Ring[] = {
+  { 0x80, 0x00, &Decoder::opcode_0xxxxxxx },  // UOP_STACK_FREE (16-bit)
+  { 0xc0, 0x80, &Decoder::opcode_10Lxxxxx },  // UOP_POP (32-bit)
+  { 0xf0, 0xc0, &Decoder::opcode_1100xxxx },  // UOP_STACK_SAVE (16-bit)
+  { 0xf8, 0xd0, &Decoder::opcode_11010Lxx },  // UOP_POP (16-bit)
+  { 0xf8, 0xd8, &Decoder::opcode_11011Lxx },  // UOP_POP (32-bit)
+  { 0xf8, 0xe0, &Decoder::opcode_11100xxx },  // UOP_VPOP (32-bit)
+  { 0xfc, 0xe8, &Decoder::opcode_111010xx },  // UOP_STACK_FREE (32-bit)
+  { 0xfe, 0xec, &Decoder::opcode_1110110L },  // UOP_POP (16-bit)
+  { 0xff, 0xee, &Decoder::opcode_11101110 },  // UOP_MICROSOFT_SPECIFIC (16-bit)
+                                              // UOP_PUSH_MACHINE_FRAME
+                                              // UOP_PUSH_CONTEXT
+                                              // UOP_PUSH_TRAP_FRAME
+                                              // UOP_REDZONE_RESTORE_LR
+  { 0xff, 0xef, &Decoder::opcode_11101111 },  // UOP_LDRPC_POSTINC (32-bit)
+  { 0xff, 0xf5, &Decoder::opcode_11110101 },  // UOP_VPOP (32-bit)
+  { 0xff, 0xf6, &Decoder::opcode_11110110 },  // UOP_VPOP (32-bit)
+  { 0xff, 0xf7, &Decoder::opcode_11110111 },  // UOP_STACK_RESTORE (16-bit)
+  { 0xff, 0xf8, &Decoder::opcode_11111000 },  // UOP_STACK_RESTORE (16-bit)
+  { 0xff, 0xf9, &Decoder::opcode_11111001 },  // UOP_STACK_RESTORE (32-bit)
+  { 0xff, 0xfa, &Decoder::opcode_11111010 },  // UOP_STACK_RESTORE (32-bit)
+  { 0xff, 0xfb, &Decoder::opcode_11111011 },  // UOP_NOP (16-bit)
+  { 0xff, 0xfc, &Decoder::opcode_11111100 },  // UOP_NOP (32-bit)
+  { 0xff, 0xfd, &Decoder::opcode_11111101 },  // UOP_NOP (16-bit) / END
+  { 0xff, 0xfe, &Decoder::opcode_11111110 },  // UOP_NOP (32-bit) / END
+  { 0xff, 0xff, &Decoder::opcode_11111111 },  // UOP_END
+};
+
+void Decoder::printRegisters(const std::pair<uint16_t, uint32_t> &RegisterMask) {
+  static const char * const GPRRegisterNames[16] = {
+    "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+    "r11", "ip", "sp", "lr", "pc",
+  };
+
+  const uint16_t GPRMask = std::get<0>(RegisterMask);
+  const uint16_t VFPMask = std::get<1>(RegisterMask);
+
+  OS << '{';
+  bool Comma = false;
+  for (unsigned RI = 0, RE = 11; RI < RE; ++RI) {
+    if (GPRMask & (1 << RI)) {
+      if (Comma)
+        OS << ", ";
+      OS << GPRRegisterNames[RI];
+      Comma = true;
+    }
+  }
+  for (unsigned RI = 0, RE = 32; RI < RE; ++RI) {
+    if (VFPMask & (1 << RI)) {
+      if (Comma)
+        OS << ", ";
+      OS << "d" << unsigned(RI);
+      Comma = true;
+    }
+  }
+  for (unsigned RI = 11, RE = 16; RI < RE; ++RI) {
+    if (GPRMask & (1 << RI)) {
+      if (Comma)
+        OS << ", ";
+      OS << GPRRegisterNames[RI];
+      Comma = true;
+    }
+  }
+  OS << '}';
+}
+
+ErrorOr<object::SectionRef>
+Decoder::getSectionContaining(const COFFObjectFile &COFF, uint64_t VA) {
+  for (const auto &Section : COFF.sections()) {
+    uint64_t Address;
+    uint64_t Size;
+
+    if (std::error_code EC = Section.getAddress(Address))
+      return EC;
+    if (std::error_code EC = Section.getSize(Size))
+      return EC;
+
+    if (VA >= Address && (VA - Address) <= Size)
+      return Section;
+  }
+  return readobj_error::unknown_symbol;
+}
+
+ErrorOr<object::SymbolRef> Decoder::getSymbol(const COFFObjectFile &COFF,
+                                              uint64_t VA, bool FunctionOnly) {
+  for (const auto &Symbol : COFF.symbols()) {
+    if (FunctionOnly) {
+      SymbolRef::Type Type;
+      if (std::error_code EC = Symbol.getType(Type))
+        return EC;
+      if (Type != SymbolRef::ST_Function)
+        continue;
+    }
+
+    uint64_t Address;
+    if (std::error_code EC = Symbol.getAddress(Address))
+      return EC;
+    if (Address == VA)
+      return Symbol;
+  }
+  return readobj_error::unknown_symbol;
+}
+
+ErrorOr<SymbolRef> Decoder::getRelocatedSymbol(const COFFObjectFile &,
+                                               const SectionRef &Section,
+                                               uint64_t Offset) {
+  for (const auto &Relocation : Section.relocations()) {
+    uint64_t RelocationOffset;
+    if (auto Error = Relocation.getOffset(RelocationOffset))
+      return Error;
+    if (RelocationOffset == Offset)
+      return *Relocation.getSymbol();
+  }
+  return readobj_error::unknown_symbol;
+}
+
+bool Decoder::opcode_0xxxxxxx(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  uint8_t Imm = OC[Offset] & 0x7f;
+  SW.startLine() << format("0x%02x                ; %s sp, #(%u * 4)\n",
+                           OC[Offset],
+                           static_cast<const char *>(Prologue ? "sub" : "add"),
+                           Imm);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_10Lxxxxx(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  unsigned Link = (OC[Offset] & 0x20) >> 5;
+  uint16_t RegisterMask = (Link << (Prologue ? 14 : 15))
+                        | ((OC[Offset + 0] & 0x1f) << 8)
+                        | ((OC[Offset + 1] & 0xff) << 0);
+  assert((~RegisterMask & (1 << 13)) && "sp must not be set");
+  assert((~RegisterMask & (1 << (Prologue ? 15 : 14))) && "pc must not be set");
+
+  SW.startLine() << format("0x%02x 0x%02x           ; %s.w ",
+                           OC[Offset + 0], OC[Offset + 1],
+                           Prologue ? "push" : "pop");
+  printRegisters(std::make_pair(RegisterMask, 0));
+  OS << '\n';
+
+  ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_1100xxxx(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  if (Prologue)
+    SW.startLine() << format("0x%02x                ; mov r%u, sp\n",
+                             OC[Offset], OC[Offset] & 0xf);
+  else
+    SW.startLine() << format("0x%02x                ; mov sp, r%u\n",
+                             OC[Offset], OC[Offset] & 0xf);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11010Lxx(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  unsigned Link = (OC[Offset] & 0x4) >> 3;
+  unsigned Count = (OC[Offset] & 0x3);
+
+  uint16_t GPRMask = (Link << (Prologue ? 14 : 15))
+                   | (((1 << (Count + 1)) - 1) << 4);
+
+  SW.startLine() << format("0x%02x                ; %s ", OC[Offset],
+                           Prologue ? "push" : "pop");
+  printRegisters(std::make_pair(GPRMask, 0));
+  OS << '\n';
+
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11011Lxx(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  unsigned Link = (OC[Offset] & 0x4) >> 2;
+  unsigned Count = (OC[Offset] & 0x3) + 4;
+
+  uint16_t GPRMask = (Link << (Prologue ? 14 : 15))
+                   | (((1 << (Count + 1)) - 1) << 4);
+
+  SW.startLine() << format("0x%02x                ; %s.w ", OC[Offset],
+                           Prologue ? "push" : "pop");
+  printRegisters(std::make_pair(GPRMask, 0));
+  OS << '\n';
+
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11100xxx(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  unsigned High = (OC[Offset] & 0x7);
+  uint32_t VFPMask = (((1 << (High + 1)) - 1) << 8);
+
+  SW.startLine() << format("0x%02x                ; %s ", OC[Offset],
+                           Prologue ? "vpush" : "vpop");
+  printRegisters(std::make_pair(0, VFPMask));
+  OS << '\n';
+
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_111010xx(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  uint16_t Imm = ((OC[Offset + 0] & 0x03) << 8) | ((OC[Offset + 1] & 0xff) << 0);
+
+  SW.startLine() << format("0x%02x 0x%02x           ; %s.w sp, #(%u * 4)\n",
+                           OC[Offset + 0], OC[Offset + 1],
+                           static_cast<const char *>(Prologue ? "sub" : "add"),
+                           Imm);
+
+  ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_1110110L(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  uint8_t GPRMask = ((OC[Offset + 0] & 0x01) << (Prologue ? 14 : 15))
+                  | ((OC[Offset + 1] & 0xff) << 0);
+
+  SW.startLine() << format("0x%02x 0x%02x           ; %s ", OC[Offset + 0],
+                           OC[Offset + 1], Prologue ? "push" : "pop");
+  printRegisters(std::make_pair(GPRMask, 0));
+  OS << '\n';
+
+  ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11101110(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  assert(!Prologue && "may not be used in prologue");
+
+  if (OC[Offset + 1] & 0xf0)
+    SW.startLine() << format("0x%02x 0x%02x           ; reserved\n",
+                             OC[Offset + 0], OC[Offset +  1]);
+  else
+    SW.startLine()
+      << format("0x%02x 0x%02x           ; microsoft-specific (type: %u)\n",
+                OC[Offset + 0], OC[Offset + 1], OC[Offset + 1] & 0x0f);
+
+  ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11101111(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  assert(!Prologue && "may not be used in prologue");
+
+  if (OC[Offset + 1] & 0xf0)
+    SW.startLine() << format("0x%02x 0x%02x           ; reserved\n",
+                             OC[Offset + 0], OC[Offset +  1]);
+  else
+    SW.startLine()
+      << format("0x%02x 0x%02x           ; ldr.w lr, [sp], #%u\n",
+                OC[Offset + 0], OC[Offset + 1], OC[Offset + 1] << 2);
+
+  ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11110101(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  unsigned Start = (OC[Offset + 1] & 0xf0) >> 4;
+  unsigned End = (OC[Offset + 1] & 0x0f) >> 0;
+  uint32_t VFPMask = ((1 << (End - Start)) - 1) << Start;
+
+  SW.startLine() << format("0x%02x 0x%02x           ; %s ", OC[Offset + 0],
+                           OC[Offset + 1], Prologue ? "vpush" : "vpop");
+  printRegisters(std::make_pair(0, VFPMask));
+  OS << '\n';
+
+  ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11110110(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  unsigned Start = (OC[Offset + 1] & 0xf0) >> 4;
+  unsigned End = (OC[Offset + 1] & 0x0f) >> 0;
+  uint32_t VFPMask = ((1 << (End - Start)) - 1) << 16;
+
+  SW.startLine() << format("0x%02x 0x%02x           ; %s ", OC[Offset + 0],
+                           OC[Offset + 1], Prologue ? "vpush" : "vpop");
+  printRegisters(std::make_pair(0, VFPMask));
+  OS << '\n';
+
+  ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11110111(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  uint32_t Imm = (OC[Offset + 1] << 8) | (OC[Offset + 2] << 0);
+
+  SW.startLine() << format("0x%02x 0x%02x 0x%02x      ; %s sp, sp, #(%u * 4)\n",
+                           OC[Offset + 0], OC[Offset + 1], OC[Offset + 2],
+                           static_cast<const char *>(Prologue ? "sub" : "add"),
+                           Imm);
+
+  ++Offset, ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11111000(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  uint32_t Imm = (OC[Offset + 1] << 16)
+               | (OC[Offset + 2] << 8)
+               | (OC[Offset + 3] << 0);
+
+  SW.startLine()
+    << format("0x%02x 0x%02x 0x%02x 0x%02x ; %s sp, sp, #(%u * 4)\n",
+              OC[Offset + 0], OC[Offset + 1], OC[Offset + 2], OC[Offset + 3],
+              static_cast<const char *>(Prologue ? "sub" : "add"), Imm);
+
+  ++Offset, ++Offset, ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11111001(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  uint32_t Imm = (OC[Offset + 1] << 8) | (OC[Offset + 2] << 0);
+
+  SW.startLine()
+    << format("0x%02x 0x%02x 0x%02x      ; %s.w sp, sp, #(%u * 4)\n",
+              OC[Offset + 0], OC[Offset + 1], OC[Offset + 2],
+              static_cast<const char *>(Prologue ? "sub" : "add"), Imm);
+
+  ++Offset, ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11111010(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  uint32_t Imm = (OC[Offset + 1] << 16)
+               | (OC[Offset + 2] << 8)
+               | (OC[Offset + 3] << 0);
+
+  SW.startLine()
+    << format("0x%02x 0x%02x 0x%02x 0x%02x ; %s.w sp, sp, #(%u * 4)\n",
+              OC[Offset + 0], OC[Offset + 1], OC[Offset + 2], OC[Offset + 3],
+              static_cast<const char *>(Prologue ? "sub" : "add"), Imm);
+
+  ++Offset, ++Offset, ++Offset, ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11111011(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  SW.startLine() << format("0x%02x                ; nop\n", OC[Offset]);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11111100(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  SW.startLine() << format("0x%02x                ; nop.w\n", OC[Offset]);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_11111101(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  SW.startLine() << format("0x%02x                ; b\n", OC[Offset]);
+  ++Offset;
+  return true;
+}
+
+bool Decoder::opcode_11111110(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  SW.startLine() << format("0x%02x                ; b.w\n", OC[Offset]);
+  ++Offset;
+  return true;
+}
+
+bool Decoder::opcode_11111111(const ulittle8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  ++Offset;
+  return true;
+}
+
+void Decoder::decodeOpcodes(ArrayRef<ulittle8_t> Opcodes, unsigned Offset,
+                            bool Prologue) {
+  assert((!Prologue || Offset == 0) && "prologue should always use offset 0");
+
+  bool Terminated = false;
+  for (unsigned OI = Offset, OE = Opcodes.size(); !Terminated && OI < OE; ) {
+    for (unsigned DI = 0;; ++DI) {
+      if ((Opcodes[OI] & Ring[DI].Mask) == Ring[DI].Value) {
+        Terminated = (this->*Ring[DI].Routine)(Opcodes.data(), OI, 0, Prologue);
+        break;
+      }
+      assert(DI < array_lengthof(Ring) && "unhandled opcode");
+    }
+  }
+}
+
+bool Decoder::dumpXDataRecord(const COFFObjectFile &COFF,
+                              const SectionRef &Section,
+                              uint64_t FunctionAddress, uint64_t VA) {
+  ArrayRef<uint8_t> Contents;
+  if (COFF.getSectionContents(COFF.getCOFFSection(Section), Contents))
+    return false;
+
+  uint64_t SectionVA;
+  if (Section.getAddress(SectionVA))
+    return false;
+
+  uint64_t Offset = VA - SectionVA;
+  const ulittle32_t *Data =
+    reinterpret_cast<const ulittle32_t *>(Contents.data() + Offset);
+  const ExceptionDataRecord XData(Data);
+
+  DictScope XRS(SW, "ExceptionData");
+  SW.printNumber("FunctionLength", XData.FunctionLength() << 1);
+  SW.printNumber("Version", XData.Vers());
+  SW.printBoolean("ExceptionData", XData.X());
+  SW.printBoolean("EpiloguePacked", XData.E());
+  SW.printBoolean("Fragment", XData.F());
+  SW.printNumber(XData.E() ? "EpilogueOffset" : "EpilogueScopes",
+                 XData.EpilogueCount());
+  SW.printNumber("ByteCodeLength",
+                 static_cast<uint64_t>(XData.CodeWords() * sizeof(uint32_t)));
+
+  if (XData.E()) {
+    ArrayRef<ulittle8_t> UC = XData.UnwindByteCode();
+    if (!XData.F()) {
+      ListScope PS(SW, "Prologue");
+      decodeOpcodes(UC, 0, /*Prologue=*/true);
+    }
+    if (XData.EpilogueCount()) {
+      ListScope ES(SW, "Epilogue");
+      decodeOpcodes(UC, XData.EpilogueCount(), /*Prologue=*/false);
+    }
+  } else {
+    ArrayRef<ulittle32_t> EpilogueScopes = XData.EpilogueScopes();
+    ListScope ESS(SW, "EpilogueScopes");
+    for (const EpilogueScope ES : EpilogueScopes) {
+      DictScope ESES(SW, "EpilogueScope");
+      SW.printNumber("StartOffset", ES.EpilogueStartOffset());
+      SW.printNumber("Condition", ES.Condition());
+      SW.printNumber("EpilogueStartIndex", ES.EpilogueStartIndex());
+
+      ListScope Opcodes(SW, "Opcodes");
+      decodeOpcodes(XData.UnwindByteCode(), ES.EpilogueStartIndex(),
+                    /*Prologue=*/false);
+    }
+  }
+
+  if (XData.X()) {
+    const uint32_t Address = XData.ExceptionHandlerRVA();
+    const uint32_t Parameter = XData.ExceptionHandlerParameter();
+    const size_t HandlerOffset = HeaderWords(XData)
+                               + (XData.E() ? 0 : XData.EpilogueCount())
+                               + XData.CodeWords();
+
+    ErrorOr<SymbolRef> Symbol =
+      getRelocatedSymbol(COFF, Section, HandlerOffset * sizeof(uint32_t));
+    if (!Symbol)
+      Symbol = getSymbol(COFF, Address, /*FunctionOnly=*/true);
+
+    StringRef Name;
+    if (Symbol)
+      Symbol->getName(Name);
+
+    ListScope EHS(SW, "ExceptionHandler");
+    SW.printString("Routine", formatSymbol(Name, Address));
+    SW.printHex("Parameter", Parameter);
+  }
+
+  return true;
+}
+
+bool Decoder::dumpUnpackedEntry(const COFFObjectFile &COFF,
+                                const SectionRef Section, uint64_t Offset,
+                                unsigned Index, const RuntimeFunction &RF) {
+  assert(RF.Flag() == RuntimeFunctionFlag::RFF_Unpacked &&
+         "packed entry cannot be treated as an unpacked entry");
+
+  ErrorOr<SymbolRef> Function = getRelocatedSymbol(COFF, Section, Offset);
+  if (!Function)
+    Function = getSymbol(COFF, RF.BeginAddress, /*FunctionOnly=*/true);
+
+  ErrorOr<SymbolRef> XDataRecord = getRelocatedSymbol(COFF, Section, Offset + 4);
+  if (!XDataRecord)
+    XDataRecord = getSymbol(COFF, RF.ExceptionInformationRVA());
+
+  if (!RF.BeginAddress && !Function)
+    return false;
+  if (!RF.UnwindData && !XDataRecord)
+    return false;
+
+  StringRef FunctionName;
+  uint64_t FunctionAddress;
+  if (Function) {
+    Function->getName(FunctionName);
+    Function->getAddress(FunctionAddress);
+  } else {
+    const pe32_header *PEHeader;
+    if (COFF.getPE32Header(PEHeader))
+      return false;
+    FunctionAddress = PEHeader->ImageBase + RF.BeginAddress;
+  }
+
+  SW.printString("Function", formatSymbol(FunctionName, FunctionAddress));
+
+  if (XDataRecord) {
+    StringRef Name;
+    uint64_t Address;
+
+    XDataRecord->getName(Name);
+    XDataRecord->getAddress(Address);
+
+    SW.printString("ExceptionRecord", formatSymbol(Name, Address));
+
+    section_iterator SI = COFF.section_end();
+    if (XDataRecord->getSection(SI))
+      return false;
+
+    return dumpXDataRecord(COFF, *SI, FunctionAddress, Address);
+  } else {
+    const pe32_header *PEHeader;
+    if (COFF.getPE32Header(PEHeader))
+      return false;
+
+    uint64_t Address = PEHeader->ImageBase + RF.ExceptionInformationRVA();
+    SW.printString("ExceptionRecord", formatSymbol("", Address));
+
+    ErrorOr<SectionRef> Section =
+      getSectionContaining(COFF, RF.ExceptionInformationRVA());
+    if (!Section)
+      return false;
+
+    return dumpXDataRecord(COFF, *Section, FunctionAddress,
+                           RF.ExceptionInformationRVA());
+  }
+}
+
+bool Decoder::dumpPackedEntry(const object::COFFObjectFile &COFF,
+                              const SectionRef Section, uint64_t Offset,
+                              unsigned Index, const RuntimeFunction &RF) {
+  assert((RF.Flag() == RuntimeFunctionFlag::RFF_Packed ||
+          RF.Flag() == RuntimeFunctionFlag::RFF_PackedFragment) &&
+         "unpacked entry cannot be treated as a packed entry");
+
+  ErrorOr<SymbolRef> Function = getRelocatedSymbol(COFF, Section, Offset);
+  if (!Function)
+    Function = getSymbol(COFF, RF.BeginAddress, /*FunctionOnly=*/true);
+
+  StringRef FunctionName;
+  uint64_t FunctionAddress;
+  if (Function) {
+    Function->getName(FunctionName);
+    Function->getAddress(FunctionAddress);
+  } else {
+    const pe32_header *PEHeader;
+    if (COFF.getPE32Header(PEHeader))
+      return false;
+    FunctionAddress = PEHeader->ImageBase + RF.BeginAddress;
+  }
+
+  SW.printString("Function", formatSymbol(FunctionName, FunctionAddress));
+  SW.printBoolean("Fragment",
+                  RF.Flag() == RuntimeFunctionFlag::RFF_PackedFragment);
+  SW.printNumber("FunctionLength", RF.FunctionLength());
+  SW.startLine() << "ReturnType: " << RF.Ret() << '\n';
+  SW.printBoolean("HomedParameters", RF.H());
+  SW.startLine() << "SavedRegisters: ";
+                 printRegisters(SavedRegisterMask(RF));
+  OS << '\n';
+  SW.printNumber("StackAdjustment", StackAdjustment(RF) << 2);
+
+  return true;
+}
+
+bool Decoder::dumpProcedureDataEntry(const COFFObjectFile &COFF,
+                                     const SectionRef Section, unsigned Index,
+                                     ArrayRef<uint8_t> Contents) {
+  uint64_t Offset = PDataEntrySize * Index;
+  const ulittle32_t *Data =
+    reinterpret_cast<const ulittle32_t *>(Contents.data() + Offset);
+
+  const RuntimeFunction Entry(Data);
+  DictScope RFS(SW, "RuntimeFunction");
+  if (Entry.Flag() == RuntimeFunctionFlag::RFF_Unpacked)
+    return dumpUnpackedEntry(COFF, Section, Offset, Index, Entry);
+  return dumpPackedEntry(COFF, Section, Offset, Index, Entry);
+}
+
+void Decoder::dumpProcedureData(const COFFObjectFile &COFF,
+                                const SectionRef Section) {
+  ArrayRef<uint8_t> Contents;
+  if (COFF.getSectionContents(COFF.getCOFFSection(Section), Contents))
+    return;
+
+  if (Contents.size() % PDataEntrySize) {
+    errs() << ".pdata content is not " << PDataEntrySize << "-byte aligned\n";
+    return;
+  }
+
+  for (unsigned EI = 0, EE = Contents.size() / PDataEntrySize; EI < EE; ++EI)
+    if (!dumpProcedureDataEntry(COFF, Section, EI, Contents))
+      break;
+}
+
+std::error_code Decoder::dumpProcedureData(const COFFObjectFile &COFF) {
+  for (const auto &Section : COFF.sections()) {
+    StringRef SectionName;
+    if (std::error_code EC =
+            COFF.getSectionName(COFF.getCOFFSection(Section), SectionName))
+      return EC;
+
+    if (SectionName.startswith(".pdata"))
+      dumpProcedureData(COFF, Section);
+  }
+  return std::error_code();
+}
+}
+}
+}
+
diff --git a/tools/llvm-readobj/ARMWinEHPrinter.h b/tools/llvm-readobj/ARMWinEHPrinter.h
new file mode 100644
index 0000000..740c8b5
--- /dev/null
+++ b/tools/llvm-readobj/ARMWinEHPrinter.h
@@ -0,0 +1,119 @@
+//===--- ARMWinEHPrinter.h - Windows on ARM Unwind Information Printer ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License.  See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_READOBJ_ARMWINEHPRINTER_H
+#define LLVM_READOBJ_ARMWINEHPRINTER_H
+
+#include "StreamWriter.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/ErrorOr.h"
+
+namespace llvm {
+namespace ARM {
+namespace WinEH {
+class RuntimeFunction;
+
+class Decoder {
+  static const size_t PDataEntrySize;
+
+  StreamWriter &SW;
+  raw_ostream &OS;
+
+  struct RingEntry {
+    uint8_t Mask;
+    uint8_t Value;
+    bool (Decoder::*Routine)(const support::ulittle8_t *, unsigned &, unsigned,
+                             bool);
+  };
+  static const RingEntry Ring[];
+
+  bool opcode_0xxxxxxx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_10Lxxxxx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_1100xxxx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11010Lxx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11011Lxx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11100xxx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_111010xx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_1110110L(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11101110(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11101111(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11110101(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11110110(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11110111(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11111000(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11111001(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11111010(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11111011(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11111100(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11111101(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11111110(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_11111111(const support::ulittle8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+
+  void decodeOpcodes(ArrayRef<support::ulittle8_t> Opcodes, unsigned Offset,
+                     bool Prologue);
+
+  void printRegisters(const std::pair<uint16_t, uint32_t> &RegisterMask);
+
+  ErrorOr<object::SectionRef>
+  getSectionContaining(const object::COFFObjectFile &COFF, uint64_t Address);
+
+  ErrorOr<object::SymbolRef>
+  getSymbol(const object::COFFObjectFile &COFF, uint64_t Address,
+            bool FunctionOnly = false);
+
+  ErrorOr<object::SymbolRef>
+  getRelocatedSymbol(const object::COFFObjectFile &COFF,
+                     const object::SectionRef &Section, uint64_t Offset);
+
+  bool dumpXDataRecord(const object::COFFObjectFile &COFF,
+                       const object::SectionRef &Section,
+                       uint64_t FunctionAddress, uint64_t VA);
+  bool dumpUnpackedEntry(const object::COFFObjectFile &COFF,
+                         const object::SectionRef Section, uint64_t Offset,
+                         unsigned Index, const RuntimeFunction &Entry);
+  bool dumpPackedEntry(const object::COFFObjectFile &COFF,
+                       const object::SectionRef Section, uint64_t Offset,
+                       unsigned Index, const RuntimeFunction &Entry);
+  bool dumpProcedureDataEntry(const object::COFFObjectFile &COFF,
+                              const object::SectionRef Section, unsigned Entry,
+                              ArrayRef<uint8_t> Contents);
+  void dumpProcedureData(const object::COFFObjectFile &COFF,
+                         const object::SectionRef Section);
+
+public:
+  Decoder(StreamWriter &SW) : SW(SW), OS(SW.getOStream()) {}
+  std::error_code dumpProcedureData(const object::COFFObjectFile &COFF);
+};
+}
+}
+}
+
+#endif
+
diff --git a/tools/llvm-readobj/Android.mk b/tools/llvm-readobj/Android.mk
index 10c99db..219e6a9 100644
--- a/tools/llvm-readobj/Android.mk
+++ b/tools/llvm-readobj/Android.mk
@@ -9,6 +9,7 @@ LLVM_ROOT_PATH := $(LOCAL_PATH)/../..
 
 llvm_readobj_SRC_FILES := \
   ARMAttributeParser.cpp \
+  ARMWinEHPrinter.cpp \
   COFFDumper.cpp \
   ELFDumper.cpp \
   Error.cpp \
@@ -25,6 +26,8 @@ llvm_readobj_STATIC_LIBRARIES := \
   libLLVMX86Info \
   libLLVMObject \
   libLLVMBitReader \
+  libLLVMMC \
+  libLLVMMCParser \
   libLLVMCore \
   libLLVMSupport \
 
diff --git a/tools/llvm-readobj/CMakeLists.txt b/tools/llvm-readobj/CMakeLists.txt
index b057dcd..30f336f 100644
--- a/tools/llvm-readobj/CMakeLists.txt
+++ b/tools/llvm-readobj/CMakeLists.txt
@@ -6,6 +6,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_tool(llvm-readobj
   ARMAttributeParser.cpp
+  ARMWinEHPrinter.cpp
   COFFDumper.cpp
   ELFDumper.cpp
   Error.cpp
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index 91f2a57..7842cd4 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm-readobj.h"
+#include "ARMWinEHPrinter.h"
 #include "Error.h"
 #include "ObjDumper.h"
 #include "StreamWriter.h"
@@ -29,9 +30,9 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/Win64EH.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <algorithm>
 #include <cstring>
+#include <system_error>
 #include <time.h>
 
 using namespace llvm;
@@ -68,10 +69,10 @@ private:
 
   void cacheRelocations();
 
-  error_code resolveSymbol(const coff_section *Section, uint64_t Offset,
-                           SymbolRef &Sym);
-  error_code resolveSymbolName(const coff_section *Section, uint64_t Offset,
-                               StringRef &Name);
+  std::error_code resolveSymbol(const coff_section *Section, uint64_t Offset,
+                                SymbolRef &Sym);
+  std::error_code resolveSymbolName(const coff_section *Section,
+                                    uint64_t Offset, StringRef &Name);
 
   typedef DenseMap<const coff_section*, std::vector<RelocationRef> > RelocMapTy;
 
@@ -84,8 +85,9 @@ private:
 
 namespace llvm {
 
-error_code createCOFFDumper(const object::ObjectFile *Obj, StreamWriter &Writer,
-                            std::unique_ptr<ObjDumper> &Result) {
+std::error_code createCOFFDumper(const object::ObjectFile *Obj,
+                                 StreamWriter &Writer,
+                                 std::unique_ptr<ObjDumper> &Result) {
   const COFFObjectFile *COFFObj = dyn_cast<COFFObjectFile>(Obj);
   if (!COFFObj)
     return readobj_error::unsupported_obj_file_format;
@@ -98,12 +100,12 @@ error_code createCOFFDumper(const object::ObjectFile *Obj, StreamWriter &Writer,
 
 // Given a a section and an offset into this section the function returns the
 // symbol used for the relocation at the offset.
-error_code COFFDumper::resolveSymbol(const coff_section *Section,
-                                     uint64_t Offset, SymbolRef &Sym) {
+std::error_code COFFDumper::resolveSymbol(const coff_section *Section,
+                                          uint64_t Offset, SymbolRef &Sym) {
   const auto &Relocations = RelocMap[Section];
   for (const auto &Relocation : Relocations) {
     uint64_t RelocationOffset;
-    if (error_code EC = Relocation.getOffset(RelocationOffset))
+    if (std::error_code EC = Relocation.getOffset(RelocationOffset))
       return EC;
 
     if (RelocationOffset == Offset) {
@@ -116,12 +118,13 @@ error_code COFFDumper::resolveSymbol(const coff_section *Section,
 
 // Given a section and an offset into this section the function returns the name
 // of the symbol used for the relocation at the offset.
-error_code COFFDumper::resolveSymbolName(const coff_section *Section,
-                                         uint64_t Offset, StringRef &Name) {
+std::error_code COFFDumper::resolveSymbolName(const coff_section *Section,
+                                              uint64_t Offset,
+                                              StringRef &Name) {
   SymbolRef Symbol;
-  if (error_code EC = resolveSymbol(Section, Offset, Symbol))
+  if (std::error_code EC = resolveSymbol(Section, Offset, Symbol))
     return EC;
-  if (error_code EC = Symbol.getName(Name))
+  if (std::error_code EC = Symbol.getName(Name))
     return EC;
   return object_error::success;
 }
@@ -190,7 +193,9 @@ static const EnumEntry<COFF::DLLCharacteristics> PEDLLCharacteristics[] = {
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_DLL_CHARACTERISTICS_NO_ISOLATION         ),
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_DLL_CHARACTERISTICS_NO_SEH               ),
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_DLL_CHARACTERISTICS_NO_BIND              ),
+  LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_DLL_CHARACTERISTICS_APPCONTAINER         ),
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_DLL_CHARACTERISTICS_WDM_DRIVER           ),
+  LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_DLL_CHARACTERISTICS_GUARD_CF             ),
   LLVM_READOBJ_ENUM_ENT(COFF, IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE),
 };
 
@@ -306,9 +311,10 @@ WeakExternalCharacteristics[] = {
   { "Alias"    , COFF::IMAGE_WEAK_EXTERN_SEARCH_ALIAS     }
 };
 
-template<typename T>
-static error_code getSymbolAuxData(const COFFObjectFile *Obj,
-                                   const coff_symbol *Symbol, const T* &Aux) {
+template <typename T>
+static std::error_code getSymbolAuxData(const COFFObjectFile *Obj,
+                                        const coff_symbol *Symbol,
+                                        const T *&Aux) {
   ArrayRef<uint8_t> AuxData = Obj->getSymbolAuxData(Symbol);
   Aux = reinterpret_cast<const T*>(AuxData.data());
   return readobj_error::success;
@@ -718,7 +724,7 @@ void COFFDumper::printSymbol(const SymbolRef &Sym) {
 
   const coff_symbol *Symbol = Obj->getCOFFSymbol(Sym);
   const coff_section *Section;
-  if (error_code EC = Obj->getSection(Symbol->SectionNumber, Section)) {
+  if (std::error_code EC = Obj->getSection(Symbol->SectionNumber, Section)) {
     W.startLine() << "Invalid section number: " << EC.message() << "\n";
     W.flush();
     return;
@@ -762,7 +768,7 @@ void COFFDumper::printSymbol(const SymbolRef &Sym) {
 
       const coff_symbol *Linked;
       StringRef LinkedName;
-      error_code EC;
+      std::error_code EC;
       if ((EC = Obj->getSymbol(Aux->TagIndex, Linked)) ||
           (EC = Obj->getSymbolName(Linked, LinkedName))) {
         LinkedName = "";
@@ -804,7 +810,7 @@ void COFFDumper::printSymbol(const SymbolRef &Sym) {
           && Aux->Selection == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
         const coff_section *Assoc;
         StringRef AssocName;
-        error_code EC;
+        std::error_code EC;
         if ((EC = Obj->getSection(Aux->Number, Assoc)) ||
             (EC = Obj->getSectionName(Assoc, AssocName))) {
           AssocName = "";
@@ -820,7 +826,7 @@ void COFFDumper::printSymbol(const SymbolRef &Sym) {
 
       const coff_symbol *ReferredSym;
       StringRef ReferredName;
-      error_code EC;
+      std::error_code EC;
       if ((EC = Obj->getSymbol(Aux->SymbolTableIndex, ReferredSym)) ||
           (EC = Obj->getSymbolName(ReferredSym, ReferredName))) {
         ReferredName = "";
@@ -848,16 +854,21 @@ void COFFDumper::printUnwindInfo() {
   switch (Header->Machine) {
   case COFF::IMAGE_FILE_MACHINE_AMD64: {
     Win64EH::Dumper Dumper(W);
-    Win64EH::Dumper::SymbolResolver Resolver =
-      [](const object::coff_section *Section, uint64_t Offset,
-         SymbolRef &Symbol, void *user_data) -> error_code {
-        COFFDumper *Dumper = reinterpret_cast<COFFDumper*>(user_data);
-        return Dumper->resolveSymbol(Section, Offset, Symbol);
-      };
+    Win64EH::Dumper::SymbolResolver
+    Resolver = [](const object::coff_section *Section, uint64_t Offset,
+                  SymbolRef &Symbol, void *user_data) -> std::error_code {
+      COFFDumper *Dumper = reinterpret_cast<COFFDumper *>(user_data);
+      return Dumper->resolveSymbol(Section, Offset, Symbol);
+    };
     Win64EH::Dumper::Context Ctx(*Obj, Resolver, this);
     Dumper.printData(Ctx);
     break;
   }
+  case COFF::IMAGE_FILE_MACHINE_ARMNT: {
+    ARM::WinEH::Decoder Decoder(W);
+    Decoder.dumpProcedureData(*Obj);
+    break;
+  }
   default:
     W.printEnum("unsupported Image Machine", Header->Machine,
                 makeArrayRef(ImageFileMachineType));
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index de4c207..5df51e2 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -18,6 +18,7 @@
 #include "Error.h"
 #include "ObjDumper.h"
 #include "StreamWriter.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Object/ELFObjectFile.h"
@@ -54,6 +55,7 @@ public:
   void printProgramHeaders() override;
 
   void printAttributes() override;
+  void printMipsPLTGOT() override;
 
 private:
   typedef ELFFile<ELFT> ELFO;
@@ -81,15 +83,16 @@ template <class T> T errorOrDefault(ErrorOr<T> Val, T Default = T()) {
 namespace llvm {
 
 template <class ELFT>
-static error_code createELFDumper(const ELFFile<ELFT> *Obj,
-                                  StreamWriter &Writer,
-                                  std::unique_ptr<ObjDumper> &Result) {
+static std::error_code createELFDumper(const ELFFile<ELFT> *Obj,
+                                       StreamWriter &Writer,
+                                       std::unique_ptr<ObjDumper> &Result) {
   Result.reset(new ELFDumper<ELFT>(Obj, Writer));
   return readobj_error::success;
 }
 
-error_code createELFDumper(const object::ObjectFile *Obj, StreamWriter &Writer,
-                           std::unique_ptr<ObjDumper> &Result) {
+std::error_code createELFDumper(const object::ObjectFile *Obj,
+                                StreamWriter &Writer,
+                                std::unique_ptr<ObjDumper> &Result) {
   // Little-endian 32-bit
   if (const ELF32LEObjectFile *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
     return createELFDumper(ELFObj->getELFFile(), Writer, Result);
@@ -111,6 +114,62 @@ error_code createELFDumper(const object::ObjectFile *Obj, StreamWriter &Writer,
 
 } // namespace llvm
 
+template <typename ELFO>
+static std::string getFullSymbolName(const ELFO &Obj,
+                                     typename ELFO::Elf_Sym_Iter Symbol) {
+  StringRef SymbolName = errorOrDefault(Obj.getSymbolName(Symbol));
+  if (!Symbol.isDynamic())
+    return SymbolName;
+
+  std::string FullSymbolName(SymbolName);
+
+  bool IsDefault;
+  ErrorOr<StringRef> Version =
+      Obj.getSymbolVersion(nullptr, &*Symbol, IsDefault);
+  if (Version) {
+    FullSymbolName += (IsDefault ? "@@" : "@");
+    FullSymbolName += *Version;
+  } else
+    error(Version.getError());
+  return FullSymbolName;
+}
+
+template <typename ELFO>
+static void
+getSectionNameIndex(const ELFO &Obj, typename ELFO::Elf_Sym_Iter Symbol,
+                    StringRef &SectionName, unsigned &SectionIndex) {
+  SectionIndex = Symbol->st_shndx;
+  if (SectionIndex == SHN_UNDEF) {
+    SectionName = "Undefined";
+  } else if (SectionIndex >= SHN_LOPROC && SectionIndex <= SHN_HIPROC) {
+    SectionName = "Processor Specific";
+  } else if (SectionIndex >= SHN_LOOS && SectionIndex <= SHN_HIOS) {
+    SectionName = "Operating System Specific";
+  } else if (SectionIndex > SHN_HIOS && SectionIndex < SHN_ABS) {
+    SectionName = "Reserved";
+  } else if (SectionIndex == SHN_ABS) {
+    SectionName = "Absolute";
+  } else if (SectionIndex == SHN_COMMON) {
+    SectionName = "Common";
+  } else {
+    if (SectionIndex == SHN_XINDEX)
+      SectionIndex = Obj.getSymbolTableIndex(&*Symbol);
+    assert(SectionIndex != SHN_XINDEX &&
+           "getSymbolTableIndex should handle this");
+    const typename ELFO::Elf_Shdr *Sec = Obj.getSection(SectionIndex);
+    SectionName = errorOrDefault(Obj.getSectionName(Sec));
+  }
+}
+
+template <class ELFT>
+static const typename ELFFile<ELFT>::Elf_Shdr *
+findSectionByAddress(const ELFFile<ELFT> *Obj, uint64_t Addr) {
+  for (const auto &Shdr : Obj->sections())
+    if (Shdr.sh_addr == Addr)
+      return &Shdr;
+  return nullptr;
+}
+
 static const EnumEntry<unsigned> ElfClass[] = {
   { "None",   ELF::ELFCLASSNONE },
   { "32-bit", ELF::ELFCLASS32   },
@@ -651,42 +710,10 @@ void ELFDumper<ELFT>::printDynamicSymbols() {
 
 template <class ELFT>
 void ELFDumper<ELFT>::printSymbol(typename ELFO::Elf_Sym_Iter Symbol) {
-  StringRef SymbolName = errorOrDefault(Obj->getSymbolName(Symbol));
-
-  unsigned SectionIndex = Symbol->st_shndx;
+  unsigned SectionIndex = 0;
   StringRef SectionName;
-  if (SectionIndex == SHN_UNDEF) {
-    SectionName = "Undefined";
-  } else if (SectionIndex >= SHN_LOPROC && SectionIndex <= SHN_HIPROC) {
-    SectionName = "Processor Specific";
-  } else if (SectionIndex >= SHN_LOOS && SectionIndex <= SHN_HIOS) {
-    SectionName = "Operating System Specific";
-  } else if (SectionIndex > SHN_HIOS && SectionIndex < SHN_ABS) {
-    SectionName = "Reserved";
-  } else if (SectionIndex == SHN_ABS) {
-    SectionName = "Absolute";
-  } else if (SectionIndex == SHN_COMMON) {
-    SectionName = "Common";
-  } else {
-    if (SectionIndex == SHN_XINDEX)
-      SectionIndex = Obj->getSymbolTableIndex(&*Symbol);
-    assert(SectionIndex != SHN_XINDEX &&
-           "getSymbolTableIndex should handle this");
-    const Elf_Shdr *Sec = Obj->getSection(SectionIndex);
-    SectionName = errorOrDefault(Obj->getSectionName(Sec));
-  }
-
-  std::string FullSymbolName(SymbolName);
-  if (Symbol.isDynamic()) {
-    bool IsDefault;
-    ErrorOr<StringRef> Version = Obj->getSymbolVersion(nullptr, &*Symbol,
-                                                       IsDefault);
-    if (Version) {
-      FullSymbolName += (IsDefault ? "@@" : "@");
-      FullSymbolName += *Version;
-    } else
-      error(Version.getError());
-  }
+  getSectionNameIndex(*Obj, Symbol, SectionName, SectionIndex);
+  std::string FullSymbolName = getFullSymbolName(*Obj, Symbol);
 
   DictScope D(W, "Symbol");
   W.printNumber("Name", FullSymbolName, Symbol->st_name);
@@ -902,13 +929,12 @@ void ELFDumper<ELFType<support::little, 2, false> >::printUnwindInfo() {
 
 template<class ELFT>
 void ELFDumper<ELFT>::printDynamicTable() {
-  typedef typename ELFO::Elf_Dyn_Iter EDI;
-  EDI Start = Obj->begin_dynamic_table(), End = Obj->end_dynamic_table(true);
+  auto DynTable = Obj->dynamic_table(true);
 
-  if (Start == End)
+  ptrdiff_t Total = std::distance(DynTable.begin(), DynTable.end());
+  if (Total == 0)
     return;
 
-  ptrdiff_t Total = std::distance(Start, End);
   raw_ostream &OS = W.getOStream();
   W.startLine() << "DynamicSection [ (" << Total << " entries)\n";
 
@@ -917,12 +943,12 @@ void ELFDumper<ELFT>::printDynamicTable() {
   W.startLine()
      << "  Tag" << (Is64 ? "                " : "        ") << "Type"
      << "                 " << "Name/Value\n";
-  for (; Start != End; ++Start) {
+  for (const auto &Entry : DynTable) {
     W.startLine()
        << "  "
-       << format(Is64 ? "0x%016" PRIX64 : "0x%08" PRIX64, Start->getTag())
-       << " " << format("%-21s", getTypeString(Start->getTag()));
-    printValue(Obj, Start->getTag(), Start->getVal(), Is64, OS);
+       << format(Is64 ? "0x%016" PRIX64 : "0x%08" PRIX64, Entry.getTag())
+       << " " << format("%-21s", getTypeString(Entry.getTag()));
+    printValue(Obj, Entry.getTag(), Entry.getVal(), Is64, OS);
     OS << "\n";
   }
 
@@ -936,11 +962,9 @@ void ELFDumper<ELFT>::printNeededLibraries() {
   typedef std::vector<StringRef> LibsTy;
   LibsTy Libs;
 
-  for (typename ELFO::Elf_Dyn_Iter DynI = Obj->begin_dynamic_table(),
-                                   DynE = Obj->end_dynamic_table();
-       DynI != DynE; ++DynI)
-    if (DynI->d_tag == ELF::DT_NEEDED)
-      Libs.push_back(Obj->getDynamicString(DynI->d_un.d_val));
+  for (const auto &Entry : Obj->dynamic_table())
+    if (Entry.d_tag == ELF::DT_NEEDED)
+      Libs.push_back(Obj->getDynamicString(Entry.d_un.d_val));
 
   std::stable_sort(Libs.begin(), Libs.end());
 
@@ -1008,3 +1032,209 @@ void ELFDumper<ELFType<support::little, 2, false> >::printAttributes() {
 }
 }
 
+namespace {
+template <class ELFT> class MipsGOTParser {
+public:
+  typedef object::ELFFile<ELFT> ObjectFile;
+  typedef typename ObjectFile::Elf_Shdr Elf_Shdr;
+
+  MipsGOTParser(const ObjectFile *Obj, StreamWriter &W) : Obj(Obj), W(W) {}
+
+  void parseGOT(const Elf_Shdr &GOTShdr);
+
+private:
+  typedef typename ObjectFile::Elf_Sym_Iter Elf_Sym_Iter;
+  typedef typename ObjectFile::Elf_Addr GOTEntry;
+  typedef typename ObjectFile::template ELFEntityIterator<const GOTEntry>
+  GOTIter;
+
+  const ObjectFile *Obj;
+  StreamWriter &W;
+
+  std::size_t getGOTTotal(ArrayRef<uint8_t> GOT) const;
+  GOTIter makeGOTIter(ArrayRef<uint8_t> GOT, std::size_t EntryNum);
+
+  bool getGOTTags(uint64_t &LocalGotNum, uint64_t &GotSym);
+  void printGotEntry(uint64_t GotAddr, GOTIter BeginIt, GOTIter It);
+  void printGlobalGotEntry(uint64_t GotAddr, GOTIter BeginIt, GOTIter It,
+                           Elf_Sym_Iter Sym);
+};
+}
+
+template <class ELFT>
+void MipsGOTParser<ELFT>::parseGOT(const Elf_Shdr &GOTShdr) {
+  // See "Global Offset Table" in Chapter 5 in the following document
+  // for detailed GOT description.
+  // ftp://www.linux-mips.org/pub/linux/mips/doc/ABI/mipsabi.pdf
+
+  ErrorOr<ArrayRef<uint8_t>> GOT = Obj->getSectionContents(&GOTShdr);
+  if (!GOT) {
+    W.startLine() << "The .got section is empty.\n";
+    return;
+  }
+
+  uint64_t DtLocalGotNum;
+  uint64_t DtGotSym;
+  if (!getGOTTags(DtLocalGotNum, DtGotSym))
+    return;
+
+  if (DtLocalGotNum > getGOTTotal(*GOT)) {
+    W.startLine() << "MIPS_LOCAL_GOTNO exceeds a number of GOT entries.\n";
+    return;
+  }
+
+  Elf_Sym_Iter DynSymBegin = Obj->begin_dynamic_symbols();
+  Elf_Sym_Iter DynSymEnd = Obj->end_dynamic_symbols();
+  std::size_t DynSymTotal = std::size_t(std::distance(DynSymBegin, DynSymEnd));
+
+  if (DtGotSym > DynSymTotal) {
+    W.startLine() << "MIPS_GOTSYM exceeds a number of dynamic symbols.\n";
+    return;
+  }
+
+  std::size_t GlobalGotNum = DynSymTotal - DtGotSym;
+
+  if (DtLocalGotNum + GlobalGotNum > getGOTTotal(*GOT)) {
+    W.startLine() << "Number of global GOT entries exceeds the size of GOT.\n";
+    return;
+  }
+
+  GOTIter GotBegin = makeGOTIter(*GOT, 0);
+  GOTIter GotLocalEnd = makeGOTIter(*GOT, DtLocalGotNum);
+  GOTIter It = GotBegin;
+
+  DictScope GS(W, "Primary GOT");
+
+  W.printHex("Canonical gp value", GOTShdr.sh_addr + 0x7ff0);
+  {
+    ListScope RS(W, "Reserved entries");
+
+    {
+      DictScope D(W, "Entry");
+      printGotEntry(GOTShdr.sh_addr, GotBegin, It++);
+      W.printString("Purpose", StringRef("Lazy resolver"));
+    }
+
+    if (It != GotLocalEnd && (*It >> (sizeof(GOTEntry) * 8 - 1)) != 0) {
+      DictScope D(W, "Entry");
+      printGotEntry(GOTShdr.sh_addr, GotBegin, It++);
+      W.printString("Purpose", StringRef("Module pointer (GNU extension)"));
+    }
+  }
+  {
+    ListScope LS(W, "Local entries");
+    for (; It != GotLocalEnd; ++It) {
+      DictScope D(W, "Entry");
+      printGotEntry(GOTShdr.sh_addr, GotBegin, It);
+    }
+  }
+  {
+    ListScope GS(W, "Global entries");
+
+    GOTIter GotGlobalEnd = makeGOTIter(*GOT, DtLocalGotNum + GlobalGotNum);
+    Elf_Sym_Iter GotDynSym = DynSymBegin + DtGotSym;
+    for (; It != GotGlobalEnd; ++It) {
+      DictScope D(W, "Entry");
+      printGlobalGotEntry(GOTShdr.sh_addr, GotBegin, It, GotDynSym++);
+    }
+  }
+
+  std::size_t SpecGotNum = getGOTTotal(*GOT) - DtLocalGotNum - GlobalGotNum;
+  W.printNumber("Number of TLS and multi-GOT entries", uint64_t(SpecGotNum));
+}
+
+template <class ELFT>
+std::size_t MipsGOTParser<ELFT>::getGOTTotal(ArrayRef<uint8_t> GOT) const {
+  return GOT.size() / sizeof(GOTEntry);
+}
+
+template <class ELFT>
+typename MipsGOTParser<ELFT>::GOTIter
+MipsGOTParser<ELFT>::makeGOTIter(ArrayRef<uint8_t> GOT, std::size_t EntryNum) {
+  const char *Data = reinterpret_cast<const char *>(GOT.data());
+  return GOTIter(sizeof(GOTEntry), Data + EntryNum * sizeof(GOTEntry));
+}
+
+template <class ELFT>
+bool MipsGOTParser<ELFT>::getGOTTags(uint64_t &LocalGotNum, uint64_t &GotSym) {
+  bool FoundLocalGotNum = false;
+  bool FoundGotSym = false;
+  for (const auto &Entry : Obj->dynamic_table()) {
+    switch (Entry.getTag()) {
+    case ELF::DT_MIPS_LOCAL_GOTNO:
+      LocalGotNum = Entry.getVal();
+      FoundLocalGotNum = true;
+      break;
+    case ELF::DT_MIPS_GOTSYM:
+      GotSym = Entry.getVal();
+      FoundGotSym = true;
+      break;
+    }
+  }
+
+  if (!FoundLocalGotNum) {
+    W.startLine() << "Cannot find MIPS_LOCAL_GOTNO dynamic table tag.\n";
+    return false;
+  }
+
+  if (!FoundGotSym) {
+    W.startLine() << "Cannot find MIPS_GOTSYM dynamic table tag.\n";
+    return false;
+  }
+
+  return true;
+}
+
+template <class ELFT>
+void MipsGOTParser<ELFT>::printGotEntry(uint64_t GotAddr, GOTIter BeginIt,
+                                        GOTIter It) {
+  int64_t Offset = std::distance(BeginIt, It) * sizeof(GOTEntry);
+  W.printHex("Address", GotAddr + Offset);
+  W.printNumber("Access", Offset - 0x7ff0);
+  W.printHex("Initial", *It);
+}
+
+template <class ELFT>
+void MipsGOTParser<ELFT>::printGlobalGotEntry(uint64_t GotAddr, GOTIter BeginIt,
+                                              GOTIter It, Elf_Sym_Iter Sym) {
+  printGotEntry(GotAddr, BeginIt, It);
+
+  W.printHex("Value", Sym->st_value);
+  W.printEnum("Type", Sym->getType(), makeArrayRef(ElfSymbolTypes));
+
+  unsigned SectionIndex = 0;
+  StringRef SectionName;
+  getSectionNameIndex(*Obj, Sym, SectionName, SectionIndex);
+  W.printHex("Section", SectionName, SectionIndex);
+
+  std::string FullSymbolName = getFullSymbolName(*Obj, Sym);
+  W.printNumber("Name", FullSymbolName, Sym->st_name);
+}
+
+template <class ELFT> void ELFDumper<ELFT>::printMipsPLTGOT() {
+  if (Obj->getHeader()->e_machine != EM_MIPS) {
+    W.startLine() << "MIPS PLT GOT is available for MIPS targets only.\n";
+    return;
+  }
+
+  llvm::Optional<uint64_t> DtPltGot;
+  for (const auto &Entry : Obj->dynamic_table()) {
+    if (Entry.getTag() == ELF::DT_PLTGOT) {
+      DtPltGot = Entry.getVal();
+      break;
+    }
+  }
+
+  if (!DtPltGot) {
+    W.startLine() << "Cannot find PLTGOT dynamic table tag.\n";
+    return;
+  }
+
+  const Elf_Shdr *GotShdr = findSectionByAddress(Obj, *DtPltGot);
+  if (!GotShdr) {
+    W.startLine() << "There is no .got section in the file.\n";
+    return;
+  }
+
+  MipsGOTParser<ELFT>(Obj, W).parseGOT(*GotShdr);
+}
diff --git a/tools/llvm-readobj/Error.cpp b/tools/llvm-readobj/Error.cpp
index 83ed6a7..a078f5c 100644
--- a/tools/llvm-readobj/Error.cpp
+++ b/tools/llvm-readobj/Error.cpp
@@ -17,11 +17,10 @@
 using namespace llvm;
 
 namespace {
-class _readobj_error_category : public error_category {
+class _readobj_error_category : public std::error_category {
 public:
-  const char* name() const override;
+  const char* name() const LLVM_NOEXCEPT override;
   std::string message(int ev) const override;
-  error_condition default_error_condition(int ev) const override;
 };
 } // namespace
 
@@ -29,8 +28,8 @@ const char *_readobj_error_category::name() const {
   return "llvm.readobj";
 }
 
-std::string _readobj_error_category::message(int ev) const {
-  switch (ev) {
+std::string _readobj_error_category::message(int EV) const {
+  switch (static_cast<readobj_error>(EV)) {
   case readobj_error::success: return "Success";
   case readobj_error::file_not_found:
     return "No such file.";
@@ -42,20 +41,13 @@ std::string _readobj_error_category::message(int ev) const {
     return "Unsupported object file format.";
   case readobj_error::unknown_symbol:
     return "Unknown symbol.";
-  default:
-    llvm_unreachable("An enumerator of readobj_error does not have a message "
-                     "defined.");
   }
-}
-
-error_condition _readobj_error_category::default_error_condition(int ev) const {
-  if (ev == readobj_error::success)
-    return errc::success;
-  return errc::invalid_argument;
+  llvm_unreachable("An enumerator of readobj_error does not have a message "
+                   "defined.");
 }
 
 namespace llvm {
-const error_category &readobj_category() {
+const std::error_category &readobj_category() {
   static _readobj_error_category o;
   return o;
 }
diff --git a/tools/llvm-readobj/Error.h b/tools/llvm-readobj/Error.h
index 5129b4e..81ce408 100644
--- a/tools/llvm-readobj/Error.h
+++ b/tools/llvm-readobj/Error.h
@@ -14,35 +14,28 @@
 #ifndef LLVM_READOBJ_ERROR_H
 #define LLVM_READOBJ_ERROR_H
 
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 namespace llvm {
-
-const error_category &readobj_category();
-
-struct readobj_error {
-  enum _ {
-    success = 0,
-    file_not_found,
-    unsupported_file_format,
-    unrecognized_file_format,
-    unsupported_obj_file_format,
-    unknown_symbol
-  };
-  _ v_;
-
-  readobj_error(_ v) : v_(v) {}
-  explicit readobj_error(int v) : v_(_(v)) {}
-  operator int() const {return v_;}
+const std::error_category &readobj_category();
+
+enum class readobj_error {
+  success = 0,
+  file_not_found,
+  unsupported_file_format,
+  unrecognized_file_format,
+  unsupported_obj_file_format,
+  unknown_symbol
 };
 
-inline error_code make_error_code(readobj_error e) {
-  return error_code(static_cast<int>(e), readobj_category());
+inline std::error_code make_error_code(readobj_error e) {
+  return std::error_code(static_cast<int>(e), readobj_category());
 }
 
-template <> struct is_error_code_enum<readobj_error> : std::true_type { };
-template <> struct is_error_code_enum<readobj_error::_> : std::true_type { };
-
 } // namespace llvm
 
+namespace std {
+template <> struct is_error_code_enum<llvm::readobj_error> : std::true_type {};
+}
+
 #endif
diff --git a/tools/llvm-readobj/MachODumper.cpp b/tools/llvm-readobj/MachODumper.cpp
index 2fd5d4a..a5e5cf8 100644
--- a/tools/llvm-readobj/MachODumper.cpp
+++ b/tools/llvm-readobj/MachODumper.cpp
@@ -16,6 +16,7 @@
 #include "ObjDumper.h"
 #include "StreamWriter.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Support/Casting.h"
 
@@ -54,9 +55,9 @@ private:
 
 namespace llvm {
 
-error_code createMachODumper(const object::ObjectFile *Obj,
-                             StreamWriter &Writer,
-                             std::unique_ptr<ObjDumper> &Result) {
+std::error_code createMachODumper(const object::ObjectFile *Obj,
+                                  StreamWriter &Writer,
+                                  std::unique_ptr<ObjDumper> &Result) {
   const MachOObjectFile *MachOObj = dyn_cast<MachOObjectFile>(Obj);
   if (!MachOObj)
     return readobj_error::unsupported_obj_file_format;
@@ -277,7 +278,7 @@ void MachODumper::printSections(const MachOObjectFile *Obj) {
 void MachODumper::printRelocations() {
   ListScope D(W, "Relocations");
 
-  error_code EC;
+  std::error_code EC;
   for (const SectionRef &Section : Obj->sections()) {
     StringRef Name;
     if (error(Section.getName(Name)))
@@ -309,18 +310,29 @@ void MachODumper::printRelocation(const MachOObjectFile *Obj,
                                   const RelocationRef &Reloc) {
   uint64_t Offset;
   SmallString<32> RelocName;
-  StringRef SymbolName;
   if (error(Reloc.getOffset(Offset)))
     return;
   if (error(Reloc.getTypeName(RelocName)))
     return;
-  symbol_iterator Symbol = Reloc.getSymbol();
-  if (Symbol != Obj->symbol_end() && error(Symbol->getName(SymbolName)))
-    return;
 
   DataRefImpl DR = Reloc.getRawDataRefImpl();
   MachO::any_relocation_info RE = Obj->getRelocation(DR);
   bool IsScattered = Obj->isRelocationScattered(RE);
+  SmallString<32> SymbolNameOrOffset("0x");
+  if (IsScattered) {
+    // Scattered relocations don't really have an associated symbol
+    // for some reason, even if one exists in the symtab at the correct address.
+    SymbolNameOrOffset += utohexstr(Obj->getScatteredRelocationValue(RE));
+  } else {
+    symbol_iterator Symbol = Reloc.getSymbol();
+    if (Symbol != Obj->symbol_end()) {
+      StringRef SymbolName;
+      if (error(Symbol->getName(SymbolName)))
+        return;
+      SymbolNameOrOffset = SymbolName;
+    } else
+      SymbolNameOrOffset += utohexstr(Obj->getPlainRelocationSymbolNum(RE));
+  }
 
   if (opts::ExpandRelocs) {
     DictScope Group(W, "Relocation");
@@ -332,7 +344,7 @@ void MachODumper::printRelocation(const MachOObjectFile *Obj,
     else
       W.printNumber("Extern", Obj->getPlainRelocationExternal(RE));
     W.printNumber("Type", RelocName, Obj->getAnyRelocationType(RE));
-    W.printString("Symbol", SymbolName.size() > 0 ? SymbolName : "-");
+    W.printString("Symbol", SymbolNameOrOffset);
     W.printNumber("Scattered", IsScattered);
   } else {
     raw_ostream& OS = W.startLine();
@@ -345,7 +357,7 @@ void MachODumper::printRelocation(const MachOObjectFile *Obj,
       OS << " " << Obj->getPlainRelocationExternal(RE);
     OS << " " << RelocName
        << " " << IsScattered
-       << " " << (SymbolName.size() > 0 ? SymbolName : "-")
+       << " " << SymbolNameOrOffset
        << "\n";
   }
 }
diff --git a/tools/llvm-readobj/ObjDumper.h b/tools/llvm-readobj/ObjDumper.h
index 9e0fd2f..f80a28b 100644
--- a/tools/llvm-readobj/ObjDumper.h
+++ b/tools/llvm-readobj/ObjDumper.h
@@ -11,15 +11,13 @@
 #define LLVM_READOBJ_OBJDUMPER_H
 
 #include <memory>
+#include <system_error>
 
 namespace llvm {
-
 namespace object {
   class ObjectFile;
 }
 
-class error_code;
-
 class StreamWriter;
 
 class ObjDumper {
@@ -42,19 +40,24 @@ public:
   // Only implemented for ARM ELF at this time.
   virtual void printAttributes() { }
 
+  // Only implemented for MIPS ELF at this time.
+  virtual void printMipsPLTGOT() { }
+
 protected:
   StreamWriter& W;
 };
 
-error_code createCOFFDumper(const object::ObjectFile *Obj, StreamWriter &Writer,
-                            std::unique_ptr<ObjDumper> &Result);
+std::error_code createCOFFDumper(const object::ObjectFile *Obj,
+                                 StreamWriter &Writer,
+                                 std::unique_ptr<ObjDumper> &Result);
 
-error_code createELFDumper(const object::ObjectFile *Obj, StreamWriter &Writer,
-                           std::unique_ptr<ObjDumper> &Result);
+std::error_code createELFDumper(const object::ObjectFile *Obj,
+                                StreamWriter &Writer,
+                                std::unique_ptr<ObjDumper> &Result);
 
-error_code createMachODumper(const object::ObjectFile *Obj,
-                             StreamWriter &Writer,
-                             std::unique_ptr<ObjDumper> &Result);
+std::error_code createMachODumper(const object::ObjectFile *Obj,
+                                  StreamWriter &Writer,
+                                  std::unique_ptr<ObjDumper> &Result);
 
 } // namespace llvm
 
diff --git a/tools/llvm-readobj/StreamWriter.h b/tools/llvm-readobj/StreamWriter.h
index 9282dcc..04b38fb 100644
--- a/tools/llvm-readobj/StreamWriter.h
+++ b/tools/llvm-readobj/StreamWriter.h
@@ -169,6 +169,10 @@ public:
     startLine() << Label << ": " << int(Value) << "\n";
   }
 
+  void printBoolean(StringRef Label, bool Value) {
+    startLine() << Label << ": " << (Value ? "Yes" : "No") << '\n';
+  }
+
   template <typename T_>
   void printList(StringRef Label, const SmallVectorImpl<T_> &List) {
     startLine() << Label << ": [";
diff --git a/tools/llvm-readobj/Win64EHDumper.cpp b/tools/llvm-readobj/Win64EHDumper.cpp
index c64d362..f058632 100644
--- a/tools/llvm-readobj/Win64EHDumper.cpp
+++ b/tools/llvm-readobj/Win64EHDumper.cpp
@@ -134,20 +134,21 @@ static std::string formatSymbol(const Dumper::Context &Ctx,
   return OS.str();
 }
 
-static error_code resolveRelocation(const Dumper::Context &Ctx,
-                                    const coff_section *Section,
-                                    uint64_t Offset,
-                                    const coff_section *&ResolvedSection,
-                                    uint64_t &ResolvedAddress) {
+static std::error_code resolveRelocation(const Dumper::Context &Ctx,
+                                         const coff_section *Section,
+                                         uint64_t Offset,
+                                         const coff_section *&ResolvedSection,
+                                         uint64_t &ResolvedAddress) {
   SymbolRef Symbol;
-  if (error_code EC = Ctx.ResolveSymbol(Section, Offset, Symbol, Ctx.UserData))
+  if (std::error_code EC =
+          Ctx.ResolveSymbol(Section, Offset, Symbol, Ctx.UserData))
     return EC;
 
-  if (error_code EC = Symbol.getAddress(ResolvedAddress))
+  if (std::error_code EC = Symbol.getAddress(ResolvedAddress))
     return EC;
 
   section_iterator SI = Ctx.COFF.section_begin();
-  if (error_code EC = Symbol.getSection(SI))
+  if (std::error_code EC = Symbol.getSection(SI))
     return EC;
 
   ResolvedSection = Ctx.COFF.getCOFFSection(*SI);
diff --git a/tools/llvm-readobj/Win64EHDumper.h b/tools/llvm-readobj/Win64EHDumper.h
index 2eac810..9ce4d39 100644
--- a/tools/llvm-readobj/Win64EHDumper.h
+++ b/tools/llvm-readobj/Win64EHDumper.h
@@ -26,8 +26,9 @@ class Dumper {
   raw_ostream &OS;
 
 public:
-  typedef error_code (*SymbolResolver)(const object::coff_section *, uint64_t,
-                                       object::SymbolRef &, void *);
+  typedef std::error_code (*SymbolResolver)(const object::coff_section *,
+                                            uint64_t, object::SymbolRef &,
+                                            void *);
 
   struct Context {
     const object::COFFObjectFile &COFF;
diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp
index 5be959f..8d2a997 100644
--- a/tools/llvm-readobj/llvm-readobj.cpp
+++ b/tools/llvm-readobj/llvm-readobj.cpp
@@ -35,8 +35,8 @@
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/system_error.h"
 #include <string>
+#include <system_error>
 
 
 using namespace llvm;
@@ -135,13 +135,18 @@ namespace opts {
                               cl::desc("Display the ARM attributes section"));
   cl::alias ARMAttributesShort("-a", cl::desc("Alias for --arm-attributes"),
                                cl::aliasopt(ARMAttributes));
+
+  // -mips-plt-got
+  cl::opt<bool>
+  MipsPLTGOT("mips-plt-got",
+             cl::desc("Display the MIPS GOT and PLT GOT sections"));
 } // namespace opts
 
 static int ReturnValue = EXIT_SUCCESS;
 
 namespace llvm {
 
-bool error(error_code EC) {
+bool error(std::error_code EC) {
   if (!EC)
     return false;
 
@@ -160,8 +165,7 @@ bool relocAddressLess(RelocationRef a, RelocationRef b) {
 
 } // namespace llvm
 
-
-static void reportError(StringRef Input, error_code EC) {
+static void reportError(StringRef Input, std::error_code EC) {
   if (Input == "-")
     Input = "<stdin>";
 
@@ -178,9 +182,21 @@ static void reportError(StringRef Input, StringRef Message) {
   ReturnValue = EXIT_FAILURE;
 }
 
+static bool isMipsArch(unsigned Arch) {
+  switch (Arch) {
+  case llvm::Triple::mips:
+  case llvm::Triple::mipsel:
+  case llvm::Triple::mips64:
+  case llvm::Triple::mips64el:
+    return true;
+  default:
+    return false;
+  }
+}
+
 /// @brief Creates an format-specific object file dumper.
-static error_code createDumper(const ObjectFile *Obj, StreamWriter &Writer,
-                               std::unique_ptr<ObjDumper> &Result) {
+static std::error_code createDumper(const ObjectFile *Obj, StreamWriter &Writer,
+                                    std::unique_ptr<ObjDumper> &Result) {
   if (!Obj)
     return readobj_error::unsupported_file_format;
 
@@ -199,7 +215,7 @@ static error_code createDumper(const ObjectFile *Obj, StreamWriter &Writer,
 static void dumpObject(const ObjectFile *Obj) {
   StreamWriter Writer(outs());
   std::unique_ptr<ObjDumper> Dumper;
-  if (error_code EC = createDumper(Obj, Writer, Dumper)) {
+  if (std::error_code EC = createDumper(Obj, Writer, Dumper)) {
     reportError(Obj->getFileName(), EC);
     return;
   }
@@ -235,6 +251,9 @@ static void dumpObject(const ObjectFile *Obj) {
   if (Obj->getArch() == llvm::Triple::arm && Obj->isELF())
     if (opts::ARMAttributes)
       Dumper->printAttributes();
+  if (isMipsArch(Obj->getArch()) && Obj->isELF())
+    if (opts::MipsPLTGOT)
+      Dumper->printMipsPLTGOT();
 }
 
 
@@ -243,15 +262,15 @@ static void dumpArchive(const Archive *Arc) {
   for (Archive::child_iterator ArcI = Arc->child_begin(),
                                ArcE = Arc->child_end();
                                ArcI != ArcE; ++ArcI) {
-    std::unique_ptr<Binary> child;
-    if (error_code EC = ArcI->getAsBinary(child)) {
+    ErrorOr<std::unique_ptr<Binary>> ChildOrErr = ArcI->getAsBinary();
+    if (std::error_code EC = ChildOrErr.getError()) {
       // Ignore non-object files.
       if (EC != object_error::invalid_file_type)
         reportError(Arc->getFileName(), EC.message());
       continue;
     }
 
-    if (ObjectFile *Obj = dyn_cast<ObjectFile>(child.get()))
+    if (ObjectFile *Obj = dyn_cast<ObjectFile>(&*ChildOrErr.get()))
       dumpObject(Obj);
     else
       reportError(Arc->getFileName(), readobj_error::unrecognized_file_format);
@@ -269,7 +288,7 @@ static void dumpInput(StringRef File) {
 
   // Attempt to open the binary.
   ErrorOr<Binary *> BinaryOrErr = createBinary(File);
-  if (error_code EC = BinaryOrErr.getError()) {
+  if (std::error_code EC = BinaryOrErr.getError()) {
     reportError(File, EC);
     return;
   }
diff --git a/tools/llvm-readobj/llvm-readobj.h b/tools/llvm-readobj/llvm-readobj.h
index cc5c85d..0413948 100644
--- a/tools/llvm-readobj/llvm-readobj.h
+++ b/tools/llvm-readobj/llvm-readobj.h
@@ -18,10 +18,8 @@ namespace llvm {
     class RelocationRef;
   }
 
-  class error_code;
-
   // Various helper functions.
-  bool error(error_code ec);
+  bool error(std::error_code ec);
   bool relocAddressLess(object::RelocationRef A,
                         object::RelocationRef B);
 } // namespace llvm
@@ -40,6 +38,7 @@ namespace opts {
   extern llvm::cl::opt<bool> ExpandRelocs;
   extern llvm::cl::opt<bool> CodeViewLineTables;
   extern llvm::cl::opt<bool> ARMAttributes;
+  extern llvm::cl::opt<bool> MipsPLTGOT;
 } // namespace opts
 
 #define LLVM_READOBJ_ENUM_ENT(ns, enum) \
diff --git a/tools/llvm-rtdyld/Android.mk b/tools/llvm-rtdyld/Android.mk
index 54a612a..6f902d3 100644
--- a/tools/llvm-rtdyld/Android.mk
+++ b/tools/llvm-rtdyld/Android.mk
@@ -11,11 +11,38 @@ llvm_rtdyld_SRC_FILES := \
   llvm-rtdyld.cpp
 
 llvm_rtdyld_STATIC_LIBRARIES := \
+  libLLVMARMCodeGen \
+  libLLVMARMInfo \
+  libLLVMARMDesc \
+  libLLVMARMAsmPrinter \
+  libLLVMARMAsmParser \
+  libLLVMARMDisassembler \
+  libLLVMAArch64CodeGen \
+  libLLVMAArch64Info \
+  libLLVMAArch64AsmParser \
+  libLLVMAArch64Desc \
+  libLLVMAArch64AsmPrinter \
+  libLLVMAArch64Utils \
+  libLLVMAArch64Disassembler \
+  libLLVMMipsCodeGen \
+  libLLVMMipsInfo \
+  libLLVMMipsDesc \
+  libLLVMMipsAsmPrinter \
+  libLLVMMipsAsmParser \
+  libLLVMMipsDisassembler \
+  libLLVMX86CodeGen \
+  libLLVMX86Info \
+  libLLVMX86Desc \
+  libLLVMX86AsmPrinter \
+  libLLVMX86AsmParser \
+  libLLVMX86Utils \
+  libLLVMX86Disassembler \
   libLLVMDebugInfo          \
   libLLVMExecutionEngine    \
+  libLLVMObject             \
   libLLVMMC                 \
+  libLLVMMCParser           \
   libLLVMRuntimeDyld        \
-  libLLVMObject             \
   libLLVMBitReader          \
   libLLVMCore               \
   libLLVMSupport            \
diff --git a/tools/llvm-rtdyld/CMakeLists.txt b/tools/llvm-rtdyld/CMakeLists.txt
index 3ad127f..feb2134 100644
--- a/tools/llvm-rtdyld/CMakeLists.txt
+++ b/tools/llvm-rtdyld/CMakeLists.txt
@@ -1,6 +1,8 @@
 set(LLVM_LINK_COMPONENTS
+  ${LLVM_TARGETS_TO_BUILD}
   DebugInfo
   ExecutionEngine
+  MC
   RuntimeDyld
   Support
   )
diff --git a/tools/llvm-rtdyld/llvm-rtdyld.cpp b/tools/llvm-rtdyld/llvm-rtdyld.cpp
index be5c345..45734f4 100644
--- a/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -16,6 +16,13 @@
 #include "llvm/ExecutionEngine/ObjectBuffer.h"
 #include "llvm/ExecutionEngine/ObjectImage.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/ExecutionEngine/RuntimeDyldChecker.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DynamicLibrary.h"
@@ -23,9 +30,12 @@
 #include "llvm/Support/Memory.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include <system_error>
+
 using namespace llvm;
 using namespace llvm::object;
 
@@ -35,7 +45,8 @@ InputFileList(cl::Positional, cl::ZeroOrMore,
 
 enum ActionType {
   AC_Execute,
-  AC_PrintLineInfo
+  AC_PrintLineInfo,
+  AC_Verify
 };
 
 static cl::opt<ActionType>
@@ -45,6 +56,8 @@ Action(cl::desc("Action to perform:"),
                              "Load, link, and execute the inputs."),
                   clEnumValN(AC_PrintLineInfo, "printline",
                              "Load, link, and print line information for each function."),
+                  clEnumValN(AC_Verify, "verify",
+                             "Load, link and verify the resulting memory image."),
                   clEnumValEnd));
 
 static cl::opt<std::string>
@@ -57,6 +70,14 @@ Dylibs("dylib",
        cl::desc("Add library."),
        cl::ZeroOrMore);
 
+static cl::opt<std::string>
+TripleName("triple", cl::desc("Target triple for disassembler"));
+
+static cl::list<std::string>
+CheckFiles("check",
+           cl::desc("File containing RuntimeDyld verifier checks."),
+           cl::ZeroOrMore);
+
 /* *** */
 
 // A trivial memory manager that doesn't do anything fancy, just uses the
@@ -139,7 +160,6 @@ static void loadDylibs() {
   }
 }
 
-
 /* *** */
 
 static int printLineInfoForInput() {
@@ -155,14 +175,16 @@ static int printLineInfoForInput() {
     RuntimeDyld Dyld(&MemMgr);
 
     // Load the input memory buffer.
-    std::unique_ptr<MemoryBuffer> InputBuffer;
-    std::unique_ptr<ObjectImage> LoadedObject;
-    if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFileList[i],
-                                                     InputBuffer))
-      return Error("unable to read input: '" + ec.message() + "'");
 
+    ErrorOr<std::unique_ptr<MemoryBuffer>> InputBuffer =
+        MemoryBuffer::getFileOrSTDIN(InputFileList[i]);
+    if (std::error_code EC = InputBuffer.getError())
+      return Error("unable to read input: '" + EC.message() + "'");
+
+    std::unique_ptr<ObjectImage> LoadedObject;
     // Load the object file
-    LoadedObject.reset(Dyld.loadObject(new ObjectBuffer(InputBuffer.release())));
+    LoadedObject.reset(
+        Dyld.loadObject(new ObjectBuffer(InputBuffer.get().release())));
     if (!LoadedObject) {
       return Error(Dyld.getErrorString());
     }
@@ -216,14 +238,14 @@ static int executeInput() {
     InputFileList.push_back("-");
   for(unsigned i = 0, e = InputFileList.size(); i != e; ++i) {
     // Load the input memory buffer.
-    std::unique_ptr<MemoryBuffer> InputBuffer;
+    ErrorOr<std::unique_ptr<MemoryBuffer>> InputBuffer =
+        MemoryBuffer::getFileOrSTDIN(InputFileList[i]);
+    if (std::error_code EC = InputBuffer.getError())
+      return Error("unable to read input: '" + EC.message() + "'");
     std::unique_ptr<ObjectImage> LoadedObject;
-    if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFileList[i],
-                                                     InputBuffer))
-      return Error("unable to read input: '" + ec.message() + "'");
-
     // Load the object file
-    LoadedObject.reset(Dyld.loadObject(new ObjectBuffer(InputBuffer.release())));
+    LoadedObject.reset(
+        Dyld.loadObject(new ObjectBuffer(InputBuffer.get().release())));
     if (!LoadedObject) {
       return Error(Dyld.getErrorString());
     }
@@ -263,6 +285,96 @@ static int executeInput() {
   return Main(1, Argv);
 }
 
+static int checkAllExpressions(RuntimeDyldChecker &Checker) {
+  for (const auto& CheckerFileName : CheckFiles) {
+    ErrorOr<std::unique_ptr<MemoryBuffer>> CheckerFileBuf =
+        MemoryBuffer::getFileOrSTDIN(CheckerFileName);
+    if (std::error_code EC = CheckerFileBuf.getError())
+      return Error("unable to read input '" + CheckerFileName + "': " +
+                   EC.message());
+
+    if (!Checker.checkAllRulesInBuffer("# rtdyld-check:",
+                                       CheckerFileBuf.get().get()))
+      return Error("some checks in '" + CheckerFileName + "' failed");
+  }
+  return 0;
+}
+
+static int linkAndVerify() {
+
+  // Check for missing triple.
+  if (TripleName == "") {
+    llvm::errs() << "Error: -triple required when running in -verify mode.\n";
+    return 1;
+  }
+
+  // Look up the target and build the disassembler.
+  Triple TheTriple(Triple::normalize(TripleName));
+  std::string ErrorStr;
+  const Target *TheTarget =
+    TargetRegistry::lookupTarget("", TheTriple, ErrorStr);
+  if (!TheTarget) {
+    llvm::errs() << "Error accessing target '" << TripleName << "': "
+                 << ErrorStr << "\n";
+    return 1;
+  }
+  TripleName = TheTriple.getTriple();
+
+  std::unique_ptr<MCSubtargetInfo> STI(
+    TheTarget->createMCSubtargetInfo(TripleName, "", ""));
+  assert(STI && "Unable to create subtarget info!");
+
+  std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TripleName));
+  assert(MRI && "Unable to create target register info!");
+
+  std::unique_ptr<MCAsmInfo> MAI(TheTarget->createMCAsmInfo(*MRI, TripleName));
+  assert(MAI && "Unable to create target asm info!");
+
+  MCContext Ctx(MAI.get(), MRI.get(), nullptr);
+
+  std::unique_ptr<MCDisassembler> Disassembler(
+    TheTarget->createMCDisassembler(*STI, Ctx));
+  assert(Disassembler && "Unable to create disassembler!");
+
+  std::unique_ptr<MCInstrInfo> MII(TheTarget->createMCInstrInfo());
+
+  std::unique_ptr<MCInstPrinter> InstPrinter(
+    TheTarget->createMCInstPrinter(0, *MAI, *MII, *MRI, *STI));
+
+  // Load any dylibs requested on the command line.
+  loadDylibs();
+
+  // Instantiate a dynamic linker.
+  TrivialMemoryManager MemMgr;
+  RuntimeDyld Dyld(&MemMgr);
+
+  // If we don't have any input files, read from stdin.
+  if (!InputFileList.size())
+    InputFileList.push_back("-");
+  for(unsigned i = 0, e = InputFileList.size(); i != e; ++i) {
+    // Load the input memory buffer.
+    ErrorOr<std::unique_ptr<MemoryBuffer>> InputBuffer =
+        MemoryBuffer::getFileOrSTDIN(InputFileList[i]);
+    if (std::error_code EC = InputBuffer.getError())
+      return Error("unable to read input: '" + EC.message() + "'");
+
+    std::unique_ptr<ObjectImage> LoadedObject;
+    // Load the object file
+    LoadedObject.reset(
+        Dyld.loadObject(new ObjectBuffer(InputBuffer.get().release())));
+    if (!LoadedObject) {
+      return Error(Dyld.getErrorString());
+    }
+  }
+
+  // Resolve all the relocations we can.
+  Dyld.resolveRelocations();
+
+  RuntimeDyldChecker Checker(Dyld, Disassembler.get(), InstPrinter.get(),
+                             llvm::dbgs());
+  return checkAllExpressions(Checker);
+}
+
 int main(int argc, char **argv) {
   sys::PrintStackTraceOnErrorSignal();
   PrettyStackTraceProgram X(argc, argv);
@@ -270,6 +382,10 @@ int main(int argc, char **argv) {
   ProgramName = argv[0];
   llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
 
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllDisassemblers();
+
   cl::ParseCommandLineOptions(argc, argv, "llvm MC-JIT tool\n");
 
   switch (Action) {
@@ -277,5 +393,7 @@ int main(int argc, char **argv) {
     return executeInput();
   case AC_PrintLineInfo:
     return printLineInfoForInput();
+  case AC_Verify:
+    return linkAndVerify();
   }
 }
diff --git a/tools/llvm-size/Android.mk b/tools/llvm-size/Android.mk
index 0efca96..4c26cce 100644
--- a/tools/llvm-size/Android.mk
+++ b/tools/llvm-size/Android.mk
@@ -11,10 +11,12 @@ llvm_size_SRC_FILES := \
   llvm-size.cpp
 
 llvm_size_STATIC_LIBRARIES := \
-  libLLVMObject             \
-  libLLVMBitReader          \
-  libLLVMCore               \
-  libLLVMSupport            \
+  libLLVMObject               \
+  libLLVMMC                   \
+  libLLVMMCParser             \
+  libLLVMBitReader            \
+  libLLVMCore                 \
+  libLLVMSupport
 
 include $(CLEAR_VARS)
 
diff --git a/tools/llvm-size/llvm-size.cpp b/tools/llvm-size/llvm-size.cpp
index 58eafd4..50b5220 100644
--- a/tools/llvm-size/llvm-size.cpp
+++ b/tools/llvm-size/llvm-size.cpp
@@ -16,6 +16,8 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/MachOUniversal.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
@@ -25,51 +27,61 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <algorithm>
 #include <string>
+#include <system_error>
 using namespace llvm;
 using namespace object;
 
-enum OutputFormatTy {berkeley, sysv};
+enum OutputFormatTy { berkeley, sysv, darwin };
 static cl::opt<OutputFormatTy>
-       OutputFormat("format",
-         cl::desc("Specify output format"),
-         cl::values(clEnumVal(sysv, "System V format"),
-                    clEnumVal(berkeley, "Berkeley format"),
-                    clEnumValEnd),
-         cl::init(berkeley));
+OutputFormat("format", cl::desc("Specify output format"),
+             cl::values(clEnumVal(sysv, "System V format"),
+                        clEnumVal(berkeley, "Berkeley format"),
+                        clEnumVal(darwin, "Darwin -m format"), clEnumValEnd),
+             cl::init(berkeley));
+
+static cl::opt<OutputFormatTy> OutputFormatShort(
+    cl::desc("Specify output format"),
+    cl::values(clEnumValN(sysv, "A", "System V format"),
+               clEnumValN(berkeley, "B", "Berkeley format"),
+               clEnumValN(darwin, "m", "Darwin -m format"), clEnumValEnd),
+    cl::init(berkeley));
+
+static bool berkeleyHeaderPrinted = false;
+static bool moreThanOneFile = false;
+
+cl::opt<bool>
+DarwinLongFormat("l", cl::desc("When format is darwin, use long format "
+                               "to include addresses and offsets."));
 
-static cl::opt<OutputFormatTy>
-       OutputFormatShort(cl::desc("Specify output format"),
-         cl::values(clEnumValN(sysv, "A", "System V format"),
-                    clEnumValN(berkeley, "B", "Berkeley format"),
-                    clEnumValEnd),
-         cl::init(berkeley));
+static cl::list<std::string>
+ArchFlags("arch", cl::desc("architecture(s) from a Mach-O file to dump"),
+          cl::ZeroOrMore);
+bool ArchAll = false;
 
-enum RadixTy {octal = 8, decimal = 10, hexadecimal = 16};
+enum RadixTy { octal = 8, decimal = 10, hexadecimal = 16 };
 static cl::opt<unsigned int>
-       Radix("-radix",
-         cl::desc("Print size in radix. Only 8, 10, and 16 are valid"),
-         cl::init(decimal));
+Radix("-radix", cl::desc("Print size in radix. Only 8, 10, and 16 are valid"),
+      cl::init(decimal));
 
 static cl::opt<RadixTy>
-       RadixShort(cl::desc("Print size in radix:"),
-         cl::values(clEnumValN(octal, "o", "Print size in octal"),
-                    clEnumValN(decimal, "d", "Print size in decimal"),
-                    clEnumValN(hexadecimal, "x", "Print size in hexadecimal"),
-                    clEnumValEnd),
-         cl::init(decimal));
+RadixShort(cl::desc("Print size in radix:"),
+           cl::values(clEnumValN(octal, "o", "Print size in octal"),
+                      clEnumValN(decimal, "d", "Print size in decimal"),
+                      clEnumValN(hexadecimal, "x", "Print size in hexadecimal"),
+                      clEnumValEnd),
+           cl::init(decimal));
 
 static cl::list<std::string>
-       InputFilenames(cl::Positional, cl::desc("<input files>"),
-                      cl::ZeroOrMore);
+InputFilenames(cl::Positional, cl::desc("<input files>"), cl::ZeroOrMore);
 
 static std::string ToolName;
 
 ///  @brief If ec is not success, print the error and return true.
-static bool error(error_code ec) {
-  if (!ec) return false;
+static bool error(std::error_code ec) {
+  if (!ec)
+    return false;
 
   outs() << ToolName << ": error reading file: " << ec.message() << ".\n";
   outs().flush();
@@ -85,6 +97,180 @@ static size_t getNumLengthAsString(uint64_t num) {
   return result.size();
 }
 
+/// @brief Return the the printing format for the Radix.
+static const char *getRadixFmt(void) {
+  switch (Radix) {
+  case octal:
+    return PRIo64;
+  case decimal:
+    return PRIu64;
+  case hexadecimal:
+    return PRIx64;
+  }
+  return nullptr;
+}
+
+/// @brief Print the size of each Mach-O segment and section in @p MachO.
+///
+/// This is when used when @c OutputFormat is darwin and produces the same
+/// output as darwin's size(1) -m output.
+static void PrintDarwinSectionSizes(MachOObjectFile *MachO) {
+  std::string fmtbuf;
+  raw_string_ostream fmt(fmtbuf);
+  const char *radix_fmt = getRadixFmt();
+  if (Radix == hexadecimal)
+    fmt << "0x";
+  fmt << "%" << radix_fmt;
+
+  uint32_t LoadCommandCount = MachO->getHeader().ncmds;
+  uint32_t Filetype = MachO->getHeader().filetype;
+  MachOObjectFile::LoadCommandInfo Load = MachO->getFirstLoadCommandInfo();
+
+  uint64_t total = 0;
+  for (unsigned I = 0;; ++I) {
+    if (Load.C.cmd == MachO::LC_SEGMENT_64) {
+      MachO::segment_command_64 Seg = MachO->getSegment64LoadCommand(Load);
+      outs() << "Segment " << Seg.segname << ": "
+             << format(fmt.str().c_str(), Seg.vmsize);
+      if (DarwinLongFormat)
+        outs() << " (vmaddr 0x" << format("%" PRIx64, Seg.vmaddr) << " fileoff "
+               << Seg.fileoff << ")";
+      outs() << "\n";
+      total += Seg.vmsize;
+      uint64_t sec_total = 0;
+      for (unsigned J = 0; J < Seg.nsects; ++J) {
+        MachO::section_64 Sec = MachO->getSection64(Load, J);
+        if (Filetype == MachO::MH_OBJECT)
+          outs() << "\tSection (" << format("%.16s", &Sec.segname) << ", "
+                 << format("%.16s", &Sec.sectname) << "): ";
+        else
+          outs() << "\tSection " << format("%.16s", &Sec.sectname) << ": ";
+        outs() << format(fmt.str().c_str(), Sec.size);
+        if (DarwinLongFormat)
+          outs() << " (addr 0x" << format("%" PRIx64, Sec.addr) << " offset "
+                 << Sec.offset << ")";
+        outs() << "\n";
+        sec_total += Sec.size;
+      }
+      if (Seg.nsects != 0)
+        outs() << "\ttotal " << format(fmt.str().c_str(), sec_total) << "\n";
+    } else if (Load.C.cmd == MachO::LC_SEGMENT) {
+      MachO::segment_command Seg = MachO->getSegmentLoadCommand(Load);
+      outs() << "Segment " << Seg.segname << ": "
+             << format(fmt.str().c_str(), Seg.vmsize);
+      if (DarwinLongFormat)
+        outs() << " (vmaddr 0x" << format("%" PRIx64, Seg.vmaddr) << " fileoff "
+               << Seg.fileoff << ")";
+      outs() << "\n";
+      total += Seg.vmsize;
+      uint64_t sec_total = 0;
+      for (unsigned J = 0; J < Seg.nsects; ++J) {
+        MachO::section Sec = MachO->getSection(Load, J);
+        if (Filetype == MachO::MH_OBJECT)
+          outs() << "\tSection (" << format("%.16s", &Sec.segname) << ", "
+                 << format("%.16s", &Sec.sectname) << "): ";
+        else
+          outs() << "\tSection " << format("%.16s", &Sec.sectname) << ": ";
+        outs() << format(fmt.str().c_str(), Sec.size);
+        if (DarwinLongFormat)
+          outs() << " (addr 0x" << format("%" PRIx64, Sec.addr) << " offset "
+                 << Sec.offset << ")";
+        outs() << "\n";
+        sec_total += Sec.size;
+      }
+      if (Seg.nsects != 0)
+        outs() << "\ttotal " << format(fmt.str().c_str(), sec_total) << "\n";
+    }
+    if (I == LoadCommandCount - 1)
+      break;
+    else
+      Load = MachO->getNextLoadCommandInfo(Load);
+  }
+  outs() << "total " << format(fmt.str().c_str(), total) << "\n";
+}
+
+/// @brief Print the summary sizes of the standard Mach-O segments in @p MachO.
+///
+/// This is when used when @c OutputFormat is berkeley with a Mach-O file and
+/// produces the same output as darwin's size(1) default output.
+static void PrintDarwinSegmentSizes(MachOObjectFile *MachO) {
+  uint32_t LoadCommandCount = MachO->getHeader().ncmds;
+  MachOObjectFile::LoadCommandInfo Load = MachO->getFirstLoadCommandInfo();
+
+  uint64_t total_text = 0;
+  uint64_t total_data = 0;
+  uint64_t total_objc = 0;
+  uint64_t total_others = 0;
+  for (unsigned I = 0;; ++I) {
+    if (Load.C.cmd == MachO::LC_SEGMENT_64) {
+      MachO::segment_command_64 Seg = MachO->getSegment64LoadCommand(Load);
+      if (MachO->getHeader().filetype == MachO::MH_OBJECT) {
+        for (unsigned J = 0; J < Seg.nsects; ++J) {
+          MachO::section_64 Sec = MachO->getSection64(Load, J);
+          StringRef SegmentName = StringRef(Sec.segname);
+          if (SegmentName == "__TEXT")
+            total_text += Sec.size;
+          else if (SegmentName == "__DATA")
+            total_data += Sec.size;
+          else if (SegmentName == "__OBJC")
+            total_objc += Sec.size;
+          else
+            total_others += Sec.size;
+        }
+      } else {
+        StringRef SegmentName = StringRef(Seg.segname);
+        if (SegmentName == "__TEXT")
+          total_text += Seg.vmsize;
+        else if (SegmentName == "__DATA")
+          total_data += Seg.vmsize;
+        else if (SegmentName == "__OBJC")
+          total_objc += Seg.vmsize;
+        else
+          total_others += Seg.vmsize;
+      }
+    } else if (Load.C.cmd == MachO::LC_SEGMENT) {
+      MachO::segment_command Seg = MachO->getSegmentLoadCommand(Load);
+      if (MachO->getHeader().filetype == MachO::MH_OBJECT) {
+        for (unsigned J = 0; J < Seg.nsects; ++J) {
+          MachO::section Sec = MachO->getSection(Load, J);
+          StringRef SegmentName = StringRef(Sec.segname);
+          if (SegmentName == "__TEXT")
+            total_text += Sec.size;
+          else if (SegmentName == "__DATA")
+            total_data += Sec.size;
+          else if (SegmentName == "__OBJC")
+            total_objc += Sec.size;
+          else
+            total_others += Sec.size;
+        }
+      } else {
+        StringRef SegmentName = StringRef(Seg.segname);
+        if (SegmentName == "__TEXT")
+          total_text += Seg.vmsize;
+        else if (SegmentName == "__DATA")
+          total_data += Seg.vmsize;
+        else if (SegmentName == "__OBJC")
+          total_objc += Seg.vmsize;
+        else
+          total_others += Seg.vmsize;
+      }
+    }
+    if (I == LoadCommandCount - 1)
+      break;
+    else
+      Load = MachO->getNextLoadCommandInfo(Load);
+  }
+  uint64_t total = total_text + total_data + total_objc + total_others;
+
+  if (!berkeleyHeaderPrinted) {
+    outs() << "__TEXT\t__DATA\t__OBJC\tothers\tdec\thex\n";
+    berkeleyHeaderPrinted = true;
+  }
+  outs() << total_text << "\t" << total_data << "\t" << total_objc << "\t"
+         << total_others << "\t" << total << "\t" << format("%" PRIx64, total)
+         << "\t";
+}
+
 /// @brief Print the size of each section in @p Obj.
 ///
 /// The format used is determined by @c OutputFormat and @c Radix.
@@ -92,20 +278,19 @@ static void PrintObjectSectionSizes(ObjectFile *Obj) {
   uint64_t total = 0;
   std::string fmtbuf;
   raw_string_ostream fmt(fmtbuf);
-
-  const char *radix_fmt = nullptr;
-  switch (Radix) {
-  case octal:
-    radix_fmt = PRIo64;
-    break;
-  case decimal:
-    radix_fmt = PRIu64;
-    break;
-  case hexadecimal:
-    radix_fmt = PRIx64;
-    break;
-  }
-  if (OutputFormat == sysv) {
+  const char *radix_fmt = getRadixFmt();
+
+  // If OutputFormat is darwin and we have a MachOObjectFile print as darwin's
+  // size(1) -m output, else if OutputFormat is darwin and not a Mach-O object
+  // let it fall through to OutputFormat berkeley.
+  MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(Obj);
+  if (OutputFormat == darwin && MachO)
+    PrintDarwinSectionSizes(MachO);
+  // If we have a MachOObjectFile and the OutputFormat is berkeley print as
+  // darwin's default berkeley format for Mach-O files.
+  else if (MachO && OutputFormat == berkeley)
+    PrintDarwinSegmentSizes(MachO);
+  else if (OutputFormat == sysv) {
     // Run two passes over all sections. The first gets the lengths needed for
     // formatting the output. The second actually does the output.
     std::size_t max_name_len = strlen("section");
@@ -139,10 +324,9 @@ static void PrintObjectSectionSizes(ObjectFile *Obj) {
         << "%" << max_addr_len << "s\n";
 
     // Print header
-    outs() << format(fmt.str().c_str(),
-                     static_cast<const char*>("section"),
-                     static_cast<const char*>("size"),
-                     static_cast<const char*>("addr"));
+    outs() << format(fmt.str().c_str(), static_cast<const char *>("section"),
+                     static_cast<const char *>("size"),
+                     static_cast<const char *>("addr"));
     fmtbuf.clear();
 
     // Setup per section format.
@@ -170,8 +354,7 @@ static void PrintObjectSectionSizes(ObjectFile *Obj) {
     fmtbuf.clear();
     fmt << "%-" << max_name_len << "s "
         << "%#" << max_size_len << radix_fmt << "\n";
-    outs() << format(fmt.str().c_str(),
-                     static_cast<const char*>("Total"),
+    outs() << format(fmt.str().c_str(), static_cast<const char *>("Total"),
                      total);
   } else {
     // The Berkeley format does not display individual section sizes. It
@@ -204,21 +387,56 @@ static void PrintObjectSectionSizes(ObjectFile *Obj) {
 
     total = total_text + total_data + total_bss;
 
+    if (!berkeleyHeaderPrinted) {
+      outs() << "   text    data     bss     "
+             << (Radix == octal ? "oct" : "dec") << "     hex filename\n";
+      berkeleyHeaderPrinted = true;
+    }
+
     // Print result.
     fmt << "%#7" << radix_fmt << " "
         << "%#7" << radix_fmt << " "
         << "%#7" << radix_fmt << " ";
-    outs() << format(fmt.str().c_str(),
-                     total_text,
-                     total_data,
-                     total_bss);
+    outs() << format(fmt.str().c_str(), total_text, total_data, total_bss);
     fmtbuf.clear();
     fmt << "%7" << (Radix == octal ? PRIo64 : PRIu64) << " "
         << "%7" PRIx64 " ";
-    outs() << format(fmt.str().c_str(),
-                     total,
-                     total);
+    outs() << format(fmt.str().c_str(), total, total);
+  }
+}
+
+/// @brief Checks to see if the @p o ObjectFile is a Mach-O file and if it is
+///        and there is a list of architecture flags specified then check to
+///        make sure this Mach-O file is one of those architectures or all
+///        architectures was specificed.  If not then an error is generated and
+///        this routine returns false.  Else it returns true.
+static bool checkMachOAndArchFlags(ObjectFile *o, StringRef file) {
+  if (isa<MachOObjectFile>(o) && !ArchAll && ArchFlags.size() != 0) {
+    MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o);
+    bool ArchFound = false;
+    MachO::mach_header H;
+    MachO::mach_header_64 H_64;
+    Triple T;
+    if (MachO->is64Bit()) {
+      H_64 = MachO->MachOObjectFile::getHeader64();
+      T = MachOObjectFile::getArch(H_64.cputype, H_64.cpusubtype);
+    } else {
+      H = MachO->MachOObjectFile::getHeader();
+      T = MachOObjectFile::getArch(H.cputype, H.cpusubtype);
+    }
+    unsigned i;
+    for (i = 0; i < ArchFlags.size(); ++i) {
+      if (ArchFlags[i] == T.getArchName())
+        ArchFound = true;
+      break;
+    }
+    if (!ArchFound) {
+      errs() << ToolName << ": file: " << file
+             << " does not contain architecture: " << ArchFlags[i] << ".\n";
+      return false;
+    }
   }
+  return true;
 }
 
 /// @brief Print the section sizes for @p file. If @p file is an archive, print
@@ -228,14 +446,15 @@ static void PrintFileSectionSizes(StringRef file) {
   if (file != "-") {
     bool exists;
     if (sys::fs::exists(file, exists) || !exists) {
-      errs() << ToolName << ": '" << file << "': " << "No such file\n";
+      errs() << ToolName << ": '" << file << "': "
+             << "No such file\n";
       return;
     }
   }
 
   // Attempt to open the binary.
   ErrorOr<Binary *> BinaryOrErr = createBinary(file);
-  if (error_code EC = BinaryOrErr.getError()) {
+  if (std::error_code EC = BinaryOrErr.getError()) {
     errs() << ToolName << ": " << file << ": " << EC.message() << ".\n";
     return;
   }
@@ -244,29 +463,250 @@ static void PrintFileSectionSizes(StringRef file) {
   if (Archive *a = dyn_cast<Archive>(binary.get())) {
     // This is an archive. Iterate over each member and display its sizes.
     for (object::Archive::child_iterator i = a->child_begin(),
-                                         e = a->child_end(); i != e; ++i) {
-      std::unique_ptr<Binary> child;
-      if (error_code ec = i->getAsBinary(child)) {
-        errs() << ToolName << ": " << file << ": " << ec.message() << ".\n";
+                                         e = a->child_end();
+         i != e; ++i) {
+      ErrorOr<std::unique_ptr<Binary>> ChildOrErr = i->getAsBinary();
+      if (std::error_code EC = ChildOrErr.getError()) {
+        errs() << ToolName << ": " << file << ": " << EC.message() << ".\n";
         continue;
       }
-      if (ObjectFile *o = dyn_cast<ObjectFile>(child.get())) {
+      if (ObjectFile *o = dyn_cast<ObjectFile>(&*ChildOrErr.get())) {
+        MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o);
+        if (!checkMachOAndArchFlags(o, file))
+          return;
         if (OutputFormat == sysv)
-          outs() << o->getFileName() << "   (ex " << a->getFileName()
-                  << "):\n";
+          outs() << o->getFileName() << "   (ex " << a->getFileName() << "):\n";
+        else if (MachO && OutputFormat == darwin)
+          outs() << a->getFileName() << "(" << o->getFileName() << "):\n";
         PrintObjectSectionSizes(o);
-        if (OutputFormat == berkeley)
-          outs() << o->getFileName() << " (ex " << a->getFileName() << ")\n";
+        if (OutputFormat == berkeley) {
+          if (MachO)
+            outs() << a->getFileName() << "(" << o->getFileName() << ")\n";
+          else
+            outs() << o->getFileName() << " (ex " << a->getFileName() << ")\n";
+        }
+      }
+    }
+  } else if (MachOUniversalBinary *UB =
+                 dyn_cast<MachOUniversalBinary>(binary.get())) {
+    // If we have a list of architecture flags specified dump only those.
+    if (!ArchAll && ArchFlags.size() != 0) {
+      // Look for a slice in the universal binary that matches each ArchFlag.
+      bool ArchFound;
+      for (unsigned i = 0; i < ArchFlags.size(); ++i) {
+        ArchFound = false;
+        for (MachOUniversalBinary::object_iterator I = UB->begin_objects(),
+                                                   E = UB->end_objects();
+             I != E; ++I) {
+          if (ArchFlags[i] == I->getArchTypeName()) {
+            ArchFound = true;
+            ErrorOr<std::unique_ptr<ObjectFile>> UO = I->getAsObjectFile();
+            std::unique_ptr<Archive> UA;
+            if (UO) {
+              if (ObjectFile *o = dyn_cast<ObjectFile>(&*UO.get())) {
+                MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o);
+                if (OutputFormat == sysv)
+                  outs() << o->getFileName() << "  :\n";
+                else if (MachO && OutputFormat == darwin) {
+                  if (moreThanOneFile || ArchFlags.size() > 1)
+                    outs() << o->getFileName() << " (for architecture "
+                           << I->getArchTypeName() << "): \n";
+                }
+                PrintObjectSectionSizes(o);
+                if (OutputFormat == berkeley) {
+                  if (!MachO || moreThanOneFile || ArchFlags.size() > 1)
+                    outs() << o->getFileName() << " (for architecture "
+                           << I->getArchTypeName() << ")";
+                  outs() << "\n";
+                }
+              }
+            } else if (!I->getAsArchive(UA)) {
+              // This is an archive. Iterate over each member and display its
+              // sizes.
+              for (object::Archive::child_iterator i = UA->child_begin(),
+                                                   e = UA->child_end();
+                   i != e; ++i) {
+                ErrorOr<std::unique_ptr<Binary>> ChildOrErr = i->getAsBinary();
+                if (std::error_code EC = ChildOrErr.getError()) {
+                  errs() << ToolName << ": " << file << ": " << EC.message()
+                         << ".\n";
+                  continue;
+                }
+                if (ObjectFile *o = dyn_cast<ObjectFile>(&*ChildOrErr.get())) {
+                  MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o);
+                  if (OutputFormat == sysv)
+                    outs() << o->getFileName() << "   (ex " << UA->getFileName()
+                           << "):\n";
+                  else if (MachO && OutputFormat == darwin)
+                    outs() << UA->getFileName() << "(" << o->getFileName()
+                           << ")"
+                           << " (for architecture " << I->getArchTypeName()
+                           << "):\n";
+                  PrintObjectSectionSizes(o);
+                  if (OutputFormat == berkeley) {
+                    if (MachO) {
+                      outs() << UA->getFileName() << "(" << o->getFileName()
+                             << ")";
+                      if (ArchFlags.size() > 1)
+                        outs() << " (for architecture " << I->getArchTypeName()
+                               << ")";
+                      outs() << "\n";
+                    } else
+                      outs() << o->getFileName() << " (ex " << UA->getFileName()
+                             << ")\n";
+                  }
+                }
+              }
+            }
+          }
+        }
+        if (!ArchFound) {
+          errs() << ToolName << ": file: " << file
+                 << " does not contain architecture" << ArchFlags[i] << ".\n";
+          return;
+        }
+      }
+      return;
+    }
+    // No architecture flags were specified so if this contains a slice that
+    // matches the host architecture dump only that.
+    if (!ArchAll) {
+      StringRef HostArchName = MachOObjectFile::getHostArch().getArchName();
+      for (MachOUniversalBinary::object_iterator I = UB->begin_objects(),
+                                                 E = UB->end_objects();
+           I != E; ++I) {
+        if (HostArchName == I->getArchTypeName()) {
+          ErrorOr<std::unique_ptr<ObjectFile>> UO = I->getAsObjectFile();
+          std::unique_ptr<Archive> UA;
+          if (UO) {
+            if (ObjectFile *o = dyn_cast<ObjectFile>(&*UO.get())) {
+              MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o);
+              if (OutputFormat == sysv)
+                outs() << o->getFileName() << "  :\n";
+              else if (MachO && OutputFormat == darwin) {
+                if (moreThanOneFile)
+                  outs() << o->getFileName() << " (for architecture "
+                         << I->getArchTypeName() << "):\n";
+              }
+              PrintObjectSectionSizes(o);
+              if (OutputFormat == berkeley) {
+                if (!MachO || moreThanOneFile)
+                  outs() << o->getFileName() << " (for architecture "
+                         << I->getArchTypeName() << ")";
+                outs() << "\n";
+              }
+            }
+          } else if (!I->getAsArchive(UA)) {
+            // This is an archive. Iterate over each member and display its
+            // sizes.
+            for (object::Archive::child_iterator i = UA->child_begin(),
+                                                 e = UA->child_end();
+                 i != e; ++i) {
+              ErrorOr<std::unique_ptr<Binary>> ChildOrErr = i->getAsBinary();
+              if (std::error_code EC = ChildOrErr.getError()) {
+                errs() << ToolName << ": " << file << ": " << EC.message()
+                       << ".\n";
+                continue;
+              }
+              if (ObjectFile *o = dyn_cast<ObjectFile>(&*ChildOrErr.get())) {
+                MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o);
+                if (OutputFormat == sysv)
+                  outs() << o->getFileName() << "   (ex " << UA->getFileName()
+                         << "):\n";
+                else if (MachO && OutputFormat == darwin)
+                  outs() << UA->getFileName() << "(" << o->getFileName() << ")"
+                         << " (for architecture " << I->getArchTypeName()
+                         << "):\n";
+                PrintObjectSectionSizes(o);
+                if (OutputFormat == berkeley) {
+                  if (MachO)
+                    outs() << UA->getFileName() << "(" << o->getFileName()
+                           << ")\n";
+                  else
+                    outs() << o->getFileName() << " (ex " << UA->getFileName()
+                           << ")\n";
+                }
+              }
+            }
+          }
+          return;
+        }
+      }
+    }
+    // Either all architectures have been specified or none have been specified
+    // and this does not contain the host architecture so dump all the slices.
+    bool moreThanOneArch = UB->getNumberOfObjects() > 1;
+    for (MachOUniversalBinary::object_iterator I = UB->begin_objects(),
+                                               E = UB->end_objects();
+         I != E; ++I) {
+      ErrorOr<std::unique_ptr<ObjectFile>> UO = I->getAsObjectFile();
+      std::unique_ptr<Archive> UA;
+      if (UO) {
+        if (ObjectFile *o = dyn_cast<ObjectFile>(&*UO.get())) {
+          MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o);
+          if (OutputFormat == sysv)
+            outs() << o->getFileName() << "  :\n";
+          else if (MachO && OutputFormat == darwin) {
+            if (moreThanOneFile || moreThanOneArch)
+              outs() << o->getFileName() << " (for architecture "
+                     << I->getArchTypeName() << "):";
+            outs() << "\n";
+          }
+          PrintObjectSectionSizes(o);
+          if (OutputFormat == berkeley) {
+            if (!MachO || moreThanOneFile || moreThanOneArch)
+              outs() << o->getFileName() << " (for architecture "
+                     << I->getArchTypeName() << ")";
+            outs() << "\n";
+          }
+        }
+      } else if (!I->getAsArchive(UA)) {
+        // This is an archive. Iterate over each member and display its sizes.
+        for (object::Archive::child_iterator i = UA->child_begin(),
+                                             e = UA->child_end();
+             i != e; ++i) {
+          ErrorOr<std::unique_ptr<Binary>> ChildOrErr = i->getAsBinary();
+          if (std::error_code EC = ChildOrErr.getError()) {
+            errs() << ToolName << ": " << file << ": " << EC.message() << ".\n";
+            continue;
+          }
+          if (ObjectFile *o = dyn_cast<ObjectFile>(&*ChildOrErr.get())) {
+            MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o);
+            if (OutputFormat == sysv)
+              outs() << o->getFileName() << "   (ex " << UA->getFileName()
+                     << "):\n";
+            else if (MachO && OutputFormat == darwin)
+              outs() << UA->getFileName() << "(" << o->getFileName() << ")"
+                     << " (for architecture " << I->getArchTypeName() << "):\n";
+            PrintObjectSectionSizes(o);
+            if (OutputFormat == berkeley) {
+              if (MachO)
+                outs() << UA->getFileName() << "(" << o->getFileName() << ")"
+                       << " (for architecture " << I->getArchTypeName()
+                       << ")\n";
+              else
+                outs() << o->getFileName() << " (ex " << UA->getFileName()
+                       << ")\n";
+            }
+          }
+        }
       }
     }
   } else if (ObjectFile *o = dyn_cast<ObjectFile>(binary.get())) {
+    if (!checkMachOAndArchFlags(o, file))
+      return;
     if (OutputFormat == sysv)
       outs() << o->getFileName() << "  :\n";
     PrintObjectSectionSizes(o);
-    if (OutputFormat == berkeley)
-      outs() << o->getFileName() << "\n";
+    if (OutputFormat == berkeley) {
+      MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o);
+      if (!MachO || moreThanOneFile)
+        outs() << o->getFileName();
+      outs() << "\n";
+    }
   } else {
-    errs() << ToolName << ": " << file << ": " << "Unrecognized file type.\n";
+    errs() << ToolName << ": " << file << ": "
+           << "Unrecognized file type.\n";
   }
   // System V adds an extra newline at the end of each file.
   if (OutputFormat == sysv)
@@ -278,7 +718,7 @@ int main(int argc, char **argv) {
   sys::PrintStackTraceOnErrorSignal();
   PrettyStackTraceProgram X(argc, argv);
 
-  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
+  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
   cl::ParseCommandLineOptions(argc, argv, "llvm object size dumper\n");
 
   ToolName = argv[0];
@@ -287,14 +727,23 @@ int main(int argc, char **argv) {
   if (RadixShort.getNumOccurrences())
     Radix = RadixShort;
 
+  for (unsigned i = 0; i < ArchFlags.size(); ++i) {
+    if (ArchFlags[i] == "all") {
+      ArchAll = true;
+    } else {
+      Triple T = MachOObjectFile::getArch(ArchFlags[i]);
+      if (T.getArch() == Triple::UnknownArch) {
+        outs() << ToolName << ": for the -arch option: Unknown architecture "
+               << "named '" << ArchFlags[i] << "'";
+        return 1;
+      }
+    }
+  }
+
   if (InputFilenames.size() == 0)
     InputFilenames.push_back("a.out");
 
-  if (OutputFormat == berkeley)
-    outs() << "   text    data     bss     "
-           << (Radix == octal ? "oct" : "dec")
-           << "     hex filename\n";
-
+  moreThanOneFile = InputFilenames.size() > 1;
   std::for_each(InputFilenames.begin(), InputFilenames.end(),
                 PrintFileSectionSizes);
 
diff --git a/tools/llvm-symbolizer/LLVMSymbolize.cpp b/tools/llvm-symbolizer/LLVMSymbolize.cpp
index 3e71111..c1d39ef 100644
--- a/tools/llvm-symbolizer/LLVMSymbolize.cpp
+++ b/tools/llvm-symbolizer/LLVMSymbolize.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
@@ -28,7 +29,7 @@
 namespace llvm {
 namespace symbolize {
 
-static bool error(error_code ec) {
+static bool error(std::error_code ec) {
   if (!ec)
     return false;
   errs() << "LLVMSymbolizer: error reading file: " << ec.message() << ".\n";
@@ -219,10 +220,11 @@ static std::string getDarwinDWARFResourceForPath(const std::string &Path) {
 }
 
 static bool checkFileCRC(StringRef Path, uint32_t CRCHash) {
-  std::unique_ptr<MemoryBuffer> MB;
-  if (MemoryBuffer::getFileOrSTDIN(Path, MB))
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+      MemoryBuffer::getFileOrSTDIN(Path);
+  if (!MB)
     return false;
-  return !zlib::isAvailable() || CRCHash == zlib::crc32(MB->getBuffer());
+  return !zlib::isAvailable() || CRCHash == zlib::crc32(MB.get()->getBuffer());
 }
 
 static bool findDebugBinary(const std::string &OrigPath,
@@ -310,7 +312,7 @@ LLVMSymbolizer::getOrCreateBinary(const std::string &Path) {
       const std::string &ResourcePath =
           getDarwinDWARFResourceForPath(Path);
       BinaryOrErr = createBinary(ResourcePath);
-      error_code EC = BinaryOrErr.getError();
+      std::error_code EC = BinaryOrErr.getError();
       if (EC != errc::no_such_file_or_directory && !error(EC)) {
         DbgBin = BinaryOrErr.get();
         ParsedBinariesAndObjects.push_back(std::unique_ptr<Binary>(DbgBin));
@@ -348,10 +350,11 @@ LLVMSymbolizer::getObjectFileFromBinary(Binary *Bin, const std::string &ArchName
         std::make_pair(UB, ArchName));
     if (I != ObjectFileForArch.end())
       return I->second;
-    std::unique_ptr<ObjectFile> ParsedObj;
-    if (!UB->getObjectForArch(Triple(ArchName).getArch(), ParsedObj)) {
-      Res = ParsedObj.get();
-      ParsedBinariesAndObjects.push_back(std::move(ParsedObj));
+    ErrorOr<std::unique_ptr<ObjectFile>> ParsedObj =
+        UB->getObjectForArch(Triple(ArchName).getArch());
+    if (ParsedObj) {
+      Res = ParsedObj.get().get();
+      ParsedBinariesAndObjects.push_back(std::move(ParsedObj.get()));
     }
     ObjectFileForArch[std::make_pair(UB, ArchName)] = Res;
   } else if (Bin->isObject()) {
diff --git a/tools/lto/CMakeLists.txt b/tools/lto/CMakeLists.txt
index 542053b..71391b7 100644
--- a/tools/lto/CMakeLists.txt
+++ b/tools/lto/CMakeLists.txt
@@ -16,11 +16,7 @@ set(SOURCES
 
 set(LLVM_EXPORTED_SYMBOL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/lto.exports)
 
-if(NOT CYGWIN AND LLVM_ENABLE_PIC)
-  set(ENABLE_SHARED SHARED)
-endif()
-
-add_llvm_library(LTO ${ENABLE_SHARED} STATIC ${SOURCES})
+add_llvm_library(LTO SHARED ${SOURCES})
 
 install(FILES ${LLVM_MAIN_INCLUDE_DIR}/llvm-c/lto.h
   DESTINATION include/llvm-c)
diff --git a/tools/lto/lto.cpp b/tools/lto/lto.cpp
index 64abf5c..b401f9a 100644
--- a/tools/lto/lto.cpp
+++ b/tools/lto/lto.cpp
@@ -13,11 +13,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm-c/lto.h"
-#include "llvm-c/Core.h"
-#include "llvm-c/Target.h"
 #include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/LTO/LTOCodeGenerator.h"
 #include "llvm/LTO/LTOModule.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TargetSelect.h"
 
 // extra command-line flags needed for LTOCodeGenerator
 static cl::opt<bool>
@@ -46,12 +46,12 @@ static bool parsedOptions = false;
 // Initialize the configured targets if they have not been initialized.
 static void lto_initialize() {
   if (!initialized) {
-    LLVMInitializeAllTargetInfos();
-    LLVMInitializeAllTargets();
-    LLVMInitializeAllTargetMCs();
-    LLVMInitializeAllAsmParsers();
-    LLVMInitializeAllAsmPrinters();
-    LLVMInitializeAllDisassemblers();
+    InitializeAllTargetInfos();
+    InitializeAllTargets();
+    InitializeAllTargetMCs();
+    InitializeAllAsmParsers();
+    InitializeAllAsmPrinters();
+    InitializeAllDisassemblers();
     initialized = true;
   }
 }
@@ -88,7 +88,10 @@ bool lto_module_is_object_file(const char* path) {
 
 bool lto_module_is_object_file_for_target(const char* path,
                                           const char* target_triplet_prefix) {
-  return LTOModule::isBitcodeFileForTarget(path, target_triplet_prefix);
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Buffer = MemoryBuffer::getFile(path);
+  if (!Buffer)
+    return false;
+  return LTOModule::isBitcodeForTarget(Buffer->get(), target_triplet_prefix);
 }
 
 bool lto_module_is_object_file_in_memory(const void* mem, size_t length) {
@@ -99,20 +102,23 @@ bool
 lto_module_is_object_file_in_memory_for_target(const void* mem,
                                             size_t length,
                                             const char* target_triplet_prefix) {
-  return LTOModule::isBitcodeFileForTarget(mem, length, target_triplet_prefix);
+  std::unique_ptr<MemoryBuffer> buffer(LTOModule::makeBuffer(mem, length));
+  if (!buffer)
+    return false;
+  return LTOModule::isBitcodeForTarget(buffer.get(), target_triplet_prefix);
 }
 
 lto_module_t lto_module_create(const char* path) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return wrap(LTOModule::makeLTOModule(path, Options, sLastErrorString));
+  return wrap(LTOModule::createFromFile(path, Options, sLastErrorString));
 }
 
 lto_module_t lto_module_create_from_fd(int fd, const char *path, size_t size) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
   return wrap(
-      LTOModule::makeLTOModule(fd, path, size, Options, sLastErrorString));
+      LTOModule::createFromOpenFile(fd, path, size, Options, sLastErrorString));
 }
 
 lto_module_t lto_module_create_from_fd_at_offset(int fd, const char *path,
@@ -121,14 +127,14 @@ lto_module_t lto_module_create_from_fd_at_offset(int fd, const char *path,
                                                  off_t offset) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return wrap(LTOModule::makeLTOModule(fd, path, map_size, offset, Options,
-                                       sLastErrorString));
+  return wrap(LTOModule::createFromOpenFileSlice(fd, path, map_size, offset,
+                                                 Options, sLastErrorString));
 }
 
 lto_module_t lto_module_create_from_memory(const void* mem, size_t length) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return wrap(LTOModule::makeLTOModule(mem, length, Options, sLastErrorString));
+  return wrap(LTOModule::createFromBuffer(mem, length, Options, sLastErrorString));
 }
 
 lto_module_t lto_module_create_from_memory_with_path(const void* mem,
@@ -137,13 +143,13 @@ lto_module_t lto_module_create_from_memory_with_path(const void* mem,
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
   return wrap(
-      LTOModule::makeLTOModule(mem, length, Options, sLastErrorString, path));
+      LTOModule::createFromBuffer(mem, length, Options, sLastErrorString, path));
 }
 
 void lto_module_dispose(lto_module_t mod) { delete unwrap(mod); }
 
 const char* lto_module_get_target_triple(lto_module_t mod) {
-  return unwrap(mod)->getTargetTriple();
+  return unwrap(mod)->getTargetTriple().c_str();
 }
 
 void lto_module_set_target_triple(lto_module_t mod, const char *triple) {
diff --git a/tools/macho-dump/Android.mk b/tools/macho-dump/Android.mk
index 9699d4a..001f293 100644
--- a/tools/macho-dump/Android.mk
+++ b/tools/macho-dump/Android.mk
@@ -12,6 +12,8 @@ macho_dump_SRC_FILES := \
 
 macho_dump_STATIC_LIBRARIES := \
   libLLVMObject                \
+  libLLVMMC                    \
+  libLLVMMCParser              \
   libLLVMBitReader             \
   libLLVMCore                  \
   libLLVMSupport               \
diff --git a/tools/macho-dump/macho-dump.cpp b/tools/macho-dump/macho-dump.cpp
index 886487b..7600979 100644
--- a/tools/macho-dump/macho-dump.cpp
+++ b/tools/macho-dump/macho-dump.cpp
@@ -20,7 +20,7 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 using namespace llvm;
 using namespace llvm::object;
 
@@ -328,6 +328,17 @@ DumpVersionMin(const MachOObjectFile &Obj,
   return 0;
 }
 
+static int
+DumpDylibID(const MachOObjectFile &Obj,
+            const MachOObjectFile::LoadCommandInfo &LCI) {
+  MachO::dylib_command DLLC = Obj.getDylibIDLoadCommand(LCI);
+  outs() << "  ('install_name', '" << LCI.Ptr + DLLC.dylib.name << "')\n"
+         << "  ('timestamp, " << DLLC.dylib.timestamp << ")\n"
+         << "  ('cur_version, " << DLLC.dylib.current_version << ")\n"
+         << "  ('compat_version, " << DLLC.dylib.compatibility_version << ")\n";
+  return 0;
+}
+
 static int DumpLoadCommand(const MachOObjectFile &Obj,
                            MachOObjectFile::LoadCommandInfo &LCI) {
   switch (LCI.C.cmd) {
@@ -350,6 +361,8 @@ static int DumpLoadCommand(const MachOObjectFile &Obj,
   case MachO::LC_VERSION_MIN_IPHONEOS:
   case MachO::LC_VERSION_MIN_MACOSX:
     return DumpVersionMin(Obj, LCI);
+  case MachO::LC_ID_DYLIB:
+    return DumpDylibID(Obj, LCI);
   default:
     Warning("unknown load command: " + Twine(LCI.C.cmd));
     return 0;
@@ -391,7 +404,7 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "llvm Mach-O dumping tool\n");
 
   ErrorOr<Binary *> BinaryOrErr = createBinary(InputFile);
-  if (error_code EC = BinaryOrErr.getError())
+  if (std::error_code EC = BinaryOrErr.getError())
     return Error("unable to read input: '" + EC.message() + "'");
   std::unique_ptr<Binary> Binary(BinaryOrErr.get());
 
diff --git a/tools/obj2yaml/Android.mk b/tools/obj2yaml/Android.mk
index 8c8fdab..2994622 100644
--- a/tools/obj2yaml/Android.mk
+++ b/tools/obj2yaml/Android.mk
@@ -15,6 +15,8 @@ obj2yaml_SRC_FILES := \
 
 obj2yaml_STATIC_LIBRARIES := \
   libLLVMObject             \
+  libLLVMMC                 \
+  libLLVMMCParser           \
   libLLVMBitReader          \
   libLLVMCore               \
   libLLVMSupport            \
diff --git a/tools/obj2yaml/Error.cpp b/tools/obj2yaml/Error.cpp
index 7be468d..0074128 100644
--- a/tools/obj2yaml/Error.cpp
+++ b/tools/obj2yaml/Error.cpp
@@ -13,18 +13,17 @@
 using namespace llvm;
 
 namespace {
-class _obj2yaml_error_category : public error_category {
+class _obj2yaml_error_category : public std::error_category {
 public:
-  const char *name() const override;
+  const char *name() const LLVM_NOEXCEPT override;
   std::string message(int ev) const override;
-  error_condition default_error_condition(int ev) const override;
 };
 } // namespace
 
 const char *_obj2yaml_error_category::name() const { return "obj2yaml"; }
 
 std::string _obj2yaml_error_category::message(int ev) const {
-  switch (ev) {
+  switch (static_cast<obj2yaml_error>(ev)) {
   case obj2yaml_error::success:
     return "Success";
   case obj2yaml_error::file_not_found:
@@ -33,21 +32,13 @@ std::string _obj2yaml_error_category::message(int ev) const {
     return "Unrecognized file type.";
   case obj2yaml_error::unsupported_obj_file_format:
     return "Unsupported object file format.";
-  default:
-    llvm_unreachable("An enumerator of obj2yaml_error does not have a message "
-                     "defined.");
   }
-}
-
-error_condition
-_obj2yaml_error_category::default_error_condition(int ev) const {
-  if (ev == obj2yaml_error::success)
-    return errc::success;
-  return errc::invalid_argument;
+  llvm_unreachable("An enumerator of obj2yaml_error does not have a message "
+                   "defined.");
 }
 
 namespace llvm {
-const error_category &obj2yaml_category() {
+  const std::error_category &obj2yaml_category() {
   static _obj2yaml_error_category o;
   return o;
 }
diff --git a/tools/obj2yaml/Error.h b/tools/obj2yaml/Error.h
index a326664..4657f0d 100644
--- a/tools/obj2yaml/Error.h
+++ b/tools/obj2yaml/Error.h
@@ -10,33 +10,26 @@
 #ifndef LLVM_TOOLS_ERROR_H
 #define LLVM_TOOLS_ERROR_H
 
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 namespace llvm {
+const std::error_category &obj2yaml_category();
 
-const error_category &obj2yaml_category();
-
-struct obj2yaml_error {
-  enum _ {
-    success = 0,
-    file_not_found,
-    unrecognized_file_format,
-    unsupported_obj_file_format
-  };
-  _ v_;
-
-  obj2yaml_error(_ v) : v_(v) {}
-  explicit obj2yaml_error(int v) : v_(_(v)) {}
-  operator int() const {return v_;}
+enum class obj2yaml_error {
+  success = 0,
+  file_not_found,
+  unrecognized_file_format,
+  unsupported_obj_file_format
 };
 
-inline error_code make_error_code(obj2yaml_error e) {
-  return error_code(static_cast<int>(e), obj2yaml_category());
+inline std::error_code make_error_code(obj2yaml_error e) {
+  return std::error_code(static_cast<int>(e), obj2yaml_category());
 }
 
-template <> struct is_error_code_enum<obj2yaml_error> : std::true_type { };
-template <> struct is_error_code_enum<obj2yaml_error::_> : std::true_type { };
-
 } // namespace llvm
 
+namespace std {
+template <> struct is_error_code_enum<llvm::obj2yaml_error> : std::true_type {};
+}
+
 #endif
diff --git a/tools/obj2yaml/coff2yaml.cpp b/tools/obj2yaml/coff2yaml.cpp
index 42b09d3..fed4533 100644
--- a/tools/obj2yaml/coff2yaml.cpp
+++ b/tools/obj2yaml/coff2yaml.cpp
@@ -31,7 +31,7 @@ public:
 
 }
 
-static void check(error_code ec) {
+static void check(std::error_code ec) {
   if (ec)
     report_fatal_error(ec.message());
 }
@@ -61,7 +61,7 @@ void COFFDumper::dumpSections(unsigned NumSections) {
 
     ArrayRef<uint8_t> sectionData;
     Obj.getSectionContents(Sect, sectionData);
-    Sec.SectionData = object::yaml::BinaryRef(sectionData);
+    Sec.SectionData = yaml::BinaryRef(sectionData);
 
     std::vector<COFFYAML::Relocation> Relocations;
     for (const auto &Reloc : Section.relocations()) {
@@ -210,7 +210,7 @@ COFFYAML::Object &COFFDumper::getYAMLObj() {
   return YAMLObj;
 }
 
-error_code coff2yaml(raw_ostream &Out, const object::COFFObjectFile &Obj) {
+std::error_code coff2yaml(raw_ostream &Out, const object::COFFObjectFile &Obj) {
   COFFDumper Dumper(Obj);
 
   yaml::Output Yout(Out);
diff --git a/tools/obj2yaml/elf2yaml.cpp b/tools/obj2yaml/elf2yaml.cpp
index 7642921..8b53ee7 100644
--- a/tools/obj2yaml/elf2yaml.cpp
+++ b/tools/obj2yaml/elf2yaml.cpp
@@ -26,11 +26,13 @@ class ELFDumper {
 
   const object::ELFFile<ELFT> &Obj;
 
-  error_code dumpSymbol(Elf_Sym_Iter Sym, ELFYAML::Symbol &S);
-  error_code dumpCommonSection(const Elf_Shdr *Shdr, ELFYAML::Section &S);
+  std::error_code dumpSymbol(Elf_Sym_Iter Sym, ELFYAML::Symbol &S);
+  std::error_code dumpCommonSection(const Elf_Shdr *Shdr, ELFYAML::Section &S);
+  std::error_code dumpCommonRelocationSection(const Elf_Shdr *Shdr,
+                                              ELFYAML::RelocationSection &S);
   template <class RelT>
-  error_code dumpRelocation(const Elf_Shdr *Shdr, const RelT *Rel,
-                            ELFYAML::Relocation &R);
+  std::error_code dumpRelocation(const Elf_Shdr *Shdr, const RelT *Rel,
+                                 ELFYAML::Relocation &R);
 
   ErrorOr<ELFYAML::RelocationSection *> dumpRelSection(const Elf_Shdr *Shdr);
   ErrorOr<ELFYAML::RelocationSection *> dumpRelaSection(const Elf_Shdr *Shdr);
@@ -72,21 +74,22 @@ ErrorOr<ELFYAML::Object *> ELFDumper<ELFT>::dump() {
       break;
     case ELF::SHT_RELA: {
       ErrorOr<ELFYAML::RelocationSection *> S = dumpRelaSection(&Sec);
-      if (error_code EC = S.getError())
+      if (std::error_code EC = S.getError())
         return EC;
       Y->Sections.push_back(std::unique_ptr<ELFYAML::Section>(S.get()));
       break;
     }
     case ELF::SHT_REL: {
       ErrorOr<ELFYAML::RelocationSection *> S = dumpRelSection(&Sec);
-      if (error_code EC = S.getError())
+      if (std::error_code EC = S.getError())
         return EC;
       Y->Sections.push_back(std::unique_ptr<ELFYAML::Section>(S.get()));
       break;
     }
+    // FIXME: Support SHT_GROUP section format.
     default: {
       ErrorOr<ELFYAML::RawContentSection *> S = dumpContentSection(&Sec);
-      if (error_code EC = S.getError())
+      if (std::error_code EC = S.getError())
         return EC;
       Y->Sections.push_back(std::unique_ptr<ELFYAML::Section>(S.get()));
     }
@@ -102,7 +105,7 @@ ErrorOr<ELFYAML::Object *> ELFDumper<ELFT>::dump() {
     }
 
     ELFYAML::Symbol S;
-    if (error_code EC = ELFDumper<ELFT>::dumpSymbol(SI, S))
+    if (std::error_code EC = ELFDumper<ELFT>::dumpSymbol(SI, S))
       return EC;
 
     switch (SI->getBinding())
@@ -125,13 +128,15 @@ ErrorOr<ELFYAML::Object *> ELFDumper<ELFT>::dump() {
 }
 
 template <class ELFT>
-error_code ELFDumper<ELFT>::dumpSymbol(Elf_Sym_Iter Sym, ELFYAML::Symbol &S) {
+std::error_code ELFDumper<ELFT>::dumpSymbol(Elf_Sym_Iter Sym,
+                                            ELFYAML::Symbol &S) {
   S.Type = Sym->getType();
   S.Value = Sym->st_value;
   S.Size = Sym->st_size;
+  S.Visibility = Sym->st_other & 0x3;
 
   ErrorOr<StringRef> NameOrErr = Obj.getSymbolName(Sym);
-  if (error_code EC = NameOrErr.getError())
+  if (std::error_code EC = NameOrErr.getError())
     return EC;
   S.Name = NameOrErr.get();
 
@@ -140,7 +145,7 @@ error_code ELFDumper<ELFT>::dumpSymbol(Elf_Sym_Iter Sym, ELFYAML::Symbol &S) {
     return obj2yaml_error::success;
 
   NameOrErr = Obj.getSectionName(Shdr);
-  if (error_code EC = NameOrErr.getError())
+  if (std::error_code EC = NameOrErr.getError())
     return EC;
   S.Section = NameOrErr.get();
 
@@ -149,9 +154,9 @@ error_code ELFDumper<ELFT>::dumpSymbol(Elf_Sym_Iter Sym, ELFYAML::Symbol &S) {
 
 template <class ELFT>
 template <class RelT>
-error_code ELFDumper<ELFT>::dumpRelocation(const Elf_Shdr *Shdr,
-                                           const RelT *Rel,
-                                           ELFYAML::Relocation &R) {
+std::error_code ELFDumper<ELFT>::dumpRelocation(const Elf_Shdr *Shdr,
+                                                const RelT *Rel,
+                                                ELFYAML::Relocation &R) {
   R.Type = Rel->getType(Obj.isMips64EL());
   R.Offset = Rel->r_offset;
   R.Addend = 0;
@@ -162,7 +167,7 @@ error_code ELFDumper<ELFT>::dumpRelocation(const Elf_Shdr *Shdr,
 
   ErrorOr<StringRef> NameOrErr =
       Obj.getSymbolName(NamePair.first, NamePair.second);
-  if (error_code EC = NameOrErr.getError())
+  if (std::error_code EC = NameOrErr.getError())
     return EC;
   R.Symbol = NameOrErr.get();
 
@@ -170,34 +175,44 @@ error_code ELFDumper<ELFT>::dumpRelocation(const Elf_Shdr *Shdr,
 }
 
 template <class ELFT>
-error_code ELFDumper<ELFT>::dumpCommonSection(const Elf_Shdr *Shdr,
-                                              ELFYAML::Section &S) {
+std::error_code ELFDumper<ELFT>::dumpCommonSection(const Elf_Shdr *Shdr,
+                                                   ELFYAML::Section &S) {
   S.Type = Shdr->sh_type;
   S.Flags = Shdr->sh_flags;
   S.Address = Shdr->sh_addr;
   S.AddressAlign = Shdr->sh_addralign;
 
   ErrorOr<StringRef> NameOrErr = Obj.getSectionName(Shdr);
-  if (error_code EC = NameOrErr.getError())
+  if (std::error_code EC = NameOrErr.getError())
     return EC;
   S.Name = NameOrErr.get();
 
   if (Shdr->sh_link != ELF::SHN_UNDEF) {
     if (const Elf_Shdr *LinkSection = Obj.getSection(Shdr->sh_link)) {
       NameOrErr = Obj.getSectionName(LinkSection);
-      if (error_code EC = NameOrErr.getError())
+      if (std::error_code EC = NameOrErr.getError())
         return EC;
       S.Link = NameOrErr.get();
     }
   }
-  if (Shdr->sh_info != ELF::SHN_UNDEF) {
-    if (const Elf_Shdr *InfoSection = Obj.getSection(Shdr->sh_info)) {
-      NameOrErr = Obj.getSectionName(InfoSection);
-      if (error_code EC = NameOrErr.getError())
-        return EC;
-      S.Info = NameOrErr.get();
-    }
+
+  return obj2yaml_error::success;
+}
+
+template <class ELFT>
+std::error_code
+ELFDumper<ELFT>::dumpCommonRelocationSection(const Elf_Shdr *Shdr,
+                                             ELFYAML::RelocationSection &S) {
+  if (std::error_code EC = dumpCommonSection(Shdr, S))
+    return EC;
+
+  if (const Elf_Shdr *InfoSection = Obj.getSection(Shdr->sh_info)) {
+    ErrorOr<StringRef> NameOrErr = Obj.getSectionName(InfoSection);
+    if (std::error_code EC = NameOrErr.getError())
+      return EC;
+    S.Info = NameOrErr.get();
   }
+
   return obj2yaml_error::success;
 }
 
@@ -207,13 +222,13 @@ ELFDumper<ELFT>::dumpRelSection(const Elf_Shdr *Shdr) {
   assert(Shdr->sh_type == ELF::SHT_REL && "Section type is not SHT_REL");
   auto S = make_unique<ELFYAML::RelocationSection>();
 
-  if (error_code EC = dumpCommonSection(Shdr, *S))
+  if (std::error_code EC = dumpCommonRelocationSection(Shdr, *S))
     return EC;
 
   for (auto RI = Obj.begin_rel(Shdr), RE = Obj.end_rel(Shdr); RI != RE;
        ++RI) {
     ELFYAML::Relocation R;
-    if (error_code EC = dumpRelocation(Shdr, &*RI, R))
+    if (std::error_code EC = dumpRelocation(Shdr, &*RI, R))
       return EC;
     S->Relocations.push_back(R);
   }
@@ -227,13 +242,13 @@ ELFDumper<ELFT>::dumpRelaSection(const Elf_Shdr *Shdr) {
   assert(Shdr->sh_type == ELF::SHT_RELA && "Section type is not SHT_RELA");
   auto S = make_unique<ELFYAML::RelocationSection>();
 
-  if (error_code EC = dumpCommonSection(Shdr, *S))
+  if (std::error_code EC = dumpCommonRelocationSection(Shdr, *S))
     return EC;
 
   for (auto RI = Obj.begin_rela(Shdr), RE = Obj.end_rela(Shdr); RI != RE;
        ++RI) {
     ELFYAML::Relocation R;
-    if (error_code EC = dumpRelocation(Shdr, &*RI, R))
+    if (std::error_code EC = dumpRelocation(Shdr, &*RI, R))
       return EC;
     R.Addend = RI->r_addend;
     S->Relocations.push_back(R);
@@ -247,23 +262,24 @@ ErrorOr<ELFYAML::RawContentSection *>
 ELFDumper<ELFT>::dumpContentSection(const Elf_Shdr *Shdr) {
   auto S = make_unique<ELFYAML::RawContentSection>();
 
-  if (error_code EC = dumpCommonSection(Shdr, *S))
+  if (std::error_code EC = dumpCommonSection(Shdr, *S))
     return EC;
 
   ErrorOr<ArrayRef<uint8_t>> ContentOrErr = Obj.getSectionContents(Shdr);
-  if (error_code EC = ContentOrErr.getError())
+  if (std::error_code EC = ContentOrErr.getError())
     return EC;
-  S->Content = object::yaml::BinaryRef(ContentOrErr.get());
+  S->Content = yaml::BinaryRef(ContentOrErr.get());
   S->Size = S->Content.binary_size();
 
   return S.release();
 }
 
 template <class ELFT>
-static error_code elf2yaml(raw_ostream &Out, const object::ELFFile<ELFT> &Obj) {
+static std::error_code elf2yaml(raw_ostream &Out,
+                                const object::ELFFile<ELFT> &Obj) {
   ELFDumper<ELFT> Dumper(Obj);
   ErrorOr<ELFYAML::Object *> YAMLOrErr = Dumper.dump();
-  if (error_code EC = YAMLOrErr.getError())
+  if (std::error_code EC = YAMLOrErr.getError())
     return EC;
 
   std::unique_ptr<ELFYAML::Object> YAML(YAMLOrErr.get());
@@ -273,7 +289,7 @@ static error_code elf2yaml(raw_ostream &Out, const object::ELFFile<ELFT> &Obj) {
   return object::object_error::success;
 }
 
-error_code elf2yaml(raw_ostream &Out, const object::ObjectFile &Obj) {
+std::error_code elf2yaml(raw_ostream &Out, const object::ObjectFile &Obj) {
   if (const auto *ELFObj = dyn_cast<object::ELF32LEObjectFile>(&Obj))
     return elf2yaml(Out, *ELFObj->getELFFile());
 
diff --git a/tools/obj2yaml/obj2yaml.cpp b/tools/obj2yaml/obj2yaml.cpp
index 7fe034d..944314a 100644
--- a/tools/obj2yaml/obj2yaml.cpp
+++ b/tools/obj2yaml/obj2yaml.cpp
@@ -19,7 +19,7 @@
 using namespace llvm;
 using namespace llvm::object;
 
-static error_code dumpObject(const ObjectFile &Obj) {
+static std::error_code dumpObject(const ObjectFile &Obj) {
   if (Obj.isCOFF())
     return coff2yaml(outs(), cast<COFFObjectFile>(Obj));
   if (Obj.isELF())
@@ -28,12 +28,12 @@ static error_code dumpObject(const ObjectFile &Obj) {
   return obj2yaml_error::unsupported_obj_file_format;
 }
 
-static error_code dumpInput(StringRef File) {
+static std::error_code dumpInput(StringRef File) {
   if (File != "-" && !sys::fs::exists(File))
     return obj2yaml_error::file_not_found;
 
   ErrorOr<Binary *> BinaryOrErr = createBinary(File);
-  if (error_code EC = BinaryOrErr.getError())
+  if (std::error_code EC = BinaryOrErr.getError())
     return EC;
 
   std::unique_ptr<Binary> Binary(BinaryOrErr.get());
@@ -53,7 +53,7 @@ int main(int argc, char *argv[]) {
   PrettyStackTraceProgram X(argc, argv);
   llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
 
-  if (error_code EC = dumpInput(InputFilename)) {
+  if (std::error_code EC = dumpInput(InputFilename)) {
     errs() << "Error: '" << EC.message() << "'\n";
     return 1;
   }
diff --git a/tools/obj2yaml/obj2yaml.h b/tools/obj2yaml/obj2yaml.h
index 73c58fa..6d81110 100644
--- a/tools/obj2yaml/obj2yaml.h
+++ b/tools/obj2yaml/obj2yaml.h
@@ -15,11 +15,11 @@
 
 #include "llvm/Object/COFF.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
-llvm::error_code coff2yaml(llvm::raw_ostream &Out,
-                           const llvm::object::COFFObjectFile &Obj);
-llvm::error_code elf2yaml(llvm::raw_ostream &Out,
-                          const llvm::object::ObjectFile &Obj);
+std::error_code coff2yaml(llvm::raw_ostream &Out,
+                          const llvm::object::COFFObjectFile &Obj);
+std::error_code elf2yaml(llvm::raw_ostream &Out,
+                         const llvm::object::ObjectFile &Obj);
 
 #endif
diff --git a/tools/opt/Android.mk b/tools/opt/Android.mk
index 3ebb97e..6f3f48d 100644
--- a/tools/opt/Android.mk
+++ b/tools/opt/Android.mk
@@ -58,6 +58,7 @@ llvm_opt_STATIC_LIBRARIES := \
   libLLVMTransformUtils \
   libLLVMTarget \
   libLLVMMC \
+  libLLVMMCParser \
   libLLVMObject \
   libLLVMCore \
   libLLVMAsmParser \
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 6f0fbf6..6ba6340 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -336,6 +336,7 @@ int main(int argc, char **argv) {
 
   InitializeAllTargets();
   InitializeAllTargetMCs();
+  InitializeAllAsmPrinters();
 
   // Initialize passes
   PassRegistry &Registry = *PassRegistry::getPassRegistry();
diff --git a/tools/yaml2obj/Android.mk b/tools/yaml2obj/Android.mk
index d69075a..3242c31 100644
--- a/tools/yaml2obj/Android.mk
+++ b/tools/yaml2obj/Android.mk
@@ -14,6 +14,8 @@ yaml2obj_SRC_FILES := \
 
 yaml2obj_STATIC_LIBRARIES := \
   libLLVMObject              \
+  libLLVMMC                  \
+  libLLVMMCParser            \
   libLLVMBitReader           \
   libLLVMCore                \
   libLLVMSupport             \
diff --git a/tools/yaml2obj/yaml2coff.cpp b/tools/yaml2obj/yaml2coff.cpp
index a0ede24..c772db9 100644
--- a/tools/yaml2obj/yaml2coff.cpp
+++ b/tools/yaml2obj/yaml2coff.cpp
@@ -327,8 +327,7 @@ bool writeCOFF(COFFParser &CP, raw_ostream &OS) {
   return true;
 }
 
-int yaml2coff(llvm::raw_ostream &Out, llvm::MemoryBuffer *Buf) {
-  yaml::Input YIn(Buf->getBuffer());
+int yaml2coff(yaml::Input &YIn, raw_ostream &Out) {
   COFFYAML::Object Doc;
   YIn >> Doc;
   if (YIn.error()) {
diff --git a/tools/yaml2obj/yaml2elf.cpp b/tools/yaml2obj/yaml2elf.cpp
index bb52cda..6eeecae 100644
--- a/tools/yaml2obj/yaml2elf.cpp
+++ b/tools/yaml2obj/yaml2elf.cpp
@@ -14,9 +14,9 @@
 
 #include "yaml2obj.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ELFYAML.h"
-#include "llvm/Object/StringTableBuilder.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/YAMLTraits.h"
@@ -304,6 +304,7 @@ void ELFState<ELFT>::addSymbols(const std::vector<ELFYAML::Symbol> &Symbols,
       Symbol.st_shndx = Index;
     } // else Symbol.st_shndex == SHN_UNDEF (== 0), since it was zero'd earlier.
     Symbol.st_value = Sym.Value;
+    Symbol.st_other = Sym.Visibility;
     Symbol.st_size = Sym.Size;
     Syms.push_back(Symbol);
   }
@@ -467,8 +468,7 @@ static bool isLittleEndian(const ELFYAML::Object &Doc) {
   return Doc.Header.Data == ELFYAML::ELF_ELFDATA(ELF::ELFDATA2LSB);
 }
 
-int yaml2elf(llvm::raw_ostream &Out, llvm::MemoryBuffer *Buf) {
-  yaml::Input YIn(Buf->getBuffer());
+int yaml2elf(yaml::Input &YIn, raw_ostream &Out) {
   ELFYAML::Object Doc;
   YIn >> Doc;
   if (YIn.error()) {
diff --git a/tools/yaml2obj/yaml2obj.cpp b/tools/yaml2obj/yaml2obj.cpp
index 2493b48..945fad1 100644
--- a/tools/yaml2obj/yaml2obj.cpp
+++ b/tools/yaml2obj/yaml2obj.cpp
@@ -15,15 +15,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "yaml2obj.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include <system_error>
 
 using namespace llvm;
 
@@ -51,9 +53,27 @@ cl::opt<YAMLObjectFormat> Format(
     clEnumValN(YOF_ELF, "elf", "ELF object file format"),
   clEnumValEnd));
 
+cl::opt<unsigned>
+DocNum("docnum", cl::init(1),
+       cl::desc("Read specified document from input (default = 1)"));
+
 static cl::opt<std::string> OutputFilename("o", cl::desc("Output filename"),
                                            cl::value_desc("filename"));
 
+typedef int (*ConvertFuncPtr)(yaml::Input & YIn, raw_ostream &Out);
+
+int convertYAML(yaml::Input & YIn, raw_ostream &Out, ConvertFuncPtr Convert) {
+  unsigned CurDocNum = 0;
+  do {
+    if (++CurDocNum == DocNum)
+      return Convert(YIn, Out);
+  } while (YIn.nextDocument());
+
+  errs() << "yaml2obj: Cannot find the " << DocNum
+         << llvm::getOrdinalSuffix(DocNum) << " document\n";
+  return 1;
+}
+
 int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv);
   sys::PrintStackTraceOnErrorSignal();
@@ -71,18 +91,24 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  std::unique_ptr<MemoryBuffer> Buf;
-  if (MemoryBuffer::getFileOrSTDIN(Input, Buf))
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
+      MemoryBuffer::getFileOrSTDIN(Input);
+  if (!Buf)
     return 1;
 
-  int Res = 1;
+  ConvertFuncPtr Convert = nullptr;
   if (Format == YOF_COFF)
-    Res = yaml2coff(Out->os(), Buf.get());
+    Convert = yaml2coff;
   else if (Format == YOF_ELF)
-    Res = yaml2elf(Out->os(), Buf.get());
-  else
+    Convert = yaml2elf;
+  else {
     errs() << "Not yet implemented\n";
+    return 1;
+  }
+
+  yaml::Input YIn(Buf.get()->getBuffer());
 
+  int Res = convertYAML(YIn, Out->os(), Convert);
   if (Res == 0)
     Out->keep();
 
diff --git a/tools/yaml2obj/yaml2obj.h b/tools/yaml2obj/yaml2obj.h
index 095435c..086f641 100644
--- a/tools/yaml2obj/yaml2obj.h
+++ b/tools/yaml2obj/yaml2obj.h
@@ -13,10 +13,12 @@
 #define LLVM_TOOLS_YAML2OBJ_H
 
 namespace llvm {
-  class raw_ostream;
-  class MemoryBuffer;
+class raw_ostream;
+namespace yaml {
+class Input;
 }
-int yaml2coff(llvm::raw_ostream &Out, llvm::MemoryBuffer *Buf);
-int yaml2elf(llvm::raw_ostream &Out, llvm::MemoryBuffer *Buf);
+}
+int yaml2coff(llvm::yaml::Input &YIn, llvm::raw_ostream &Out);
+int yaml2elf(llvm::yaml::Input &YIn, llvm::raw_ostream &Out);
 
 #endif
diff --git a/unittests/ADT/APFloatTest.cpp b/unittests/ADT/APFloatTest.cpp
index e57c8d4..8f298cd 100644
--- a/unittests/ADT/APFloatTest.cpp
+++ b/unittests/ADT/APFloatTest.cpp
@@ -1173,11 +1173,11 @@ TEST(APFloatTest, exactInverse) {
   EXPECT_TRUE(inv.bitwiseIsEqual(APFloat(8.5070592e+37f)));
 
   // Large float, inverse is a denormal.
-  EXPECT_FALSE(APFloat(1.7014118e38f).getExactInverse(0));
+  EXPECT_FALSE(APFloat(1.7014118e38f).getExactInverse(nullptr));
   // Zero
-  EXPECT_FALSE(APFloat(0.0).getExactInverse(0));
+  EXPECT_FALSE(APFloat(0.0).getExactInverse(nullptr));
   // Denormalized float
-  EXPECT_FALSE(APFloat(1.40129846e-45f).getExactInverse(0));
+  EXPECT_FALSE(APFloat(1.40129846e-45f).getExactInverse(nullptr));
 }
 
 TEST(APFloatTest, roundToIntegral) {
@@ -1844,10 +1844,10 @@ TEST(APFloatTest, subtract) {
     { PInf, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { PInf, PZero, "inf", APFloat::opOK, APFloat::fcInfinity },
     { PInf, MZero, "inf", APFloat::opOK, APFloat::fcInfinity },
-    { PInf, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { PInf, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { PInf, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { PInf, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { PInf, PNormalValue, "inf", APFloat::opOK, APFloat::fcInfinity },
     { PInf, MNormalValue, "inf", APFloat::opOK, APFloat::fcInfinity },
@@ -1861,10 +1861,10 @@ TEST(APFloatTest, subtract) {
     { MInf, MInf, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
     { MInf, PZero, "-inf", APFloat::opOK, APFloat::fcInfinity },
     { MInf, MZero, "-inf", APFloat::opOK, APFloat::fcInfinity },
-    { MInf, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { MInf, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { MInf, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { MInf, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { MInf, PNormalValue, "-inf", APFloat::opOK, APFloat::fcInfinity },
     { MInf, MNormalValue, "-inf", APFloat::opOK, APFloat::fcInfinity },
@@ -1878,10 +1878,10 @@ TEST(APFloatTest, subtract) {
     { PZero, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { PZero, PZero, "0x0p+0", APFloat::opOK, APFloat::fcZero },
     { PZero, MZero, "0x0p+0", APFloat::opOK, APFloat::fcZero },
-    { PZero, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { PZero, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { PZero, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { PZero, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { PZero, PNormalValue, "-0x1p+0", APFloat::opOK, APFloat::fcNormal },
     { PZero, MNormalValue, "0x1p+0", APFloat::opOK, APFloat::fcNormal },
@@ -1895,10 +1895,10 @@ TEST(APFloatTest, subtract) {
     { MZero, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { MZero, PZero, "-0x0p+0", APFloat::opOK, APFloat::fcZero },
     { MZero, MZero, "0x0p+0", APFloat::opOK, APFloat::fcZero },
-    { MZero, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { MZero, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { MZero, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { MZero, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { MZero, PNormalValue, "-0x1p+0", APFloat::opOK, APFloat::fcNormal },
     { MZero, MNormalValue, "0x1p+0", APFloat::opOK, APFloat::fcNormal },
@@ -1946,10 +1946,10 @@ TEST(APFloatTest, subtract) {
     { PNormalValue, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { PNormalValue, PZero, "0x1p+0", APFloat::opOK, APFloat::fcNormal },
     { PNormalValue, MZero, "0x1p+0", APFloat::opOK, APFloat::fcNormal },
-    { PNormalValue, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { PNormalValue, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { PNormalValue, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { PNormalValue, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { PNormalValue, PNormalValue, "0x0p+0", APFloat::opOK, APFloat::fcZero },
     { PNormalValue, MNormalValue, "0x1p+1", APFloat::opOK, APFloat::fcNormal },
@@ -1963,10 +1963,10 @@ TEST(APFloatTest, subtract) {
     { MNormalValue, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { MNormalValue, PZero, "-0x1p+0", APFloat::opOK, APFloat::fcNormal },
     { MNormalValue, MZero, "-0x1p+0", APFloat::opOK, APFloat::fcNormal },
-    { MNormalValue, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { MNormalValue, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { MNormalValue, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { MNormalValue, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { MNormalValue, PNormalValue, "-0x1p+1", APFloat::opOK, APFloat::fcNormal },
     { MNormalValue, MNormalValue, "0x0p+0", APFloat::opOK, APFloat::fcZero },
@@ -1980,10 +1980,10 @@ TEST(APFloatTest, subtract) {
     { PLargestValue, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { PLargestValue, PZero, "0x1.fffffep+127", APFloat::opOK, APFloat::fcNormal },
     { PLargestValue, MZero, "0x1.fffffep+127", APFloat::opOK, APFloat::fcNormal },
-    { PLargestValue, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { PLargestValue, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { PLargestValue, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { PLargestValue, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { PLargestValue, PNormalValue, "0x1.fffffep+127", APFloat::opInexact, APFloat::fcNormal },
     { PLargestValue, MNormalValue, "0x1.fffffep+127", APFloat::opInexact, APFloat::fcNormal },
@@ -1997,10 +1997,10 @@ TEST(APFloatTest, subtract) {
     { MLargestValue, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { MLargestValue, PZero, "-0x1.fffffep+127", APFloat::opOK, APFloat::fcNormal },
     { MLargestValue, MZero, "-0x1.fffffep+127", APFloat::opOK, APFloat::fcNormal },
-    { MLargestValue, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { MLargestValue, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { MLargestValue, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { MLargestValue, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { MLargestValue, PNormalValue, "-0x1.fffffep+127", APFloat::opInexact, APFloat::fcNormal },
     { MLargestValue, MNormalValue, "-0x1.fffffep+127", APFloat::opInexact, APFloat::fcNormal },
@@ -2014,10 +2014,10 @@ TEST(APFloatTest, subtract) {
     { PSmallestValue, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { PSmallestValue, PZero, "0x1p-149", APFloat::opOK, APFloat::fcNormal },
     { PSmallestValue, MZero, "0x1p-149", APFloat::opOK, APFloat::fcNormal },
-    { PSmallestValue, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { PSmallestValue, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { PSmallestValue, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { PSmallestValue, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { PSmallestValue, PNormalValue, "-0x1p+0", APFloat::opInexact, APFloat::fcNormal },
     { PSmallestValue, MNormalValue, "0x1p+0", APFloat::opInexact, APFloat::fcNormal },
@@ -2031,10 +2031,10 @@ TEST(APFloatTest, subtract) {
     { MSmallestValue, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { MSmallestValue, PZero, "-0x1p-149", APFloat::opOK, APFloat::fcNormal },
     { MSmallestValue, MZero, "-0x1p-149", APFloat::opOK, APFloat::fcNormal },
-    { MSmallestValue, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { MSmallestValue, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { MSmallestValue, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { MSmallestValue, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { MSmallestValue, PNormalValue, "-0x1p+0", APFloat::opInexact, APFloat::fcNormal },
     { MSmallestValue, MNormalValue, "0x1p+0", APFloat::opInexact, APFloat::fcNormal },
@@ -2048,10 +2048,10 @@ TEST(APFloatTest, subtract) {
     { PSmallestNormalized, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { PSmallestNormalized, PZero, "0x1p-126", APFloat::opOK, APFloat::fcNormal },
     { PSmallestNormalized, MZero, "0x1p-126", APFloat::opOK, APFloat::fcNormal },
-    { PSmallestNormalized, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { PSmallestNormalized, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { PSmallestNormalized, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { PSmallestNormalized, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { PSmallestNormalized, PNormalValue, "-0x1p+0", APFloat::opInexact, APFloat::fcNormal },
     { PSmallestNormalized, MNormalValue, "0x1p+0", APFloat::opInexact, APFloat::fcNormal },
@@ -2065,10 +2065,10 @@ TEST(APFloatTest, subtract) {
     { MSmallestNormalized, MInf, "inf", APFloat::opOK, APFloat::fcInfinity },
     { MSmallestNormalized, PZero, "-0x1p-126", APFloat::opOK, APFloat::fcNormal },
     { MSmallestNormalized, MZero, "-0x1p-126", APFloat::opOK, APFloat::fcNormal },
-    { MSmallestNormalized, QNaN, "nan", APFloat::opOK, APFloat::fcNaN },
+    { MSmallestNormalized, QNaN, "-nan", APFloat::opOK, APFloat::fcNaN },
 #if 0
 // See Note 1.
-    { MSmallestNormalized, SNaN, "nan", APFloat::opInvalidOp, APFloat::fcNaN },
+    { MSmallestNormalized, SNaN, "-nan", APFloat::opInvalidOp, APFloat::fcNaN },
 #endif
     { MSmallestNormalized, PNormalValue, "-0x1p+0", APFloat::opInexact, APFloat::fcNormal },
     { MSmallestNormalized, MNormalValue, "0x1p+0", APFloat::opInexact, APFloat::fcNormal },
diff --git a/unittests/ADT/ArrayRefTest.cpp b/unittests/ADT/ArrayRefTest.cpp
index 7133ca7..293afc6 100644
--- a/unittests/ADT/ArrayRefTest.cpp
+++ b/unittests/ADT/ArrayRefTest.cpp
@@ -29,5 +29,12 @@ TEST(ArrayRefTest, AllocatorCopy) {
   EXPECT_NE(Array2.data(), Array2c.data());
 }
 
+TEST(ArrayRefTest, DropBack) {
+  static const int TheNumbers[] = {4, 8, 15, 16, 23, 42};
+  ArrayRef<int> AR1(TheNumbers);
+  ArrayRef<int> AR2(TheNumbers, AR1.size() - 1);
+  EXPECT_TRUE(AR1.drop_back().equals(AR2));
+}
+
 
 } // end anonymous namespace
diff --git a/unittests/ADT/CMakeLists.txt b/unittests/ADT/CMakeLists.txt
index 5119723..0f214f3 100644
--- a/unittests/ADT/CMakeLists.txt
+++ b/unittests/ADT/CMakeLists.txt
@@ -23,7 +23,6 @@ set(ADTSources
   MakeUniqueTest.cpp
   MapVectorTest.cpp
   OptionalTest.cpp
-  OwningPtrTest.cpp
   PackedVectorTest.cpp
   PointerIntPairTest.cpp
   PointerUnionTest.cpp
diff --git a/unittests/ADT/DenseMapTest.cpp b/unittests/ADT/DenseMapTest.cpp
index dd49071..75a910a 100644
--- a/unittests/ADT/DenseMapTest.cpp
+++ b/unittests/ADT/DenseMapTest.cpp
@@ -92,9 +92,9 @@ protected:
 };
 
 template <typename T>
-typename T::key_type *const DenseMapTest<T>::dummy_key_ptr = 0;
+typename T::key_type *const DenseMapTest<T>::dummy_key_ptr = nullptr;
 template <typename T>
-typename T::mapped_type *const DenseMapTest<T>::dummy_value_ptr = 0;
+typename T::mapped_type *const DenseMapTest<T>::dummy_value_ptr = nullptr;
 
 // Register these types for testing.
 typedef ::testing::Types<DenseMap<uint32_t, uint32_t>,
@@ -345,7 +345,7 @@ TEST(DenseMapCustomTest, FindAsTest) {
   EXPECT_EQ(3u, map.size());
 
   // Normal lookup tests
-  EXPECT_EQ(1, map.count(1));
+  EXPECT_EQ(1u, map.count(1));
   EXPECT_EQ(1u, map.find(0)->second);
   EXPECT_EQ(2u, map.find(1)->second);
   EXPECT_EQ(3u, map.find(2)->second);
diff --git a/unittests/ADT/DenseSetTest.cpp b/unittests/ADT/DenseSetTest.cpp
index ada5f6d..154c589 100644
--- a/unittests/ADT/DenseSetTest.cpp
+++ b/unittests/ADT/DenseSetTest.cpp
@@ -24,7 +24,7 @@ TEST_F(DenseSetTest, DoubleEntrySetTest) {
   set.insert(0);
   set.insert(1);
   // Original failure was an infinite loop in this call:
-  EXPECT_EQ(0, set.count(2));
+  EXPECT_EQ(0u, set.count(2));
 }
 
 }
diff --git a/unittests/ADT/HashingTest.cpp b/unittests/ADT/HashingTest.cpp
index 60917ae..acaa83c 100644
--- a/unittests/ADT/HashingTest.cpp
+++ b/unittests/ADT/HashingTest.cpp
@@ -58,7 +58,7 @@ enum TestEnumeration {
 
 TEST(HashingTest, HashValueBasicTest) {
   int x = 42, y = 43, c = 'x';
-  void *p = 0;
+  void *p = nullptr;
   uint64_t i = 71;
   const unsigned ci = 71;
   volatile int vi = 71;
diff --git a/unittests/ADT/ImmutableMapTest.cpp b/unittests/ADT/ImmutableMapTest.cpp
index 774581c..6a99884 100644
--- a/unittests/ADT/ImmutableMapTest.cpp
+++ b/unittests/ADT/ImmutableMapTest.cpp
@@ -36,8 +36,8 @@ TEST(ImmutableMapTest, MultiElemIntMapTest) {
   EXPECT_TRUE(S.isEmpty());
   EXPECT_FALSE(S2.isEmpty());
 
-  EXPECT_EQ(0, S.lookup(3));
-  EXPECT_EQ(0, S.lookup(9));
+  EXPECT_EQ(nullptr, S.lookup(3));
+  EXPECT_EQ(nullptr, S.lookup(9));
 
   EXPECT_EQ(10, *S2.lookup(3));
   EXPECT_EQ(11, *S2.lookup(4));
diff --git a/unittests/ADT/OwningPtrTest.cpp b/unittests/ADT/OwningPtrTest.cpp
deleted file mode 100644
index aee955b..0000000
--- a/unittests/ADT/OwningPtrTest.cpp
+++ /dev/null
@@ -1,273 +0,0 @@
-//===- llvm/unittest/ADT/OwningPtrTest.cpp - OwningPtr unit tests ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/OwningPtr.h"
-#include "gtest/gtest.h"
-using namespace llvm;
-
-namespace {
-
-struct TrackDestructor {
-  static unsigned Destructions;
-  int val;
-  explicit TrackDestructor(int val) : val(val) {}
-  ~TrackDestructor() { ++Destructions; }
-  static void ResetCounts() { Destructions = 0; }
-
-private:
-  TrackDestructor(const TrackDestructor &other) LLVM_DELETED_FUNCTION;
-  TrackDestructor &
-  operator=(const TrackDestructor &other) LLVM_DELETED_FUNCTION;
-  TrackDestructor(TrackDestructor &&other) LLVM_DELETED_FUNCTION;
-  TrackDestructor &operator=(TrackDestructor &&other) LLVM_DELETED_FUNCTION;
-};
-
-unsigned TrackDestructor::Destructions = 0;
-
-// Test fixture
-class OwningPtrTest : public testing::Test {};
-
-TEST_F(OwningPtrTest, DefaultConstruction) {
-  TrackDestructor::ResetCounts();
-  {
-    OwningPtr<TrackDestructor> O;
-    EXPECT_FALSE(O);
-    EXPECT_TRUE(!O);
-    EXPECT_FALSE(O.get());
-    EXPECT_FALSE(O.isValid());
-  }
-  EXPECT_EQ(0u, TrackDestructor::Destructions);
-}
-
-TEST_F(OwningPtrTest, PtrConstruction) {
-  TrackDestructor::ResetCounts();
-  {
-    OwningPtr<TrackDestructor> O(new TrackDestructor(3));
-    EXPECT_TRUE((bool)O);
-    EXPECT_FALSE(!O);
-    EXPECT_TRUE(O.get());
-    EXPECT_TRUE(O.isValid());
-    EXPECT_EQ(3, (*O).val);
-    EXPECT_EQ(3, O->val);
-    EXPECT_EQ(0u, TrackDestructor::Destructions);
-  }
-  EXPECT_EQ(1u, TrackDestructor::Destructions);
-}
-
-TEST_F(OwningPtrTest, Reset) {
-  TrackDestructor::ResetCounts();
-  OwningPtr<TrackDestructor> O(new TrackDestructor(3));
-  EXPECT_EQ(0u, TrackDestructor::Destructions);
-  O.reset();
-  EXPECT_FALSE((bool)O);
-  EXPECT_TRUE(!O);
-  EXPECT_FALSE(O.get());
-  EXPECT_FALSE(O.isValid());
-  EXPECT_EQ(1u, TrackDestructor::Destructions);
-}
-
-TEST_F(OwningPtrTest, Take) {
-  TrackDestructor::ResetCounts();
-  TrackDestructor *T = 0;
-  {
-    OwningPtr<TrackDestructor> O(new TrackDestructor(3));
-    T = O.take();
-    EXPECT_FALSE((bool)O);
-    EXPECT_TRUE(!O);
-    EXPECT_FALSE(O.get());
-    EXPECT_FALSE(O.isValid());
-    EXPECT_TRUE(T);
-    EXPECT_EQ(3, T->val);
-    EXPECT_EQ(0u, TrackDestructor::Destructions);
-  }
-  delete T;
-  EXPECT_EQ(1u, TrackDestructor::Destructions);
-}
-
-TEST_F(OwningPtrTest, Release) {
-  TrackDestructor::ResetCounts();
-  TrackDestructor *T = 0;
-  {
-    OwningPtr<TrackDestructor> O(new TrackDestructor(3));
-    T = O.release();
-    EXPECT_FALSE((bool)O);
-    EXPECT_TRUE(!O);
-    EXPECT_FALSE(O.get());
-    EXPECT_FALSE(O.isValid());
-    EXPECT_TRUE(T);
-    EXPECT_EQ(3, T->val);
-    EXPECT_EQ(0u, TrackDestructor::Destructions);
-  }
-  delete T;
-  EXPECT_EQ(1u, TrackDestructor::Destructions);
-}
-
-TEST_F(OwningPtrTest, MoveConstruction) {
-  TrackDestructor::ResetCounts();
-  {
-    OwningPtr<TrackDestructor> A(new TrackDestructor(3));
-    OwningPtr<TrackDestructor> B = std::move(A);
-    EXPECT_FALSE((bool)A);
-    EXPECT_TRUE(!A);
-    EXPECT_FALSE(A.get());
-    EXPECT_FALSE(A.isValid());
-    EXPECT_TRUE((bool)B);
-    EXPECT_FALSE(!B);
-    EXPECT_TRUE(B.get());
-    EXPECT_TRUE(B.isValid());
-    EXPECT_EQ(3, (*B).val);
-    EXPECT_EQ(3, B->val);
-    EXPECT_EQ(0u, TrackDestructor::Destructions);
-  }
-  EXPECT_EQ(1u, TrackDestructor::Destructions);
-}
-
-TEST_F(OwningPtrTest, MoveAssignment) {
-  TrackDestructor::ResetCounts();
-  {
-    OwningPtr<TrackDestructor> A(new TrackDestructor(3));
-    OwningPtr<TrackDestructor> B(new TrackDestructor(4));
-    B = std::move(A);
-    EXPECT_FALSE(A);
-    EXPECT_TRUE(!A);
-    EXPECT_FALSE(A.get());
-    EXPECT_FALSE(A.isValid());
-    EXPECT_TRUE((bool)B);
-    EXPECT_FALSE(!B);
-    EXPECT_TRUE(B.get());
-    EXPECT_TRUE(B.isValid());
-    EXPECT_EQ(3, (*B).val);
-    EXPECT_EQ(3, B->val);
-    EXPECT_EQ(1u, TrackDestructor::Destructions);
-  }
-  EXPECT_EQ(2u, TrackDestructor::Destructions);
-}
-
-TEST_F(OwningPtrTest, Swap) {
-  TrackDestructor::ResetCounts();
-  {
-    OwningPtr<TrackDestructor> A(new TrackDestructor(3));
-    OwningPtr<TrackDestructor> B(new TrackDestructor(4));
-    B.swap(A);
-    EXPECT_TRUE((bool)A);
-    EXPECT_FALSE(!A);
-    EXPECT_TRUE(A.get());
-    EXPECT_TRUE(A.isValid());
-    EXPECT_EQ(4, (*A).val);
-    EXPECT_EQ(4, A->val);
-    EXPECT_TRUE((bool)B);
-    EXPECT_FALSE(!B);
-    EXPECT_TRUE(B.get());
-    EXPECT_TRUE(B.isValid());
-    EXPECT_EQ(3, (*B).val);
-    EXPECT_EQ(3, B->val);
-    EXPECT_EQ(0u, TrackDestructor::Destructions);
-  }
-  EXPECT_EQ(2u, TrackDestructor::Destructions);
-  TrackDestructor::ResetCounts();
-  {
-    OwningPtr<TrackDestructor> A(new TrackDestructor(3));
-    OwningPtr<TrackDestructor> B(new TrackDestructor(4));
-    swap(A, B);
-    EXPECT_TRUE((bool)A);
-    EXPECT_FALSE(!A);
-    EXPECT_TRUE(A.get());
-    EXPECT_TRUE(A.isValid());
-    EXPECT_EQ(4, (*A).val);
-    EXPECT_EQ(4, A->val);
-    EXPECT_TRUE((bool)B);
-    EXPECT_FALSE(!B);
-    EXPECT_TRUE(B.get());
-    EXPECT_TRUE(B.isValid());
-    EXPECT_EQ(3, (*B).val);
-    EXPECT_EQ(3, B->val);
-    EXPECT_EQ(0u, TrackDestructor::Destructions);
-  }
-  EXPECT_EQ(2u, TrackDestructor::Destructions);
-}
-
-TEST_F(OwningPtrTest, UniqueToOwningConstruction) {
-  TrackDestructor::ResetCounts();
-  {
-    std::unique_ptr<TrackDestructor> A(new TrackDestructor(3));
-    OwningPtr<TrackDestructor> B = std::move(A);
-    EXPECT_FALSE(A);
-    EXPECT_TRUE(!A);
-    EXPECT_FALSE(A.get());
-    EXPECT_TRUE((bool)B);
-    EXPECT_FALSE(!B);
-    EXPECT_TRUE(B.get());
-    EXPECT_TRUE(B.isValid());
-    EXPECT_EQ(3, (*B).val);
-    EXPECT_EQ(3, B->val);
-    EXPECT_EQ(0u, TrackDestructor::Destructions);
-  }
-  EXPECT_EQ(1u, TrackDestructor::Destructions);
-}
-
-TEST_F(OwningPtrTest, UniqueToOwningAssignment) {
-  TrackDestructor::ResetCounts();
-  {
-    std::unique_ptr<TrackDestructor> A(new TrackDestructor(3));
-    OwningPtr<TrackDestructor> B(new TrackDestructor(4));
-    B = std::move(A);
-    EXPECT_FALSE(A);
-    EXPECT_TRUE(!A);
-    EXPECT_FALSE(A.get());
-    EXPECT_TRUE((bool)B);
-    EXPECT_FALSE(!B);
-    EXPECT_TRUE(B.get());
-    EXPECT_TRUE(B.isValid());
-    EXPECT_EQ(3, (*B).val);
-    EXPECT_EQ(3, B->val);
-    EXPECT_EQ(1u, TrackDestructor::Destructions);
-  }
-  EXPECT_EQ(2u, TrackDestructor::Destructions);
-}
-
-TEST_F(OwningPtrTest, TakeUniqueConstruction) {
-  TrackDestructor::ResetCounts();
-  {
-    OwningPtr<TrackDestructor> A(new TrackDestructor(3));
-    std::unique_ptr<TrackDestructor> B = A.take_unique();
-    EXPECT_FALSE(A);
-    EXPECT_TRUE(!A);
-    EXPECT_FALSE(A.get());
-    EXPECT_FALSE(A.isValid());
-    EXPECT_TRUE((bool)B);
-    EXPECT_FALSE(!B);
-    EXPECT_TRUE(B.get());
-    EXPECT_EQ(3, (*B).val);
-    EXPECT_EQ(3, B->val);
-    EXPECT_EQ(0u, TrackDestructor::Destructions);
-  }
-  EXPECT_EQ(1u, TrackDestructor::Destructions);
-}
-
-#if LLVM_HAS_RVALUE_REFERENCE_THIS
-TEST_F(OwningPtrTest, OwningToUniqueConstruction) {
-  TrackDestructor::ResetCounts();
-  {
-    OwningPtr<TrackDestructor> A(new TrackDestructor(3));
-    std::unique_ptr<TrackDestructor> B = std::move(A);
-    EXPECT_FALSE(A);
-    EXPECT_TRUE(!A);
-    EXPECT_FALSE(A.get());
-    EXPECT_FALSE(A.isValid());
-    EXPECT_TRUE((bool)B);
-    EXPECT_FALSE(!B);
-    EXPECT_TRUE(B.get());
-    EXPECT_EQ(3, (*B).val);
-    EXPECT_EQ(3, B->val);
-    EXPECT_EQ(0u, TrackDestructor::Destructions);
-  }
-  EXPECT_EQ(1u, TrackDestructor::Destructions);
-}
-#endif
-}
diff --git a/unittests/ADT/PointerUnionTest.cpp b/unittests/ADT/PointerUnionTest.cpp
index 3bfb79c..a592784 100644
--- a/unittests/ADT/PointerUnionTest.cpp
+++ b/unittests/ADT/PointerUnionTest.cpp
@@ -66,7 +66,7 @@ TEST_F(PointerUnionTest, Is) {
 TEST_F(PointerUnionTest, Get) {
   EXPECT_EQ(a.get<float *>(), &f);
   EXPECT_EQ(b.get<int *>(), &i);
-  EXPECT_EQ(n.get<int *>(), (int *)0);
+  EXPECT_EQ(n.get<int *>(), (int *)nullptr);
 }
 
 } // end anonymous namespace
diff --git a/unittests/ADT/SCCIteratorTest.cpp b/unittests/ADT/SCCIteratorTest.cpp
index 8609732..3f1ba1c 100644
--- a/unittests/ADT/SCCIteratorTest.cpp
+++ b/unittests/ADT/SCCIteratorTest.cpp
@@ -213,7 +213,7 @@ public:
           // Return a pointer to it.
           return FirstNode + i;
       assert(false && "Dereferencing end iterator!");
-      return 0; // Avoid compiler warning.
+      return nullptr; // Avoid compiler warning.
     }
   };
 
diff --git a/unittests/ADT/SmallVectorTest.cpp b/unittests/ADT/SmallVectorTest.cpp
index 58f5591..95bf33e 100644
--- a/unittests/ADT/SmallVectorTest.cpp
+++ b/unittests/ADT/SmallVectorTest.cpp
@@ -26,8 +26,12 @@ namespace {
 class Constructable {
 private:
   static int numConstructorCalls;
+  static int numMoveConstructorCalls;
+  static int numCopyConstructorCalls;
   static int numDestructorCalls;
   static int numAssignmentCalls;
+  static int numMoveAssignmentCalls;
+  static int numCopyAssignmentCalls;
 
   bool constructed;
   int value;
@@ -44,11 +48,13 @@ public:
   Constructable(const Constructable & src) : constructed(true) {
     value = src.value;
     ++numConstructorCalls;
+    ++numCopyConstructorCalls;
   }
 
   Constructable(Constructable && src) : constructed(true) {
     value = src.value;
     ++numConstructorCalls;
+    ++numMoveConstructorCalls;
   }
 
   ~Constructable() {
@@ -61,6 +67,7 @@ public:
     EXPECT_TRUE(constructed);
     value = src.value;
     ++numAssignmentCalls;
+    ++numCopyAssignmentCalls;
     return *this;
   }
 
@@ -68,6 +75,7 @@ public:
     EXPECT_TRUE(constructed);
     value = src.value;
     ++numAssignmentCalls;
+    ++numMoveAssignmentCalls;
     return *this;
   }
 
@@ -77,18 +85,42 @@ public:
 
   static void reset() {
     numConstructorCalls = 0;
+    numMoveConstructorCalls = 0;
+    numCopyConstructorCalls = 0;
     numDestructorCalls = 0;
     numAssignmentCalls = 0;
+    numMoveAssignmentCalls = 0;
+    numCopyAssignmentCalls = 0;
   }
 
   static int getNumConstructorCalls() {
     return numConstructorCalls;
   }
 
+  static int getNumMoveConstructorCalls() {
+    return numMoveConstructorCalls;
+  }
+
+  static int getNumCopyConstructorCalls() {
+    return numCopyConstructorCalls;
+  }
+
   static int getNumDestructorCalls() {
     return numDestructorCalls;
   }
 
+  static int getNumAssignmentCalls() {
+    return numAssignmentCalls;
+  }
+
+  static int getNumMoveAssignmentCalls() {
+    return numMoveAssignmentCalls;
+  }
+
+  static int getNumCopyAssignmentCalls() {
+    return numCopyAssignmentCalls;
+  }
+
   friend bool operator==(const Constructable & c0, const Constructable & c1) {
     return c0.getValue() == c1.getValue();
   }
@@ -100,8 +132,26 @@ public:
 };
 
 int Constructable::numConstructorCalls;
+int Constructable::numCopyConstructorCalls;
+int Constructable::numMoveConstructorCalls;
 int Constructable::numDestructorCalls;
 int Constructable::numAssignmentCalls;
+int Constructable::numCopyAssignmentCalls;
+int Constructable::numMoveAssignmentCalls;
+
+struct NonCopyable {
+  NonCopyable() {}
+  NonCopyable(NonCopyable &&) {}
+  NonCopyable &operator=(NonCopyable &&) { return *this; }
+private:
+  NonCopyable(const NonCopyable &) LLVM_DELETED_FUNCTION;
+  NonCopyable &operator=(const NonCopyable &) LLVM_DELETED_FUNCTION;
+};
+
+LLVM_ATTRIBUTE_USED void CompileTest() {
+  SmallVector<NonCopyable, 0> V;
+  V.resize(42);
+}
 
 // Test fixture class
 template <typename VectorT>
@@ -148,7 +198,8 @@ protected:
 typedef ::testing::Types<SmallVector<Constructable, 0>,
                          SmallVector<Constructable, 1>,
                          SmallVector<Constructable, 2>,
-                         SmallVector<Constructable, 4>
+                         SmallVector<Constructable, 4>,
+                         SmallVector<Constructable, 5>
                          > SmallVectorTestTypes;
 TYPED_TEST_CASE(SmallVectorTest, SmallVectorTestTypes);
 
@@ -240,13 +291,26 @@ TYPED_TEST(SmallVectorTest, ResizeGrowTest) {
 
   this->theVector.resize(2);
 
-  // The extra constructor/destructor calls come from the temporary object used
-  // to initialize the contents of the resized array (via copy construction).
-  EXPECT_EQ(3, Constructable::getNumConstructorCalls());
-  EXPECT_EQ(1, Constructable::getNumDestructorCalls());
+  EXPECT_EQ(2, Constructable::getNumConstructorCalls());
+  EXPECT_EQ(0, Constructable::getNumDestructorCalls());
   EXPECT_EQ(2u, this->theVector.size());
 }
 
+TYPED_TEST(SmallVectorTest, ResizeWithElementsTest) {
+  this->theVector.resize(2);
+
+  Constructable::reset();
+
+  this->theVector.resize(4);
+
+  size_t Ctors = Constructable::getNumConstructorCalls();
+  EXPECT_TRUE(Ctors == 2 || Ctors == 4);
+  size_t MoveCtors = Constructable::getNumMoveConstructorCalls();
+  EXPECT_TRUE(MoveCtors == 0 || MoveCtors == 2);
+  size_t Dtors = Constructable::getNumDestructorCalls();
+  EXPECT_TRUE(Dtors == 0 || Dtors == 2);
+}
+
 // Resize with fill value.
 TYPED_TEST(SmallVectorTest, ResizeFillTest) {
   SCOPED_TRACE("ResizeFillTest");
@@ -413,22 +477,67 @@ TYPED_TEST(SmallVectorTest, InsertTest) {
   this->assertValuesInOrder(this->theVector, 4u, 1, 77, 2, 3);
 }
 
+// Insert a copy of a single element.
+TYPED_TEST(SmallVectorTest, InsertCopy) {
+  SCOPED_TRACE("InsertTest");
+
+  this->makeSequence(this->theVector, 1, 3);
+  Constructable C(77);
+  typename TypeParam::iterator I =
+      this->theVector.insert(this->theVector.begin() + 1, C);
+  EXPECT_EQ(this->theVector.begin() + 1, I);
+  this->assertValuesInOrder(this->theVector, 4u, 1, 77, 2, 3);
+}
+
 // Insert repeated elements.
 TYPED_TEST(SmallVectorTest, InsertRepeatedTest) {
   SCOPED_TRACE("InsertRepeatedTest");
 
-  this->makeSequence(this->theVector, 10, 15);
-  typename TypeParam::iterator I =
-    this->theVector.insert(this->theVector.begin() + 1, 2, Constructable(16));
+  this->makeSequence(this->theVector, 1, 4);
+  Constructable::reset();
+  auto I =
+      this->theVector.insert(this->theVector.begin() + 1, 2, Constructable(16));
+  // Move construct the top element into newly allocated space, and optionally
+  // reallocate the whole buffer, move constructing into it.
+  // FIXME: This is inefficient, we shouldn't move things into newly allocated
+  // space, then move them up/around, there should only be 2 or 4 move
+  // constructions here.
+  EXPECT_TRUE(Constructable::getNumMoveConstructorCalls() == 2 ||
+              Constructable::getNumMoveConstructorCalls() == 6);
+  // Move assign the next two to shift them up and make a gap.
+  EXPECT_EQ(1, Constructable::getNumMoveAssignmentCalls());
+  // Copy construct the two new elements from the parameter.
+  EXPECT_EQ(2, Constructable::getNumCopyAssignmentCalls());
+  // All without any copy construction.
+  EXPECT_EQ(0, Constructable::getNumCopyConstructorCalls());
   EXPECT_EQ(this->theVector.begin() + 1, I);
-  this->assertValuesInOrder(this->theVector, 8u,
-                      10, 16, 16, 11, 12, 13, 14, 15);
+  this->assertValuesInOrder(this->theVector, 6u, 1, 16, 16, 2, 3, 4);
+}
 
-  // Insert at end.
-  I = this->theVector.insert(this->theVector.end(), 2, Constructable(16));
-  EXPECT_EQ(this->theVector.begin() + 8, I);
-  this->assertValuesInOrder(this->theVector, 10u,
-                      10, 16, 16, 11, 12, 13, 14, 15, 16, 16);
+
+TYPED_TEST(SmallVectorTest, InsertRepeatedAtEndTest) {
+  SCOPED_TRACE("InsertRepeatedTest");
+
+  this->makeSequence(this->theVector, 1, 4);
+  Constructable::reset();
+  auto I = this->theVector.insert(this->theVector.end(), 2, Constructable(16));
+  // Just copy construct them into newly allocated space
+  EXPECT_EQ(2, Constructable::getNumCopyConstructorCalls());
+  // Move everything across if reallocation is needed.
+  EXPECT_TRUE(Constructable::getNumMoveConstructorCalls() == 0 ||
+              Constructable::getNumMoveConstructorCalls() == 4);
+  // Without ever moving or copying anything else.
+  EXPECT_EQ(0, Constructable::getNumCopyAssignmentCalls());
+  EXPECT_EQ(0, Constructable::getNumMoveAssignmentCalls());
+
+  EXPECT_EQ(this->theVector.begin() + 4, I);
+  this->assertValuesInOrder(this->theVector, 6u, 1, 2, 3, 4, 16, 16);
+}
+
+TYPED_TEST(SmallVectorTest, InsertRepeatedEmptyTest) {
+  SCOPED_TRACE("InsertRepeatedTest");
+
+  this->makeSequence(this->theVector, 10, 15);
 
   // Empty insert.
   EXPECT_EQ(this->theVector.end(),
@@ -447,16 +556,53 @@ TYPED_TEST(SmallVectorTest, InsertRangeTest) {
     { Constructable(77), Constructable(77), Constructable(77) };
 
   this->makeSequence(this->theVector, 1, 3);
-  typename TypeParam::iterator I =
-    this->theVector.insert(this->theVector.begin() + 1, Arr, Arr+3);
+  Constructable::reset();
+  auto I = this->theVector.insert(this->theVector.begin() + 1, Arr, Arr + 3);
+  // Move construct the top 3 elements into newly allocated space.
+  // Possibly move the whole sequence into new space first.
+  // FIXME: This is inefficient, we shouldn't move things into newly allocated
+  // space, then move them up/around, there should only be 2 or 3 move
+  // constructions here.
+  EXPECT_TRUE(Constructable::getNumMoveConstructorCalls() == 2 ||
+              Constructable::getNumMoveConstructorCalls() == 5);
+  // Copy assign the lower 2 new elements into existing space.
+  EXPECT_EQ(2, Constructable::getNumCopyAssignmentCalls());
+  // Copy construct the third element into newly allocated space.
+  EXPECT_EQ(1, Constructable::getNumCopyConstructorCalls());
   EXPECT_EQ(this->theVector.begin() + 1, I);
   this->assertValuesInOrder(this->theVector, 6u, 1, 77, 77, 77, 2, 3);
+}
+
+
+TYPED_TEST(SmallVectorTest, InsertRangeAtEndTest) {
+  SCOPED_TRACE("InsertRangeTest");
+
+  Constructable Arr[3] =
+    { Constructable(77), Constructable(77), Constructable(77) };
+
+  this->makeSequence(this->theVector, 1, 3);
 
   // Insert at end.
-  I = this->theVector.insert(this->theVector.end(), Arr, Arr+3);
-  EXPECT_EQ(this->theVector.begin() + 6, I);
-  this->assertValuesInOrder(this->theVector, 9u,
-                            1, 77, 77, 77, 2, 3, 77, 77, 77);
+  Constructable::reset();
+  auto I = this->theVector.insert(this->theVector.end(), Arr, Arr+3);
+  // Copy construct the 3 elements into new space at the top.
+  EXPECT_EQ(3, Constructable::getNumCopyConstructorCalls());
+  // Don't copy/move anything else.
+  EXPECT_EQ(0, Constructable::getNumCopyAssignmentCalls());
+  // Reallocation might occur, causing all elements to be moved into the new
+  // buffer.
+  EXPECT_TRUE(Constructable::getNumMoveConstructorCalls() == 0 ||
+              Constructable::getNumMoveConstructorCalls() == 3);
+  EXPECT_EQ(0, Constructable::getNumMoveAssignmentCalls());
+  EXPECT_EQ(this->theVector.begin() + 3, I);
+  this->assertValuesInOrder(this->theVector, 6u,
+                            1, 2, 3, 77, 77, 77);
+}
+
+TYPED_TEST(SmallVectorTest, InsertEmptyRangeTest) {
+  SCOPED_TRACE("InsertRangeTest");
+
+  this->makeSequence(this->theVector, 1, 3);
 
   // Empty insert.
   EXPECT_EQ(this->theVector.end(),
@@ -531,4 +677,26 @@ TEST(SmallVectorCustomTest, NoAssignTest) {
   EXPECT_EQ(42, vec.pop_back_val().x);
 }
 
+struct MovedFrom {
+  bool hasValue;
+  MovedFrom() : hasValue(true) {
+  }
+  MovedFrom(MovedFrom&& m) : hasValue(m.hasValue) {
+    m.hasValue = false;
+  }
+  MovedFrom &operator=(MovedFrom&& m) {
+    hasValue = m.hasValue;
+    m.hasValue = false;
+    return *this;
+  }
+};
+
+TEST(SmallVectorTest, MidInsert) {
+  SmallVector<MovedFrom, 3> v;
+  v.push_back(MovedFrom());
+  v.insert(v.begin(), MovedFrom());
+  for (MovedFrom &m : v)
+    EXPECT_TRUE(m.hasValue);
+}
+
 }
diff --git a/unittests/ADT/StringMapTest.cpp b/unittests/ADT/StringMapTest.cpp
index de18e07..028375d 100644
--- a/unittests/ADT/StringMapTest.cpp
+++ b/unittests/ADT/StringMapTest.cpp
@@ -10,6 +10,7 @@
 #include "gtest/gtest.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/DataTypes.h"
+#include <tuple>
 using namespace llvm;
 
 namespace {
@@ -187,7 +188,7 @@ TEST_F(StringMapTest, IterationTest) {
 TEST_F(StringMapTest, StringMapEntryTest) {
   StringMap<uint32_t>::value_type* entry =
       StringMap<uint32_t>::value_type::Create(
-          testKeyFirst, testKeyFirst + testKeyLength, 1u);
+          StringRef(testKeyFirst, testKeyLength), 1u);
   EXPECT_STREQ(testKey, entry->first().data());
   EXPECT_EQ(1u, entry->second);
   free(entry);
@@ -198,11 +199,48 @@ TEST_F(StringMapTest, InsertTest) {
   SCOPED_TRACE("InsertTest");
   testMap.insert(
       StringMap<uint32_t>::value_type::Create(
-          testKeyFirst, testKeyFirst + testKeyLength, 
+          StringRef(testKeyFirst, testKeyLength),
           testMap.getAllocator(), 1u));
   assertSingleItemMap();
 }
 
+// Test insert(pair<K, V>) method
+TEST_F(StringMapTest, InsertPairTest) {
+  bool Inserted;
+  StringMap<uint32_t>::iterator NewIt;
+  std::tie(NewIt, Inserted) =
+      testMap.insert(std::make_pair(testKeyFirst, testValue));
+  EXPECT_EQ(1u, testMap.size());
+  EXPECT_EQ(testValue, testMap[testKeyFirst]);
+  EXPECT_EQ(testKeyFirst, NewIt->first());
+  EXPECT_EQ(testValue, NewIt->second);
+  EXPECT_TRUE(Inserted);
+
+  StringMap<uint32_t>::iterator ExistingIt;
+  std::tie(ExistingIt, Inserted) =
+      testMap.insert(std::make_pair(testKeyFirst, testValue + 1));
+  EXPECT_EQ(1u, testMap.size());
+  EXPECT_EQ(testValue, testMap[testKeyFirst]);
+  EXPECT_FALSE(Inserted);
+  EXPECT_EQ(NewIt, ExistingIt);
+}
+
+// Test insert(pair<K, V>) method when rehashing occurs
+TEST_F(StringMapTest, InsertRehashingPairTest) {
+  // Check that the correct iterator is returned when the inserted element is
+  // moved to a different bucket during internal rehashing. This depends on
+  // the particular key, and the implementation of StringMap and HashString.
+  // Changes to those might result in this test not actually checking that.
+  StringMap<uint32_t> t(1);
+  EXPECT_EQ(1u, t.getNumBuckets());
+
+  StringMap<uint32_t>::iterator It =
+    t.insert(std::make_pair("abcdef", 42)).first;
+  EXPECT_EQ(2u, t.getNumBuckets());
+  EXPECT_EQ("abcdef", It->first());
+  EXPECT_EQ(42u, It->second);
+}
+
 // Create a non-default constructable value
 struct StringMapTestStruct {
   StringMapTestStruct(int i) : i(i) {}
@@ -228,15 +266,15 @@ struct MoveOnly {
   }
 
 private:
-  MoveOnly(const MoveOnly &);
-  MoveOnly &operator=(const MoveOnly &);
+  MoveOnly(const MoveOnly &) LLVM_DELETED_FUNCTION;
+  MoveOnly &operator=(const MoveOnly &) LLVM_DELETED_FUNCTION;
 };
 
 TEST_F(StringMapTest, MoveOnlyKey) {
   StringMap<MoveOnly> t;
   t.GetOrCreateValue("Test", MoveOnly(42));
   StringRef Key = "Test";
-  StringMapEntry<MoveOnly>::Create(Key.begin(), Key.end(), MoveOnly(42))
+  StringMapEntry<MoveOnly>::Create(Key, MoveOnly(42))
       ->Destroy();
 }
 
diff --git a/unittests/ADT/ilistTest.cpp b/unittests/ADT/ilistTest.cpp
index 134607c..44442eb 100644
--- a/unittests/ADT/ilistTest.cpp
+++ b/unittests/ADT/ilistTest.cpp
@@ -29,8 +29,8 @@ TEST(ilistTest, Basic) {
   ilist<Node> List;
   List.push_back(Node(1));
   EXPECT_EQ(1, List.back().Value);
-  EXPECT_EQ(0, List.back().getPrevNode());
-  EXPECT_EQ(0, List.back().getNextNode());
+  EXPECT_EQ(nullptr, List.back().getPrevNode());
+  EXPECT_EQ(nullptr, List.back().getNextNode());
 
   List.push_back(Node(2));
   EXPECT_EQ(2, List.back().Value);
diff --git a/unittests/Analysis/CFGTest.cpp b/unittests/Analysis/CFGTest.cpp
index 8d8c560..ac5e710 100644
--- a/unittests/Analysis/CFGTest.cpp
+++ b/unittests/Analysis/CFGTest.cpp
@@ -46,10 +46,10 @@ protected:
     }
 
     Function *F = M->getFunction("test");
-    if (F == NULL)
+    if (F == nullptr)
       report_fatal_error("Test must have a function named @test");
 
-    A = B = NULL;
+    A = B = nullptr;
     for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
       if (I->hasName()) {
         if (I->getName() == "A")
@@ -58,9 +58,9 @@ protected:
           B = &*I;
       }
     }
-    if (A == NULL)
+    if (A == nullptr)
       report_fatal_error("@test must have an instruction %A");
-    if (B == NULL)
+    if (B == nullptr)
       report_fatal_error("@test must have an instruction %B");
   }
 
@@ -74,7 +74,7 @@ protected:
 
       static int initialize() {
         PassInfo *PI = new PassInfo("isPotentiallyReachable testing pass",
-                                    "", &ID, 0, true, true);
+                                    "", &ID, nullptr, true, true);
         PassRegistry::getPassRegistry()->registerPass(*PI, false);
         initializeLoopInfoPass(*PassRegistry::getPassRegistry());
         initializeDominatorTreeWrapperPassPass(
@@ -95,9 +95,10 @@ protected:
         LoopInfo *LI = &getAnalysis<LoopInfo>();
         DominatorTree *DT =
             &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-        EXPECT_EQ(isPotentiallyReachable(A, B, 0, 0), ExpectedResult);
-        EXPECT_EQ(isPotentiallyReachable(A, B, DT, 0), ExpectedResult);
-        EXPECT_EQ(isPotentiallyReachable(A, B, 0, LI), ExpectedResult);
+        EXPECT_EQ(isPotentiallyReachable(A, B, nullptr, nullptr),
+                  ExpectedResult);
+        EXPECT_EQ(isPotentiallyReachable(A, B, DT, nullptr), ExpectedResult);
+        EXPECT_EQ(isPotentiallyReachable(A, B, nullptr, LI), ExpectedResult);
         EXPECT_EQ(isPotentiallyReachable(A, B, DT, LI), ExpectedResult);
         return false;
       }
diff --git a/unittests/Analysis/MixedTBAATest.cpp b/unittests/Analysis/MixedTBAATest.cpp
index 2cf7c73..142e047 100644
--- a/unittests/Analysis/MixedTBAATest.cpp
+++ b/unittests/Analysis/MixedTBAATest.cpp
@@ -43,7 +43,7 @@ TEST_F(MixedTBAATest, MixedTBAA) {
 
   auto *Store1 = new StoreInst(Value, Addr, BB);
   auto *Store2 = new StoreInst(Value, Addr, BB);
-  ReturnInst::Create(C, 0, BB);
+  ReturnInst::Create(C, nullptr, BB);
 
   // New TBAA metadata
   {
diff --git a/unittests/Analysis/ScalarEvolutionTest.cpp b/unittests/Analysis/ScalarEvolutionTest.cpp
index 398d09e..90f6997 100644
--- a/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -41,7 +41,7 @@ TEST_F(ScalarEvolutionsTest, SCEVUnknownRAUW) {
                                               std::vector<Type *>(), false);
   Function *F = cast<Function>(M.getOrInsertFunction("f", FTy));
   BasicBlock *BB = BasicBlock::Create(Context, "entry", F);
-  ReturnInst::Create(Context, 0, BB);
+  ReturnInst::Create(Context, nullptr, BB);
 
   Type *Ty = Type::getInt1Ty(Context);
   Constant *Init = Constant::getNullValue(Ty);
@@ -94,7 +94,7 @@ TEST_F(ScalarEvolutionsTest, SCEVMultiplyAddRecs) {
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(Context), Types, false);
   Function *F = cast<Function>(M.getOrInsertFunction("f", FTy));
   BasicBlock *BB = BasicBlock::Create(Context, "entry", F);
-  ReturnInst::Create(Context, 0, BB);
+  ReturnInst::Create(Context, nullptr, BB);
 
   // Create a ScalarEvolution and "run" it so that it gets initialized.
   PM.add(&SE);
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index bbab2a1..65930b5 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -21,7 +21,6 @@ add_subdirectory(IR)
 add_subdirectory(LineEditor)
 add_subdirectory(Linker)
 add_subdirectory(MC)
-add_subdirectory(Object)
 add_subdirectory(Option)
 add_subdirectory(Support)
 add_subdirectory(Transforms)
diff --git a/unittests/ExecutionEngine/CMakeLists.txt b/unittests/ExecutionEngine/CMakeLists.txt
index 7ef509b..489eaaf 100644
--- a/unittests/ExecutionEngine/CMakeLists.txt
+++ b/unittests/ExecutionEngine/CMakeLists.txt
@@ -9,9 +9,10 @@ add_llvm_unittest(ExecutionEngineTests
   ExecutionEngineTest.cpp
   )
 
-# Include JIT/MCJIT tests only if native arch is a JIT target.
-list(FIND LLVM_TARGETS_WITH_JIT "${LLVM_NATIVE_ARCH}" have_jit)
-if (NOT have_jit EQUAL -1 )
+# Include JIT/MCJIT tests only if native arch is a built JIT target.
+list(FIND LLVM_TARGETS_TO_BUILD "${LLVM_NATIVE_ARCH}" build_idx)
+list(FIND LLVM_TARGETS_WITH_JIT "${LLVM_NATIVE_ARCH}" jit_idx)
+if (NOT build_idx LESS 0 AND NOT jit_idx LESS 0)
   add_subdirectory(JIT)
   add_subdirectory(MCJIT)
 endif()
diff --git a/unittests/ExecutionEngine/ExecutionEngineTest.cpp b/unittests/ExecutionEngine/ExecutionEngineTest.cpp
index e6f07dc..f23745c 100644
--- a/unittests/ExecutionEngine/ExecutionEngineTest.cpp
+++ b/unittests/ExecutionEngine/ExecutionEngineTest.cpp
@@ -26,13 +26,13 @@ protected:
   }
 
   virtual void SetUp() {
-    ASSERT_TRUE(Engine.get() != NULL) << "EngineBuilder returned error: '"
+    ASSERT_TRUE(Engine.get() != nullptr) << "EngineBuilder returned error: '"
       << Error << "'";
   }
 
   GlobalVariable *NewExtGlobal(Type *T, const Twine &Name) {
     return new GlobalVariable(*M, T, false,  // Not constant.
-                              GlobalValue::ExternalLinkage, NULL, Name);
+                              GlobalValue::ExternalLinkage, nullptr, Name);
   }
 
   Module *const M;
@@ -49,14 +49,14 @@ TEST_F(ExecutionEngineTest, ForwardGlobalMapping) {
   int32_t Mem2 = 4;
   Engine->updateGlobalMapping(G1, &Mem2);
   EXPECT_EQ(&Mem2, Engine->getPointerToGlobalIfAvailable(G1));
-  Engine->updateGlobalMapping(G1, NULL);
-  EXPECT_EQ(NULL, Engine->getPointerToGlobalIfAvailable(G1));
+  Engine->updateGlobalMapping(G1, nullptr);
+  EXPECT_EQ(nullptr, Engine->getPointerToGlobalIfAvailable(G1));
   Engine->updateGlobalMapping(G1, &Mem2);
   EXPECT_EQ(&Mem2, Engine->getPointerToGlobalIfAvailable(G1));
 
   GlobalVariable *G2 =
       NewExtGlobal(Type::getInt32Ty(getGlobalContext()), "Global1");
-  EXPECT_EQ(NULL, Engine->getPointerToGlobalIfAvailable(G2))
+  EXPECT_EQ(nullptr, Engine->getPointerToGlobalIfAvailable(G2))
     << "The NULL return shouldn't depend on having called"
     << " updateGlobalMapping(..., NULL)";
   // Check that update...() can be called before add...().
@@ -75,7 +75,7 @@ TEST_F(ExecutionEngineTest, ReverseGlobalMapping) {
   EXPECT_EQ(G1, Engine->getGlobalValueAtAddress(&Mem1));
   int32_t Mem2 = 4;
   Engine->updateGlobalMapping(G1, &Mem2);
-  EXPECT_EQ(NULL, Engine->getGlobalValueAtAddress(&Mem1));
+  EXPECT_EQ(nullptr, Engine->getGlobalValueAtAddress(&Mem1));
   EXPECT_EQ(G1, Engine->getGlobalValueAtAddress(&Mem2));
 
   GlobalVariable *G2 =
@@ -83,12 +83,12 @@ TEST_F(ExecutionEngineTest, ReverseGlobalMapping) {
   Engine->updateGlobalMapping(G2, &Mem1);
   EXPECT_EQ(G2, Engine->getGlobalValueAtAddress(&Mem1));
   EXPECT_EQ(G1, Engine->getGlobalValueAtAddress(&Mem2));
-  Engine->updateGlobalMapping(G1, NULL);
+  Engine->updateGlobalMapping(G1, nullptr);
   EXPECT_EQ(G2, Engine->getGlobalValueAtAddress(&Mem1))
     << "Removing one mapping doesn't affect a different one.";
-  EXPECT_EQ(NULL, Engine->getGlobalValueAtAddress(&Mem2));
+  EXPECT_EQ(nullptr, Engine->getGlobalValueAtAddress(&Mem2));
   Engine->updateGlobalMapping(G2, &Mem2);
-  EXPECT_EQ(NULL, Engine->getGlobalValueAtAddress(&Mem1));
+  EXPECT_EQ(nullptr, Engine->getGlobalValueAtAddress(&Mem1));
   EXPECT_EQ(G2, Engine->getGlobalValueAtAddress(&Mem2))
     << "Once a mapping is removed, we can point another GV at the"
     << " now-free address.";
@@ -104,7 +104,7 @@ TEST_F(ExecutionEngineTest, ClearModuleMappings) {
 
   Engine->clearGlobalMappingsFromModule(M);
 
-  EXPECT_EQ(NULL, Engine->getGlobalValueAtAddress(&Mem1));
+  EXPECT_EQ(nullptr, Engine->getGlobalValueAtAddress(&Mem1));
 
   GlobalVariable *G2 =
       NewExtGlobal(Type::getInt32Ty(getGlobalContext()), "Global2");
@@ -124,7 +124,7 @@ TEST_F(ExecutionEngineTest, DestructionRemovesGlobalMapping) {
   // When the GV goes away, the ExecutionEngine should remove any
   // mappings that refer to it.
   G1->eraseFromParent();
-  EXPECT_EQ(NULL, Engine->getGlobalValueAtAddress(&Mem1));
+  EXPECT_EQ(nullptr, Engine->getGlobalValueAtAddress(&Mem1));
 }
 
 }
diff --git a/unittests/ExecutionEngine/JIT/JITMemoryManagerTest.cpp b/unittests/ExecutionEngine/JIT/JITMemoryManagerTest.cpp
index ab30884..296838d 100644
--- a/unittests/ExecutionEngine/JIT/JITMemoryManagerTest.cpp
+++ b/unittests/ExecutionEngine/JIT/JITMemoryManagerTest.cpp
@@ -267,12 +267,12 @@ TEST(JITMemoryManagerTest, TestManyStubs) {
 
   // After allocating a bunch of stubs, we should have two.
   for (int I = 0; I < Iters; ++I)
-    MemMgr->allocateStub(NULL, Size, 8);
+    MemMgr->allocateStub(nullptr, Size, 8);
   EXPECT_EQ(2U, MemMgr->GetNumStubSlabs());
 
   // And after much more, we should have three.
   for (int I = 0; I < Iters; ++I)
-    MemMgr->allocateStub(NULL, Size, 8);
+    MemMgr->allocateStub(nullptr, Size, 8);
   EXPECT_EQ(3U, MemMgr->GetNumStubSlabs());
 }
 
@@ -286,10 +286,10 @@ TEST(JITMemoryManagerTest, AllocateSection) {
   uint8_t *data2 = MemMgr->allocateDataSection(256, 64, 4, StringRef(), false);
   uint8_t *code3 = MemMgr->allocateCodeSection(258, 64, 5, StringRef());
 
-  EXPECT_NE((uint8_t*)0, code1);
-  EXPECT_NE((uint8_t*)0, code2);
-  EXPECT_NE((uint8_t*)0, data1);
-  EXPECT_NE((uint8_t*)0, data2);
+  EXPECT_NE((uint8_t*)nullptr, code1);
+  EXPECT_NE((uint8_t*)nullptr, code2);
+  EXPECT_NE((uint8_t*)nullptr, data1);
+  EXPECT_NE((uint8_t*)nullptr, data2);
 
   // Check alignment
   EXPECT_EQ((uint64_t)code1 & 0xf, 0u);
diff --git a/unittests/ExecutionEngine/JIT/JITTest.cpp b/unittests/ExecutionEngine/JIT/JITTest.cpp
index f438286..817d207 100644
--- a/unittests/ExecutionEngine/JIT/JITTest.cpp
+++ b/unittests/ExecutionEngine/JIT/JITTest.cpp
@@ -169,7 +169,7 @@ public:
 bool LoadAssemblyInto(Module *M, const char *assembly) {
   SMDiagnostic Error;
   bool success =
-    NULL != ParseAssemblyString(assembly, M, Error, M->getContext());
+    nullptr != ParseAssemblyString(assembly, M, Error, M->getContext());
   std::string errMsg;
   raw_string_ostream os(errMsg);
   Error.print("", os);
@@ -193,7 +193,7 @@ class JITTest : public testing::Test {
                  .setJITMemoryManager(RJMM)
                  .setErrorStr(&Error)
                  .setTargetOptions(Options).create());
-    ASSERT_TRUE(TheJIT.get() != NULL) << Error;
+    ASSERT_TRUE(TheJIT.get() != nullptr) << Error;
   }
 
   void LoadAssembly(const char *assembly) {
@@ -249,7 +249,7 @@ TEST(JIT, GlobalInFunction) {
 
   // Since F1 was codegen'd, a pointer to G should be available.
   int32_t *GPtr = (int32_t*)JIT->getPointerToGlobalIfAvailable(G);
-  ASSERT_NE((int32_t*)NULL, GPtr);
+  ASSERT_NE((int32_t*)nullptr, GPtr);
   EXPECT_EQ(0, *GPtr);
 
   // F1() should increment G.
@@ -633,10 +633,10 @@ ExecutionEngine *getJITFromBitcode(
   MemoryBuffer *BitcodeBuffer =
     MemoryBuffer::getMemBuffer(Bitcode, "Bitcode for test");
   ErrorOr<Module*> ModuleOrErr = getLazyBitcodeModule(BitcodeBuffer, Context);
-  if (error_code EC = ModuleOrErr.getError()) {
+  if (std::error_code EC = ModuleOrErr.getError()) {
     ADD_FAILURE() << EC.message();
     delete BitcodeBuffer;
-    return NULL;
+    return nullptr;
   }
   M = ModuleOrErr.get();
   std::string errMsg;
@@ -644,11 +644,11 @@ ExecutionEngine *getJITFromBitcode(
     .setEngineKind(EngineKind::JIT)
     .setErrorStr(&errMsg)
     .create();
-  if (TheJIT == NULL) {
+  if (TheJIT == nullptr) {
     ADD_FAILURE() << errMsg;
     delete M;
-    M = NULL;
-    return NULL;
+    M = nullptr;
+    return nullptr;
   }
   return TheJIT;
 }
diff --git a/unittests/ExecutionEngine/JIT/MultiJITTest.cpp b/unittests/ExecutionEngine/JIT/MultiJITTest.cpp
index 5016532..f530e0d 100644
--- a/unittests/ExecutionEngine/JIT/MultiJITTest.cpp
+++ b/unittests/ExecutionEngine/JIT/MultiJITTest.cpp
@@ -27,7 +27,7 @@ namespace {
 bool LoadAssemblyInto(Module *M, const char *assembly) {
   SMDiagnostic Error;
   bool success =
-    NULL != ParseAssemblyString(assembly, M, Error, M->getContext());
+    nullptr != ParseAssemblyString(assembly, M, Error, M->getContext());
   std::string errMsg;
   raw_string_ostream os(errMsg);
   Error.print("", os);
@@ -71,13 +71,13 @@ void createModule2(LLVMContext &Context2, Module *&M2, Function *&FooF2) {
 
 TEST(MultiJitTest, EagerMode) {
   LLVMContext Context1;
-  Module *M1 = 0;
-  Function *FooF1 = 0;
+  Module *M1 = nullptr;
+  Function *FooF1 = nullptr;
   createModule1(Context1, M1, FooF1);
 
   LLVMContext Context2;
-  Module *M2 = 0;
-  Function *FooF2 = 0;
+  Module *M2 = nullptr;
+  Function *FooF2 = nullptr;
   createModule2(Context2, M2, FooF2);
 
   // Now we create the JIT in eager mode
@@ -101,13 +101,13 @@ TEST(MultiJitTest, EagerMode) {
 
 TEST(MultiJitTest, LazyMode) {
   LLVMContext Context1;
-  Module *M1 = 0;
-  Function *FooF1 = 0;
+  Module *M1 = nullptr;
+  Function *FooF1 = nullptr;
   createModule1(Context1, M1, FooF1);
 
   LLVMContext Context2;
-  Module *M2 = 0;
-  Function *FooF2 = 0;
+  Module *M2 = nullptr;
+  Function *FooF2 = nullptr;
   createModule2(Context2, M2, FooF2);
 
   // Now we create the JIT in lazy mode
@@ -135,13 +135,13 @@ extern "C" {
 
 TEST(MultiJitTest, JitPool) {
   LLVMContext Context1;
-  Module *M1 = 0;
-  Function *FooF1 = 0;
+  Module *M1 = nullptr;
+  Function *FooF1 = nullptr;
   createModule1(Context1, M1, FooF1);
 
   LLVMContext Context2;
-  Module *M2 = 0;
-  Function *FooF2 = 0;
+  Module *M2 = nullptr;
+  Function *FooF2 = nullptr;
   createModule2(Context2, M2, FooF2);
 
   // Now we create two JITs
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
index 20d3f13..d03de89 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
@@ -148,10 +148,10 @@ protected:
     didCallAllocateCodeSection = false;
     didAllocateCompactUnwindSection = false;
     didCallYield = false;
-    Module = 0;
-    Function = 0;
-    Engine = 0;
-    Error = 0;
+    Module = nullptr;
+    Function = nullptr;
+    Engine = nullptr;
+    Error = nullptr;
   }
   
   virtual void TearDown() {
@@ -166,8 +166,8 @@ protected:
     
     LLVMSetTarget(Module, HostTriple.c_str());
     
-    Function = LLVMAddFunction(
-      Module, "simple_function", LLVMFunctionType(LLVMInt32Type(), 0, 0, 0));
+    Function = LLVMAddFunction(Module, "simple_function",
+                               LLVMFunctionType(LLVMInt32Type(), nullptr,0, 0));
     LLVMSetFunctionCallConv(Function, LLVMCCallConv);
     
     LLVMBasicBlockRef entry = LLVMAppendBasicBlock(Function, "entry");
@@ -192,8 +192,8 @@ protected:
       LLVMFunctionType(LLVMVoidType(), stackmapParamTypes, 2, 1));
     LLVMSetLinkage(stackmap, LLVMExternalLinkage);
     
-    Function = LLVMAddFunction(
-      Module, "simple_function", LLVMFunctionType(LLVMInt32Type(), 0, 0, 0));
+    Function = LLVMAddFunction(Module, "simple_function",
+                              LLVMFunctionType(LLVMInt32Type(), nullptr, 0, 0));
     
     LLVMBasicBlockRef entry = LLVMAppendBasicBlock(Function, "entry");
     LLVMBuilderRef builder = LLVMCreateBuilder();
@@ -221,8 +221,8 @@ protected:
     LLVMSetInitializer(GlobalVar, LLVMConstInt(LLVMInt32Type(), 42, 0));
     
     {
-        Function = LLVMAddFunction(
-          Module, "getGlobal", LLVMFunctionType(LLVMInt32Type(), 0, 0, 0));
+        Function = LLVMAddFunction(Module, "getGlobal",
+                              LLVMFunctionType(LLVMInt32Type(), nullptr, 0, 0));
         LLVMSetFunctionCallConv(Function, LLVMCCallConv);
         
         LLVMBasicBlockRef Entry = LLVMAppendBasicBlock(Function, "entry");
@@ -443,7 +443,7 @@ TEST_F(MCJITCAPITest, yield) {
   buildMCJITOptions();
   buildMCJITEngine();
   LLVMContextRef C = LLVMGetGlobalContext();
-  LLVMContextSetYieldCallback(C, yield, NULL);
+  LLVMContextSetYieldCallback(C, yield, nullptr);
   buildAndRunPasses();
 
   union {
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp
index f862999..98587f7 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp
@@ -23,10 +23,10 @@ TEST(MCJITMemoryManagerTest, BasicAllocations) {
   uint8_t *code2 = MemMgr->allocateCodeSection(256, 0, 3, "");
   uint8_t *data2 = MemMgr->allocateDataSection(256, 0, 4, "", false);
 
-  EXPECT_NE((uint8_t*)0, code1);
-  EXPECT_NE((uint8_t*)0, code2);
-  EXPECT_NE((uint8_t*)0, data1);
-  EXPECT_NE((uint8_t*)0, data2);
+  EXPECT_NE((uint8_t*)nullptr, code1);
+  EXPECT_NE((uint8_t*)nullptr, code2);
+  EXPECT_NE((uint8_t*)nullptr, data1);
+  EXPECT_NE((uint8_t*)nullptr, data2);
 
   // Initialize the data
   for (unsigned i = 0; i < 256; ++i) {
@@ -56,10 +56,10 @@ TEST(MCJITMemoryManagerTest, LargeAllocations) {
   uint8_t *code2 = MemMgr->allocateCodeSection(0x100000, 0, 3, "");
   uint8_t *data2 = MemMgr->allocateDataSection(0x100000, 0, 4, "", false);
 
-  EXPECT_NE((uint8_t*)0, code1);
-  EXPECT_NE((uint8_t*)0, code2);
-  EXPECT_NE((uint8_t*)0, data1);
-  EXPECT_NE((uint8_t*)0, data2);
+  EXPECT_NE((uint8_t*)nullptr, code1);
+  EXPECT_NE((uint8_t*)nullptr, code2);
+  EXPECT_NE((uint8_t*)nullptr, data1);
+  EXPECT_NE((uint8_t*)nullptr, data2);
 
   // Initialize the data
   for (unsigned i = 0; i < 0x100000; ++i) {
@@ -98,8 +98,8 @@ TEST(MCJITMemoryManagerTest, ManyAllocations) {
       data[i][j] = 2 + (i % 254);
     }
 
-    EXPECT_NE((uint8_t *)0, code[i]);
-    EXPECT_NE((uint8_t *)0, data[i]);
+    EXPECT_NE((uint8_t *)nullptr, code[i]);
+    EXPECT_NE((uint8_t *)nullptr, data[i]);
   }
 
   // Verify the data (this is checking for overlaps in the addresses)
@@ -141,8 +141,8 @@ TEST(MCJITMemoryManagerTest, ManyVariedAllocations) {
       data[i][j] = 2 + (i % 254);
     }
 
-    EXPECT_NE((uint8_t *)0, code[i]);
-    EXPECT_NE((uint8_t *)0, data[i]);
+    EXPECT_NE((uint8_t *)nullptr, code[i]);
+    EXPECT_NE((uint8_t *)nullptr, data[i]);
 
     uintptr_t CodeAlign = Align ? (uintptr_t)code[i] % Align : 0;
     uintptr_t DataAlign = Align ? (uintptr_t)data[i] % Align : 0;
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITObjectCacheTest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITObjectCacheTest.cpp
index 46847d3..fbbab42 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITObjectCacheTest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITObjectCacheTest.cpp
@@ -48,7 +48,7 @@ public:
     const MemoryBuffer* BufferFound = getObjectInternal(M);
     ModulesLookedUp.insert(M->getModuleIdentifier());
     if (!BufferFound)
-      return NULL;
+      return nullptr;
     // Our test cache wants to maintain ownership of its object buffers
     // so we make a copy here for the execution engine.
     return MemoryBuffer::getMemBufferCopy(BufferFound->getBuffer());
@@ -67,7 +67,7 @@ public:
     const std::string ModuleID = M->getModuleIdentifier();
     StringMap<const MemoryBuffer *>::iterator it = ObjMap.find(ModuleID);
     if (it == ObjMap.end())
-      return 0;
+      return nullptr;
     return it->second;
   }
 
@@ -101,13 +101,13 @@ protected:
   void compileAndRun(int ExpectedRC = OriginalRC) {
     // This function shouldn't be called until after SetUp.
     ASSERT_TRUE(bool(TheJIT));
-    ASSERT_TRUE(0 != Main);
+    ASSERT_TRUE(nullptr != Main);
 
     // We may be using a null cache, so ensure compilation is valid.
     TheJIT->finalizeObject();
     void *vPtr = TheJIT->getPointerToFunction(Main);
 
-    EXPECT_TRUE(0 != vPtr)
+    EXPECT_TRUE(nullptr != vPtr)
       << "Unable to get pointer to main() from JIT";
 
     int (*FuncPtr)(void) = (int(*)(void))(intptr_t)vPtr;
@@ -123,7 +123,7 @@ TEST_F(MCJITObjectCacheTest, SetNullObjectCache) {
 
   createJIT(M.release());
 
-  TheJIT->setObjectCache(NULL);
+  TheJIT->setObjectCache(nullptr);
 
   compileAndRun();
 }
@@ -143,7 +143,7 @@ TEST_F(MCJITObjectCacheTest, VerifyBasicObjectCaching) {
 
   // Verify that our object cache does not contain the module yet.
   const MemoryBuffer *ObjBuffer = Cache->getObjectInternal(SavedModulePointer);
-  EXPECT_EQ(0, ObjBuffer);
+  EXPECT_EQ(nullptr, ObjBuffer);
 
   compileAndRun();
 
@@ -152,7 +152,7 @@ TEST_F(MCJITObjectCacheTest, VerifyBasicObjectCaching) {
 
   // Verify that our object cache now contains the module.
   ObjBuffer = Cache->getObjectInternal(SavedModulePointer);
-  EXPECT_TRUE(0 != ObjBuffer);
+  EXPECT_TRUE(nullptr != ObjBuffer);
 
   // Verify that the cache was only notified once.
   EXPECT_FALSE(Cache->wereDuplicatesInserted());
@@ -221,7 +221,7 @@ TEST_F(MCJITObjectCacheTest, VerifyNonLoadFromCache) {
 
   // Verify that our object cache does not contain the module yet.
   const MemoryBuffer *ObjBuffer = Cache->getObjectInternal(SecondModulePointer);
-  EXPECT_EQ(0, ObjBuffer);
+  EXPECT_EQ(nullptr, ObjBuffer);
 
   // Run the function and look for the replacement return code.
   compileAndRun(ReplacementRC);
@@ -231,7 +231,7 @@ TEST_F(MCJITObjectCacheTest, VerifyNonLoadFromCache) {
 
   // Verify that our object cache now contains the module.
   ObjBuffer = Cache->getObjectInternal(SecondModulePointer);
-  EXPECT_TRUE(0 != ObjBuffer);
+  EXPECT_TRUE(nullptr != ObjBuffer);
 
   // Verify that MCJIT didn't try to cache this again.
   EXPECT_FALSE(Cache->wereDuplicatesInserted());
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp
index a439508..c37c1d1 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp
@@ -51,7 +51,7 @@ TEST_F(MCJITTest, global_variable) {
   GlobalValue *Global = insertGlobalInt32(M.get(), "test_global", initialValue);
   createJIT(M.release());
   void *globalPtr =  TheJIT->getPointerToGlobal(Global);
-  EXPECT_TRUE(0 != globalPtr)
+  EXPECT_TRUE(nullptr != globalPtr)
     << "Unable to get pointer to global value from JIT";
 
   EXPECT_EQ(initialValue, *(int32_t*)globalPtr)
diff --git a/unittests/IR/ConstantRangeTest.cpp b/unittests/IR/ConstantRangeTest.cpp
index cdf7378..fa03302 100644
--- a/unittests/IR/ConstantRangeTest.cpp
+++ b/unittests/IR/ConstantRangeTest.cpp
@@ -99,11 +99,11 @@ TEST_F(ConstantRangeTest, Equality) {
 }
 
 TEST_F(ConstantRangeTest, SingleElement) {
-  EXPECT_EQ(Full.getSingleElement(), static_cast<APInt *>(NULL));
-  EXPECT_EQ(Empty.getSingleElement(), static_cast<APInt *>(NULL));
+  EXPECT_EQ(Full.getSingleElement(), static_cast<APInt *>(nullptr));
+  EXPECT_EQ(Empty.getSingleElement(), static_cast<APInt *>(nullptr));
   EXPECT_EQ(*One.getSingleElement(), APInt(16, 0xa));
-  EXPECT_EQ(Some.getSingleElement(), static_cast<APInt *>(NULL));
-  EXPECT_EQ(Wrap.getSingleElement(), static_cast<APInt *>(NULL));
+  EXPECT_EQ(Some.getSingleElement(), static_cast<APInt *>(nullptr));
+  EXPECT_EQ(Wrap.getSingleElement(), static_cast<APInt *>(nullptr));
 
   EXPECT_FALSE(Full.isSingleElement());
   EXPECT_FALSE(Empty.isSingleElement());
diff --git a/unittests/IR/ConstantsTest.cpp b/unittests/IR/ConstantsTest.cpp
index c11729c..0cd8549 100644
--- a/unittests/IR/ConstantsTest.cpp
+++ b/unittests/IR/ConstantsTest.cpp
@@ -269,16 +269,6 @@ TEST(ConstantsTest, ReplaceWithConstantTest) {
                "this->replaceAllUsesWith\\(expr\\(this\\)\\) is NOT valid!");
 }
 
-TEST(ConstantsTest, ReplaceInAliasTest) {
-  std::unique_ptr<Module> M(new Module("MyModule", getGlobalContext()));
-
-  Type *Int32Ty = Type::getInt32Ty(getGlobalContext());
-  auto *Global = cast<GlobalObject>(M->getOrInsertGlobal("dummy", Int32Ty));
-  auto *GA = GlobalAlias::create(GlobalValue::ExternalLinkage, "alias", Global);
-  EXPECT_DEATH(Global->replaceAllUsesWith(GA),
-               "replaceAliasUseWith cannot form an alias cycle");
-}
-
 #endif
 #endif
 
diff --git a/unittests/IR/DominatorTreeTest.cpp b/unittests/IR/DominatorTreeTest.cpp
index 98c2317..ab43d1c 100644
--- a/unittests/IR/DominatorTreeTest.cpp
+++ b/unittests/IR/DominatorTreeTest.cpp
@@ -213,7 +213,7 @@ namespace llvm {
         "}\n";
       LLVMContext &C = getGlobalContext();
       SMDiagnostic Err;
-      return ParseAssemblyString(ModuleStrig, NULL, Err, C);
+      return ParseAssemblyString(ModuleStrig, nullptr, Err, C);
     }
 
     TEST(DominatorTree, Unreachable) {
diff --git a/unittests/IR/IRBuilderTest.cpp b/unittests/IR/IRBuilderTest.cpp
index 9796e44..2108575 100644
--- a/unittests/IR/IRBuilderTest.cpp
+++ b/unittests/IR/IRBuilderTest.cpp
@@ -31,11 +31,11 @@ protected:
     F = Function::Create(FTy, Function::ExternalLinkage, "", M.get());
     BB = BasicBlock::Create(Ctx, "", F);
     GV = new GlobalVariable(*M, Type::getFloatTy(Ctx), true,
-                            GlobalValue::ExternalLinkage, 0);
+                            GlobalValue::ExternalLinkage, nullptr);
   }
 
   virtual void TearDown() {
-    BB = 0;
+    BB = nullptr;
     M.reset();
   }
 
@@ -71,9 +71,9 @@ TEST_F(IRBuilderTest, Lifetime) {
 
   IntrinsicInst *II_Start1 = dyn_cast<IntrinsicInst>(Start1);
   IntrinsicInst *II_End1 = dyn_cast<IntrinsicInst>(End1);
-  ASSERT_TRUE(II_Start1 != NULL);
+  ASSERT_TRUE(II_Start1 != nullptr);
   EXPECT_EQ(II_Start1->getIntrinsicID(), Intrinsic::lifetime_start);
-  ASSERT_TRUE(II_End1 != NULL);
+  ASSERT_TRUE(II_End1 != nullptr);
   EXPECT_EQ(II_End1->getIntrinsicID(), Intrinsic::lifetime_end);
 }
 
@@ -203,7 +203,7 @@ TEST_F(IRBuilderTest, WrapFlags) {
 
   // Test instructions.
   GlobalVariable *G = new GlobalVariable(*M, Builder.getInt32Ty(), true,
-                                         GlobalValue::ExternalLinkage, 0);
+                                         GlobalValue::ExternalLinkage, nullptr);
   Value *V = Builder.CreateLoad(G);
   EXPECT_TRUE(
       cast<BinaryOperator>(Builder.CreateNSWAdd(V, V))->hasNoSignedWrap());
diff --git a/unittests/IR/InstructionsTest.cpp b/unittests/IR/InstructionsTest.cpp
index 336f5a2..7ec9b62 100644
--- a/unittests/IR/InstructionsTest.cpp
+++ b/unittests/IR/InstructionsTest.cpp
@@ -415,7 +415,7 @@ TEST(InstructionsTest, isEliminableCastPair) {
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::PtrToInt,
                                            CastInst::IntToPtr,
                                            Int64PtrTy, Int64Ty, Int64PtrTy,
-                                           Int32Ty, 0, Int32Ty),
+                                           Int32Ty, nullptr, Int32Ty),
             CastInst::BitCast);
 
   // Source and destination have unknown sizes, but the same address space and
@@ -423,7 +423,7 @@ TEST(InstructionsTest, isEliminableCastPair) {
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::PtrToInt,
                                            CastInst::IntToPtr,
                                            Int64PtrTy, Int64Ty, Int64PtrTy,
-                                           0, 0, 0),
+                                           nullptr, nullptr, nullptr),
             CastInst::BitCast);
 
   // Source and destination have unknown sizes, but the same address space and
@@ -431,21 +431,21 @@ TEST(InstructionsTest, isEliminableCastPair) {
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::PtrToInt,
                                            CastInst::IntToPtr,
                                            Int64PtrTy, Int32Ty, Int64PtrTy,
-                                           0, 0, 0),
+                                           nullptr, nullptr, nullptr),
             0U);
 
   // Middle pointer big enough -> bitcast.
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
                                            CastInst::PtrToInt,
                                            Int64Ty, Int64PtrTy, Int64Ty,
-                                           0, Int64Ty, 0),
+                                           nullptr, Int64Ty, nullptr),
             CastInst::BitCast);
 
   // Middle pointer too small -> fail.
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
                                            CastInst::PtrToInt,
                                            Int64Ty, Int64PtrTy, Int64Ty,
-                                           0, Int32Ty, 0),
+                                           nullptr, Int32Ty, nullptr),
             0U);
 
   // Test that we don't eliminate bitcasts between different address spaces,
@@ -464,21 +464,21 @@ TEST(InstructionsTest, isEliminableCastPair) {
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
                                            CastInst::AddrSpaceCast,
                                            Int16Ty, Int64PtrTyAS1, Int64PtrTyAS2,
-                                           0, Int16SizePtr, Int64SizePtr),
+                                           nullptr, Int16SizePtr, Int64SizePtr),
             0U);
 
   // Cannot simplify addrspacecast, ptrtoint
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::AddrSpaceCast,
                                            CastInst::PtrToInt,
                                            Int64PtrTyAS1, Int64PtrTyAS2, Int16Ty,
-                                           Int64SizePtr, Int16SizePtr, 0),
+                                           Int64SizePtr, Int16SizePtr, nullptr),
             0U);
 
   // Pass since the bitcast address spaces are the same
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
                                            CastInst::BitCast,
                                            Int16Ty, Int64PtrTyAS1, Int64PtrTyAS1,
-                                           0, 0, 0),
+                                           nullptr, nullptr, nullptr),
             CastInst::IntToPtr);
 
 }
diff --git a/unittests/IR/LegacyPassManagerTest.cpp b/unittests/IR/LegacyPassManagerTest.cpp
index df6f460..9c2a835 100644
--- a/unittests/IR/LegacyPassManagerTest.cpp
+++ b/unittests/IR/LegacyPassManagerTest.cpp
@@ -476,7 +476,7 @@ namespace llvm {
       // Function: test1 (func_test1)
       {
 
-        BasicBlock* label_entry = BasicBlock::Create(getGlobalContext(), "entry",func_test1,0);
+        BasicBlock* label_entry = BasicBlock::Create(getGlobalContext(), "entry",func_test1,nullptr);
 
         // Block entry (label_entry)
         CallInst* int32_3 = CallInst::Create(func_test2, "", label_entry);
@@ -491,7 +491,7 @@ namespace llvm {
       // Function: test2 (func_test2)
       {
 
-        BasicBlock* label_entry_5 = BasicBlock::Create(getGlobalContext(), "entry",func_test2,0);
+        BasicBlock* label_entry_5 = BasicBlock::Create(getGlobalContext(), "entry",func_test2,nullptr);
 
         // Block entry (label_entry_5)
         CallInst* int32_6 = CallInst::Create(func_test3, "", label_entry_5);
@@ -506,7 +506,7 @@ namespace llvm {
       // Function: test3 (func_test3)
       {
 
-        BasicBlock* label_entry_8 = BasicBlock::Create(getGlobalContext(), "entry",func_test3,0);
+        BasicBlock* label_entry_8 = BasicBlock::Create(getGlobalContext(), "entry",func_test3,nullptr);
 
         // Block entry (label_entry_8)
         CallInst* int32_9 = CallInst::Create(func_test1, "", label_entry_8);
@@ -524,10 +524,10 @@ namespace llvm {
         Value* int1_f = args++;
         int1_f->setName("f");
 
-        BasicBlock* label_entry_11 = BasicBlock::Create(getGlobalContext(), "entry",func_test4,0);
-        BasicBlock* label_bb = BasicBlock::Create(getGlobalContext(), "bb",func_test4,0);
-        BasicBlock* label_bb1 = BasicBlock::Create(getGlobalContext(), "bb1",func_test4,0);
-        BasicBlock* label_return = BasicBlock::Create(getGlobalContext(), "return",func_test4,0);
+        BasicBlock* label_entry_11 = BasicBlock::Create(getGlobalContext(), "entry",func_test4,nullptr);
+        BasicBlock* label_bb = BasicBlock::Create(getGlobalContext(), "bb",func_test4,nullptr);
+        BasicBlock* label_bb1 = BasicBlock::Create(getGlobalContext(), "bb1",func_test4,nullptr);
+        BasicBlock* label_return = BasicBlock::Create(getGlobalContext(), "return",func_test4,nullptr);
 
         // Block entry (label_entry_11)
         BranchInst::Create(label_bb, label_entry_11);
diff --git a/unittests/IR/MDBuilderTest.cpp b/unittests/IR/MDBuilderTest.cpp
index c8b5a09..fc4674e 100644
--- a/unittests/IR/MDBuilderTest.cpp
+++ b/unittests/IR/MDBuilderTest.cpp
@@ -33,8 +33,8 @@ TEST_F(MDBuilderTest, createFPMath) {
   MDBuilder MDHelper(Context);
   MDNode *MD0 = MDHelper.createFPMath(0.0);
   MDNode *MD1 = MDHelper.createFPMath(1.0);
-  EXPECT_EQ(MD0, (MDNode *)0);
-  EXPECT_NE(MD1, (MDNode *)0);
+  EXPECT_EQ(MD0, (MDNode *)nullptr);
+  EXPECT_NE(MD1, (MDNode *)nullptr);
   EXPECT_EQ(MD1->getNumOperands(), 1U);
   Value *Op = MD1->getOperand(0);
   EXPECT_TRUE(isa<ConstantFP>(Op));
@@ -47,8 +47,8 @@ TEST_F(MDBuilderTest, createRangeMetadata) {
   APInt A(8, 1), B(8, 2);
   MDNode *R0 = MDHelper.createRange(A, A);
   MDNode *R1 = MDHelper.createRange(A, B);
-  EXPECT_EQ(R0, (MDNode *)0);
-  EXPECT_NE(R1, (MDNode *)0);
+  EXPECT_EQ(R0, (MDNode *)nullptr);
+  EXPECT_NE(R1, (MDNode *)nullptr);
   EXPECT_EQ(R1->getNumOperands(), 2U);
   EXPECT_TRUE(isa<ConstantInt>(R1->getOperand(0)));
   EXPECT_TRUE(isa<ConstantInt>(R1->getOperand(1)));
@@ -66,8 +66,8 @@ TEST_F(MDBuilderTest, createAnonymousTBAARoot) {
   EXPECT_GE(R1->getNumOperands(), 1U);
   EXPECT_EQ(R0->getOperand(0), R0);
   EXPECT_EQ(R1->getOperand(0), R1);
-  EXPECT_TRUE(R0->getNumOperands() == 1 || R0->getOperand(1) == 0);
-  EXPECT_TRUE(R1->getNumOperands() == 1 || R1->getOperand(1) == 0);
+  EXPECT_TRUE(R0->getNumOperands() == 1 || R0->getOperand(1) == nullptr);
+  EXPECT_TRUE(R1->getNumOperands() == 1 || R1->getOperand(1) == nullptr);
 }
 TEST_F(MDBuilderTest, createTBAARoot) {
   MDBuilder MDHelper(Context);
@@ -77,7 +77,7 @@ TEST_F(MDBuilderTest, createTBAARoot) {
   EXPECT_GE(R0->getNumOperands(), 1U);
   EXPECT_TRUE(isa<MDString>(R0->getOperand(0)));
   EXPECT_EQ(cast<MDString>(R0->getOperand(0))->getString(), "Root");
-  EXPECT_TRUE(R0->getNumOperands() == 1 || R0->getOperand(1) == 0);
+  EXPECT_TRUE(R0->getNumOperands() == 1 || R0->getOperand(1) == nullptr);
 }
 TEST_F(MDBuilderTest, createTBAANode) {
   MDBuilder MDHelper(Context);
diff --git a/unittests/IR/MetadataTest.cpp b/unittests/IR/MetadataTest.cpp
index 00a2783..4f7bd72 100644
--- a/unittests/IR/MetadataTest.cpp
+++ b/unittests/IR/MetadataTest.cpp
@@ -103,7 +103,7 @@ TEST_F(MDNodeTest, Simple) {
 #endif
   EXPECT_EQ(n4, n1);
   EXPECT_EQ(n5, n2);
-  EXPECT_EQ(n6, (Value*)0);
+  EXPECT_EQ(n6, (Value*)nullptr);
 
   EXPECT_EQ(3u, n1->getNumOperands());
   EXPECT_EQ(s1, n1->getOperand(0));
diff --git a/unittests/IR/PassManagerTest.cpp b/unittests/IR/PassManagerTest.cpp
index 310e48f..25037a7 100644
--- a/unittests/IR/PassManagerTest.cpp
+++ b/unittests/IR/PassManagerTest.cpp
@@ -168,7 +168,7 @@ struct TestInvalidationFunctionPass {
 Module *parseIR(const char *IR) {
   LLVMContext &C = getGlobalContext();
   SMDiagnostic Err;
-  return ParseAssemblyString(IR, 0, Err, C);
+  return ParseAssemblyString(IR, nullptr, Err, C);
 }
 
 class PassManagerTest : public ::testing::Test {
diff --git a/unittests/IR/PatternMatch.cpp b/unittests/IR/PatternMatch.cpp
index bebee15..f3a27b8 100644
--- a/unittests/IR/PatternMatch.cpp
+++ b/unittests/IR/PatternMatch.cpp
@@ -230,17 +230,17 @@ TEST_F(PatternMatchTest, OverflowingBinOps) {
       m_NSWAdd(m_Value(MatchL), m_Value(MatchR)).match(IRB.CreateNSWAdd(L, R)));
   EXPECT_EQ(L, MatchL);
   EXPECT_EQ(R, MatchR);
-  MatchL = MatchR = 0;
+  MatchL = MatchR = nullptr;
   EXPECT_TRUE(
       m_NSWSub(m_Value(MatchL), m_Value(MatchR)).match(IRB.CreateNSWSub(L, R)));
   EXPECT_EQ(L, MatchL);
   EXPECT_EQ(R, MatchR);
-  MatchL = MatchR = 0;
+  MatchL = MatchR = nullptr;
   EXPECT_TRUE(
       m_NSWMul(m_Value(MatchL), m_Value(MatchR)).match(IRB.CreateNSWMul(L, R)));
   EXPECT_EQ(L, MatchL);
   EXPECT_EQ(R, MatchR);
-  MatchL = MatchR = 0;
+  MatchL = MatchR = nullptr;
   EXPECT_TRUE(m_NSWShl(m_Value(MatchL), m_Value(MatchR)).match(
       IRB.CreateShl(L, R, "", /* NUW */ false, /* NSW */ true)));
   EXPECT_EQ(L, MatchL);
@@ -250,17 +250,17 @@ TEST_F(PatternMatchTest, OverflowingBinOps) {
       m_NUWAdd(m_Value(MatchL), m_Value(MatchR)).match(IRB.CreateNUWAdd(L, R)));
   EXPECT_EQ(L, MatchL);
   EXPECT_EQ(R, MatchR);
-  MatchL = MatchR = 0;
+  MatchL = MatchR = nullptr;
   EXPECT_TRUE(
       m_NUWSub(m_Value(MatchL), m_Value(MatchR)).match(IRB.CreateNUWSub(L, R)));
   EXPECT_EQ(L, MatchL);
   EXPECT_EQ(R, MatchR);
-  MatchL = MatchR = 0;
+  MatchL = MatchR = nullptr;
   EXPECT_TRUE(
       m_NUWMul(m_Value(MatchL), m_Value(MatchR)).match(IRB.CreateNUWMul(L, R)));
   EXPECT_EQ(L, MatchL);
   EXPECT_EQ(R, MatchR);
-  MatchL = MatchR = 0;
+  MatchL = MatchR = nullptr;
   EXPECT_TRUE(m_NUWShl(m_Value(MatchL), m_Value(MatchR)).match(
       IRB.CreateShl(L, R, "", /* NUW */ true, /* NSW */ false)));
   EXPECT_EQ(L, MatchL);
diff --git a/unittests/IR/TypeBuilderTest.cpp b/unittests/IR/TypeBuilderTest.cpp
index be493cd..b7b3e45 100644
--- a/unittests/IR/TypeBuilderTest.cpp
+++ b/unittests/IR/TypeBuilderTest.cpp
@@ -234,19 +234,19 @@ TEST(TypeBuilderTest, Extensions) {
                                      TypeBuilder<int, false>::get(getGlobalContext()),
                                      TypeBuilder<int*, false>::get(getGlobalContext()),
                                      TypeBuilder<void*[], false>::get(getGlobalContext()),
-                                     (void*)0)),
+                                     (void*)nullptr)),
             (TypeBuilder<MyType*, false>::get(getGlobalContext())));
   EXPECT_EQ(PointerType::getUnqual(StructType::get(
                                      TypeBuilder<types::i<32>, false>::get(getGlobalContext()),
                                      TypeBuilder<types::i<32>*, false>::get(getGlobalContext()),
                                      TypeBuilder<types::i<8>*[], false>::get(getGlobalContext()),
-                                     (void*)0)),
+                                     (void*)nullptr)),
             (TypeBuilder<MyPortableType*, false>::get(getGlobalContext())));
   EXPECT_EQ(PointerType::getUnqual(StructType::get(
                                      TypeBuilder<types::i<32>, false>::get(getGlobalContext()),
                                      TypeBuilder<types::i<32>*, false>::get(getGlobalContext()),
                                      TypeBuilder<types::i<8>*[], false>::get(getGlobalContext()),
-                                     (void*)0)),
+                                     (void*)nullptr)),
             (TypeBuilder<MyPortableType*, true>::get(getGlobalContext())));
 }
 
diff --git a/unittests/IR/UserTest.cpp b/unittests/IR/UserTest.cpp
index 9c0e7b2..eb07e82 100644
--- a/unittests/IR/UserTest.cpp
+++ b/unittests/IR/UserTest.cpp
@@ -65,7 +65,7 @@ TEST(UserTest, ValueOpIteration) {
                              "  ret void\n"
                              "}\n";
   SMDiagnostic Err;
-  Module *M = ParseAssemblyString(ModuleString, NULL, Err, C);
+  Module *M = ParseAssemblyString(ModuleString, nullptr, Err, C);
 
   Function *F = M->getFunction("f");
   BasicBlock &ExitBB = F->back();
diff --git a/unittests/IR/ValueHandleTest.cpp b/unittests/IR/ValueHandleTest.cpp
index 15a0c22..403d2bc 100644
--- a/unittests/IR/ValueHandleTest.cpp
+++ b/unittests/IR/ValueHandleTest.cpp
@@ -94,7 +94,7 @@ TEST_F(ValueHandle, WeakVH_NullOnDeletion) {
   WeakVH WVH_Copy(WVH);
   WeakVH WVH_Recreated(BitcastV.get());
   BitcastV.reset();
-  Value *null_value = NULL;
+  Value *null_value = nullptr;
   EXPECT_EQ(null_value, WVH);
   EXPECT_EQ(null_value, WVH_Copy);
   EXPECT_EQ(null_value, WVH_Recreated);
@@ -178,10 +178,10 @@ TEST_F(ValueHandle, AssertingVH_Asserts) {
   EXPECT_DEATH({BitcastV.reset();},
                "An asserting value handle still pointed to this value!");
   AssertingVH<Value> Copy(AVH);
-  AVH = NULL;
+  AVH = nullptr;
   EXPECT_DEATH({BitcastV.reset();},
                "An asserting value handle still pointed to this value!");
-  Copy = NULL;
+  Copy = nullptr;
   BitcastV.reset();
 }
 
@@ -263,14 +263,14 @@ TEST_F(ValueHandle, CallbackVH_CallbackOnRAUW) {
     int DeletedCalls;
     Value *AURWArgument;
 
-    RecordingVH() : DeletedCalls(0), AURWArgument(NULL) {}
+    RecordingVH() : DeletedCalls(0), AURWArgument(nullptr) {}
     RecordingVH(Value *V)
-      : CallbackVH(V), DeletedCalls(0), AURWArgument(NULL) {}
+      : CallbackVH(V), DeletedCalls(0), AURWArgument(nullptr) {}
 
   private:
     virtual void deleted() { DeletedCalls++; CallbackVH::deleted(); }
     virtual void allUsesReplacedWith(Value *new_value) {
-      EXPECT_EQ(NULL, AURWArgument);
+      EXPECT_EQ(nullptr, AURWArgument);
       AURWArgument = new_value;
     }
   };
@@ -278,7 +278,7 @@ TEST_F(ValueHandle, CallbackVH_CallbackOnRAUW) {
   RecordingVH RVH;
   RVH = BitcastV.get();
   EXPECT_EQ(0, RVH.DeletedCalls);
-  EXPECT_EQ(NULL, RVH.AURWArgument);
+  EXPECT_EQ(nullptr, RVH.AURWArgument);
   BitcastV->replaceAllUsesWith(ConstantV);
   EXPECT_EQ(0, RVH.DeletedCalls);
   EXPECT_EQ(ConstantV, RVH.AURWArgument);
@@ -291,21 +291,21 @@ TEST_F(ValueHandle, CallbackVH_DeletionCanRAUW) {
     Value *AURWArgument;
     LLVMContext *Context;
 
-    RecoveringVH() : DeletedCalls(0), AURWArgument(NULL), 
+    RecoveringVH() : DeletedCalls(0), AURWArgument(nullptr), 
                      Context(&getGlobalContext()) {}
     RecoveringVH(Value *V)
-      : CallbackVH(V), DeletedCalls(0), AURWArgument(NULL), 
+      : CallbackVH(V), DeletedCalls(0), AURWArgument(nullptr), 
         Context(&getGlobalContext()) {}
 
   private:
     virtual void deleted() {
       getValPtr()->replaceAllUsesWith(Constant::getNullValue(Type::getInt32Ty(getGlobalContext())));
-      setValPtr(NULL);
+      setValPtr(nullptr);
     }
     virtual void allUsesReplacedWith(Value *new_value) {
-      ASSERT_TRUE(NULL != getValPtr());
+      ASSERT_TRUE(nullptr != getValPtr());
       EXPECT_EQ(1U, getValPtr()->getNumUses());
-      EXPECT_EQ(NULL, AURWArgument);
+      EXPECT_EQ(nullptr, AURWArgument);
       AURWArgument = new_value;
     }
   };
@@ -368,8 +368,8 @@ TEST_F(ValueHandle, DestroyingOtherVHOnSameValueDoesntBreakIteration) {
     WeakVH ShouldBeVisited2(BitcastV.get());
 
     BitcastV.reset();
-    EXPECT_EQ(NULL, static_cast<Value*>(ShouldBeVisited1));
-    EXPECT_EQ(NULL, static_cast<Value*>(ShouldBeVisited2));
+    EXPECT_EQ(nullptr, static_cast<Value*>(ShouldBeVisited1));
+    EXPECT_EQ(nullptr, static_cast<Value*>(ShouldBeVisited2));
   }
 }
 
@@ -389,8 +389,8 @@ TEST_F(ValueHandle, AssertingVHCheckedLast) {
     }
 
     virtual void deleted() {
-      *ToClear[0] = 0;
-      *ToClear[1] = 0;
+      *ToClear[0] = nullptr;
+      *ToClear[1] = nullptr;
       CallbackVH::deleted();
     }
   };
diff --git a/unittests/IR/ValueMapTest.cpp b/unittests/IR/ValueMapTest.cpp
index 6fd87b1..0b7198f 100644
--- a/unittests/IR/ValueMapTest.cpp
+++ b/unittests/IR/ValueMapTest.cpp
@@ -40,21 +40,21 @@ TYPED_TEST_CASE(ValueMapTest, KeyTypes);
 
 TYPED_TEST(ValueMapTest, Null) {
   ValueMap<TypeParam*, int> VM1;
-  VM1[NULL] = 7;
-  EXPECT_EQ(7, VM1.lookup(NULL));
+  VM1[nullptr] = 7;
+  EXPECT_EQ(7, VM1.lookup(nullptr));
 }
 
 TYPED_TEST(ValueMapTest, FollowsValue) {
   ValueMap<TypeParam*, int> VM;
   VM[this->BitcastV.get()] = 7;
   EXPECT_EQ(7, VM.lookup(this->BitcastV.get()));
-  EXPECT_EQ(0, VM.count(this->AddV.get()));
+  EXPECT_EQ(0u, VM.count(this->AddV.get()));
   this->BitcastV->replaceAllUsesWith(this->AddV.get());
   EXPECT_EQ(7, VM.lookup(this->AddV.get()));
-  EXPECT_EQ(0, VM.count(this->BitcastV.get()));
+  EXPECT_EQ(0u, VM.count(this->BitcastV.get()));
   this->AddV.reset();
-  EXPECT_EQ(0, VM.count(this->AddV.get()));
-  EXPECT_EQ(0, VM.count(this->BitcastV.get()));
+  EXPECT_EQ(0u, VM.count(this->AddV.get()));
+  EXPECT_EQ(0u, VM.count(this->BitcastV.get()));
   EXPECT_EQ(0U, VM.size());
 }
 
@@ -90,7 +90,7 @@ TYPED_TEST(ValueMapTest, OperationsWork) {
   EXPECT_EQ(this->AddV.get(), InsertResult1.first->first);
   EXPECT_EQ(3, InsertResult1.first->second);
   EXPECT_TRUE(InsertResult1.second);
-  EXPECT_EQ(true, VM.count(this->AddV.get()));
+  EXPECT_EQ(1u, VM.count(this->AddV.get()));
   std::pair<typename ValueMap<TypeParam*, int>::iterator, bool> InsertResult2 =
     VM.insert(std::make_pair(this->AddV.get(), 5));
   EXPECT_EQ(this->AddV.get(), InsertResult2.first->first);
@@ -169,7 +169,7 @@ TYPED_TEST(ValueMapTest, DefaultCollisionBehavior) {
   VM[this->BitcastV.get()] = 7;
   VM[this->AddV.get()] = 9;
   this->BitcastV->replaceAllUsesWith(this->AddV.get());
-  EXPECT_EQ(0, VM.count(this->BitcastV.get()));
+  EXPECT_EQ(0u, VM.count(this->BitcastV.get()));
   EXPECT_EQ(9, VM.lookup(this->AddV.get()));
 }
 
@@ -177,10 +177,10 @@ TYPED_TEST(ValueMapTest, ConfiguredCollisionBehavior) {
   // TODO: Implement this when someone needs it.
 }
 
-template<typename KeyT>
-struct LockMutex : ValueMapConfig<KeyT> {
+template<typename KeyT, typename MutexT>
+struct LockMutex : ValueMapConfig<KeyT, MutexT> {
   struct ExtraData {
-    sys::Mutex *M;
+    MutexT *M;
     bool *CalledRAUW;
     bool *CalledDeleted;
   };
@@ -192,15 +192,15 @@ struct LockMutex : ValueMapConfig<KeyT> {
     *Data.CalledDeleted = true;
     EXPECT_FALSE(Data.M->tryacquire()) << "Mutex should already be locked.";
   }
-  static sys::Mutex *getMutex(const ExtraData &Data) { return Data.M; }
+  static MutexT *getMutex(const ExtraData &Data) { return Data.M; }
 };
 #if LLVM_ENABLE_THREADS
 TYPED_TEST(ValueMapTest, LocksMutex) {
   sys::Mutex M(false);  // Not recursive.
   bool CalledRAUW = false, CalledDeleted = false;
-  typename LockMutex<TypeParam*>::ExtraData Data =
-    {&M, &CalledRAUW, &CalledDeleted};
-  ValueMap<TypeParam*, int, LockMutex<TypeParam*> > VM(Data);
+  typedef LockMutex<TypeParam*, sys::Mutex> ConfigType;
+  typename ConfigType::ExtraData Data = {&M, &CalledRAUW, &CalledDeleted};
+  ValueMap<TypeParam*, int, ConfigType> VM(Data);
   VM[this->BitcastV.get()] = 7;
   this->BitcastV->replaceAllUsesWith(this->AddV.get());
   this->AddV.reset();
@@ -218,7 +218,7 @@ TYPED_TEST(ValueMapTest, NoFollowRAUW) {
   ValueMap<TypeParam*, int, NoFollow<TypeParam*> > VM;
   VM[this->BitcastV.get()] = 7;
   EXPECT_EQ(7, VM.lookup(this->BitcastV.get()));
-  EXPECT_EQ(0, VM.count(this->AddV.get()));
+  EXPECT_EQ(0u, VM.count(this->AddV.get()));
   this->BitcastV->replaceAllUsesWith(this->AddV.get());
   EXPECT_EQ(7, VM.lookup(this->BitcastV.get()));
   EXPECT_EQ(0, VM.lookup(this->AddV.get()));
@@ -284,11 +284,11 @@ TYPED_TEST(ValueMapTest, SurvivesModificationByConfig) {
   // Now the ModifyingConfig can modify the Map inside a callback.
   VM[this->BitcastV.get()] = 7;
   this->BitcastV->replaceAllUsesWith(this->AddV.get());
-  EXPECT_FALSE(VM.count(this->BitcastV.get()));
-  EXPECT_FALSE(VM.count(this->AddV.get()));
+  EXPECT_EQ(0u, VM.count(this->BitcastV.get()));
+  EXPECT_EQ(0u, VM.count(this->AddV.get()));
   VM[this->AddV.get()] = 7;
   this->AddV.reset();
-  EXPECT_FALSE(VM.count(this->AddV.get()));
+  EXPECT_EQ(0u, VM.count(this->AddV.get()));
 }
 
 }
diff --git a/unittests/IR/ValueTest.cpp b/unittests/IR/ValueTest.cpp
index d92bc82..61e44a9 100644
--- a/unittests/IR/ValueTest.cpp
+++ b/unittests/IR/ValueTest.cpp
@@ -34,7 +34,7 @@ TEST(ValueTest, UsedInBasicBlock) {
                              "  ret void\n"
                              "}\n";
   SMDiagnostic Err;
-  Module *M = ParseAssemblyString(ModuleString, NULL, Err, C);
+  Module *M = ParseAssemblyString(ModuleString, nullptr, Err, C);
 
   Function *F = M->getFunction("f");
 
@@ -56,7 +56,7 @@ TEST(GlobalTest, CreateAddressSpace) {
                          GlobalValue::ExternalLinkage,
                          Constant::getAllOnesValue(Int32Ty),
                          "dummy",
-                         0,
+                         nullptr,
                          GlobalVariable::NotThreadLocal,
                          1);
 
@@ -74,7 +74,7 @@ TEST(GlobalTest, CreateAddressSpace) {
                          GlobalValue::ExternalLinkage,
                          Constant::getAllOnesValue(Int32Ty),
                          "dummy_cast",
-                         0,
+                         nullptr,
                          GlobalVariable::NotThreadLocal,
                          1);
 
diff --git a/unittests/IR/VerifierTest.cpp b/unittests/IR/VerifierTest.cpp
index 252bdd3..71e3168 100644
--- a/unittests/IR/VerifierTest.cpp
+++ b/unittests/IR/VerifierTest.cpp
@@ -44,23 +44,6 @@ TEST(VerifierTest, Branch_i1) {
   EXPECT_TRUE(verifyFunction(*F));
 }
 
-TEST(VerifierTest, AliasUnnamedAddr) {
-  LLVMContext &C = getGlobalContext();
-  Module M("M", C);
-  Type *Ty = Type::getInt8Ty(C);
-  Constant *Init = Constant::getNullValue(Ty);
-  GlobalVariable *Aliasee = new GlobalVariable(M, Ty, true,
-                                               GlobalValue::ExternalLinkage,
-                                               Init, "foo");
-  auto *GA = GlobalAlias::create(GlobalValue::ExternalLinkage, "bar", Aliasee);
-  GA->setUnnamedAddr(true);
-  std::string Error;
-  raw_string_ostream ErrorOS(Error);
-  EXPECT_TRUE(verifyModule(M, &ErrorOS));
-  EXPECT_TRUE(
-      StringRef(ErrorOS.str()).startswith("Alias cannot have unnamed_addr"));
-}
-
 TEST(VerifierTest, InvalidRetAttribute) {
   LLVMContext &C = getGlobalContext();
   Module M("M", C);
diff --git a/unittests/IR/WaymarkTest.cpp b/unittests/IR/WaymarkTest.cpp
index 9a9b4a2..8e3cd45 100644
--- a/unittests/IR/WaymarkTest.cpp
+++ b/unittests/IR/WaymarkTest.cpp
@@ -31,7 +31,7 @@ TEST(WaymarkTest, NativeArray) {
   FunctionType *FT = FunctionType::get(Type::getVoidTy(getGlobalContext()), true);
   Function *F = Function::Create(FT, GlobalValue::ExternalLinkage);
   const CallInst *A = CallInst::Create(F, makeArrayRef(values));
-  ASSERT_NE(A, (const CallInst*)NULL);
+  ASSERT_NE(A, (const CallInst*)nullptr);
   ASSERT_EQ(1U + 22, A->getNumOperands());
   const Use *U = &A->getOperandUse(0);
   const Use *Ue = &A->getOperandUse(22);
diff --git a/unittests/Linker/LinkModulesTest.cpp b/unittests/Linker/LinkModulesTest.cpp
index 1d5db36..4ccced1 100644
--- a/unittests/Linker/LinkModulesTest.cpp
+++ b/unittests/Linker/LinkModulesTest.cpp
@@ -36,7 +36,7 @@ protected:
     ArrayType *AT = ArrayType::get(Type::getInt8PtrTy(Ctx), 3);
 
     GV = new GlobalVariable(*M.get(), AT, false /*=isConstant*/,
-                            GlobalValue::InternalLinkage, 0, "switch.bas");
+                            GlobalValue::InternalLinkage, nullptr,"switch.bas");
 
     // Global Initializer
     std::vector<Constant *> Init;
@@ -88,7 +88,7 @@ TEST_F(LinkModuleTest, BlockAddress) {
   Builder.CreateRet(ConstantPointerNull::get(Type::getInt8PtrTy(Ctx)));
 
   Module *LinkedModule = new Module("MyModuleLinked", Ctx);
-  Linker::LinkModules(LinkedModule, M.get(), Linker::PreserveSource, 0);
+  Linker::LinkModules(LinkedModule, M.get(), Linker::PreserveSource, nullptr);
 
   // Delete the original module.
   M.reset();
@@ -138,16 +138,16 @@ TEST_F(LinkModuleTest, EmptyModule) {
 
   GlobalVariable *GV =
       new GlobalVariable(*InternalM, STy, false /*=isConstant*/,
-                         GlobalValue::InternalLinkage, 0, "g");
+                         GlobalValue::InternalLinkage, nullptr, "g");
 
   GV->setInitializer(ConstantStruct::get(STy, F));
 
   Module *EmptyM = new Module("EmptyModule1", Ctx);
-  Linker::LinkModules(EmptyM, InternalM, Linker::PreserveSource, 0);
+  Linker::LinkModules(EmptyM, InternalM, Linker::PreserveSource, nullptr);
 
   delete EmptyM;
   EmptyM = new Module("EmptyModule2", Ctx);
-  Linker::LinkModules(InternalM, EmptyM, Linker::PreserveSource, 0);
+  Linker::LinkModules(InternalM, EmptyM, Linker::PreserveSource, nullptr);
 
   delete EmptyM;
   delete InternalM;
diff --git a/unittests/MC/CMakeLists.txt b/unittests/MC/CMakeLists.txt
index 0e4782c..e2beab2 100644
--- a/unittests/MC/CMakeLists.txt
+++ b/unittests/MC/CMakeLists.txt
@@ -1,11 +1,9 @@
 set(LLVM_LINK_COMPONENTS
-  MC
-  )
-
-set(MCSources
-  MCAtomTest.cpp
+  MCAnalysis
   )
 
 add_llvm_unittest(MCTests
-  ${MCSources}
+  MCAtomTest.cpp
+  StringTableBuilderTest.cpp
+  YAMLTest.cpp
   )
diff --git a/unittests/MC/MCAtomTest.cpp b/unittests/MC/MCAtomTest.cpp
index 17b056c..16228b5 100644
--- a/unittests/MC/MCAtomTest.cpp
+++ b/unittests/MC/MCAtomTest.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCAtom.h"
-#include "llvm/MC/MCModule.h"
+#include "llvm/MC/MCAnalysis/MCAtom.h"
+#include "llvm/MC/MCAnalysis/MCModule.h"
 #include "gtest/gtest.h"
 
 namespace llvm {
diff --git a/unittests/MC/Makefile b/unittests/MC/Makefile
index 4c25697..07a608e 100644
--- a/unittests/MC/Makefile
+++ b/unittests/MC/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL = ../..
 TESTNAME = MC
-LINK_COMPONENTS := MC
+LINK_COMPONENTS := MCAnalysis
 
 include $(LEVEL)/Makefile.config
 include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
diff --git a/unittests/Object/StringTableBuilderTest.cpp b/unittests/MC/StringTableBuilderTest.cpp
index 130eb4a..d30dc62 100644
--- a/unittests/Object/StringTableBuilderTest.cpp
+++ b/unittests/MC/StringTableBuilderTest.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/MC/StringTableBuilder.h"
 #include "gtest/gtest.h"
-#include "llvm/Object/StringTableBuilder.h"
 #include <string>
 
 using namespace llvm;
diff --git a/unittests/Object/YAMLTest.cpp b/unittests/MC/YAMLTest.cpp
index 1eb1113..09709ad 100644
--- a/unittests/Object/YAMLTest.cpp
+++ b/unittests/MC/YAMLTest.cpp
@@ -7,14 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Object/YAML.h"
+#include "llvm/MC/YAML.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
 
 struct BinaryHolder {
-  object::yaml::BinaryRef Binary;
+  yaml::BinaryRef Binary;
 };
 
 namespace llvm {
diff --git a/unittests/Makefile b/unittests/Makefile
index 37f6540..603e7d5 100644
--- a/unittests/Makefile
+++ b/unittests/Makefile
@@ -10,7 +10,7 @@
 LEVEL = ..
 
 PARALLEL_DIRS = ADT Analysis Bitcode CodeGen DebugInfo ExecutionEngine IR \
-		LineEditor Linker MC Object Option Support Transforms
+		LineEditor Linker MC Option Support Transforms
 
 include $(LEVEL)/Makefile.config
 include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
diff --git a/unittests/Object/CMakeLists.txt b/unittests/Object/CMakeLists.txt
deleted file mode 100644
index 580a894..0000000
--- a/unittests/Object/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-set(LLVM_LINK_COMPONENTS
-  Object
-  Support
-  )
-
-add_llvm_unittest(ObjectTests
-  StringTableBuilderTest.cpp
-  YAMLTest.cpp
-  )
diff --git a/unittests/Support/CMakeLists.txt b/unittests/Support/CMakeLists.txt
index 0ea9310..97c5c43 100644
--- a/unittests/Support/CMakeLists.txt
+++ b/unittests/Support/CMakeLists.txt
@@ -29,7 +29,10 @@ add_llvm_unittest(SupportTests
   ProcessTest.cpp
   ProgramTest.cpp
   RegexTest.cpp
+  ScaledNumberTest.cpp
   SourceMgrTest.cpp
+  SpecialCaseListTest.cpp
+  StringPool.cpp
   SwapByteOrderTest.cpp
   ThreadLocalTest.cpp
   TimeValueTest.cpp
diff --git a/unittests/Support/Casting.cpp b/unittests/Support/Casting.cpp
index 228c90b..88c7d19 100644
--- a/unittests/Support/Casting.cpp
+++ b/unittests/Support/Casting.cpp
@@ -18,7 +18,7 @@ namespace llvm {
 // Used to test illegal cast. If a cast doesn't match any of the "real" ones,
 // it will match this one.
 struct IllegalCast;
-template <typename T> IllegalCast *cast(...) { return 0; }
+template <typename T> IllegalCast *cast(...) { return nullptr; }
 
 // set up two example classes
 // with conversion facility
@@ -90,7 +90,7 @@ static_assert(std::is_same<simplify_type<foo *>::SimpleType, foo *>::value,
 
 namespace {
 
-const foo *null_foo = NULL;
+const foo *null_foo = nullptr;
 
 bar B;
 extern bar &B1;
@@ -175,7 +175,7 @@ TEST(CastingTest, dyn_cast_or_null) {
 const bar *B2 = &B;
 }  // anonymous namespace
 
-bar *llvm::fub() { return 0; }
+bar *llvm::fub() { return nullptr; }
 
 namespace {
 namespace inferred_upcasting {
@@ -203,7 +203,7 @@ TEST(CastingTest, UpcastIsInferred) {
   Derived D;
   EXPECT_TRUE(isa<Base>(D));
   Base *BP = dyn_cast<Base>(&D);
-  EXPECT_TRUE(BP != NULL);
+  EXPECT_TRUE(BP != nullptr);
 }
 
 
diff --git a/unittests/Support/CommandLineTest.cpp b/unittests/Support/CommandLineTest.cpp
index b0f1eb1..b2d71ab 100644
--- a/unittests/Support/CommandLineTest.cpp
+++ b/unittests/Support/CommandLineTest.cpp
@@ -23,7 +23,7 @@ class TempEnvVar {
   TempEnvVar(const char *name, const char *value)
       : name(name) {
     const char *old_value = getenv(name);
-    EXPECT_EQ(NULL, old_value) << old_value;
+    EXPECT_EQ(nullptr, old_value) << old_value;
 #if HAVE_SETENV
     setenv(name, value, true);
 #else
diff --git a/unittests/Support/ConvertUTFTest.cpp b/unittests/Support/ConvertUTFTest.cpp
index 13ea75b..16c9beb 100644
--- a/unittests/Support/ConvertUTFTest.cpp
+++ b/unittests/Support/ConvertUTFTest.cpp
@@ -10,6 +10,8 @@
 #include "llvm/Support/ConvertUTF.h"
 #include "gtest/gtest.h"
 #include <string>
+#include <vector>
+#include <utility>
 
 using namespace llvm;
 
@@ -63,3 +65,1600 @@ TEST(ConvertUTFTest, HasUTF16BOM) {
   HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe", 1));
   EXPECT_FALSE(HasBOM);
 }
+
+struct ConvertUTFResultContainer {
+  ConversionResult ErrorCode;
+  std::vector<unsigned> UnicodeScalars;
+
+  ConvertUTFResultContainer(ConversionResult ErrorCode)
+      : ErrorCode(ErrorCode) {}
+
+  ConvertUTFResultContainer
+  withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000,
+              unsigned US2 = 0x110000, unsigned US3 = 0x110000,
+              unsigned US4 = 0x110000, unsigned US5 = 0x110000,
+              unsigned US6 = 0x110000, unsigned US7 = 0x110000) {
+    ConvertUTFResultContainer Result(*this);
+    if (US0 != 0x110000)
+      Result.UnicodeScalars.push_back(US0);
+    if (US1 != 0x110000)
+      Result.UnicodeScalars.push_back(US1);
+    if (US2 != 0x110000)
+      Result.UnicodeScalars.push_back(US2);
+    if (US3 != 0x110000)
+      Result.UnicodeScalars.push_back(US3);
+    if (US4 != 0x110000)
+      Result.UnicodeScalars.push_back(US4);
+    if (US5 != 0x110000)
+      Result.UnicodeScalars.push_back(US5);
+    if (US6 != 0x110000)
+      Result.UnicodeScalars.push_back(US6);
+    if (US7 != 0x110000)
+      Result.UnicodeScalars.push_back(US7);
+    return Result;
+  }
+};
+
+std::pair<ConversionResult, std::vector<unsigned>>
+ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
+  const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
+
+  const UTF8 *SourceNext = SourceStart;
+  std::vector<UTF32> Decoded(S.size(), 0);
+  UTF32 *TargetStart = Decoded.data();
+
+  auto ErrorCode =
+      ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
+                         Decoded.data() + Decoded.size(), lenientConversion);
+
+  Decoded.resize(TargetStart - Decoded.data());
+
+  return std::make_pair(ErrorCode, Decoded);
+}
+
+std::pair<ConversionResult, std::vector<unsigned>>
+ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
+  const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
+
+  const UTF8 *SourceNext = SourceStart;
+  std::vector<UTF32> Decoded(S.size(), 0);
+  UTF32 *TargetStart = Decoded.data();
+
+  auto ErrorCode = ConvertUTF8toUTF32Partial(
+      &SourceNext, SourceStart + S.size(), &TargetStart,
+      Decoded.data() + Decoded.size(), lenientConversion);
+
+  Decoded.resize(TargetStart - Decoded.data());
+
+  return std::make_pair(ErrorCode, Decoded);
+}
+
+::testing::AssertionResult
+CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,
+                                 StringRef S, bool Partial = false) {
+  ConversionResult ErrorCode;
+  std::vector<unsigned> Decoded;
+  if (!Partial)
+    std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S);
+  else
+
+    std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S);
+  if (Expected.ErrorCode != ErrorCode)
+    return ::testing::AssertionFailure() << "Expected error code "
+                                         << Expected.ErrorCode << ", actual "
+                                         << ErrorCode;
+
+  if (Expected.UnicodeScalars != Decoded)
+    return ::testing::AssertionFailure()
+           << "Expected lenient decoded result:\n"
+           << ::testing::PrintToString(Expected.UnicodeScalars) << "\n"
+           << "Actual result:\n" << ::testing::PrintToString(Decoded);
+
+  return ::testing::AssertionSuccess();
+}
+
+TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
+
+  //
+  // 1-byte sequences
+  //
+
+  // U+0041 LATIN CAPITAL LETTER A
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41"));
+
+  //
+  // 2-byte sequences
+  //
+
+  // U+0283 LATIN SMALL LETTER ESH
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x0283),
+      "\xca\x83"));
+
+  // U+03BA GREEK SMALL LETTER KAPPA
+  // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
+  // U+03C3 GREEK SMALL LETTER SIGMA
+  // U+03BC GREEK SMALL LETTER MU
+  // U+03B5 GREEK SMALL LETTER EPSILON
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK)
+          .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
+      "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
+
+  //
+  // 3-byte sequences
+  //
+
+  // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
+  // U+6587 CJK UNIFIED IDEOGRAPH-6587
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587),
+      "\xe4\xbe\x8b\xe6\x96\x87"));
+
+  // U+D55C HANGUL SYLLABLE HAN
+  // U+AE00 HANGUL SYLLABLE GEUL
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00),
+      "\xed\x95\x9c\xea\xb8\x80"));
+
+  // U+1112 HANGUL CHOSEONG HIEUH
+  // U+1161 HANGUL JUNGSEONG A
+  // U+11AB HANGUL JONGSEONG NIEUN
+  // U+1100 HANGUL CHOSEONG KIYEOK
+  // U+1173 HANGUL JUNGSEONG EU
+  // U+11AF HANGUL JONGSEONG RIEUL
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK)
+          .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
+      "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
+      "\xe1\x86\xaf"));
+
+  //
+  // 4-byte sequences
+  //
+
+  // U+E0100 VARIATION SELECTOR-17
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100),
+      "\xf3\xa0\x84\x80"));
+
+  //
+  // First possible sequence of a certain length
+  //
+
+  // U+0000 NULL
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
+      StringRef("\x00", 1)));
+
+  // U+0080 PADDING CHARACTER
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x0080),
+      "\xc2\x80"));
+
+  // U+0800 SAMARITAN LETTER ALAF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x0800),
+      "\xe0\xa0\x80"));
+
+  // U+10000 LINEAR B SYLLABLE B008 A
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x10000),
+      "\xf0\x90\x80\x80"));
+
+  // U+200000 (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf8\x88\x80\x80\x80"));
+
+  // U+4000000 (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x84\x80\x80\x80\x80"));
+
+  //
+  // Last possible sequence of a certain length
+  //
+
+  // U+007F DELETE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f"));
+
+  // U+07FF (unassigned)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x07ff),
+      "\xdf\xbf"));
+
+  // U+FFFF (noncharacter)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
+      "\xef\xbf\xbf"));
+
+  // U+1FFFFF (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf7\xbf\xbf\xbf"));
+
+  // U+3FFFFFF (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfb\xbf\xbf\xbf\xbf"));
+
+  // U+7FFFFFFF (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfd\xbf\xbf\xbf\xbf\xbf"));
+
+  //
+  // Other boundary conditions
+  //
+
+  // U+D7FF (unassigned)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff),
+      "\xed\x9f\xbf"));
+
+  // U+E000 (private use)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xe000),
+      "\xee\x80\x80"));
+
+  // U+FFFD REPLACEMENT CHARACTER
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfffd),
+      "\xef\xbf\xbd"));
+
+  // U+10FFFF (noncharacter)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
+      "\xf4\x8f\xbf\xbf"));
+
+  // U+110000 (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf4\x90\x80\x80"));
+
+  //
+  // Unexpected continuation bytes
+  //
+
+  // A sequence of unexpected continuation bytes that don't follow a first
+  // byte, every byte is a maximal subpart.
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\x80\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xbf\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\x80\xbf\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\x80\xbf\x80\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\x80\xbf\x82\xbf\xaa"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xaa\xb0\xbb\xbf\xaa\xa0"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
+
+  // All continuation bytes (0x80--0xbf).
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+      "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+      "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+      "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
+
+  //
+  // Lonely start bytes
+  //
+
+  // Start bytes of 2-byte sequences (0xc0--0xdf).
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+      "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020),
+      "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
+      "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
+      "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
+      "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
+
+  // Start bytes of 3-byte sequences (0xe0--0xef).
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020),
+      "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
+      "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
+
+  // Start bytes of 4-byte sequences (0xf0--0xf7).
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
+                       0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020),
+      "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
+
+  // Start bytes of 5-byte sequences (0xf8--0xfb).
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf8\xf9\xfa\xfb"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020),
+      "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
+
+  // Start bytes of 6-byte sequences (0xfc--0xfd).
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xfc\xfd"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
+      "\xfc\x20\xfd\x20"));
+
+  //
+  // Other bytes (0xc0--0xc1, 0xfe--0xff).
+  //
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xc0\xc1\xfe\xff"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfe\xfe\xff\xff"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfe\x80\x80\x80\x80\x80"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xff\x80\x80\x80\x80\x80"));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
+                       0xfffd, 0x0020, 0xfffd, 0x0020),
+      "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
+
+  //
+  // Sequences with one continuation byte missing
+  //
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xe0\xa0"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xe0\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xe1\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xec\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xed\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xed\x9f"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xee\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xef\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf0\x90\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf0\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf1\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf3\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf4\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf4\x8f\xbf"));
+
+  // Overlong sequences with one trailing byte missing.
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xc0"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xc1"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xe0\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xe0\x9f"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf0\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf0\x8f\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf8\x80\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x80\x80\x80\x80"));
+
+  // Sequences that represent surrogates with one trailing byte missing.
+  // High surrogates
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xed\xa0"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xed\xac"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xed\xaf"));
+  // Low surrogates
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xed\xb0"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xed\xb4"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xed\xbf"));
+
+  // Ill-formed 4-byte sequences.
+  // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+1100xx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf4\x90\x80"));
+  // U+13FBxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf4\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf5\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf6\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf7\x80\x80"));
+  // U+1FFBxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf7\xbf\xbf"));
+
+  // Ill-formed 5-byte sequences.
+  // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+2000xx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf8\x88\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf8\xbf\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf9\x80\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfa\x80\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfb\x80\x80\x80"));
+  // U+3FFFFxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfb\xbf\xbf\xbf"));
+
+  // Ill-formed 6-byte sequences.
+  // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
+  // U+40000xx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x84\x80\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\xbf\xbf\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfd\x80\x80\x80\x80"));
+  // U+7FFFFFxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfd\xbf\xbf\xbf\xbf"));
+
+  //
+  // Sequences with two continuation bytes missing
+  //
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf0\x90"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf0\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf1\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf3\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf4\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
+      "\xf4\x8f"));
+
+  // Overlong sequences with two trailing byte missing.
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf0\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf0\x8f"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf8\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x80\x80\x80"));
+
+  // Sequences that represent surrogates with two trailing bytes missing.
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed"));
+
+  // Ill-formed 4-byte sequences.
+  // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+110yxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf4\x90"));
+  // U+13Fyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf4\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf5\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf6\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf7\x80"));
+  // U+1FFyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf7\xbf"));
+
+  // Ill-formed 5-byte sequences.
+  // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+200yxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf8\x88\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf8\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xf9\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xfa\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xfb\x80\x80"));
+  // U+3FFFyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xfb\xbf\xbf"));
+
+  // Ill-formed 6-byte sequences.
+  // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+4000yxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x84\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\xbf\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfd\x80\x80\x80"));
+  // U+7FFFFyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfd\xbf\xbf\xbf"));
+
+  //
+  // Sequences with three continuation bytes missing
+  //
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4"));
+
+  // Broken overlong sequences.
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf8\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x80\x80"));
+
+  // Ill-formed 4-byte sequences.
+  // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+14yyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6"));
+  // U+1Cyyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7"));
+
+  // Ill-formed 5-byte sequences.
+  // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+20yyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf8\x88"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf8\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xf9\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xfa\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xfb\x80"));
+  // U+3FCyyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xfb\xbf"));
+
+  // Ill-formed 6-byte sequences.
+  // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+400yyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x84\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xfc\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xfd\x80\x80"));
+  // U+7FFCyyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xfd\xbf\xbf"));
+
+  //
+  // Sequences with four continuation bytes missing
+  //
+
+  // Ill-formed 5-byte sequences.
+  // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+uzyyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
+  // U+3zyyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
+
+  // Broken overlong sequences.
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xfc\x80"));
+
+  // Ill-formed 6-byte sequences.
+  // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+uzzyyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xfc\x84"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xfc\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xfd\x80"));
+  // U+7Fzzyyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xfd\xbf"));
+
+  //
+  // Sequences with five continuation bytes missing
+  //
+
+  // Ill-formed 6-byte sequences.
+  // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
+  // U+uzzyyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc"));
+  // U+uuzzyyxx (invalid)
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd"));
+
+  //
+  // Consecutive sequences with trailing bytes missing
+  //
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xc0" "\xe0\x80" "\xf0\x80\x80"
+      "\xf8\x80\x80\x80"
+      "\xfc\x80\x80\x80\x80"
+      "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
+      "\xfb\xbf\xbf\xbf"
+      "\xfd\xbf\xbf\xbf\xbf"));
+
+  //
+  // Overlong UTF-8 sequences
+  //
+
+  // U+002F SOLIDUS
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f"));
+
+  // Overlong sequences of the above.
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xc0\xaf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xe0\x80\xaf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf0\x80\x80\xaf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf8\x80\x80\x80\xaf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x80\x80\x80\x80\xaf"));
+
+  // U+0000 NULL
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
+      StringRef("\x00", 1)));
+
+  // Overlong sequences of the above.
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xc0\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xe0\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf0\x80\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf8\x80\x80\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x80\x80\x80\x80\x80"));
+
+  // Other overlong sequences.
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xc0\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xc1\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
+      "\xc1\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xe0\x9f\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xed\xa0\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xed\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf0\x8f\x80\x80"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf0\x8f\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xf8\x87\xbf\xbf\xbf"));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xfc\x83\xbf\xbf\xbf\xbf"));
+
+  //
+  // Isolated surrogates
+  //
+
+  // Unicode 6.3.0:
+  //
+  //    D71.  High-surrogate code point: A Unicode code point in the range
+  //    U+D800 to U+DBFF.
+  //
+  //    D73.  Low-surrogate code point: A Unicode code point in the range
+  //    U+DC00 to U+DFFF.
+
+  // Note: U+E0100 is <DB40 DD00> in UTF16.
+
+  // High surrogates
+
+  // U+D800
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xed\xa0\x80"));
+
+  // U+DB40
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xed\xac\xa0"));
+
+  // U+DBFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xed\xaf\xbf"));
+
+  // Low surrogates
+
+  // U+DC00
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xed\xb0\x80"));
+
+  // U+DD00
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xed\xb4\x80"));
+
+  // U+DFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd),
+      "\xed\xbf\xbf"));
+
+  // Surrogate pairs
+
+  // U+D800 U+DC00
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xed\xa0\x80\xed\xb0\x80"));
+
+  // U+D800 U+DD00
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xed\xa0\x80\xed\xb4\x80"));
+
+  // U+D800 U+DFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xed\xa0\x80\xed\xbf\xbf"));
+
+  // U+DB40 U+DC00
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xed\xac\xa0\xed\xb0\x80"));
+
+  // U+DB40 U+DD00
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xed\xac\xa0\xed\xb4\x80"));
+
+  // U+DB40 U+DFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xed\xac\xa0\xed\xbf\xbf"));
+
+  // U+DBFF U+DC00
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xed\xaf\xbf\xed\xb0\x80"));
+
+  // U+DBFF U+DD00
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xed\xaf\xbf\xed\xb4\x80"));
+
+  // U+DBFF U+DFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceIllegal)
+          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
+      "\xed\xaf\xbf\xed\xbf\xbf"));
+
+  //
+  // Noncharacters
+  //
+
+  // Unicode 6.3.0:
+  //
+  //    D14.  Noncharacter: A code point that is permanently reserved for
+  //    internal use and that should never be interchanged. Noncharacters
+  //    consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
+  //    and the values U+FDD0..U+FDEF.
+
+  // U+FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfffe),
+      "\xef\xbf\xbe"));
+
+  // U+FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
+      "\xef\xbf\xbf"));
+
+  // U+1FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe),
+      "\xf0\x9f\xbf\xbe"));
+
+  // U+1FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff),
+      "\xf0\x9f\xbf\xbf"));
+
+  // U+2FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe),
+      "\xf0\xaf\xbf\xbe"));
+
+  // U+2FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff),
+      "\xf0\xaf\xbf\xbf"));
+
+  // U+3FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe),
+      "\xf0\xbf\xbf\xbe"));
+
+  // U+3FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff),
+      "\xf0\xbf\xbf\xbf"));
+
+  // U+4FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe),
+      "\xf1\x8f\xbf\xbe"));
+
+  // U+4FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff),
+      "\xf1\x8f\xbf\xbf"));
+
+  // U+5FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe),
+      "\xf1\x9f\xbf\xbe"));
+
+  // U+5FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff),
+      "\xf1\x9f\xbf\xbf"));
+
+  // U+6FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe),
+      "\xf1\xaf\xbf\xbe"));
+
+  // U+6FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff),
+      "\xf1\xaf\xbf\xbf"));
+
+  // U+7FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe),
+      "\xf1\xbf\xbf\xbe"));
+
+  // U+7FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff),
+      "\xf1\xbf\xbf\xbf"));
+
+  // U+8FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe),
+      "\xf2\x8f\xbf\xbe"));
+
+  // U+8FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff),
+      "\xf2\x8f\xbf\xbf"));
+
+  // U+9FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe),
+      "\xf2\x9f\xbf\xbe"));
+
+  // U+9FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff),
+      "\xf2\x9f\xbf\xbf"));
+
+  // U+AFFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xafffe),
+      "\xf2\xaf\xbf\xbe"));
+
+  // U+AFFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xaffff),
+      "\xf2\xaf\xbf\xbf"));
+
+  // U+BFFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe),
+      "\xf2\xbf\xbf\xbe"));
+
+  // U+BFFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xbffff),
+      "\xf2\xbf\xbf\xbf"));
+
+  // U+CFFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe),
+      "\xf3\x8f\xbf\xbe"));
+
+  // U+CFFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF),
+      "\xf3\x8f\xbf\xbf"));
+
+  // U+DFFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe),
+      "\xf3\x9f\xbf\xbe"));
+
+  // U+DFFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xdffff),
+      "\xf3\x9f\xbf\xbf"));
+
+  // U+EFFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xefffe),
+      "\xf3\xaf\xbf\xbe"));
+
+  // U+EFFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xeffff),
+      "\xf3\xaf\xbf\xbf"));
+
+  // U+FFFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xffffe),
+      "\xf3\xbf\xbf\xbe"));
+
+  // U+FFFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfffff),
+      "\xf3\xbf\xbf\xbf"));
+
+  // U+10FFFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe),
+      "\xf4\x8f\xbf\xbe"));
+
+  // U+10FFFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
+      "\xf4\x8f\xbf\xbf"));
+
+  // U+FDD0
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0),
+      "\xef\xb7\x90"));
+
+  // U+FDD1
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1),
+      "\xef\xb7\x91"));
+
+  // U+FDD2
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2),
+      "\xef\xb7\x92"));
+
+  // U+FDD3
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3),
+      "\xef\xb7\x93"));
+
+  // U+FDD4
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4),
+      "\xef\xb7\x94"));
+
+  // U+FDD5
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5),
+      "\xef\xb7\x95"));
+
+  // U+FDD6
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6),
+      "\xef\xb7\x96"));
+
+  // U+FDD7
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7),
+      "\xef\xb7\x97"));
+
+  // U+FDD8
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8),
+      "\xef\xb7\x98"));
+
+  // U+FDD9
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9),
+      "\xef\xb7\x99"));
+
+  // U+FDDA
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdda),
+      "\xef\xb7\x9a"));
+
+  // U+FDDB
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfddb),
+      "\xef\xb7\x9b"));
+
+  // U+FDDC
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfddc),
+      "\xef\xb7\x9c"));
+
+  // U+FDDD
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfddd),
+      "\xef\xb7\x9d"));
+
+  // U+FDDE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdde),
+      "\xef\xb7\x9e"));
+
+  // U+FDDF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfddf),
+      "\xef\xb7\x9f"));
+
+  // U+FDE0
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde0),
+      "\xef\xb7\xa0"));
+
+  // U+FDE1
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde1),
+      "\xef\xb7\xa1"));
+
+  // U+FDE2
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde2),
+      "\xef\xb7\xa2"));
+
+  // U+FDE3
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde3),
+      "\xef\xb7\xa3"));
+
+  // U+FDE4
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde4),
+      "\xef\xb7\xa4"));
+
+  // U+FDE5
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde5),
+      "\xef\xb7\xa5"));
+
+  // U+FDE6
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde6),
+      "\xef\xb7\xa6"));
+
+  // U+FDE7
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde7),
+      "\xef\xb7\xa7"));
+
+  // U+FDE8
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde8),
+      "\xef\xb7\xa8"));
+
+  // U+FDE9
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfde9),
+      "\xef\xb7\xa9"));
+
+  // U+FDEA
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdea),
+      "\xef\xb7\xaa"));
+
+  // U+FDEB
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb),
+      "\xef\xb7\xab"));
+
+  // U+FDEC
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdec),
+      "\xef\xb7\xac"));
+
+  // U+FDED
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfded),
+      "\xef\xb7\xad"));
+
+  // U+FDEE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdee),
+      "\xef\xb7\xae"));
+
+  // U+FDEF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdef),
+      "\xef\xb7\xaf"));
+
+  // U+FDF0
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0),
+      "\xef\xb7\xb0"));
+
+  // U+FDF1
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1),
+      "\xef\xb7\xb1"));
+
+  // U+FDF2
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2),
+      "\xef\xb7\xb2"));
+
+  // U+FDF3
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3),
+      "\xef\xb7\xb3"));
+
+  // U+FDF4
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4),
+      "\xef\xb7\xb4"));
+
+  // U+FDF5
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5),
+      "\xef\xb7\xb5"));
+
+  // U+FDF6
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6),
+      "\xef\xb7\xb6"));
+
+  // U+FDF7
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7),
+      "\xef\xb7\xb7"));
+
+  // U+FDF8
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8),
+      "\xef\xb7\xb8"));
+
+  // U+FDF9
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9),
+      "\xef\xb7\xb9"));
+
+  // U+FDFA
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa),
+      "\xef\xb7\xba"));
+
+  // U+FDFB
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb),
+      "\xef\xb7\xbb"));
+
+  // U+FDFC
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc),
+      "\xef\xb7\xbc"));
+
+  // U+FDFD
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd),
+      "\xef\xb7\xbd"));
+
+  // U+FDFE
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe),
+      "\xef\xb7\xbe"));
+
+  // U+FDFF
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0xfdff),
+      "\xef\xb7\xbf"));
+}
+
+TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
+  // U+0041 LATIN CAPITAL LETTER A
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(conversionOK).withScalars(0x0041),
+      "\x41", true));
+
+  //
+  // Sequences with one continuation byte missing
+  //
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xc2", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xdf", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xe0\xa0", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xe0\xbf", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xe1\x80", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xec\xbf", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xed\x80", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xed\x9f", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xee\x80", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xef\xbf", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xf0\x90\x80", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xf0\xbf\xbf", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xf1\x80\x80", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xf3\xbf\xbf", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xf4\x80\x80", true));
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted),
+      "\xf4\x8f\xbf", true));
+
+  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
+      ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041),
+      "\x41\xc2", true));
+}
+
diff --git a/unittests/Support/DataExtractorTest.cpp b/unittests/Support/DataExtractorTest.cpp
index ec8bd3d..81de983 100644
--- a/unittests/Support/DataExtractorTest.cpp
+++ b/unittests/Support/DataExtractorTest.cpp
@@ -94,7 +94,7 @@ TEST(DataExtractorTest, Strings) {
 
   EXPECT_EQ(stringData, DE.getCStr(&offset));
   EXPECT_EQ(11U, offset);
-  EXPECT_EQ(NULL, DE.getCStr(&offset));
+  EXPECT_EQ(nullptr, DE.getCStr(&offset));
   EXPECT_EQ(11U, offset);
 }
 
diff --git a/unittests/Support/ErrorOrTest.cpp b/unittests/Support/ErrorOrTest.cpp
index 18ce507..d76e7d6 100644
--- a/unittests/Support/ErrorOrTest.cpp
+++ b/unittests/Support/ErrorOrTest.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/Errc.h"
 #include "gtest/gtest.h"
 #include <memory>
 
@@ -30,7 +31,7 @@ TEST(ErrorOr, SimpleValue) {
 
   a = t2();
   EXPECT_FALSE(a);
-  EXPECT_EQ(errc::invalid_argument, a.getError());
+  EXPECT_EQ(a.getError(), errc::invalid_argument);
 #ifdef EXPECT_DEBUG_DEATH
   EXPECT_DEBUG_DEATH(*a, "Cannot get value when an error exists");
 #endif
@@ -54,10 +55,10 @@ struct B {};
 struct D : B {};
 
 TEST(ErrorOr, Covariant) {
-  ErrorOr<B*> b(ErrorOr<D*>(0));
-  b = ErrorOr<D*>(0);
+  ErrorOr<B*> b(ErrorOr<D*>(nullptr));
+  b = ErrorOr<D*>(nullptr);
 
-  ErrorOr<std::unique_ptr<B> > b1(ErrorOr<std::unique_ptr<D> >(0));
-  b1 = ErrorOr<std::unique_ptr<D> >(0);
+  ErrorOr<std::unique_ptr<B> > b1(ErrorOr<std::unique_ptr<D> >(nullptr));
+  b1 = ErrorOr<std::unique_ptr<D> >(nullptr);
 }
 } // end anon namespace
diff --git a/unittests/Support/FileOutputBufferTest.cpp b/unittests/Support/FileOutputBufferTest.cpp
index 0801f85..b086f1e 100644
--- a/unittests/Support/FileOutputBufferTest.cpp
+++ b/unittests/Support/FileOutputBufferTest.cpp
@@ -17,12 +17,13 @@
 using namespace llvm;
 using namespace llvm::sys;
 
-#define ASSERT_NO_ERROR(x) \
-  if (error_code ASSERT_NO_ERROR_ec = x) { \
-    errs() << #x ": did not return errc::success.\n" \
-            << "error number: " << ASSERT_NO_ERROR_ec.value() << "\n" \
-            << "error message: " << ASSERT_NO_ERROR_ec.message() << "\n"; \
-  } else {}
+#define ASSERT_NO_ERROR(x)                                                     \
+  if (std::error_code ASSERT_NO_ERROR_ec = x) {                                \
+    errs() << #x ": did not return errc::success.\n"                           \
+           << "error number: " << ASSERT_NO_ERROR_ec.value() << "\n"           \
+           << "error message: " << ASSERT_NO_ERROR_ec.message() << "\n";       \
+  } else {                                                                     \
+  }
 
 namespace {
 TEST(FileOutputBuffer, Test) {
@@ -46,11 +47,7 @@ TEST(FileOutputBuffer, Test) {
     // Commit buffer.
     ASSERT_NO_ERROR(Buffer->commit());
   }
-  // Verify file exists and starts with special header.
-  bool MagicMatches = false;
-  ASSERT_NO_ERROR(fs::has_magic(Twine(File1), Twine("AABBCCDDEEFFGGHHIIJJ"),
-                                                                MagicMatches));
-  EXPECT_TRUE(MagicMatches);
+
   // Verify file is correct size.
   uint64_t File1Size;
   ASSERT_NO_ERROR(fs::file_size(Twine(File1), File1Size));
@@ -86,11 +83,7 @@ TEST(FileOutputBuffer, Test) {
     // Commit buffer, but size down to smaller size
     ASSERT_NO_ERROR(Buffer->commit(5000));
   }
-  // Verify file exists and starts with special header.
-  bool MagicMatches3 = false;
-  ASSERT_NO_ERROR(fs::has_magic(Twine(File3), Twine("AABBCCDDEEFFGGHHIIJJ"),
-                                                              MagicMatches3));
-  EXPECT_TRUE(MagicMatches3);
+
   // Verify file is correct size.
   uint64_t File3Size;
   ASSERT_NO_ERROR(fs::file_size(Twine(File3), File3Size));
diff --git a/unittests/Support/LockFileManagerTest.cpp b/unittests/Support/LockFileManagerTest.cpp
index 93fa10b..885b7d6 100644
--- a/unittests/Support/LockFileManagerTest.cpp
+++ b/unittests/Support/LockFileManagerTest.cpp
@@ -19,7 +19,7 @@ namespace {
 
 TEST(LockFileManagerTest, Basic) {
   SmallString<64> TmpDir;
-  error_code EC;
+  std::error_code EC;
   EC = sys::fs::createUniqueDirectory("LockFileManagerTestDir", TmpDir);
   ASSERT_FALSE(EC);
 
@@ -46,7 +46,7 @@ TEST(LockFileManagerTest, Basic) {
 
 TEST(LockFileManagerTest, LinkLockExists) {
   SmallString<64> TmpDir;
-  error_code EC;
+  std::error_code EC;
   EC = sys::fs::createUniqueDirectory("LockFileManagerTestDir", TmpDir);
   ASSERT_FALSE(EC);
 
@@ -89,7 +89,7 @@ TEST(LockFileManagerTest, LinkLockExists) {
 
 TEST(LockFileManagerTest, RelativePath) {
   SmallString<64> TmpDir;
-  error_code EC;
+  std::error_code EC;
   EC = sys::fs::createUniqueDirectory("LockFileManagerTestDir", TmpDir);
   ASSERT_FALSE(EC);
 
diff --git a/unittests/Support/ManagedStatic.cpp b/unittests/Support/ManagedStatic.cpp
index 1497f4e..153884b 100644
--- a/unittests/Support/ManagedStatic.cpp
+++ b/unittests/Support/ManagedStatic.cpp
@@ -25,7 +25,7 @@ namespace test1 {
   llvm::ManagedStatic<int> ms;
   void *helper(void*) {
     *ms;
-    return NULL;
+    return nullptr;
   }
 
   // Valgrind's leak checker complains glibc's stack allocation.
@@ -47,15 +47,13 @@ TEST(Initialize, MultipleThreads) {
   void *p1 = test1::allocate_stack(a1);
   void *p2 = test1::allocate_stack(a2);
 
-  llvm_start_multithreaded();
   pthread_t t1, t2;
-  pthread_create(&t1, &a1, test1::helper, NULL);
-  pthread_create(&t2, &a2, test1::helper, NULL);
-  pthread_join(t1, NULL);
-  pthread_join(t2, NULL);
+  pthread_create(&t1, &a1, test1::helper, nullptr);
+  pthread_create(&t2, &a2, test1::helper, nullptr);
+  pthread_join(t1, nullptr);
+  pthread_join(t2, nullptr);
   free(p1);
   free(p2);
-  llvm_stop_multithreaded();
 }
 #endif
 
diff --git a/unittests/Support/MemoryBufferTest.cpp b/unittests/Support/MemoryBufferTest.cpp
index 6790d0c..93bf301 100644
--- a/unittests/Support/MemoryBufferTest.cpp
+++ b/unittests/Support/MemoryBufferTest.cpp
@@ -43,15 +43,15 @@ protected:
 TEST_F(MemoryBufferTest, get) {
   // Default name and null-terminator flag
   OwningBuffer MB1(MemoryBuffer::getMemBuffer(data));
-  EXPECT_TRUE(0 != MB1.get());
+  EXPECT_TRUE(nullptr != MB1.get());
 
   // RequiresNullTerminator = false
   OwningBuffer MB2(MemoryBuffer::getMemBuffer(data, "one", false));
-  EXPECT_TRUE(0 != MB2.get());
+  EXPECT_TRUE(nullptr != MB2.get());
 
   // RequiresNullTerminator = true
   OwningBuffer MB3(MemoryBuffer::getMemBuffer(data, "two", true));
-  EXPECT_TRUE(0 != MB3.get());
+  EXPECT_TRUE(nullptr != MB3.get());
 
   // verify all 3 buffers point to the same address
   EXPECT_EQ(MB1->getBufferStart(), MB2->getBufferStart());
@@ -77,11 +77,11 @@ TEST_F(MemoryBufferTest, NullTerminator4K) {
   }
   OF.close();
 
-  OwningBuffer MB;
-  error_code EC = MemoryBuffer::getFile(TestPath.c_str(), MB);
+  ErrorOr<OwningBuffer> MB = MemoryBuffer::getFile(TestPath.c_str());
+  std::error_code EC = MB.getError();
   ASSERT_FALSE(EC);
 
-  const char *BufData = MB->getBufferStart();
+  const char *BufData = MB.get()->getBufferStart();
   EXPECT_EQ('f', BufData[4095]);
   EXPECT_EQ('\0', BufData[4096]);
 }
@@ -89,11 +89,11 @@ TEST_F(MemoryBufferTest, NullTerminator4K) {
 TEST_F(MemoryBufferTest, copy) {
   // copy with no name
   OwningBuffer MBC1(MemoryBuffer::getMemBufferCopy(data));
-  EXPECT_TRUE(0 != MBC1.get());
+  EXPECT_TRUE(nullptr != MBC1.get());
 
   // copy with a name
   OwningBuffer MBC2(MemoryBuffer::getMemBufferCopy(data, "copy"));
-  EXPECT_TRUE(0 != MBC2.get());
+  EXPECT_TRUE(nullptr != MBC2.get());
 
   // verify the two copies do not point to the same place
   EXPECT_NE(MBC1->getBufferStart(), MBC2->getBufferStart());
@@ -102,25 +102,25 @@ TEST_F(MemoryBufferTest, copy) {
 TEST_F(MemoryBufferTest, make_new) {
   // 0-sized buffer
   OwningBuffer Zero(MemoryBuffer::getNewUninitMemBuffer(0));
-  EXPECT_TRUE(0 != Zero.get());
+  EXPECT_TRUE(nullptr != Zero.get());
 
   // uninitialized buffer with no name
   OwningBuffer One(MemoryBuffer::getNewUninitMemBuffer(321));
-  EXPECT_TRUE(0 != One.get());
+  EXPECT_TRUE(nullptr != One.get());
 
   // uninitialized buffer with name
   OwningBuffer Two(MemoryBuffer::getNewUninitMemBuffer(123, "bla"));
-  EXPECT_TRUE(0 != Two.get());
+  EXPECT_TRUE(nullptr != Two.get());
 
   // 0-initialized buffer with no name
   OwningBuffer Three(MemoryBuffer::getNewMemBuffer(321, data));
-  EXPECT_TRUE(0 != Three.get());
+  EXPECT_TRUE(nullptr != Three.get());
   for (size_t i = 0; i < 321; ++i)
     EXPECT_EQ(0, Three->getBufferStart()[0]);
 
   // 0-initialized buffer with name
   OwningBuffer Four(MemoryBuffer::getNewMemBuffer(123, "zeros"));
-  EXPECT_TRUE(0 != Four.get());
+  EXPECT_TRUE(nullptr != Four.get());
   for (size_t i = 0; i < 123; ++i)
     EXPECT_EQ(0, Four->getBufferStart()[0]);
 }
@@ -146,14 +146,16 @@ void MemoryBufferTest::testGetOpenFileSlice(bool Reopen) {
     EXPECT_FALSE(sys::fs::openFileForRead(TestPath.c_str(), TestFD));
   }
 
-  OwningBuffer Buf;
-  error_code EC = MemoryBuffer::getOpenFileSlice(TestFD, TestPath.c_str(), Buf,
-                                                 40000, // Size
-                                                 80000  // Offset
-                                                 );
+  ErrorOr<OwningBuffer> Buf =
+      MemoryBuffer::getOpenFileSlice(TestFD, TestPath.c_str(),
+                                     40000, // Size
+                                     80000  // Offset
+                                     );
+
+  std::error_code EC = Buf.getError();
   EXPECT_FALSE(EC);
 
-  StringRef BufData = Buf->getBuffer();
+  StringRef BufData = Buf.get()->getBuffer();
   EXPECT_EQ(BufData.size(), 40000U);
   EXPECT_EQ(BufData[0], '0');
   EXPECT_EQ(BufData[9], '9');
diff --git a/unittests/Support/MemoryTest.cpp b/unittests/Support/MemoryTest.cpp
index fae67a8..8ad90e0 100644
--- a/unittests/Support/MemoryTest.cpp
+++ b/unittests/Support/MemoryTest.cpp
@@ -57,30 +57,30 @@ protected:
 };
 
 TEST_P(MappedMemoryTest, AllocAndRelease) {
-  error_code EC;
-  MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  std::error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), nullptr, Flags,EC);
+  EXPECT_EQ(std::error_code(), EC);
 
-  EXPECT_NE((void*)0, M1.base());
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(sizeof(int), M1.size());
 
   EXPECT_FALSE(Memory::releaseMappedMemory(M1));
 }
 
 TEST_P(MappedMemoryTest, MultipleAllocAndRelease) {
-  error_code EC;
-  MemoryBlock M1 = Memory::allocateMappedMemory(16, 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-  MemoryBlock M2 = Memory::allocateMappedMemory(64, 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-  MemoryBlock M3 = Memory::allocateMappedMemory(32, 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-
-  EXPECT_NE((void*)0, M1.base());
+  std::error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(16, nullptr, Flags, EC);
+  EXPECT_EQ(std::error_code(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(64, nullptr, Flags, EC);
+  EXPECT_EQ(std::error_code(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(32, nullptr, Flags, EC);
+  EXPECT_EQ(std::error_code(), EC);
+
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(16U, M1.size());
-  EXPECT_NE((void*)0, M2.base());
+  EXPECT_NE((void*)nullptr, M2.base());
   EXPECT_LE(64U, M2.size());
-  EXPECT_NE((void*)0, M3.base());
+  EXPECT_NE((void*)nullptr, M3.base());
   EXPECT_LE(32U, M3.size());
 
   EXPECT_FALSE(doesOverlap(M1, M2));
@@ -89,9 +89,9 @@ TEST_P(MappedMemoryTest, MultipleAllocAndRelease) {
 
   EXPECT_FALSE(Memory::releaseMappedMemory(M1));
   EXPECT_FALSE(Memory::releaseMappedMemory(M3));
-  MemoryBlock M4 = Memory::allocateMappedMemory(16, 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-  EXPECT_NE((void*)0, M4.base());
+  MemoryBlock M4 = Memory::allocateMappedMemory(16, nullptr, Flags, EC);
+  EXPECT_EQ(std::error_code(), EC);
+  EXPECT_NE((void*)nullptr, M4.base());
   EXPECT_LE(16U, M4.size());
   EXPECT_FALSE(Memory::releaseMappedMemory(M4));
   EXPECT_FALSE(Memory::releaseMappedMemory(M2));
@@ -103,11 +103,11 @@ TEST_P(MappedMemoryTest, BasicWrite) {
       !((Flags & Memory::MF_READ) && (Flags & Memory::MF_WRITE)))
     return;
 
-  error_code EC;
-  MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  std::error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), nullptr, Flags,EC);
+  EXPECT_EQ(std::error_code(), EC);
 
-  EXPECT_NE((void*)0, M1.base());
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(sizeof(int), M1.size());
 
   int *a = (int*)M1.base();
@@ -122,23 +122,26 @@ TEST_P(MappedMemoryTest, MultipleWrite) {
   if (Flags &&
       !((Flags & Memory::MF_READ) && (Flags & Memory::MF_WRITE)))
     return;
-  error_code EC;
-  MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-  MemoryBlock M2 = Memory::allocateMappedMemory(8 * sizeof(int), 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-  MemoryBlock M3 = Memory::allocateMappedMemory(4 * sizeof(int), 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  std::error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), nullptr, Flags,
+                                                EC);
+  EXPECT_EQ(std::error_code(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(8 * sizeof(int), nullptr, Flags,
+                                                EC);
+  EXPECT_EQ(std::error_code(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(4 * sizeof(int), nullptr, Flags,
+                                                EC);
+  EXPECT_EQ(std::error_code(), EC);
 
   EXPECT_FALSE(doesOverlap(M1, M2));
   EXPECT_FALSE(doesOverlap(M2, M3));
   EXPECT_FALSE(doesOverlap(M1, M3));
 
-  EXPECT_NE((void*)0, M1.base());
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(1U * sizeof(int), M1.size());
-  EXPECT_NE((void*)0, M2.base());
+  EXPECT_NE((void*)nullptr, M2.base());
   EXPECT_LE(8U * sizeof(int), M2.size());
-  EXPECT_NE((void*)0, M3.base());
+  EXPECT_NE((void*)nullptr, M3.base());
   EXPECT_LE(4U * sizeof(int), M3.size());
 
   int *x = (int*)M1.base();
@@ -159,9 +162,10 @@ TEST_P(MappedMemoryTest, MultipleWrite) {
   EXPECT_FALSE(Memory::releaseMappedMemory(M1));
   EXPECT_FALSE(Memory::releaseMappedMemory(M3));
 
-  MemoryBlock M4 = Memory::allocateMappedMemory(64 * sizeof(int), 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-  EXPECT_NE((void*)0, M4.base());
+  MemoryBlock M4 = Memory::allocateMappedMemory(64 * sizeof(int), nullptr,
+                                                Flags, EC);
+  EXPECT_EQ(std::error_code(), EC);
+  EXPECT_NE((void*)nullptr, M4.base());
   EXPECT_LE(64U * sizeof(int), M4.size());
   x = (int*)M4.base();
   *x = 4;
@@ -176,19 +180,22 @@ TEST_P(MappedMemoryTest, MultipleWrite) {
 }
 
 TEST_P(MappedMemoryTest, EnabledWrite) {
-  error_code EC;
-  MemoryBlock M1 = Memory::allocateMappedMemory(2 * sizeof(int), 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-  MemoryBlock M2 = Memory::allocateMappedMemory(8 * sizeof(int), 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-  MemoryBlock M3 = Memory::allocateMappedMemory(4 * sizeof(int), 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-
-  EXPECT_NE((void*)0, M1.base());
+  std::error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(2 * sizeof(int), nullptr, Flags,
+                                                EC);
+  EXPECT_EQ(std::error_code(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(8 * sizeof(int), nullptr, Flags,
+                                                EC);
+  EXPECT_EQ(std::error_code(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(4 * sizeof(int), nullptr, Flags,
+                                                EC);
+  EXPECT_EQ(std::error_code(), EC);
+
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(2U * sizeof(int), M1.size());
-  EXPECT_NE((void*)0, M2.base());
+  EXPECT_NE((void*)nullptr, M2.base());
   EXPECT_LE(8U * sizeof(int), M2.size());
-  EXPECT_NE((void*)0, M3.base());
+  EXPECT_NE((void*)nullptr, M3.base());
   EXPECT_LE(4U * sizeof(int), M3.size());
 
   EXPECT_FALSE(Memory::protectMappedMemory(M1, getTestableEquivalent(Flags)));
@@ -216,11 +223,12 @@ TEST_P(MappedMemoryTest, EnabledWrite) {
   EXPECT_FALSE(Memory::releaseMappedMemory(M3));
   EXPECT_EQ(6, y[6]);
 
-  MemoryBlock M4 = Memory::allocateMappedMemory(16, 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
-  EXPECT_NE((void*)0, M4.base());
+  MemoryBlock M4 = Memory::allocateMappedMemory(16, nullptr, Flags, EC);
+  EXPECT_EQ(std::error_code(), EC);
+  EXPECT_NE((void*)nullptr, M4.base());
   EXPECT_LE(16U, M4.size());
-  EXPECT_EQ(error_code::success(), Memory::protectMappedMemory(M4, getTestableEquivalent(Flags)));
+  EXPECT_EQ(std::error_code(),
+            Memory::protectMappedMemory(M4, getTestableEquivalent(Flags)));
   x = (int*)M4.base();
   *x = 4;
   EXPECT_EQ(4, *x);
@@ -229,19 +237,19 @@ TEST_P(MappedMemoryTest, EnabledWrite) {
 }
 
 TEST_P(MappedMemoryTest, SuccessiveNear) {
-  error_code EC;
-  MemoryBlock M1 = Memory::allocateMappedMemory(16, 0, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  std::error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(16, nullptr, Flags, EC);
+  EXPECT_EQ(std::error_code(), EC);
   MemoryBlock M2 = Memory::allocateMappedMemory(64, &M1, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
   MemoryBlock M3 = Memory::allocateMappedMemory(32, &M2, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
 
-  EXPECT_NE((void*)0, M1.base());
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(16U, M1.size());
-  EXPECT_NE((void*)0, M2.base());
+  EXPECT_NE((void*)nullptr, M2.base());
   EXPECT_LE(64U, M2.size());
-  EXPECT_NE((void*)0, M3.base());
+  EXPECT_NE((void*)nullptr, M3.base());
   EXPECT_LE(32U, M3.size());
 
   EXPECT_FALSE(doesOverlap(M1, M2));
@@ -254,20 +262,20 @@ TEST_P(MappedMemoryTest, SuccessiveNear) {
 }
 
 TEST_P(MappedMemoryTest, DuplicateNear) {
-  error_code EC;
+  std::error_code EC;
   MemoryBlock Near((void*)(3*PageSize), 16);
   MemoryBlock M1 = Memory::allocateMappedMemory(16, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
   MemoryBlock M2 = Memory::allocateMappedMemory(64, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
   MemoryBlock M3 = Memory::allocateMappedMemory(32, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
 
-  EXPECT_NE((void*)0, M1.base());
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(16U, M1.size());
-  EXPECT_NE((void*)0, M2.base());
+  EXPECT_NE((void*)nullptr, M2.base());
   EXPECT_LE(64U, M2.size());
-  EXPECT_NE((void*)0, M3.base());
+  EXPECT_NE((void*)nullptr, M3.base());
   EXPECT_LE(32U, M3.size());
 
   EXPECT_FALSE(Memory::releaseMappedMemory(M1));
@@ -276,20 +284,20 @@ TEST_P(MappedMemoryTest, DuplicateNear) {
 }
 
 TEST_P(MappedMemoryTest, ZeroNear) {
-  error_code EC;
-  MemoryBlock Near(0, 0);
+  std::error_code EC;
+  MemoryBlock Near(nullptr, 0);
   MemoryBlock M1 = Memory::allocateMappedMemory(16, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
   MemoryBlock M2 = Memory::allocateMappedMemory(64, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
   MemoryBlock M3 = Memory::allocateMappedMemory(32, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
 
-  EXPECT_NE((void*)0, M1.base());
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(16U, M1.size());
-  EXPECT_NE((void*)0, M2.base());
+  EXPECT_NE((void*)nullptr, M2.base());
   EXPECT_LE(64U, M2.size());
-  EXPECT_NE((void*)0, M3.base());
+  EXPECT_NE((void*)nullptr, M3.base());
   EXPECT_LE(32U, M3.size());
 
   EXPECT_FALSE(doesOverlap(M1, M2));
@@ -302,20 +310,20 @@ TEST_P(MappedMemoryTest, ZeroNear) {
 }
 
 TEST_P(MappedMemoryTest, ZeroSizeNear) {
-  error_code EC;
+  std::error_code EC;
   MemoryBlock Near((void*)(4*PageSize), 0);
   MemoryBlock M1 = Memory::allocateMappedMemory(16, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
   MemoryBlock M2 = Memory::allocateMappedMemory(64, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
   MemoryBlock M3 = Memory::allocateMappedMemory(32, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
 
-  EXPECT_NE((void*)0, M1.base());
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(16U, M1.size());
-  EXPECT_NE((void*)0, M2.base());
+  EXPECT_NE((void*)nullptr, M2.base());
   EXPECT_LE(64U, M2.size());
-  EXPECT_NE((void*)0, M3.base());
+  EXPECT_NE((void*)nullptr, M3.base());
   EXPECT_LE(32U, M3.size());
 
   EXPECT_FALSE(doesOverlap(M1, M2));
@@ -328,12 +336,12 @@ TEST_P(MappedMemoryTest, ZeroSizeNear) {
 }
 
 TEST_P(MappedMemoryTest, UnalignedNear) {
-  error_code EC;
+  std::error_code EC;
   MemoryBlock Near((void*)(2*PageSize+5), 0);
   MemoryBlock M1 = Memory::allocateMappedMemory(15, &Near, Flags, EC);
-  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_EQ(std::error_code(), EC);
 
-  EXPECT_NE((void*)0, M1.base());
+  EXPECT_NE((void*)nullptr, M1.base());
   EXPECT_LE(sizeof(int), M1.size());
 
   EXPECT_FALSE(Memory::releaseMappedMemory(M1));
diff --git a/unittests/Support/Path.cpp b/unittests/Support/Path.cpp
index b79d055..cf2e1ee 100644
--- a/unittests/Support/Path.cpp
+++ b/unittests/Support/Path.cpp
@@ -8,24 +8,30 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "gtest/gtest.h"
 
+#ifdef LLVM_ON_WIN32
+#include <winerror.h>
+#endif
+
 using namespace llvm;
 using namespace llvm::sys;
 
-#define ASSERT_NO_ERROR(x) \
-  if (error_code ASSERT_NO_ERROR_ec = x) { \
-    SmallString<128> MessageStorage; \
-    raw_svector_ostream Message(MessageStorage); \
-    Message << #x ": did not return errc::success.\n" \
-            << "error number: " << ASSERT_NO_ERROR_ec.value() << "\n" \
-            << "error message: " << ASSERT_NO_ERROR_ec.message() << "\n"; \
-    GTEST_FATAL_FAILURE_(MessageStorage.c_str()); \
-  } else {}
+#define ASSERT_NO_ERROR(x)                                                     \
+  if (std::error_code ASSERT_NO_ERROR_ec = x) {                                \
+    SmallString<128> MessageStorage;                                           \
+    raw_svector_ostream Message(MessageStorage);                               \
+    Message << #x ": did not return errc::success.\n"                          \
+            << "error number: " << ASSERT_NO_ERROR_ec.value() << "\n"          \
+            << "error message: " << ASSERT_NO_ERROR_ec.message() << "\n";      \
+    GTEST_FATAL_FAILURE_(MessageStorage.c_str());                              \
+  } else {                                                                     \
+  }
 
 namespace {
 
@@ -352,7 +358,7 @@ TEST_F(FileSystemTest, TempFiles) {
   ASSERT_EQ(fs::remove(Twine(TempPath2), false),
             errc::no_such_file_or_directory);
 
-  error_code EC = fs::status(TempPath2.c_str(), B);
+  std::error_code EC = fs::status(TempPath2.c_str(), B);
   EXPECT_EQ(EC, errc::no_such_file_or_directory);
   EXPECT_EQ(B.type(), fs::file_type::file_not_found);
 
@@ -393,7 +399,7 @@ TEST_F(FileSystemTest, TempFiles) {
     "abcdefghijklmnopqrstuvwxyz3abcdefghijklmnopqrstuvwxyz2"
     "abcdefghijklmnopqrstuvwxyz1abcdefghijklmnopqrstuvwxyz0";
   EXPECT_EQ(fs::createUniqueFile(Twine(Path270), FileDescriptor, TempPath),
-            windows_error::path_not_found);
+            errc::no_such_file_or_directory);
 #endif
 }
 
@@ -406,7 +412,7 @@ TEST_F(FileSystemTest, CreateDir) {
 }
 
 TEST_F(FileSystemTest, DirectoryIteration) {
-  error_code ec;
+  std::error_code ec;
   for (fs::directory_iterator i(".", ec), e; i != e; i.increment(ec))
     ASSERT_NO_ERROR(ec);
 
@@ -535,9 +541,6 @@ TEST_F(FileSystemTest, Magic) {
     StringRef magic(i->magic_str, i->magic_str_len);
     file << magic;
     file.close();
-    bool res = false;
-    ASSERT_NO_ERROR(fs::has_magic(file_pathname.c_str(), magic, res));
-    EXPECT_TRUE(res);
     EXPECT_EQ(i->magic, fs::identify_magic(magic));
     ASSERT_NO_ERROR(fs::remove(Twine(file_pathname)));
   }
@@ -555,9 +558,9 @@ TEST_F(FileSystemTest, CarriageReturn) {
     File << '\n';
   }
   {
-    std::unique_ptr<MemoryBuffer> Buf;
-    MemoryBuffer::getFile(FilePathname.c_str(), Buf);
-    EXPECT_EQ(Buf->getBuffer(), "\r\n");
+    auto Buf = MemoryBuffer::getFile(FilePathname.c_str());
+    EXPECT_TRUE((bool)Buf);
+    EXPECT_EQ(Buf.get()->getBuffer(), "\r\n");
   }
 
   {
@@ -566,9 +569,9 @@ TEST_F(FileSystemTest, CarriageReturn) {
     File << '\n';
   }
   {
-    std::unique_ptr<MemoryBuffer> Buf;
-    MemoryBuffer::getFile(FilePathname.c_str(), Buf);
-    EXPECT_EQ(Buf->getBuffer(), "\n");
+    auto Buf = MemoryBuffer::getFile(FilePathname.c_str());
+    EXPECT_TRUE((bool)Buf);
+    EXPECT_EQ(Buf.get()->getBuffer(), "\n");
   }
   ASSERT_NO_ERROR(fs::remove(Twine(FilePathname)));
 }
@@ -581,7 +584,7 @@ TEST_F(FileSystemTest, FileMapping) {
   ASSERT_NO_ERROR(
       fs::createTemporaryFile("prefix", "temp", FileDescriptor, TempPath));
   // Map in temp file and add some content
-  error_code EC;
+  std::error_code EC;
   StringRef Val("hello there");
   {
     fs::mapped_file_region mfr(FileDescriptor,
diff --git a/unittests/Support/ProgramTest.cpp b/unittests/Support/ProgramTest.cpp
index 800df14..4e7316f 100644
--- a/unittests/Support/ProgramTest.cpp
+++ b/unittests/Support/ProgramTest.cpp
@@ -54,7 +54,7 @@ static void CopyEnvironment(std::vector<const char *> &out) {
   // environ seems to work for Windows and most other Unices.
   char **envp = environ;
 #endif
-  while (*envp != 0) {
+  while (*envp != nullptr) {
     out.push_back(*envp);
     ++envp;
   }
@@ -76,14 +76,14 @@ TEST(ProgramTest, CreateProcessTrailingSlash) {
     "--gtest_filter=ProgramTest.CreateProcessTrailingSlash",
     "-program-test-string-arg1", "has\\\\ trailing\\",
     "-program-test-string-arg2", "has\\\\ trailing\\",
-    0
+    nullptr
   };
 
   // Add LLVM_PROGRAM_TEST_CHILD to the environment of the child.
   std::vector<const char *> envp;
   CopyEnvironment(envp);
   envp.push_back("LLVM_PROGRAM_TEST_CHILD=1");
-  envp.push_back(0);
+  envp.push_back(nullptr);
 
   std::string error;
   bool ExecutionFailed;
@@ -93,7 +93,7 @@ TEST(ProgramTest, CreateProcessTrailingSlash) {
 #else
   StringRef nul("/dev/null");
 #endif
-  const StringRef *redirects[] = { &nul, &nul, 0 };
+  const StringRef *redirects[] = { &nul, &nul, nullptr };
   int rc = ExecuteAndWait(my_exe, argv, &envp[0], redirects,
                           /*secondsToWait=*/ 10, /*memoryLimit=*/ 0, &error,
                           &ExecutionFailed);
@@ -114,19 +114,19 @@ TEST(ProgramTest, TestExecuteNoWait) {
   const char *argv[] = {
     Executable.c_str(),
     "--gtest_filter=ProgramTest.TestExecuteNoWait",
-    0
+    nullptr
   };
 
   // Add LLVM_PROGRAM_TEST_EXECUTE_NO_WAIT to the environment of the child.
   std::vector<const char *> envp;
   CopyEnvironment(envp);
   envp.push_back("LLVM_PROGRAM_TEST_EXECUTE_NO_WAIT=1");
-  envp.push_back(0);
+  envp.push_back(nullptr);
 
   std::string Error;
   bool ExecutionFailed;
-  ProcessInfo PI1 =
-      ExecuteNoWait(Executable, argv, &envp[0], 0, 0, &Error, &ExecutionFailed);
+  ProcessInfo PI1 = ExecuteNoWait(Executable, argv, &envp[0], nullptr, 0,
+                                  &Error, &ExecutionFailed);
   ASSERT_FALSE(ExecutionFailed) << Error;
   ASSERT_NE(PI1.Pid, 0) << "Invalid process id";
 
@@ -144,8 +144,8 @@ TEST(ProgramTest, TestExecuteNoWait) {
 
   EXPECT_EQ(LoopCount, 1u) << "LoopCount should be 1";
 
-  ProcessInfo PI2 =
-      ExecuteNoWait(Executable, argv, &envp[0], 0, 0, &Error, &ExecutionFailed);
+  ProcessInfo PI2 = ExecuteNoWait(Executable, argv, &envp[0], nullptr, 0,
+                                  &Error, &ExecutionFailed);
   ASSERT_FALSE(ExecutionFailed) << Error;
   ASSERT_NE(PI2.Pid, 0) << "Invalid process id";
 
@@ -162,15 +162,45 @@ TEST(ProgramTest, TestExecuteNoWait) {
   ASSERT_GT(LoopCount, 1u) << "LoopCount should be >1";
 }
 
+TEST(ProgramTest, TestExecuteAndWaitTimeout) {
+  using namespace llvm::sys;
+
+  if (getenv("LLVM_PROGRAM_TEST_TIMEOUT")) {
+    sleep_for(/*seconds*/ 10);
+    exit(0);
+  }
+
+  std::string Executable =
+      sys::fs::getMainExecutable(TestMainArgv0, &ProgramTestStringArg1);
+  const char *argv[] = {
+    Executable.c_str(),
+    "--gtest_filter=ProgramTest.TestExecuteAndWaitTimeout",
+    nullptr
+  };
+
+  // Add LLVM_PROGRAM_TEST_TIMEOUT to the environment of the child.
+  std::vector<const char *> envp;
+  CopyEnvironment(envp);
+  envp.push_back("LLVM_PROGRAM_TEST_TIMEOUT=1");
+  envp.push_back(nullptr);
+
+  std::string Error;
+  bool ExecutionFailed;
+  int RetCode =
+      ExecuteAndWait(Executable, argv, &envp[0], nullptr, /*secondsToWait=*/1, 0,
+                     &Error, &ExecutionFailed);
+  ASSERT_EQ(-2, RetCode);
+}
+
 TEST(ProgramTest, TestExecuteNegative) {
   std::string Executable = "i_dont_exist";
-  const char *argv[] = { Executable.c_str(), 0 };
+  const char *argv[] = { Executable.c_str(), nullptr };
 
   {
     std::string Error;
     bool ExecutionFailed;
-    int RetCode =
-        ExecuteAndWait(Executable, argv, 0, 0, 0, 0, &Error, &ExecutionFailed);
+    int RetCode = ExecuteAndWait(Executable, argv, nullptr, nullptr, 0, 0,
+                                 &Error, &ExecutionFailed);
     ASSERT_TRUE(RetCode < 0) << "On error ExecuteAndWait should return 0 or "
                                 "positive value indicating the result code";
     ASSERT_TRUE(ExecutionFailed);
@@ -180,8 +210,8 @@ TEST(ProgramTest, TestExecuteNegative) {
   {
     std::string Error;
     bool ExecutionFailed;
-    ProcessInfo PI =
-        ExecuteNoWait(Executable, argv, 0, 0, 0, &Error, &ExecutionFailed);
+    ProcessInfo PI = ExecuteNoWait(Executable, argv, nullptr, nullptr, 0,
+                                   &Error, &ExecutionFailed);
     ASSERT_EQ(PI.Pid, 0)
         << "On error ExecuteNoWait should return an invalid ProcessInfo";
     ASSERT_TRUE(ExecutionFailed);
diff --git a/unittests/Support/ScaledNumberTest.cpp b/unittests/Support/ScaledNumberTest.cpp
new file mode 100644
index 0000000..7bbef7e
--- /dev/null
+++ b/unittests/Support/ScaledNumberTest.cpp
@@ -0,0 +1,536 @@
+//===- llvm/unittest/Support/ScaledNumberTest.cpp - ScaledPair tests -----==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ScaledNumber.h"
+
+#include "llvm/Support/DataTypes.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::ScaledNumbers;
+
+namespace {
+
+template <class UIntT> struct ScaledPair {
+  UIntT D;
+  int S;
+  ScaledPair(const std::pair<UIntT, int16_t> &F) : D(F.first), S(F.second) {}
+  ScaledPair(UIntT D, int S) : D(D), S(S) {}
+
+  bool operator==(const ScaledPair<UIntT> &X) const {
+    return D == X.D && S == X.S;
+  }
+};
+template <class UIntT>
+bool operator==(const std::pair<UIntT, int16_t> &L,
+                const ScaledPair<UIntT> &R) {
+  return ScaledPair<UIntT>(L) == R;
+}
+template <class UIntT>
+void PrintTo(const ScaledPair<UIntT> &F, ::std::ostream *os) {
+  *os << F.D << "*2^" << F.S;
+}
+
+typedef ScaledPair<uint32_t> SP32;
+typedef ScaledPair<uint64_t> SP64;
+
+TEST(ScaledNumberHelpersTest, getRounded) {
+  EXPECT_EQ(getRounded32(0, 0, false), SP32(0, 0));
+  EXPECT_EQ(getRounded32(0, 0, true), SP32(1, 0));
+  EXPECT_EQ(getRounded32(20, 21, true), SP32(21, 21));
+  EXPECT_EQ(getRounded32(UINT32_MAX, 0, false), SP32(UINT32_MAX, 0));
+  EXPECT_EQ(getRounded32(UINT32_MAX, 0, true), SP32(1 << 31, 1));
+
+  EXPECT_EQ(getRounded64(0, 0, false), SP64(0, 0));
+  EXPECT_EQ(getRounded64(0, 0, true), SP64(1, 0));
+  EXPECT_EQ(getRounded64(20, 21, true), SP64(21, 21));
+  EXPECT_EQ(getRounded64(UINT32_MAX, 0, false), SP64(UINT32_MAX, 0));
+  EXPECT_EQ(getRounded64(UINT32_MAX, 0, true), SP64(UINT64_C(1) << 32, 0));
+  EXPECT_EQ(getRounded64(UINT64_MAX, 0, false), SP64(UINT64_MAX, 0));
+  EXPECT_EQ(getRounded64(UINT64_MAX, 0, true), SP64(UINT64_C(1) << 63, 1));
+}
+
+TEST(ScaledNumberHelpersTest, getAdjusted) {
+  const uint64_t Max32In64 = UINT32_MAX;
+  EXPECT_EQ(getAdjusted32(0), SP32(0, 0));
+  EXPECT_EQ(getAdjusted32(0, 5), SP32(0, 5));
+  EXPECT_EQ(getAdjusted32(UINT32_MAX), SP32(UINT32_MAX, 0));
+  EXPECT_EQ(getAdjusted32(Max32In64 << 1), SP32(UINT32_MAX, 1));
+  EXPECT_EQ(getAdjusted32(Max32In64 << 1, 1), SP32(UINT32_MAX, 2));
+  EXPECT_EQ(getAdjusted32(Max32In64 << 31), SP32(UINT32_MAX, 31));
+  EXPECT_EQ(getAdjusted32(Max32In64 << 32), SP32(UINT32_MAX, 32));
+  EXPECT_EQ(getAdjusted32(Max32In64 + 1), SP32(1u << 31, 1));
+  EXPECT_EQ(getAdjusted32(UINT64_MAX), SP32(1u << 31, 33));
+
+  EXPECT_EQ(getAdjusted64(0), SP64(0, 0));
+  EXPECT_EQ(getAdjusted64(0, 5), SP64(0, 5));
+  EXPECT_EQ(getAdjusted64(UINT32_MAX), SP64(UINT32_MAX, 0));
+  EXPECT_EQ(getAdjusted64(Max32In64 << 1), SP64(Max32In64 << 1, 0));
+  EXPECT_EQ(getAdjusted64(Max32In64 << 1, 1), SP64(Max32In64 << 1, 1));
+  EXPECT_EQ(getAdjusted64(Max32In64 << 31), SP64(Max32In64 << 31, 0));
+  EXPECT_EQ(getAdjusted64(Max32In64 << 32), SP64(Max32In64 << 32, 0));
+  EXPECT_EQ(getAdjusted64(Max32In64 + 1), SP64(Max32In64 + 1, 0));
+  EXPECT_EQ(getAdjusted64(UINT64_MAX), SP64(UINT64_MAX, 0));
+}
+
+TEST(ScaledNumberHelpersTest, getProduct) {
+  // Zero.
+  EXPECT_EQ(SP32(0, 0), getProduct32(0, 0));
+  EXPECT_EQ(SP32(0, 0), getProduct32(0, 1));
+  EXPECT_EQ(SP32(0, 0), getProduct32(0, 33));
+
+  // Basic.
+  EXPECT_EQ(SP32(6, 0), getProduct32(2, 3));
+  EXPECT_EQ(SP32(UINT16_MAX / 3 * UINT16_MAX / 5 * 2, 0),
+            getProduct32(UINT16_MAX / 3, UINT16_MAX / 5 * 2));
+
+  // Overflow, no loss of precision.
+  // ==> 0xf00010 * 0x1001
+  // ==> 0xf00f00000 + 0x10010
+  // ==> 0xf00f10010
+  // ==> 0xf00f1001 * 2^4
+  EXPECT_EQ(SP32(0xf00f1001, 4), getProduct32(0xf00010, 0x1001));
+
+  // Overflow, loss of precision, rounds down.
+  // ==> 0xf000070 * 0x1001
+  // ==> 0xf00f000000 + 0x70070
+  // ==> 0xf00f070070
+  // ==> 0xf00f0700 * 2^8
+  EXPECT_EQ(SP32(0xf00f0700, 8), getProduct32(0xf000070, 0x1001));
+
+  // Overflow, loss of precision, rounds up.
+  // ==> 0xf000080 * 0x1001
+  // ==> 0xf00f000000 + 0x80080
+  // ==> 0xf00f080080
+  // ==> 0xf00f0801 * 2^8
+  EXPECT_EQ(SP32(0xf00f0801, 8), getProduct32(0xf000080, 0x1001));
+
+  // Reverse operand order.
+  EXPECT_EQ(SP32(0, 0), getProduct32(1, 0));
+  EXPECT_EQ(SP32(0, 0), getProduct32(33, 0));
+  EXPECT_EQ(SP32(6, 0), getProduct32(3, 2));
+  EXPECT_EQ(SP32(UINT16_MAX / 3 * UINT16_MAX / 5 * 2, 0),
+            getProduct32(UINT16_MAX / 5 * 2, UINT16_MAX / 3));
+  EXPECT_EQ(SP32(0xf00f1001, 4), getProduct32(0x1001, 0xf00010));
+  EXPECT_EQ(SP32(0xf00f0700, 8), getProduct32(0x1001, 0xf000070));
+  EXPECT_EQ(SP32(0xf00f0801, 8), getProduct32(0x1001, 0xf000080));
+
+  // Round to overflow.
+  EXPECT_EQ(SP64(UINT64_C(1) << 63, 64),
+            getProduct64(UINT64_C(10376293541461622786),
+                         UINT64_C(16397105843297379211)));
+
+  // Big number with rounding.
+  EXPECT_EQ(SP64(UINT64_C(9223372036854775810), 64),
+            getProduct64(UINT64_C(18446744073709551556),
+                         UINT64_C(9223372036854775840)));
+}
+
+TEST(ScaledNumberHelpersTest, getQuotient) {
+  // Zero.
+  EXPECT_EQ(SP32(0, 0), getQuotient32(0, 0));
+  EXPECT_EQ(SP32(0, 0), getQuotient32(0, 1));
+  EXPECT_EQ(SP32(0, 0), getQuotient32(0, 73));
+  EXPECT_EQ(SP32(UINT32_MAX, MaxScale), getQuotient32(1, 0));
+  EXPECT_EQ(SP32(UINT32_MAX, MaxScale), getQuotient32(6, 0));
+
+  // Powers of two.
+  EXPECT_EQ(SP32(1u << 31, -31), getQuotient32(1, 1));
+  EXPECT_EQ(SP32(1u << 31, -30), getQuotient32(2, 1));
+  EXPECT_EQ(SP32(1u << 31, -33), getQuotient32(4, 16));
+  EXPECT_EQ(SP32(7u << 29, -29), getQuotient32(7, 1));
+  EXPECT_EQ(SP32(7u << 29, -30), getQuotient32(7, 2));
+  EXPECT_EQ(SP32(7u << 29, -33), getQuotient32(7, 16));
+
+  // Divide evenly.
+  EXPECT_EQ(SP32(3u << 30, -30), getQuotient32(9, 3));
+  EXPECT_EQ(SP32(9u << 28, -28), getQuotient32(63, 7));
+
+  // Divide unevenly.
+  EXPECT_EQ(SP32(0xaaaaaaab, -33), getQuotient32(1, 3));
+  EXPECT_EQ(SP32(0xd5555555, -31), getQuotient32(5, 3));
+
+  // 64-bit division is hard to test, since divide64 doesn't canonicalized its
+  // output.  However, this is the algorithm the implementation uses:
+  //
+  // - Shift divisor right.
+  // - If we have 1 (power of 2), return early -- not canonicalized.
+  // - Shift dividend left.
+  // - 64-bit integer divide.
+  // - If there's a remainder, continue with long division.
+  //
+  // TODO: require less knowledge about the implementation in the test.
+
+  // Zero.
+  EXPECT_EQ(SP64(0, 0), getQuotient64(0, 0));
+  EXPECT_EQ(SP64(0, 0), getQuotient64(0, 1));
+  EXPECT_EQ(SP64(0, 0), getQuotient64(0, 73));
+  EXPECT_EQ(SP64(UINT64_MAX, MaxScale), getQuotient64(1, 0));
+  EXPECT_EQ(SP64(UINT64_MAX, MaxScale), getQuotient64(6, 0));
+
+  // Powers of two.
+  EXPECT_EQ(SP64(1, 0), getQuotient64(1, 1));
+  EXPECT_EQ(SP64(2, 0), getQuotient64(2, 1));
+  EXPECT_EQ(SP64(4, -4), getQuotient64(4, 16));
+  EXPECT_EQ(SP64(7, 0), getQuotient64(7, 1));
+  EXPECT_EQ(SP64(7, -1), getQuotient64(7, 2));
+  EXPECT_EQ(SP64(7, -4), getQuotient64(7, 16));
+
+  // Divide evenly.
+  EXPECT_EQ(SP64(UINT64_C(3) << 60, -60), getQuotient64(9, 3));
+  EXPECT_EQ(SP64(UINT64_C(9) << 58, -58), getQuotient64(63, 7));
+
+  // Divide unevenly.
+  EXPECT_EQ(SP64(0xaaaaaaaaaaaaaaab, -65), getQuotient64(1, 3));
+  EXPECT_EQ(SP64(0xd555555555555555, -63), getQuotient64(5, 3));
+}
+
+TEST(ScaledNumberHelpersTest, getLg) {
+  EXPECT_EQ(0, getLg(UINT32_C(1), 0));
+  EXPECT_EQ(1, getLg(UINT32_C(1), 1));
+  EXPECT_EQ(1, getLg(UINT32_C(2), 0));
+  EXPECT_EQ(3, getLg(UINT32_C(1), 3));
+  EXPECT_EQ(3, getLg(UINT32_C(7), 0));
+  EXPECT_EQ(3, getLg(UINT32_C(8), 0));
+  EXPECT_EQ(3, getLg(UINT32_C(9), 0));
+  EXPECT_EQ(3, getLg(UINT32_C(64), -3));
+  EXPECT_EQ(31, getLg((UINT32_MAX >> 1) + 2, 0));
+  EXPECT_EQ(32, getLg(UINT32_MAX, 0));
+  EXPECT_EQ(-1, getLg(UINT32_C(1), -1));
+  EXPECT_EQ(-1, getLg(UINT32_C(2), -2));
+  EXPECT_EQ(INT32_MIN, getLg(UINT32_C(0), -1));
+  EXPECT_EQ(INT32_MIN, getLg(UINT32_C(0), 0));
+  EXPECT_EQ(INT32_MIN, getLg(UINT32_C(0), 1));
+
+  EXPECT_EQ(0, getLg(UINT64_C(1), 0));
+  EXPECT_EQ(1, getLg(UINT64_C(1), 1));
+  EXPECT_EQ(1, getLg(UINT64_C(2), 0));
+  EXPECT_EQ(3, getLg(UINT64_C(1), 3));
+  EXPECT_EQ(3, getLg(UINT64_C(7), 0));
+  EXPECT_EQ(3, getLg(UINT64_C(8), 0));
+  EXPECT_EQ(3, getLg(UINT64_C(9), 0));
+  EXPECT_EQ(3, getLg(UINT64_C(64), -3));
+  EXPECT_EQ(63, getLg((UINT64_MAX >> 1) + 2, 0));
+  EXPECT_EQ(64, getLg(UINT64_MAX, 0));
+  EXPECT_EQ(-1, getLg(UINT64_C(1), -1));
+  EXPECT_EQ(-1, getLg(UINT64_C(2), -2));
+  EXPECT_EQ(INT32_MIN, getLg(UINT64_C(0), -1));
+  EXPECT_EQ(INT32_MIN, getLg(UINT64_C(0), 0));
+  EXPECT_EQ(INT32_MIN, getLg(UINT64_C(0), 1));
+}
+
+TEST(ScaledNumberHelpersTest, getLgFloor) {
+  EXPECT_EQ(0, getLgFloor(UINT32_C(1), 0));
+  EXPECT_EQ(1, getLgFloor(UINT32_C(1), 1));
+  EXPECT_EQ(1, getLgFloor(UINT32_C(2), 0));
+  EXPECT_EQ(2, getLgFloor(UINT32_C(7), 0));
+  EXPECT_EQ(3, getLgFloor(UINT32_C(1), 3));
+  EXPECT_EQ(3, getLgFloor(UINT32_C(8), 0));
+  EXPECT_EQ(3, getLgFloor(UINT32_C(9), 0));
+  EXPECT_EQ(3, getLgFloor(UINT32_C(64), -3));
+  EXPECT_EQ(31, getLgFloor((UINT32_MAX >> 1) + 2, 0));
+  EXPECT_EQ(31, getLgFloor(UINT32_MAX, 0));
+  EXPECT_EQ(INT32_MIN, getLgFloor(UINT32_C(0), -1));
+  EXPECT_EQ(INT32_MIN, getLgFloor(UINT32_C(0), 0));
+  EXPECT_EQ(INT32_MIN, getLgFloor(UINT32_C(0), 1));
+
+  EXPECT_EQ(0, getLgFloor(UINT64_C(1), 0));
+  EXPECT_EQ(1, getLgFloor(UINT64_C(1), 1));
+  EXPECT_EQ(1, getLgFloor(UINT64_C(2), 0));
+  EXPECT_EQ(2, getLgFloor(UINT64_C(7), 0));
+  EXPECT_EQ(3, getLgFloor(UINT64_C(1), 3));
+  EXPECT_EQ(3, getLgFloor(UINT64_C(8), 0));
+  EXPECT_EQ(3, getLgFloor(UINT64_C(9), 0));
+  EXPECT_EQ(3, getLgFloor(UINT64_C(64), -3));
+  EXPECT_EQ(63, getLgFloor((UINT64_MAX >> 1) + 2, 0));
+  EXPECT_EQ(63, getLgFloor(UINT64_MAX, 0));
+  EXPECT_EQ(INT32_MIN, getLgFloor(UINT64_C(0), -1));
+  EXPECT_EQ(INT32_MIN, getLgFloor(UINT64_C(0), 0));
+  EXPECT_EQ(INT32_MIN, getLgFloor(UINT64_C(0), 1));
+}
+
+TEST(ScaledNumberHelpersTest, getLgCeiling) {
+  EXPECT_EQ(0, getLgCeiling(UINT32_C(1), 0));
+  EXPECT_EQ(1, getLgCeiling(UINT32_C(1), 1));
+  EXPECT_EQ(1, getLgCeiling(UINT32_C(2), 0));
+  EXPECT_EQ(3, getLgCeiling(UINT32_C(1), 3));
+  EXPECT_EQ(3, getLgCeiling(UINT32_C(7), 0));
+  EXPECT_EQ(3, getLgCeiling(UINT32_C(8), 0));
+  EXPECT_EQ(3, getLgCeiling(UINT32_C(64), -3));
+  EXPECT_EQ(4, getLgCeiling(UINT32_C(9), 0));
+  EXPECT_EQ(32, getLgCeiling(UINT32_MAX, 0));
+  EXPECT_EQ(32, getLgCeiling((UINT32_MAX >> 1) + 2, 0));
+  EXPECT_EQ(INT32_MIN, getLgCeiling(UINT32_C(0), -1));
+  EXPECT_EQ(INT32_MIN, getLgCeiling(UINT32_C(0), 0));
+  EXPECT_EQ(INT32_MIN, getLgCeiling(UINT32_C(0), 1));
+
+  EXPECT_EQ(0, getLgCeiling(UINT64_C(1), 0));
+  EXPECT_EQ(1, getLgCeiling(UINT64_C(1), 1));
+  EXPECT_EQ(1, getLgCeiling(UINT64_C(2), 0));
+  EXPECT_EQ(3, getLgCeiling(UINT64_C(1), 3));
+  EXPECT_EQ(3, getLgCeiling(UINT64_C(7), 0));
+  EXPECT_EQ(3, getLgCeiling(UINT64_C(8), 0));
+  EXPECT_EQ(3, getLgCeiling(UINT64_C(64), -3));
+  EXPECT_EQ(4, getLgCeiling(UINT64_C(9), 0));
+  EXPECT_EQ(64, getLgCeiling((UINT64_MAX >> 1) + 2, 0));
+  EXPECT_EQ(64, getLgCeiling(UINT64_MAX, 0));
+  EXPECT_EQ(INT32_MIN, getLgCeiling(UINT64_C(0), -1));
+  EXPECT_EQ(INT32_MIN, getLgCeiling(UINT64_C(0), 0));
+  EXPECT_EQ(INT32_MIN, getLgCeiling(UINT64_C(0), 1));
+}
+
+TEST(ScaledNumberHelpersTest, compare) {
+  EXPECT_EQ(0, compare(UINT32_C(0), 0, UINT32_C(0), 1));
+  EXPECT_EQ(0, compare(UINT32_C(0), 0, UINT32_C(0), -10));
+  EXPECT_EQ(0, compare(UINT32_C(0), 0, UINT32_C(0), 20));
+  EXPECT_EQ(0, compare(UINT32_C(8), 0, UINT32_C(64), -3));
+  EXPECT_EQ(0, compare(UINT32_C(8), 0, UINT32_C(32), -2));
+  EXPECT_EQ(0, compare(UINT32_C(8), 0, UINT32_C(16), -1));
+  EXPECT_EQ(0, compare(UINT32_C(8), 0, UINT32_C(8), 0));
+  EXPECT_EQ(0, compare(UINT32_C(8), 0, UINT32_C(4), 1));
+  EXPECT_EQ(0, compare(UINT32_C(8), 0, UINT32_C(2), 2));
+  EXPECT_EQ(0, compare(UINT32_C(8), 0, UINT32_C(1), 3));
+  EXPECT_EQ(-1, compare(UINT32_C(0), 0, UINT32_C(1), 3));
+  EXPECT_EQ(-1, compare(UINT32_C(7), 0, UINT32_C(1), 3));
+  EXPECT_EQ(-1, compare(UINT32_C(7), 0, UINT32_C(64), -3));
+  EXPECT_EQ(1, compare(UINT32_C(9), 0, UINT32_C(1), 3));
+  EXPECT_EQ(1, compare(UINT32_C(9), 0, UINT32_C(64), -3));
+  EXPECT_EQ(1, compare(UINT32_C(9), 0, UINT32_C(0), 0));
+
+  EXPECT_EQ(0, compare(UINT64_C(0), 0, UINT64_C(0), 1));
+  EXPECT_EQ(0, compare(UINT64_C(0), 0, UINT64_C(0), -10));
+  EXPECT_EQ(0, compare(UINT64_C(0), 0, UINT64_C(0), 20));
+  EXPECT_EQ(0, compare(UINT64_C(8), 0, UINT64_C(64), -3));
+  EXPECT_EQ(0, compare(UINT64_C(8), 0, UINT64_C(32), -2));
+  EXPECT_EQ(0, compare(UINT64_C(8), 0, UINT64_C(16), -1));
+  EXPECT_EQ(0, compare(UINT64_C(8), 0, UINT64_C(8), 0));
+  EXPECT_EQ(0, compare(UINT64_C(8), 0, UINT64_C(4), 1));
+  EXPECT_EQ(0, compare(UINT64_C(8), 0, UINT64_C(2), 2));
+  EXPECT_EQ(0, compare(UINT64_C(8), 0, UINT64_C(1), 3));
+  EXPECT_EQ(-1, compare(UINT64_C(0), 0, UINT64_C(1), 3));
+  EXPECT_EQ(-1, compare(UINT64_C(7), 0, UINT64_C(1), 3));
+  EXPECT_EQ(-1, compare(UINT64_C(7), 0, UINT64_C(64), -3));
+  EXPECT_EQ(1, compare(UINT64_C(9), 0, UINT64_C(1), 3));
+  EXPECT_EQ(1, compare(UINT64_C(9), 0, UINT64_C(64), -3));
+  EXPECT_EQ(1, compare(UINT64_C(9), 0, UINT64_C(0), 0));
+  EXPECT_EQ(-1, compare(UINT64_MAX, 0, UINT64_C(1), 64));
+}
+
+TEST(ScaledNumberHelpersTest, matchScales) {
+#define MATCH_SCALES(T, LDIn, LSIn, RDIn, RSIn, LDOut, RDOut, SOut)            \
+  do {                                                                         \
+    T LDx = LDIn;                                                              \
+    T RDx = RDIn;                                                              \
+    T LDy = LDOut;                                                             \
+    T RDy = RDOut;                                                             \
+    int16_t LSx = LSIn;                                                        \
+    int16_t RSx = RSIn;                                                        \
+    int16_t Sy = SOut;                                                         \
+                                                                               \
+    EXPECT_EQ(SOut, matchScales(LDx, LSx, RDx, RSx));                          \
+    EXPECT_EQ(LDy, LDx);                                                       \
+    EXPECT_EQ(RDy, RDx);                                                       \
+    if (LDy)                                                                   \
+      EXPECT_EQ(Sy, LSx);                                                      \
+    if (RDy)                                                                   \
+      EXPECT_EQ(Sy, RSx);                                                      \
+  } while (false)
+
+  MATCH_SCALES(uint32_t, 0, 0, 0, 0, 0, 0, 0);
+  MATCH_SCALES(uint32_t, 0, 50, 7, 1, 0, 7, 1);
+  MATCH_SCALES(uint32_t, UINT32_C(1) << 31, 1, 9, 0, UINT32_C(1) << 31, 4, 1);
+  MATCH_SCALES(uint32_t, UINT32_C(1) << 31, 2, 9, 0, UINT32_C(1) << 31, 2, 2);
+  MATCH_SCALES(uint32_t, UINT32_C(1) << 31, 3, 9, 0, UINT32_C(1) << 31, 1, 3);
+  MATCH_SCALES(uint32_t, UINT32_C(1) << 31, 4, 9, 0, UINT32_C(1) << 31, 0, 4);
+  MATCH_SCALES(uint32_t, UINT32_C(1) << 30, 4, 9, 0, UINT32_C(1) << 31, 1, 3);
+  MATCH_SCALES(uint32_t, UINT32_C(1) << 29, 4, 9, 0, UINT32_C(1) << 31, 2, 2);
+  MATCH_SCALES(uint32_t, UINT32_C(1) << 28, 4, 9, 0, UINT32_C(1) << 31, 4, 1);
+  MATCH_SCALES(uint32_t, UINT32_C(1) << 27, 4, 9, 0, UINT32_C(1) << 31, 9, 0);
+  MATCH_SCALES(uint32_t, 7, 1, 0, 50, 7, 0, 1);
+  MATCH_SCALES(uint32_t, 9, 0, UINT32_C(1) << 31, 1, 4, UINT32_C(1) << 31, 1);
+  MATCH_SCALES(uint32_t, 9, 0, UINT32_C(1) << 31, 2, 2, UINT32_C(1) << 31, 2);
+  MATCH_SCALES(uint32_t, 9, 0, UINT32_C(1) << 31, 3, 1, UINT32_C(1) << 31, 3);
+  MATCH_SCALES(uint32_t, 9, 0, UINT32_C(1) << 31, 4, 0, UINT32_C(1) << 31, 4);
+  MATCH_SCALES(uint32_t, 9, 0, UINT32_C(1) << 30, 4, 1, UINT32_C(1) << 31, 3);
+  MATCH_SCALES(uint32_t, 9, 0, UINT32_C(1) << 29, 4, 2, UINT32_C(1) << 31, 2);
+  MATCH_SCALES(uint32_t, 9, 0, UINT32_C(1) << 28, 4, 4, UINT32_C(1) << 31, 1);
+  MATCH_SCALES(uint32_t, 9, 0, UINT32_C(1) << 27, 4, 9, UINT32_C(1) << 31, 0);
+
+  MATCH_SCALES(uint64_t, 0, 0, 0, 0, 0, 0, 0);
+  MATCH_SCALES(uint64_t, 0, 100, 7, 1, 0, 7, 1);
+  MATCH_SCALES(uint64_t, UINT64_C(1) << 63, 1, 9, 0, UINT64_C(1) << 63, 4, 1);
+  MATCH_SCALES(uint64_t, UINT64_C(1) << 63, 2, 9, 0, UINT64_C(1) << 63, 2, 2);
+  MATCH_SCALES(uint64_t, UINT64_C(1) << 63, 3, 9, 0, UINT64_C(1) << 63, 1, 3);
+  MATCH_SCALES(uint64_t, UINT64_C(1) << 63, 4, 9, 0, UINT64_C(1) << 63, 0, 4);
+  MATCH_SCALES(uint64_t, UINT64_C(1) << 62, 4, 9, 0, UINT64_C(1) << 63, 1, 3);
+  MATCH_SCALES(uint64_t, UINT64_C(1) << 61, 4, 9, 0, UINT64_C(1) << 63, 2, 2);
+  MATCH_SCALES(uint64_t, UINT64_C(1) << 60, 4, 9, 0, UINT64_C(1) << 63, 4, 1);
+  MATCH_SCALES(uint64_t, UINT64_C(1) << 59, 4, 9, 0, UINT64_C(1) << 63, 9, 0);
+  MATCH_SCALES(uint64_t, 7, 1, 0, 100, 7, 0, 1);
+  MATCH_SCALES(uint64_t, 9, 0, UINT64_C(1) << 63, 1, 4, UINT64_C(1) << 63, 1);
+  MATCH_SCALES(uint64_t, 9, 0, UINT64_C(1) << 63, 2, 2, UINT64_C(1) << 63, 2);
+  MATCH_SCALES(uint64_t, 9, 0, UINT64_C(1) << 63, 3, 1, UINT64_C(1) << 63, 3);
+  MATCH_SCALES(uint64_t, 9, 0, UINT64_C(1) << 63, 4, 0, UINT64_C(1) << 63, 4);
+  MATCH_SCALES(uint64_t, 9, 0, UINT64_C(1) << 62, 4, 1, UINT64_C(1) << 63, 3);
+  MATCH_SCALES(uint64_t, 9, 0, UINT64_C(1) << 61, 4, 2, UINT64_C(1) << 63, 2);
+  MATCH_SCALES(uint64_t, 9, 0, UINT64_C(1) << 60, 4, 4, UINT64_C(1) << 63, 1);
+  MATCH_SCALES(uint64_t, 9, 0, UINT64_C(1) << 59, 4, 9, UINT64_C(1) << 63, 0);
+}
+
+TEST(ScaledNumberHelpersTest, getSum) {
+  // Zero.
+  EXPECT_EQ(SP32(1, 0), getSum32(0, 0, 1, 0));
+  EXPECT_EQ(SP32(8, -3), getSum32(0, 0, 8, -3));
+  EXPECT_EQ(SP32(UINT32_MAX, 0), getSum32(0, 0, UINT32_MAX, 0));
+
+  // Basic.
+  EXPECT_EQ(SP32(2, 0), getSum32(1, 0, 1, 0));
+  EXPECT_EQ(SP32(3, 0), getSum32(1, 0, 2, 0));
+  EXPECT_EQ(SP32(67, 0), getSum32(7, 0, 60, 0));
+
+  // Different scales.
+  EXPECT_EQ(SP32(3, 0), getSum32(1, 0, 1, 1));
+  EXPECT_EQ(SP32(4, 0), getSum32(2, 0, 1, 1));
+
+  // Loss of precision.
+  EXPECT_EQ(SP32(UINT32_C(1) << 31, 1), getSum32(1, 32, 1, 0));
+  EXPECT_EQ(SP32(UINT32_C(1) << 31, -31), getSum32(1, -32, 1, 0));
+
+  // Not quite loss of precision.
+  EXPECT_EQ(SP32((UINT32_C(1) << 31) + 1, 1), getSum32(1, 32, 1, 1));
+  EXPECT_EQ(SP32((UINT32_C(1) << 31) + 1, -32), getSum32(1, -32, 1, -1));
+
+  // Overflow.
+  EXPECT_EQ(SP32(UINT32_C(1) << 31, 1), getSum32(1, 0, UINT32_MAX, 0));
+
+  // Reverse operand order.
+  EXPECT_EQ(SP32(1, 0), getSum32(1, 0, 0, 0));
+  EXPECT_EQ(SP32(8, -3), getSum32(8, -3, 0, 0));
+  EXPECT_EQ(SP32(UINT32_MAX, 0), getSum32(UINT32_MAX, 0, 0, 0));
+  EXPECT_EQ(SP32(3, 0), getSum32(2, 0, 1, 0));
+  EXPECT_EQ(SP32(67, 0), getSum32(60, 0, 7, 0));
+  EXPECT_EQ(SP32(3, 0), getSum32(1, 1, 1, 0));
+  EXPECT_EQ(SP32(4, 0), getSum32(1, 1, 2, 0));
+  EXPECT_EQ(SP32(UINT32_C(1) << 31, 1), getSum32(1, 0, 1, 32));
+  EXPECT_EQ(SP32(UINT32_C(1) << 31, -31), getSum32(1, 0, 1, -32));
+  EXPECT_EQ(SP32((UINT32_C(1) << 31) + 1, 1), getSum32(1, 1, 1, 32));
+  EXPECT_EQ(SP32((UINT32_C(1) << 31) + 1, -32), getSum32(1, -1, 1, -32));
+  EXPECT_EQ(SP32(UINT32_C(1) << 31, 1), getSum32(UINT32_MAX, 0, 1, 0));
+
+  // Zero.
+  EXPECT_EQ(SP64(1, 0), getSum64(0, 0, 1, 0));
+  EXPECT_EQ(SP64(8, -3), getSum64(0, 0, 8, -3));
+  EXPECT_EQ(SP64(UINT64_MAX, 0), getSum64(0, 0, UINT64_MAX, 0));
+
+  // Basic.
+  EXPECT_EQ(SP64(2, 0), getSum64(1, 0, 1, 0));
+  EXPECT_EQ(SP64(3, 0), getSum64(1, 0, 2, 0));
+  EXPECT_EQ(SP64(67, 0), getSum64(7, 0, 60, 0));
+
+  // Different scales.
+  EXPECT_EQ(SP64(3, 0), getSum64(1, 0, 1, 1));
+  EXPECT_EQ(SP64(4, 0), getSum64(2, 0, 1, 1));
+
+  // Loss of precision.
+  EXPECT_EQ(SP64(UINT64_C(1) << 63, 1), getSum64(1, 64, 1, 0));
+  EXPECT_EQ(SP64(UINT64_C(1) << 63, -63), getSum64(1, -64, 1, 0));
+
+  // Not quite loss of precision.
+  EXPECT_EQ(SP64((UINT64_C(1) << 63) + 1, 1), getSum64(1, 64, 1, 1));
+  EXPECT_EQ(SP64((UINT64_C(1) << 63) + 1, -64), getSum64(1, -64, 1, -1));
+
+  // Overflow.
+  EXPECT_EQ(SP64(UINT64_C(1) << 63, 1), getSum64(1, 0, UINT64_MAX, 0));
+
+  // Reverse operand order.
+  EXPECT_EQ(SP64(1, 0), getSum64(1, 0, 0, 0));
+  EXPECT_EQ(SP64(8, -3), getSum64(8, -3, 0, 0));
+  EXPECT_EQ(SP64(UINT64_MAX, 0), getSum64(UINT64_MAX, 0, 0, 0));
+  EXPECT_EQ(SP64(3, 0), getSum64(2, 0, 1, 0));
+  EXPECT_EQ(SP64(67, 0), getSum64(60, 0, 7, 0));
+  EXPECT_EQ(SP64(3, 0), getSum64(1, 1, 1, 0));
+  EXPECT_EQ(SP64(4, 0), getSum64(1, 1, 2, 0));
+  EXPECT_EQ(SP64(UINT64_C(1) << 63, 1), getSum64(1, 0, 1, 64));
+  EXPECT_EQ(SP64(UINT64_C(1) << 63, -63), getSum64(1, 0, 1, -64));
+  EXPECT_EQ(SP64((UINT64_C(1) << 63) + 1, 1), getSum64(1, 1, 1, 64));
+  EXPECT_EQ(SP64((UINT64_C(1) << 63) + 1, -64), getSum64(1, -1, 1, -64));
+  EXPECT_EQ(SP64(UINT64_C(1) << 63, 1), getSum64(UINT64_MAX, 0, 1, 0));
+}
+
+TEST(ScaledNumberHelpersTest, getDifference) {
+  // Basic.
+  EXPECT_EQ(SP32(0, 0), getDifference32(1, 0, 1, 0));
+  EXPECT_EQ(SP32(1, 0), getDifference32(2, 0, 1, 0));
+  EXPECT_EQ(SP32(53, 0), getDifference32(60, 0, 7, 0));
+
+  // Equals "0", different scales.
+  EXPECT_EQ(SP32(0, 0), getDifference32(2, 0, 1, 1));
+
+  // Subtract "0".
+  EXPECT_EQ(SP32(1, 0), getDifference32(1, 0, 0, 0));
+  EXPECT_EQ(SP32(8, -3), getDifference32(8, -3, 0, 0));
+  EXPECT_EQ(SP32(UINT32_MAX, 0), getDifference32(UINT32_MAX, 0, 0, 0));
+
+  // Loss of precision.
+  EXPECT_EQ(SP32((UINT32_C(1) << 31) + 1, 1),
+            getDifference32((UINT32_C(1) << 31) + 1, 1, 1, 0));
+  EXPECT_EQ(SP32((UINT32_C(1) << 31) + 1, -31),
+            getDifference32((UINT32_C(1) << 31) + 1, -31, 1, -32));
+
+  // Not quite loss of precision.
+  EXPECT_EQ(SP32(UINT32_MAX, 0), getDifference32(1, 32, 1, 0));
+  EXPECT_EQ(SP32(UINT32_MAX, -32), getDifference32(1, 0, 1, -32));
+
+  // Saturate to "0".
+  EXPECT_EQ(SP32(0, 0), getDifference32(0, 0, 1, 0));
+  EXPECT_EQ(SP32(0, 0), getDifference32(0, 0, 8, -3));
+  EXPECT_EQ(SP32(0, 0), getDifference32(0, 0, UINT32_MAX, 0));
+  EXPECT_EQ(SP32(0, 0), getDifference32(7, 0, 60, 0));
+  EXPECT_EQ(SP32(0, 0), getDifference32(1, 0, 1, 1));
+  EXPECT_EQ(SP32(0, 0), getDifference32(1, -32, 1, 0));
+  EXPECT_EQ(SP32(0, 0), getDifference32(1, -32, 1, -1));
+
+  // Regression tests for cases that failed during bringup.
+  EXPECT_EQ(SP32(UINT32_C(1) << 26, -31),
+            getDifference32(1, 0, UINT32_C(31) << 27, -32));
+
+  // Basic.
+  EXPECT_EQ(SP64(0, 0), getDifference64(1, 0, 1, 0));
+  EXPECT_EQ(SP64(1, 0), getDifference64(2, 0, 1, 0));
+  EXPECT_EQ(SP64(53, 0), getDifference64(60, 0, 7, 0));
+
+  // Equals "0", different scales.
+  EXPECT_EQ(SP64(0, 0), getDifference64(2, 0, 1, 1));
+
+  // Subtract "0".
+  EXPECT_EQ(SP64(1, 0), getDifference64(1, 0, 0, 0));
+  EXPECT_EQ(SP64(8, -3), getDifference64(8, -3, 0, 0));
+  EXPECT_EQ(SP64(UINT64_MAX, 0), getDifference64(UINT64_MAX, 0, 0, 0));
+
+  // Loss of precision.
+  EXPECT_EQ(SP64((UINT64_C(1) << 63) + 1, 1),
+            getDifference64((UINT64_C(1) << 63) + 1, 1, 1, 0));
+  EXPECT_EQ(SP64((UINT64_C(1) << 63) + 1, -63),
+            getDifference64((UINT64_C(1) << 63) + 1, -63, 1, -64));
+
+  // Not quite loss of precision.
+  EXPECT_EQ(SP64(UINT64_MAX, 0), getDifference64(1, 64, 1, 0));
+  EXPECT_EQ(SP64(UINT64_MAX, -64), getDifference64(1, 0, 1, -64));
+
+  // Saturate to "0".
+  EXPECT_EQ(SP64(0, 0), getDifference64(0, 0, 1, 0));
+  EXPECT_EQ(SP64(0, 0), getDifference64(0, 0, 8, -3));
+  EXPECT_EQ(SP64(0, 0), getDifference64(0, 0, UINT64_MAX, 0));
+  EXPECT_EQ(SP64(0, 0), getDifference64(7, 0, 60, 0));
+  EXPECT_EQ(SP64(0, 0), getDifference64(1, 0, 1, 1));
+  EXPECT_EQ(SP64(0, 0), getDifference64(1, -64, 1, 0));
+  EXPECT_EQ(SP64(0, 0), getDifference64(1, -64, 1, -1));
+}
+
+} // end namespace
diff --git a/unittests/Support/SpecialCaseListTest.cpp b/unittests/Support/SpecialCaseListTest.cpp
new file mode 100644
index 0000000..bb9c351
--- /dev/null
+++ b/unittests/Support/SpecialCaseListTest.cpp
@@ -0,0 +1,126 @@
+//===- SpecialCaseListTest.cpp - Unit tests for SpecialCaseList -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SpecialCaseList.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+class SpecialCaseListTest : public ::testing::Test {
+protected:
+  SpecialCaseList *makeSpecialCaseList(StringRef List, std::string &Error) {
+    std::unique_ptr<MemoryBuffer> MB(MemoryBuffer::getMemBuffer(List));
+    return SpecialCaseList::create(MB.get(), Error);
+  }
+
+  SpecialCaseList *makeSpecialCaseList(StringRef List) {
+    std::string Error;
+    SpecialCaseList *SCL = makeSpecialCaseList(List, Error);
+    assert(SCL);
+    assert(Error == "");
+    return SCL;
+  }
+};
+
+TEST_F(SpecialCaseListTest, Basic) {
+  std::unique_ptr<SpecialCaseList> SCL(
+      makeSpecialCaseList("# This is a comment.\n"
+                          "\n"
+                          "src:hello\n"
+                          "src:bye\n"
+                          "src:hi=category\n"
+                          "src:z*=category\n"));
+  EXPECT_TRUE(SCL->inSection("src", "hello"));
+  EXPECT_TRUE(SCL->inSection("src", "bye"));
+  EXPECT_TRUE(SCL->inSection("src", "hi", "category"));
+  EXPECT_TRUE(SCL->inSection("src", "zzzz", "category"));
+  EXPECT_FALSE(SCL->inSection("src", "hi"));
+  EXPECT_FALSE(SCL->inSection("fun", "hello"));
+  EXPECT_FALSE(SCL->inSection("src", "hello", "category"));
+}
+
+TEST_F(SpecialCaseListTest, GlobalInitCompat) {
+  std::unique_ptr<SpecialCaseList> SCL(
+      makeSpecialCaseList("global:foo=init\n"));
+  EXPECT_FALSE(SCL->inSection("global", "foo"));
+  EXPECT_FALSE(SCL->inSection("global", "bar"));
+  EXPECT_TRUE(SCL->inSection("global", "foo", "init"));
+  EXPECT_FALSE(SCL->inSection("global", "bar", "init"));
+
+  SCL.reset(makeSpecialCaseList("global-init:foo\n"));
+  EXPECT_FALSE(SCL->inSection("global", "foo"));
+  EXPECT_FALSE(SCL->inSection("global", "bar"));
+  EXPECT_TRUE(SCL->inSection("global", "foo", "init"));
+  EXPECT_FALSE(SCL->inSection("global", "bar", "init"));
+
+  SCL.reset(makeSpecialCaseList("type:t2=init\n"));
+  EXPECT_FALSE(SCL->inSection("type", "t1"));
+  EXPECT_FALSE(SCL->inSection("type", "t2"));
+  EXPECT_FALSE(SCL->inSection("type", "t1", "init"));
+  EXPECT_TRUE(SCL->inSection("type", "t2", "init"));
+
+  SCL.reset(makeSpecialCaseList("global-init-type:t2\n"));
+  EXPECT_FALSE(SCL->inSection("type", "t1"));
+  EXPECT_FALSE(SCL->inSection("type", "t2"));
+  EXPECT_FALSE(SCL->inSection("type", "t1", "init"));
+  EXPECT_TRUE(SCL->inSection("type", "t2", "init"));
+
+  SCL.reset(makeSpecialCaseList("src:hello=init\n"));
+  EXPECT_FALSE(SCL->inSection("src", "hello"));
+  EXPECT_FALSE(SCL->inSection("src", "bye"));
+  EXPECT_TRUE(SCL->inSection("src", "hello", "init"));
+  EXPECT_FALSE(SCL->inSection("src", "bye", "init"));
+
+  SCL.reset(makeSpecialCaseList("global-init-src:hello\n"));
+  EXPECT_FALSE(SCL->inSection("src", "hello"));
+  EXPECT_FALSE(SCL->inSection("src", "bye"));
+  EXPECT_TRUE(SCL->inSection("src", "hello", "init"));
+  EXPECT_FALSE(SCL->inSection("src", "bye", "init"));
+}
+
+TEST_F(SpecialCaseListTest, Substring) {
+  std::unique_ptr<SpecialCaseList> SCL(makeSpecialCaseList("src:hello\n"
+                                                           "fun:foo\n"
+                                                           "global:bar\n"));
+  EXPECT_FALSE(SCL->inSection("src", "othello"));
+  EXPECT_FALSE(SCL->inSection("fun", "tomfoolery"));
+  EXPECT_FALSE(SCL->inSection("global", "bartender"));
+
+  SCL.reset(makeSpecialCaseList("fun:*foo*\n"));
+  EXPECT_TRUE(SCL->inSection("fun", "tomfoolery"));
+  EXPECT_TRUE(SCL->inSection("fun", "foobar"));
+}
+
+TEST_F(SpecialCaseListTest, InvalidSpecialCaseList) {
+  std::string Error;
+  EXPECT_EQ(nullptr, makeSpecialCaseList("badline", Error));
+  EXPECT_EQ("Malformed line 1: 'badline'", Error);
+  EXPECT_EQ(nullptr, makeSpecialCaseList("src:bad[a-", Error));
+  EXPECT_EQ("Malformed regex in line 1: 'bad[a-': invalid character range",
+            Error);
+  EXPECT_EQ(nullptr, makeSpecialCaseList("src:a.c\n"
+                                   "fun:fun(a\n",
+                                   Error));
+  EXPECT_EQ("Malformed regex in line 2: 'fun(a': parentheses not balanced",
+            Error);
+  EXPECT_EQ(nullptr, SpecialCaseList::create("unexisting", Error));
+  EXPECT_EQ(0U, Error.find("Can't open file 'unexisting':"));
+}
+
+TEST_F(SpecialCaseListTest, EmptySpecialCaseList) {
+  std::unique_ptr<SpecialCaseList> SCL(makeSpecialCaseList(""));
+  EXPECT_FALSE(SCL->inSection("foo", "bar"));
+}
+
+}
+
+
diff --git a/unittests/Support/StringPool.cpp b/unittests/Support/StringPool.cpp
new file mode 100644
index 0000000..7b7805f
--- /dev/null
+++ b/unittests/Support/StringPool.cpp
@@ -0,0 +1,31 @@
+//===- llvm/unittest/Support/ThreadLocalTest.cpp - Therad Local tests   ---===//
+//
+//		       The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/StringPool.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+TEST(PooledStringPtrTest, OperatorEquals) {
+  StringPool pool;
+  const PooledStringPtr a = pool.intern("a");
+  const PooledStringPtr b = pool.intern("b");
+  EXPECT_FALSE(a == b);
+}
+
+TEST(PooledStringPtrTest, OperatorNotEquals) {
+  StringPool pool;
+  const PooledStringPtr a = pool.intern("a");
+  const PooledStringPtr b = pool.intern("b");
+  EXPECT_TRUE(a != b);
+}
+
+}
diff --git a/unittests/Support/SwapByteOrderTest.cpp b/unittests/Support/SwapByteOrderTest.cpp
index 85ac6f3..525cfc1 100644
--- a/unittests/Support/SwapByteOrderTest.cpp
+++ b/unittests/Support/SwapByteOrderTest.cpp
@@ -20,70 +20,70 @@ namespace {
 // In these first two tests all of the original_uintx values are truncated
 // except for 64. We could avoid this, but there's really no point.
 
-TEST(SwapByteOrder, UnsignedRoundTrip) {
+TEST(getSwappedBytes, UnsignedRoundTrip) {
   // The point of the bit twiddling of magic is to test with and without bits
   // in every byte.
   uint64_t value = 1;
   for (std::size_t i = 0; i <= sizeof(value); ++i) {
     uint8_t original_uint8 = static_cast<uint8_t>(value);
     EXPECT_EQ(original_uint8,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_uint8)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_uint8)));
 
     uint16_t original_uint16 = static_cast<uint16_t>(value);
     EXPECT_EQ(original_uint16,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_uint16)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_uint16)));
 
     uint32_t original_uint32 = static_cast<uint32_t>(value);
     EXPECT_EQ(original_uint32,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_uint32)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_uint32)));
 
     uint64_t original_uint64 = static_cast<uint64_t>(value);
     EXPECT_EQ(original_uint64,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_uint64)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_uint64)));
 
     value = (value << 8) | 0x55; // binary 0101 0101.
   }
 }
 
-TEST(SwapByteOrder, SignedRoundTrip) {
+TEST(getSwappedBytes, SignedRoundTrip) {
   // The point of the bit twiddling of magic is to test with and without bits
   // in every byte.
   uint64_t value = 1;
   for (std::size_t i = 0; i <= sizeof(value); ++i) {
     int8_t original_int8 = static_cast<int8_t>(value);
     EXPECT_EQ(original_int8,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_int8)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_int8)));
 
     int16_t original_int16 = static_cast<int16_t>(value);
     EXPECT_EQ(original_int16,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_int16)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_int16)));
 
     int32_t original_int32 = static_cast<int32_t>(value);
     EXPECT_EQ(original_int32,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_int32)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_int32)));
 
     int64_t original_int64 = static_cast<int64_t>(value);
     EXPECT_EQ(original_int64,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_int64)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_int64)));
 
     // Test other sign.
     value *= -1;
 
     original_int8 = static_cast<int8_t>(value);
     EXPECT_EQ(original_int8,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_int8)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_int8)));
 
     original_int16 = static_cast<int16_t>(value);
     EXPECT_EQ(original_int16,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_int16)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_int16)));
 
     original_int32 = static_cast<int32_t>(value);
     EXPECT_EQ(original_int32,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_int32)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_int32)));
 
     original_int64 = static_cast<int64_t>(value);
     EXPECT_EQ(original_int64,
-              sys::SwapByteOrder(sys::SwapByteOrder(original_int64)));
+              sys::getSwappedBytes(sys::getSwappedBytes(original_int64)));
 
     // Return to normal sign and twiddle.
     value *= -1;
@@ -91,38 +91,86 @@ TEST(SwapByteOrder, SignedRoundTrip) {
   }
 }
 
-TEST(SwapByteOrder, uint8_t) {
-  EXPECT_EQ(uint8_t(0x11), sys::SwapByteOrder(uint8_t(0x11)));
+TEST(getSwappedBytes, uint8_t) {
+  EXPECT_EQ(uint8_t(0x11), sys::getSwappedBytes(uint8_t(0x11)));
 }
 
-TEST(SwapByteOrder, uint16_t) {
-  EXPECT_EQ(uint16_t(0x1122), sys::SwapByteOrder(uint16_t(0x2211)));
+TEST(getSwappedBytes, uint16_t) {
+  EXPECT_EQ(uint16_t(0x1122), sys::getSwappedBytes(uint16_t(0x2211)));
 }
 
-TEST(SwapByteOrder, uint32_t) {
-  EXPECT_EQ(uint32_t(0x11223344), sys::SwapByteOrder(uint32_t(0x44332211)));
+TEST(getSwappedBytes, uint32_t) {
+  EXPECT_EQ(uint32_t(0x11223344), sys::getSwappedBytes(uint32_t(0x44332211)));
 }
 
-TEST(SwapByteOrder, uint64_t) {
+TEST(getSwappedBytes, uint64_t) {
   EXPECT_EQ(uint64_t(0x1122334455667788ULL),
-    sys::SwapByteOrder(uint64_t(0x8877665544332211ULL)));
+    sys::getSwappedBytes(uint64_t(0x8877665544332211ULL)));
 }
 
-TEST(SwapByteOrder, int8_t) {
-  EXPECT_EQ(int8_t(0x11), sys::SwapByteOrder(int8_t(0x11)));
+TEST(getSwappedBytes, int8_t) {
+  EXPECT_EQ(int8_t(0x11), sys::getSwappedBytes(int8_t(0x11)));
 }
 
-TEST(SwapByteOrder, int16_t) {
-  EXPECT_EQ(int16_t(0x1122), sys::SwapByteOrder(int16_t(0x2211)));
+TEST(getSwappedBytes, int16_t) {
+  EXPECT_EQ(int16_t(0x1122), sys::getSwappedBytes(int16_t(0x2211)));
 }
 
-TEST(SwapByteOrder, int32_t) {
-  EXPECT_EQ(int32_t(0x11223344), sys::SwapByteOrder(int32_t(0x44332211)));
+TEST(getSwappedBytes, int32_t) {
+  EXPECT_EQ(int32_t(0x11223344), sys::getSwappedBytes(int32_t(0x44332211)));
 }
 
-TEST(SwapByteOrder, int64_t) {
+TEST(getSwappedBytes, int64_t) {
   EXPECT_EQ(int64_t(0x1122334455667788LL),
-    sys::SwapByteOrder(int64_t(0x8877665544332211LL)));
+    sys::getSwappedBytes(int64_t(0x8877665544332211LL)));
+}
+
+TEST(swapByteOrder, uint8_t) {
+  uint8_t value = 0x11;
+  sys::swapByteOrder(value);
+  EXPECT_EQ(uint8_t(0x11), value);
+}
+
+TEST(swapByteOrder, uint16_t) {
+  uint16_t value = 0x2211;
+  sys::swapByteOrder(value);
+  EXPECT_EQ(uint16_t(0x1122), value);
+}
+
+TEST(swapByteOrder, uint32_t) {
+  uint32_t value = 0x44332211;
+  sys::swapByteOrder(value);
+  EXPECT_EQ(uint32_t(0x11223344), value);
+}
+
+TEST(swapByteOrder, uint64_t) {
+  uint64_t value = 0x8877665544332211ULL;
+  sys::swapByteOrder(value);
+  EXPECT_EQ(uint64_t(0x1122334455667788ULL), value);
+}
+
+TEST(swapByteOrder, int8_t) {
+  int8_t value = 0x11;
+  sys::swapByteOrder(value);
+  EXPECT_EQ(int8_t(0x11), value);
+}
+
+TEST(swapByteOrder, int16_t) {
+  int16_t value = 0x2211;
+  sys::swapByteOrder(value);
+  EXPECT_EQ(int16_t(0x1122), value);
+}
+
+TEST(swapByteOrder, int32_t) {
+  int32_t value = 0x44332211;
+  sys::swapByteOrder(value);
+  EXPECT_EQ(int32_t(0x11223344), value);
+}
+
+TEST(swapByteOrder, int64_t) {
+  int64_t value = 0x8877665544332211LL;
+  sys::swapByteOrder(value);
+  EXPECT_EQ(int64_t(0x1122334455667788LL), value);
 }
 
 }
diff --git a/unittests/Support/ThreadLocalTest.cpp b/unittests/Support/ThreadLocalTest.cpp
index dd4d706..ea751be 100644
--- a/unittests/Support/ThreadLocalTest.cpp
+++ b/unittests/Support/ThreadLocalTest.cpp
@@ -25,14 +25,14 @@ struct S {
 TEST_F(ThreadLocalTest, Basics) {
   ThreadLocal<const S> x;
 
-  EXPECT_EQ(0, x.get());
+  EXPECT_EQ(nullptr, x.get());
 
   S s;
   x.set(&s);
   EXPECT_EQ(&s, x.get());
 
   x.erase();
-  EXPECT_EQ(0, x.get());
+  EXPECT_EQ(nullptr, x.get());
 }
 
 }
diff --git a/unittests/Support/TimeValueTest.cpp b/unittests/Support/TimeValueTest.cpp
index 8058812..3d2b978 100644
--- a/unittests/Support/TimeValueTest.cpp
+++ b/unittests/Support/TimeValueTest.cpp
@@ -16,7 +16,7 @@ namespace {
 
 TEST(TimeValue, time_t) {
   sys::TimeValue now = sys::TimeValue::now();
-  time_t now_t = time(NULL);
+  time_t now_t = time(nullptr);
   EXPECT_TRUE(std::abs(static_cast<long>(now_t - now.toEpochTime())) < 2);
 }
 
diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp
index cf95532..8aed980 100644
--- a/unittests/Support/YAMLIOTest.cpp
+++ b/unittests/Support/YAMLIOTest.cpp
@@ -1204,9 +1204,9 @@ TEST(YAMLIO, TestValidatingInput) {
   std::vector<MyValidation> docList;
   Input yin("--- \nvalue:  3.0\n"
             "--- \nvalue:  -1.0\n...\n",
-            NULL, suppressErrorMessages);
+            nullptr, suppressErrorMessages);
   yin >> docList;
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1224,10 +1224,10 @@ TEST(YAMLIO, TestColorsReadError) {
             "c2:  purple\n"
             "c3:  green\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> map;
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1241,11 +1241,11 @@ TEST(YAMLIO, TestFlagsReadError) {
             "f2:  [ round, hollow ]\n"
             "f3:  []\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> map;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1260,11 +1260,11 @@ TEST(YAMLIO, TestReadBuiltInTypesUint8Error) {
             "- 0\n"
             "- 257\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1279,11 +1279,11 @@ TEST(YAMLIO, TestReadBuiltInTypesUint16Error) {
             "- 0\n"
             "- 66000\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1298,11 +1298,11 @@ TEST(YAMLIO, TestReadBuiltInTypesUint32Error) {
             "- 0\n"
             "- 5000000000\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1317,11 +1317,11 @@ TEST(YAMLIO, TestReadBuiltInTypesUint64Error) {
             "- 0\n"
             "- 19446744073709551615\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1337,11 +1337,11 @@ TEST(YAMLIO, TestReadBuiltInTypesint8OverError) {
             "- 127\n"
             "- 128\n"
            "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 //
@@ -1355,11 +1355,11 @@ TEST(YAMLIO, TestReadBuiltInTypesint8UnderError) {
             "- 127\n"
             "- -129\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1375,11 +1375,11 @@ TEST(YAMLIO, TestReadBuiltInTypesint16UnderError) {
             "- -32768\n"
             "- -32769\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1394,11 +1394,11 @@ TEST(YAMLIO, TestReadBuiltInTypesint16OverError) {
             "- -32768\n"
             "- 32768\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1414,11 +1414,11 @@ TEST(YAMLIO, TestReadBuiltInTypesint32UnderError) {
             "- -2147483648\n"
             "- -2147483649\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 //
@@ -1432,11 +1432,11 @@ TEST(YAMLIO, TestReadBuiltInTypesint32OverError) {
             "- -2147483648\n"
             "- 2147483649\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1452,11 +1452,11 @@ TEST(YAMLIO, TestReadBuiltInTypesint64UnderError) {
             "- 9223372036854775807\n"
             "- -9223372036854775809\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 //
@@ -1470,11 +1470,11 @@ TEST(YAMLIO, TestReadBuiltInTypesint64OverError) {
             "- 9223372036854775807\n"
             "- 9223372036854775809\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 //
@@ -1489,11 +1489,11 @@ TEST(YAMLIO, TestReadBuiltInTypesFloatError) {
             "- -123.456\n"
             "- 1.2.3\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 //
@@ -1508,11 +1508,11 @@ TEST(YAMLIO, TestReadBuiltInTypesDoubleError) {
             "- -123.456\n"
             "- 1.2.3\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 //
@@ -1526,11 +1526,11 @@ TEST(YAMLIO, TestReadBuiltInTypesHex8Error) {
             "- 0xFE\n"
             "- 0x123\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 
@@ -1545,11 +1545,11 @@ TEST(YAMLIO, TestReadBuiltInTypesHex16Error) {
             "- 0xFEFF\n"
             "- 0x12345\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 //
@@ -1563,11 +1563,11 @@ TEST(YAMLIO, TestReadBuiltInTypesHex32Error) {
             "- 0xFEFF0000\n"
             "- 0x1234556789\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 //
@@ -1581,11 +1581,11 @@ TEST(YAMLIO, TestReadBuiltInTypesHex64Error) {
             "- 0xFFEEDDCCBBAA9988\n"
             "- 0x12345567890ABCDEF0\n"
             "...\n",
-            /*Ctxt=*/NULL,
+            /*Ctxt=*/nullptr,
             suppressErrorMessages);
   yin >> seq;
 
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 TEST(YAMLIO, TestMalformedMapFailsGracefully) {
@@ -1593,15 +1593,15 @@ TEST(YAMLIO, TestMalformedMapFailsGracefully) {
   {
     // We pass the suppressErrorMessages handler to handle the error
     // message generated in the constructor of Input.
-    Input yin("{foo:3, bar: 5}", /*Ctxt=*/NULL, suppressErrorMessages);
+    Input yin("{foo:3, bar: 5}", /*Ctxt=*/nullptr, suppressErrorMessages);
     yin >> doc;
-    EXPECT_TRUE(yin.error());
+    EXPECT_TRUE(!!yin.error());
   }
 
   {
-    Input yin("---\nfoo:3\nbar: 5\n...\n", /*Ctxt=*/NULL, suppressErrorMessages);
+    Input yin("---\nfoo:3\nbar: 5\n...\n", /*Ctxt=*/nullptr, suppressErrorMessages);
     yin >> doc;
-    EXPECT_TRUE(yin.error());
+    EXPECT_TRUE(!!yin.error());
   }
 }
 
@@ -1673,7 +1673,7 @@ TEST(YAMLIO, TestEmptyStringFailsForMapWithRequiredFields) {
   FooBar doc;
   Input yin("");
   yin >> doc;
-  EXPECT_TRUE(yin.error());
+  EXPECT_TRUE(!!yin.error());
 }
 
 TEST(YAMLIO, TestEmptyStringSucceedsForMapWithOptionalFields) {
@@ -1685,7 +1685,7 @@ TEST(YAMLIO, TestEmptyStringSucceedsForMapWithOptionalFields) {
 
 TEST(YAMLIO, TestEmptyStringSucceedsForSequence) {
   std::vector<uint8_t> seq;
-  Input yin("", /*Ctxt=*/NULL, suppressErrorMessages);
+  Input yin("", /*Ctxt=*/nullptr, suppressErrorMessages);
   yin >> seq;
 
   EXPECT_FALSE(yin.error());
diff --git a/unittests/Support/raw_ostream_test.cpp b/unittests/Support/raw_ostream_test.cpp
index 2b797b4..44d27d0 100644
--- a/unittests/Support/raw_ostream_test.cpp
+++ b/unittests/Support/raw_ostream_test.cpp
@@ -69,7 +69,7 @@ TEST(raw_ostreamTest, Types_Buffered) {
   EXPECT_EQ("1.100000e+00", printToString(1.1));
 
   // void*
-  EXPECT_EQ("0x0", printToString((void*) 0));
+  EXPECT_EQ("0x0", printToString((void*) nullptr));
   EXPECT_EQ("0xbeef", printToString((void*) 0xbeef));
   EXPECT_EQ("0xdeadbeef", printToString((void*) 0xdeadbeef));
 
@@ -100,7 +100,7 @@ TEST(raw_ostreamTest, Types_Unbuffered) {
   EXPECT_EQ("1.100000e+00", printToStringUnbuffered(1.1));
 
   // void*
-  EXPECT_EQ("0x0", printToStringUnbuffered((void*) 0));
+  EXPECT_EQ("0x0", printToStringUnbuffered((void*) nullptr));
   EXPECT_EQ("0xbeef", printToStringUnbuffered((void*) 0xbeef));
   EXPECT_EQ("0xdeadbeef", printToStringUnbuffered((void*) 0xdeadbeef));
 
diff --git a/unittests/Transforms/DebugIR/DebugIR.cpp b/unittests/Transforms/DebugIR/DebugIR.cpp
index 9b89c15..41df147 100644
--- a/unittests/Transforms/DebugIR/DebugIR.cpp
+++ b/unittests/Transforms/DebugIR/DebugIR.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/Path.h"
@@ -57,7 +58,7 @@ void insertCUDescriptor(Module *M, StringRef File, StringRef Dir,
 bool removeIfExists(StringRef Path) {
   // This is an approximation, on error we don't know in general if the file
   // existed or not.
-  llvm::error_code EC = sys::fs::remove(Path, false);
+  std::error_code EC = sys::fs::remove(Path, false);
   return EC != llvm::errc::no_such_file_or_directory;
 }
 
@@ -65,7 +66,7 @@ char * current_dir() {
 #if defined(LLVM_ON_WIN32) || defined(HAVE_GETCWD)
   // calling getcwd (or _getcwd() on windows) with a null buffer makes it
   // allocate a sufficiently sized buffer to store the current working dir.
-  return getcwd_impl(0, 0);
+  return getcwd_impl(nullptr, 0);
 #else
   return 0;
 #endif
diff --git a/unittests/Transforms/Utils/CMakeLists.txt b/unittests/Transforms/Utils/CMakeLists.txt
index 60447bb..ffa1d49 100644
--- a/unittests/Transforms/Utils/CMakeLists.txt
+++ b/unittests/Transforms/Utils/CMakeLists.txt
@@ -9,5 +9,4 @@ add_llvm_unittest(UtilsTests
   Cloning.cpp
   IntegerDivision.cpp
   Local.cpp
-  SpecialCaseList.cpp
   )
diff --git a/unittests/Transforms/Utils/Cloning.cpp b/unittests/Transforms/Utils/Cloning.cpp
index fb27dc1..b3a1f5b 100644
--- a/unittests/Transforms/Utils/Cloning.cpp
+++ b/unittests/Transforms/Utils/Cloning.cpp
@@ -32,7 +32,7 @@ namespace {
 class CloneInstruction : public ::testing::Test {
 protected:
   virtual void SetUp() {
-    V = NULL;
+    V = nullptr;
   }
 
   template <typename T>
@@ -272,7 +272,7 @@ protected:
 
   void CreateNewFunc() {
     ValueToValueMapTy VMap;
-    NewFunc = CloneFunction(OldFunc, VMap, true, NULL);
+    NewFunc = CloneFunction(OldFunc, VMap, true, nullptr);
     M->getFunctionList().push_back(NewFunc);
   }
 
diff --git a/unittests/Transforms/Utils/SpecialCaseList.cpp b/unittests/Transforms/Utils/SpecialCaseList.cpp
deleted file mode 100644
index fd00687..0000000
--- a/unittests/Transforms/Utils/SpecialCaseList.cpp
+++ /dev/null
@@ -1,232 +0,0 @@
-//===- SpecialCaseList.cpp - Unit tests for SpecialCaseList ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Transforms/Utils/SpecialCaseList.h"
-#include "gtest/gtest.h"
-
-using namespace llvm;
-
-namespace {
-
-class SpecialCaseListTest : public ::testing::Test {
-protected:
-  Function *makeFunction(StringRef Name, Module &M) {
-    return Function::Create(FunctionType::get(Type::getVoidTy(Ctx), false),
-                            GlobalValue::ExternalLinkage,
-                            Name,
-                            &M);
-  }
-
-  GlobalVariable *makeGlobal(StringRef Name, StringRef StructName, Module &M) {
-    StructType *ST =
-        StructType::create(StructName, Type::getInt32Ty(Ctx), (Type*)0);
-    return new GlobalVariable(
-        M, ST, false, GlobalValue::ExternalLinkage, 0, Name);
-  }
-
-  GlobalAlias *makeAlias(StringRef Name, GlobalObject *Aliasee) {
-    return GlobalAlias::create(GlobalValue::ExternalLinkage, Name, Aliasee);
-  }
-
-  SpecialCaseList *makeSpecialCaseList(StringRef List, std::string &Error) {
-    std::unique_ptr<MemoryBuffer> MB(MemoryBuffer::getMemBuffer(List));
-    return SpecialCaseList::create(MB.get(), Error);
-  }
-
-  SpecialCaseList *makeSpecialCaseList(StringRef List) {
-    std::string Error;
-    SpecialCaseList *SCL = makeSpecialCaseList(List, Error);
-    assert(SCL);
-    assert(Error == "");
-    return SCL;
-  }
-
-  LLVMContext Ctx;
-};
-
-TEST_F(SpecialCaseListTest, ModuleIsIn) {
-  Module M("hello", Ctx);
-  Function *F = makeFunction("foo", M);
-  GlobalVariable *GV = makeGlobal("bar", "t", M);
-
-  std::unique_ptr<SpecialCaseList> SCL(
-      makeSpecialCaseList("# This is a comment.\n"
-                          "\n"
-                          "src:hello\n"));
-  EXPECT_TRUE(SCL->isIn(M));
-  EXPECT_TRUE(SCL->isIn(*F));
-  EXPECT_TRUE(SCL->isIn(*GV));
-
-  SCL.reset(makeSpecialCaseList("src:he*o\n"));
-  EXPECT_TRUE(SCL->isIn(M));
-  EXPECT_TRUE(SCL->isIn(*F));
-  EXPECT_TRUE(SCL->isIn(*GV));
-
-  SCL.reset(makeSpecialCaseList("src:hi\n"));
-  EXPECT_FALSE(SCL->isIn(M));
-  EXPECT_FALSE(SCL->isIn(*F));
-  EXPECT_FALSE(SCL->isIn(*GV));
-}
-
-TEST_F(SpecialCaseListTest, FunctionIsIn) {
-  Module M("hello", Ctx);
-  Function *Foo = makeFunction("foo", M);
-  Function *Bar = makeFunction("bar", M);
-
-  std::unique_ptr<SpecialCaseList> SCL(makeSpecialCaseList("fun:foo\n"));
-  EXPECT_TRUE(SCL->isIn(*Foo));
-  EXPECT_FALSE(SCL->isIn(*Bar));
-
-  SCL.reset(makeSpecialCaseList("fun:b*\n"));
-  EXPECT_FALSE(SCL->isIn(*Foo));
-  EXPECT_TRUE(SCL->isIn(*Bar));
-
-  SCL.reset(makeSpecialCaseList("fun:f*\n"
-                                "fun:bar\n"));
-  EXPECT_TRUE(SCL->isIn(*Foo));
-  EXPECT_TRUE(SCL->isIn(*Bar));
-
-  SCL.reset(makeSpecialCaseList("fun:foo=functional\n"));
-  EXPECT_TRUE(SCL->isIn(*Foo, "functional"));
-  StringRef Category;
-  EXPECT_FALSE(SCL->isIn(*Bar, "functional"));
-}
-
-TEST_F(SpecialCaseListTest, GlobalIsIn) {
-  Module M("hello", Ctx);
-  GlobalVariable *Foo = makeGlobal("foo", "t1", M);
-  GlobalVariable *Bar = makeGlobal("bar", "t2", M);
-
-  std::unique_ptr<SpecialCaseList> SCL(makeSpecialCaseList("global:foo\n"));
-  EXPECT_TRUE(SCL->isIn(*Foo));
-  EXPECT_FALSE(SCL->isIn(*Bar));
-  EXPECT_FALSE(SCL->isIn(*Foo, "init"));
-  EXPECT_FALSE(SCL->isIn(*Bar, "init"));
-
-  SCL.reset(makeSpecialCaseList("global:foo=init\n"));
-  EXPECT_FALSE(SCL->isIn(*Foo));
-  EXPECT_FALSE(SCL->isIn(*Bar));
-  EXPECT_TRUE(SCL->isIn(*Foo, "init"));
-  EXPECT_FALSE(SCL->isIn(*Bar, "init"));
-
-  SCL.reset(makeSpecialCaseList("global-init:foo\n"));
-  EXPECT_FALSE(SCL->isIn(*Foo));
-  EXPECT_FALSE(SCL->isIn(*Bar));
-  EXPECT_TRUE(SCL->isIn(*Foo, "init"));
-  EXPECT_FALSE(SCL->isIn(*Bar, "init"));
-
-  SCL.reset(makeSpecialCaseList("type:t2=init\n"));
-  EXPECT_FALSE(SCL->isIn(*Foo));
-  EXPECT_FALSE(SCL->isIn(*Bar));
-  EXPECT_FALSE(SCL->isIn(*Foo, "init"));
-  EXPECT_TRUE(SCL->isIn(*Bar, "init"));
-
-  SCL.reset(makeSpecialCaseList("global-init-type:t2\n"));
-  EXPECT_FALSE(SCL->isIn(*Foo));
-  EXPECT_FALSE(SCL->isIn(*Bar));
-  EXPECT_FALSE(SCL->isIn(*Foo, "init"));
-  EXPECT_TRUE(SCL->isIn(*Bar, "init"));
-
-  SCL.reset(makeSpecialCaseList("src:hello=init\n"));
-  EXPECT_FALSE(SCL->isIn(*Foo));
-  EXPECT_FALSE(SCL->isIn(*Bar));
-  EXPECT_TRUE(SCL->isIn(*Foo, "init"));
-  EXPECT_TRUE(SCL->isIn(*Bar, "init"));
-
-  SCL.reset(makeSpecialCaseList("global-init-src:hello\n"));
-  EXPECT_FALSE(SCL->isIn(*Foo));
-  EXPECT_FALSE(SCL->isIn(*Bar));
-  EXPECT_TRUE(SCL->isIn(*Foo, "init"));
-  EXPECT_TRUE(SCL->isIn(*Bar, "init"));
-}
-
-TEST_F(SpecialCaseListTest, AliasIsIn) {
-  Module M("hello", Ctx);
-  Function *Foo = makeFunction("foo", M);
-  GlobalVariable *Bar = makeGlobal("bar", "t", M);
-  GlobalAlias *FooAlias = makeAlias("fooalias", Foo);
-  GlobalAlias *BarAlias = makeAlias("baralias", Bar);
-
-  std::unique_ptr<SpecialCaseList> SCL(makeSpecialCaseList("fun:foo\n"));
-  EXPECT_FALSE(SCL->isIn(*FooAlias));
-  EXPECT_FALSE(SCL->isIn(*BarAlias));
-
-  SCL.reset(makeSpecialCaseList("global:bar\n"));
-  EXPECT_FALSE(SCL->isIn(*FooAlias));
-  EXPECT_FALSE(SCL->isIn(*BarAlias));
-
-  SCL.reset(makeSpecialCaseList("global:fooalias\n"));
-  EXPECT_FALSE(SCL->isIn(*FooAlias));
-  EXPECT_FALSE(SCL->isIn(*BarAlias));
-
-  SCL.reset(makeSpecialCaseList("fun:fooalias\n"));
-  EXPECT_TRUE(SCL->isIn(*FooAlias));
-  EXPECT_FALSE(SCL->isIn(*BarAlias));
-
-  SCL.reset(makeSpecialCaseList("global:baralias=init\n"));
-  EXPECT_FALSE(SCL->isIn(*FooAlias, "init"));
-  EXPECT_TRUE(SCL->isIn(*BarAlias, "init"));
-
-  SCL.reset(makeSpecialCaseList("type:t=init\n"));
-  EXPECT_FALSE(SCL->isIn(*FooAlias, "init"));
-  EXPECT_TRUE(SCL->isIn(*BarAlias, "init"));
-
-  SCL.reset(makeSpecialCaseList("fun:baralias=init\n"));
-  EXPECT_FALSE(SCL->isIn(*FooAlias, "init"));
-  EXPECT_FALSE(SCL->isIn(*BarAlias, "init"));
-}
-
-TEST_F(SpecialCaseListTest, Substring) {
-  Module M("othello", Ctx);
-  Function *F = makeFunction("tomfoolery", M);
-  GlobalVariable *GV = makeGlobal("bartender", "t", M);
-  GlobalAlias *GA1 = makeAlias("buffoonery", F);
-  GlobalAlias *GA2 = makeAlias("foobar", GV);
-
-  std::unique_ptr<SpecialCaseList> SCL(makeSpecialCaseList("src:hello\n"
-                                                           "fun:foo\n"
-                                                           "global:bar\n"));
-  EXPECT_FALSE(SCL->isIn(M));
-  EXPECT_FALSE(SCL->isIn(*F));
-  EXPECT_FALSE(SCL->isIn(*GV));
-  EXPECT_FALSE(SCL->isIn(*GA1));
-  EXPECT_FALSE(SCL->isIn(*GA2));
-
-  SCL.reset(makeSpecialCaseList("fun:*foo*\n"));
-  EXPECT_TRUE(SCL->isIn(*F));
-  EXPECT_TRUE(SCL->isIn(*GA1));
-}
-
-TEST_F(SpecialCaseListTest, InvalidSpecialCaseList) {
-  std::string Error;
-  EXPECT_EQ(0, makeSpecialCaseList("badline", Error));
-  EXPECT_EQ("Malformed line 1: 'badline'", Error);
-  EXPECT_EQ(0, makeSpecialCaseList("src:bad[a-", Error));
-  EXPECT_EQ("Malformed regex in line 1: 'bad[a-': invalid character range",
-            Error);
-  EXPECT_EQ(0, makeSpecialCaseList("src:a.c\n"
-                                   "fun:fun(a\n",
-                                   Error));
-  EXPECT_EQ("Malformed regex in line 2: 'fun(a': parentheses not balanced",
-            Error);
-  EXPECT_EQ(0, SpecialCaseList::create("unexisting", Error));
-  EXPECT_EQ(0U, Error.find("Can't open file 'unexisting':"));
-}
-
-TEST_F(SpecialCaseListTest, EmptySpecialCaseList) {
-  std::unique_ptr<SpecialCaseList> SCL(makeSpecialCaseList(""));
-  Module M("foo", Ctx);
-  EXPECT_FALSE(SCL->isIn(M));
-}
-
-}
diff --git a/utils/FileCheck/FileCheck.cpp b/utils/FileCheck/FileCheck.cpp
index a124377..d88cf36 100644
--- a/utils/FileCheck/FileCheck.cpp
+++ b/utils/FileCheck/FileCheck.cpp
@@ -27,11 +27,11 @@
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include <algorithm>
 #include <cctype>
 #include <map>
 #include <string>
+#include <system_error>
 #include <vector>
 using namespace llvm;
 
@@ -820,18 +820,18 @@ static StringRef FindFirstMatchingPrefix(StringRef &Buffer,
 /// Returns true in case of an error, false otherwise.
 static bool ReadCheckFile(SourceMgr &SM,
                           std::vector<CheckString> &CheckStrings) {
-  std::unique_ptr<MemoryBuffer> File;
-  if (error_code ec =
-        MemoryBuffer::getFileOrSTDIN(CheckFilename, File)) {
-    errs() << "Could not open check file '" << CheckFilename << "': "
-           << ec.message() << '\n';
+  ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
+      MemoryBuffer::getFileOrSTDIN(CheckFilename);
+  if (std::error_code EC = FileOrErr.getError()) {
+    errs() << "Could not open check file '" << CheckFilename
+           << "': " << EC.message() << '\n';
     return true;
   }
 
   // If we want to canonicalize whitespace, strip excess whitespace from the
   // buffer containing the CHECK lines. Remove DOS style line endings.
-  MemoryBuffer *F =
-    CanonicalizeInputFile(File.release(), NoCanonicalizeWhiteSpace);
+  MemoryBuffer *F = CanonicalizeInputFile(FileOrErr.get().release(),
+                                          NoCanonicalizeWhiteSpace);
 
   SM.AddNewSourceBuffer(F, SMLoc());
 
@@ -1043,7 +1043,7 @@ bool CheckString::CheckNext(const SourceMgr &SM, StringRef Buffer) const {
              SMLoc::getFromPointer(Buffer.data())))->getBufferStart() &&
          "CHECK-NEXT can't be the first check in a file");
 
-  const char *FirstNewLine = 0;
+  const char *FirstNewLine = nullptr;
   unsigned NumNewLines = CountNumNewlinesBetween(Buffer, FirstNewLine);
 
   if (NumNewLines == 0) {
@@ -1224,13 +1224,14 @@ int main(int argc, char **argv) {
     return 2;
 
   // Open the file to check and add it to SourceMgr.
-  std::unique_ptr<MemoryBuffer> File;
-  if (error_code ec =
-        MemoryBuffer::getFileOrSTDIN(InputFilename, File)) {
-    errs() << "Could not open input file '" << InputFilename << "': "
-           << ec.message() << '\n';
+  ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
+      MemoryBuffer::getFileOrSTDIN(InputFilename);
+  if (std::error_code EC = FileOrErr.getError()) {
+    errs() << "Could not open input file '" << InputFilename
+           << "': " << EC.message() << '\n';
     return 2;
   }
+  std::unique_ptr<MemoryBuffer> File = std::move(FileOrErr.get());
 
   if (File->getBufferSize() == 0) {
     errs() << "FileCheck error: '" << InputFilename << "' is empty.\n";
diff --git a/utils/FileUpdate/Android.mk b/utils/FileUpdate/Android.mk
deleted file mode 100644
index d2b82f2..0000000
--- a/utils/FileUpdate/Android.mk
+++ /dev/null
@@ -1,33 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-
-LLVM_ROOT_PATH := $(LOCAL_PATH)/../..
-
-
-#===---------------------------------------------------------------===
-# FileUpdate command line tool
-#===---------------------------------------------------------------===
-
-file_update_SRC_FILES := \
-  FileUpdate.cpp
-
-file_update_STATIC_LIBRARIES := \
-  libLLVMCore               \
-  libLLVMSupport            \
-
-include $(CLEAR_VARS)
-
-LOCAL_MODULE := FileUpdate
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE_CLASS := EXECUTABLES
-LOCAL_IS_HOST_MODULE := true
-
-LOCAL_SRC_FILES := $(file_update_SRC_FILES)
-
-LOCAL_STATIC_LIBRARIES := $(file_update_STATIC_LIBRARIES)
-
-LOCAL_LDLIBS += -lpthread -lm -ldl
-
-include $(LLVM_ROOT_PATH)/llvm.mk
-include $(LLVM_HOST_BUILD_MK)
-include $(LLVM_GEN_INTRINSICS_MK)
-include $(BUILD_HOST_EXECUTABLE)
diff --git a/utils/FileUpdate/CMakeLists.txt b/utils/FileUpdate/CMakeLists.txt
deleted file mode 100644
index ce7478f..0000000
--- a/utils/FileUpdate/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-add_llvm_utility(FileUpdate
-  FileUpdate.cpp
-  )
-
-target_link_libraries(FileUpdate LLVMSupport)
diff --git a/utils/FileUpdate/FileUpdate.cpp b/utils/FileUpdate/FileUpdate.cpp
deleted file mode 100644
index 1bf1248..0000000
--- a/utils/FileUpdate/FileUpdate.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-//===- FileUpdate.cpp - Conditionally update a file -----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// FileUpdate is a utility for conditionally updating a file from its input
-// based on whether the input differs from the output. It is used to avoid
-// unnecessary modifications in a build system.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/system_error.h"
-using namespace llvm;
-
-static cl::opt<bool>
-Quiet("quiet", cl::desc("Don't print unnecessary status information"),
-      cl::init(false));
-
-static cl::opt<std::string>
-InputFilename("input-file", cl::desc("Input file (defaults to stdin)"),
-              cl::init("-"), cl::value_desc("filename"));
-
-static cl::opt<std::string>
-OutputFilename(cl::Positional, cl::desc("<output-file>"), cl::Required);
-
-int main(int argc, char **argv) {
-  sys::PrintStackTraceOnErrorSignal();
-  PrettyStackTraceProgram X(argc, argv);
-  cl::ParseCommandLineOptions(argc, argv);
-
-  if (OutputFilename == "-") {
-    errs() << argv[0] << ": error: Can't update standard output\n";
-    return 1;
-  }
-
-  // Get the input data.
-  std::unique_ptr<MemoryBuffer> In;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFilename, In)) {
-    errs() << argv[0] << ": error: Unable to get input '"
-           << InputFilename << "': " << ec.message() << '\n';
-    return 1;
-  }
-
-  // Get the output data.
-  std::unique_ptr<MemoryBuffer> Out;
-  MemoryBuffer::getFile(OutputFilename.c_str(), Out);
-
-  // If the output exists and the contents match, we are done.
-  if (Out && In->getBufferSize() == Out->getBufferSize() &&
-      memcmp(In->getBufferStart(), Out->getBufferStart(),
-             Out->getBufferSize()) == 0) {
-    if (!Quiet)
-      errs() << argv[0] << ": Not updating '" << OutputFilename
-             << "', contents match input.\n";
-    return 0;
-  }
-
-  // Otherwise, overwrite the output.
-  if (!Quiet)
-    errs() << argv[0] << ": Updating '" << OutputFilename
-           << "', contents changed.\n";
-  std::string ErrorStr;
-  tool_output_file OutStream(OutputFilename.c_str(), ErrorStr,
-                             sys::fs::F_None);
-  if (!ErrorStr.empty()) {
-    errs() << argv[0] << ": Unable to write output '"
-           << OutputFilename << "': " << ErrorStr << '\n';
-    return 1;
-  }
-
-  OutStream.os().write(In->getBufferStart(), In->getBufferSize());
-
-  // Declare success.
-  OutStream.keep();
-
-  return 0;
-}
diff --git a/utils/FileUpdate/Makefile b/utils/FileUpdate/Makefile
deleted file mode 100644
index 1e6c0a8..0000000
--- a/utils/FileUpdate/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-##===- utils/FileUpdate/Makefile ---------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-TOOLNAME = FileUpdate
-USEDLIBS = LLVMSupport.a
-
-# This tool has no plugins, optimize startup time.
-TOOL_NO_EXPORTS = 1
-
-# Don't install this utility
-NO_INSTALL = 1
-
-include $(LEVEL)/Makefile.common
-
diff --git a/utils/KillTheDoctor/KillTheDoctor.cpp b/utils/KillTheDoctor/KillTheDoctor.cpp
index feba2e5..111bad2 100644
--- a/utils/KillTheDoctor/KillTheDoctor.cpp
+++ b/utils/KillTheDoctor/KillTheDoctor.cpp
@@ -40,14 +40,15 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
+#include "llvm/Support/WindowsError.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
 #include "llvm/Support/type_traits.h"
 #include <algorithm>
 #include <cerrno>
 #include <cstdlib>
 #include <map>
 #include <string>
+#include <system_error>
 
 // These includes must be last.
 #include <Windows.h>
@@ -169,8 +170,10 @@ namespace {
   typedef ScopedHandle<FileHandle>              FileScopedHandle;
 }
 
-static error_code GetFileNameFromHandle(HANDLE FileHandle,
-                                        std::string& Name) {
+static std::error_code windows_error(DWORD E) { return mapWindowsError(E); }
+
+static std::error_code GetFileNameFromHandle(HANDLE FileHandle,
+                                             std::string &Name) {
   char Filename[MAX_PATH+1];
   bool Success = false;
   Name.clear();
@@ -210,7 +213,7 @@ static error_code GetFileNameFromHandle(HANDLE FileHandle,
     return windows_error(::GetLastError());
   else {
     Name = Filename;
-    return windows_error::success;
+    return std::error_code();
   }
 }
 
@@ -220,7 +223,8 @@ static error_code GetFileNameFromHandle(HANDLE FileHandle,
 ///        extension is present, try all extensions in PATHEXT.
 /// @return If ec == errc::success, The absolute path to the program. Otherwise
 ///         the return value is undefined.
-static std::string FindProgram(const std::string &Program, error_code &ec) {
+static std::string FindProgram(const std::string &Program,
+                               std::error_code &ec) {
   char PathName[MAX_PATH + 1];
   typedef SmallVector<StringRef, 12> pathext_t;
   pathext_t pathext;
@@ -245,11 +249,11 @@ static std::string FindProgram(const std::string &Program, error_code &ec) {
       ec = windows_error(::GetLastError());
     else if (length > array_lengthof(PathName)) {
       // This may have been the file, return with error.
-      ec = windows_error::buffer_overflow;
+      ec = windows_error(ERROR_BUFFER_OVERFLOW);
       break;
     } else {
       // We found the path! Return it.
-      ec = windows_error::success;
+      ec = std::error_code();
       break;
     }
   }
@@ -312,7 +316,7 @@ int main(int argc, char **argv) {
 
   std::string CommandLine(ProgramToRun);
 
-  error_code ec;
+  std::error_code ec;
   ProgramToRun = FindProgram(ProgramToRun, ec);
   if (ec) {
     errs() << ToolName << ": Failed to find program: '" << CommandLine
@@ -356,8 +360,8 @@ int main(int argc, char **argv) {
                                   &StartupInfo,
                                   &ProcessInfo);
   if (!success) {
-    errs() << ToolName << ": Failed to run program: '" << ProgramToRun
-           << "': " << error_code(windows_error(::GetLastError())).message()
+    errs() << ToolName << ": Failed to run program: '" << ProgramToRun << "': "
+           << std::error_code(windows_error(::GetLastError())).message()
            << '\n';
     return -1;
   }
@@ -420,9 +424,10 @@ int main(int argc, char **argv) {
     success = WaitForDebugEvent(&DebugEvent, TimeLeft);
 
     if (!success) {
-      ec = windows_error(::GetLastError());
+      DWORD LastError = ::GetLastError();
+      ec = windows_error(LastError);
 
-      if (ec == errc::timed_out) {
+      if (LastError == ERROR_SEM_TIMEOUT || LastError == WSAETIMEDOUT) {
         errs() << ToolName << ": Process timed out.\n";
         ::TerminateProcess(ProcessInfo.hProcess, -1);
         // Otherwise other stuff starts failing...
diff --git a/utils/Makefile b/utils/Makefile
index ecb30be..0426192 100644
--- a/utils/Makefile
+++ b/utils/Makefile
@@ -8,8 +8,8 @@
 ##===----------------------------------------------------------------------===##
 
 LEVEL = ..
-PARALLEL_DIRS := FileCheck FileUpdate TableGen PerfectShuffle \
-	      count fpcmp llvm-lit not unittest
+PARALLEL_DIRS := FileCheck TableGen PerfectShuffle count fpcmp llvm-lit not \
+                 unittest
 
 EXTRA_DIST := check-each-file codegen-diff countloc.sh \
               DSAclean.py DSAextract.py emacs findsym.pl GenLibDeps.pl \
diff --git a/utils/PerfectShuffle/PerfectShuffle.cpp b/utils/PerfectShuffle/PerfectShuffle.cpp
index d39414e..f80d885 100644
--- a/utils/PerfectShuffle/PerfectShuffle.cpp
+++ b/utils/PerfectShuffle/PerfectShuffle.cpp
@@ -219,10 +219,10 @@ static void EvaluateOps(unsigned short Elt, unsigned short Vals[],
 int main() {
   // Seed the table with accesses to the LHS and RHS.
   ShufTab[0x0123].Cost = 0;
-  ShufTab[0x0123].Op = 0;
+  ShufTab[0x0123].Op = nullptr;
   ShufTab[0x0123].Arg0 = 0x0123;
   ShufTab[0x4567].Cost = 0;
-  ShufTab[0x4567].Op = 0;
+  ShufTab[0x4567].Op = nullptr;
   ShufTab[0x4567].Arg0 = 0x4567;
 
   // Seed the first-level of shuffles, shuffles whose inputs are the input to
diff --git a/utils/TableGen/Android.mk b/utils/TableGen/Android.mk
index c982168..a743950 100644
--- a/utils/TableGen/Android.mk
+++ b/utils/TableGen/Android.mk
@@ -27,7 +27,6 @@ tablegen_SRC_FILES := \
   OptParserEmitter.cpp \
   PseudoLoweringEmitter.cpp \
   RegisterInfoEmitter.cpp \
-  SetTheory.cpp \
   SubtargetEmitter.cpp \
   TableGen.cpp \
   X86DisassemblerTables.cpp \
diff --git a/utils/TableGen/AsmMatcherEmitter.cpp b/utils/TableGen/AsmMatcherEmitter.cpp
index 3d72741..1277086 100644
--- a/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/utils/TableGen/AsmMatcherEmitter.cpp
@@ -1722,8 +1722,8 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
   CvtOS << "void " << Target.getName() << ClassName << "::\n"
         << "convertToMCInst(unsigned Kind, MCInst &Inst, "
         << "unsigned Opcode,\n"
-        << "                const SmallVectorImpl<MCParsedAsmOperand*"
-        << "> &Operands) {\n"
+        << "                const OperandVector"
+        << " &Operands) {\n"
         << "  assert(Kind < CVT_NUM_SIGNATURES && \"Invalid signature!\");\n"
         << "  const uint8_t *Converter = ConversionTable[Kind];\n"
         << "  Inst.setOpcode(Opcode);\n"
@@ -1732,7 +1732,7 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
         << "    default: llvm_unreachable(\"invalid conversion entry!\");\n"
         << "    case CVT_Reg:\n"
         << "      static_cast<" << TargetOperandClass
-        << "*>(Operands[*(p + 1)])->addRegOperands(Inst, 1);\n"
+        << "&>(*Operands[*(p + 1)]).addRegOperands(Inst, 1);\n"
         << "      break;\n"
         << "    case CVT_Tied:\n"
         << "      Inst.addOperand(Inst.getOperand(*(p + 1)));\n"
@@ -1744,7 +1744,7 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
   OpOS << "void " << Target.getName() << ClassName << "::\n"
        << "convertToMapAndConstraints(unsigned Kind,\n";
   OpOS.indent(27);
-  OpOS << "const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {\n"
+  OpOS << "const OperandVector &Operands) {\n"
        << "  assert(Kind < CVT_NUM_SIGNATURES && \"Invalid signature!\");\n"
        << "  unsigned NumMCOperands = 0;\n"
        << "  const uint8_t *Converter = ConversionTable[Kind];\n"
@@ -1849,9 +1849,8 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
         // converter driver.
         CvtOS << "    case " << Name << ":\n"
               << "      static_cast<" << TargetOperandClass
-              << "*>(Operands[*(p + 1)])->"
-              << Op.Class->RenderMethod << "(Inst, " << OpInfo.MINumOperands
-              << ");\n"
+              << "&>(*Operands[*(p + 1)])." << Op.Class->RenderMethod
+              << "(Inst, " << OpInfo.MINumOperands << ");\n"
               << "      break;\n";
 
         // Add a handler for the operand number lookup.
@@ -2036,10 +2035,10 @@ static void emitMatchClassEnumeration(CodeGenTarget &Target,
 /// emitValidateOperandClass - Emit the function to validate an operand class.
 static void emitValidateOperandClass(AsmMatcherInfo &Info,
                                      raw_ostream &OS) {
-  OS << "static unsigned validateOperandClass(MCParsedAsmOperand *GOp, "
+  OS << "static unsigned validateOperandClass(MCParsedAsmOperand &GOp, "
      << "MatchClassKind Kind) {\n";
-  OS << "  " << Info.Target.getName() << "Operand &Operand = *("
-     << Info.Target.getName() << "Operand*)GOp;\n";
+  OS << "  " << Info.Target.getName() << "Operand &Operand = ("
+     << Info.Target.getName() << "Operand&)GOp;\n";
 
   // The InvalidMatchClass is not to match any operand.
   OS << "  if (Kind == InvalidMatchClass)\n";
@@ -2561,7 +2560,7 @@ static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
   // the found operand class.
   OS << Target.getName() << ClassName << "::OperandMatchResultTy "
      << Target.getName() << ClassName << "::\n"
-     << "tryCustomParseOperand(SmallVectorImpl<MCParsedAsmOperand*>"
+     << "tryCustomParseOperand(OperandVector"
      << " &Operands,\n                      unsigned MCK) {\n\n"
      << "  switch(MCK) {\n";
 
@@ -2585,7 +2584,7 @@ static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
   // a better error handling.
   OS << Target.getName() << ClassName << "::OperandMatchResultTy "
      << Target.getName() << ClassName << "::\n"
-     << "MatchOperandParserImpl(SmallVectorImpl<MCParsedAsmOperand*>"
+     << "MatchOperandParserImpl(OperandVector"
      << " &Operands,\n                       StringRef Mnemonic) {\n";
 
   // Emit code to get the available features.
@@ -2695,14 +2694,14 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "  unsigned ComputeAvailableFeatures(uint64_t FeatureBits) const;\n";
   OS << "  void convertToMCInst(unsigned Kind, MCInst &Inst, "
      << "unsigned Opcode,\n"
-     << "                       const SmallVectorImpl<MCParsedAsmOperand*> "
+     << "                       const OperandVector "
      << "&Operands);\n";
   OS << "  void convertToMapAndConstraints(unsigned Kind,\n                ";
-  OS << "           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) override;\n";
+  OS << "           const OperandVector &Operands) override;\n";
   OS << "  bool mnemonicIsValid(StringRef Mnemonic, unsigned VariantID) override;\n";
   OS << "  unsigned MatchInstructionImpl(\n";
   OS.indent(27);
-  OS << "const SmallVectorImpl<MCParsedAsmOperand*> &Operands,\n"
+  OS << "const OperandVector &Operands,\n"
      << "                                MCInst &Inst,\n"
      << "                                unsigned &ErrorInfo,"
      << " bool matchingInlineAsm,\n"
@@ -2715,11 +2714,11 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
     OS << "    MatchOperand_ParseFail   // operand matched but had errors\n";
     OS << "  };\n";
     OS << "  OperandMatchResultTy MatchOperandParserImpl(\n";
-    OS << "    SmallVectorImpl<MCParsedAsmOperand*> &Operands,\n";
+    OS << "    OperandVector &Operands,\n";
     OS << "    StringRef Mnemonic);\n";
 
     OS << "  OperandMatchResultTy tryCustomParseOperand(\n";
-    OS << "    SmallVectorImpl<MCParsedAsmOperand*> &Operands,\n";
+    OS << "    OperandVector &Operands,\n";
     OS << "    unsigned MCK);\n\n";
   }
 
@@ -2909,9 +2908,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "}\n\n";
 
   // Finally, build the match function.
-  OS << "unsigned "
-     << Target.getName() << ClassName << "::\n"
-     << "MatchInstructionImpl(const SmallVectorImpl<MCParsedAsmOperand*>"
+  OS << "unsigned " << Target.getName() << ClassName << "::\n"
+     << "MatchInstructionImpl(const OperandVector"
      << " &Operands,\n";
   OS << "                     MCInst &Inst,\n"
      << "unsigned &ErrorInfo, bool matchingInlineAsm, unsigned VariantID) {\n";
@@ -2928,7 +2926,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
   OS << "  // Get the instruction mnemonic, which is the first token.\n";
   OS << "  StringRef Mnemonic = ((" << Target.getName()
-     << "Operand*)Operands[0])->getToken();\n\n";
+     << "Operand&)*Operands[0]).getToken();\n\n";
 
   if (HasMnemonicAliases) {
     OS << "  // Process all MnemonicAliases to remap the mnemonic.\n";
@@ -2980,7 +2978,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "        if (!OperandsValid) ErrorInfo = i + 1;\n";
   OS << "        break;\n";
   OS << "      }\n";
-  OS << "      unsigned Diag = validateOperandClass(Operands[i+1],\n";
+  OS << "      unsigned Diag = validateOperandClass(*Operands[i+1],\n";
   OS.indent(43);
   OS << "(MatchClassKind)it->Classes[i]);\n";
   OS << "      if (Diag == Match_Success)\n";
@@ -2988,7 +2986,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "      // If the generic handler indicates an invalid operand\n";
   OS << "      // failure, check for a special case.\n";
   OS << "      if (Diag == Match_InvalidOperand) {\n";
-  OS << "        Diag = validateTargetOperandClass(Operands[i+1],\n";
+  OS << "        Diag = validateTargetOperandClass(*Operands[i+1],\n";
   OS.indent(43);
   OS << "(MatchClassKind)it->Classes[i]);\n";
   OS << "        if (Diag == Match_Success)\n";
@@ -3055,7 +3053,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   if (HasDeprecation) {
     OS << "    std::string Info;\n";
     OS << "    if (MII.get(Inst.getOpcode()).getDeprecatedInfo(Inst, STI, Info)) {\n";
-    OS << "      SMLoc Loc = ((" << Target.getName() << "Operand*)Operands[0])->getStartLoc();\n";
+    OS << "      SMLoc Loc = ((" << Target.getName()
+       << "Operand&)*Operands[0]).getStartLoc();\n";
     OS << "      Parser.Warning(Loc, Info, None);\n";
     OS << "    }\n";
   }
diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp
index 2741d8f..c7fe9df 100644
--- a/utils/TableGen/AsmWriterEmitter.cpp
+++ b/utils/TableGen/AsmWriterEmitter.cpp
@@ -806,6 +806,10 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   // before it can be matched to the mnemonic.
   std::map<std::string, std::vector<IAPrinter*> > IAPrinterMap;
 
+  // A list of MCOperandPredicates for all operands in use, and the reverse map
+  std::vector<const Record*> MCOpPredicates;
+  DenseMap<const Record*, unsigned> MCOpPredicateMap;
+
   for (auto &Aliases : AliasMap) {
     for (auto &Alias : Aliases.second) {
       const CodeGenInstAlias *CGA = Alias.first;
@@ -832,6 +836,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
 
       unsigned MIOpNum = 0;
       for (unsigned i = 0, e = LastOpNo; i != e; ++i) {
+        std::string Op = "MI->getOperand(" + llvm::utostr(MIOpNum) + ")";
+
         const CodeGenInstAlias::ResultOperand &RO = CGA->ResultOperands[i];
 
         switch (RO.Kind) {
@@ -858,9 +864,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
           if (Rec->isSubClassOf("RegisterOperand"))
             Rec = Rec->getValueAsDef("RegClass");
           if (Rec->isSubClassOf("RegisterClass")) {
-            Cond = std::string("MI->getOperand(") + llvm::utostr(MIOpNum) +
-                   ").isReg()";
-            IAP->addCond(Cond);
+            IAP->addCond(Op + ".isReg()");
 
             if (!IAP->isOpMapped(ROName)) {
               IAP->addOperand(ROName, MIOpNum, PrintMethodIdx);
@@ -869,26 +873,34 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
                 R = R->getValueAsDef("RegClass");
               Cond = std::string("MRI.getRegClass(") + Target.getName() + "::" +
                      R->getName() + "RegClassID)"
-                                    ".contains(MI->getOperand(" +
-                     llvm::utostr(MIOpNum) + ").getReg())";
-              IAP->addCond(Cond);
+                                    ".contains(" + Op + ".getReg())";
             } else {
-              Cond = std::string("MI->getOperand(") +
-                llvm::utostr(MIOpNum) + ").getReg() == MI->getOperand(" +
+              Cond = Op + ".getReg() == MI->getOperand(" +
                 llvm::utostr(IAP->getOpIndex(ROName)) + ").getReg()";
-              IAP->addCond(Cond);
             }
           } else {
             // Assume all printable operands are desired for now. This can be
             // overridden in the InstAlias instantiation if necessary.
             IAP->addOperand(ROName, MIOpNum, PrintMethodIdx);
-          }
 
+            // There might be an additional predicate on the MCOperand
+            unsigned Entry = MCOpPredicateMap[Rec];
+            if (!Entry) {
+              if (!Rec->isValueUnset("MCOperandPredicate")) {
+                MCOpPredicates.push_back(Rec);
+                Entry = MCOpPredicates.size();
+                MCOpPredicateMap[Rec] = Entry;
+              } else
+                break; // No conditions on this operand at all
+            }
+            Cond = Target.getName() + ClassName + "ValidateMCOperand(" +
+                   Op + ", " + llvm::utostr(Entry) + ")";
+          }
+          // for all subcases of ResultOperand::K_Record:
+          IAP->addCond(Cond);
           break;
         }
         case CodeGenInstAlias::ResultOperand::K_Imm: {
-          std::string Op = "MI->getOperand(" + llvm::utostr(MIOpNum) + ")";
-
           // Just because the alias has an immediate result, doesn't mean the
           // MCInst will. An MCExpr could be present, for example.
           IAP->addCond(Op + ".isImm()");
@@ -906,8 +918,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
             break;
           }
 
-          Cond = std::string("MI->getOperand(") +
-            llvm::utostr(MIOpNum) + ").getReg() == " + Target.getName() +
+          Cond = Op + ".getReg() == " + Target.getName() +
             "::" + CGA->ResultOperands[i].getRegister()->getName();
           IAP->addCond(Cond);
           break;
@@ -980,6 +991,11 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
     return;
   }
 
+  if (MCOpPredicates.size())
+    O << "static bool " << Target.getName() << ClassName
+      << "ValidateMCOperand(\n"
+      << "       const MCOperand &MCOp, unsigned PredicateIndex);\n";
+
   O << HeaderO.str();
   O.indent(2) << "const char *AsmString;\n";
   O.indent(2) << "switch (MI->getOpcode()) {\n";
@@ -1041,6 +1057,28 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   }    
   O << "}\n\n";
 
+  if (MCOpPredicates.size()) {
+    O << "static bool " << Target.getName() << ClassName
+      << "ValidateMCOperand(\n"
+      << "       const MCOperand &MCOp, unsigned PredicateIndex) {\n"
+      << "  switch (PredicateIndex) {\n"
+      << "  default:\n"
+      << "    llvm_unreachable(\"Unknown MCOperandPredicate kind\");\n"
+      << "    break;\n";
+
+    for (unsigned i = 0; i < MCOpPredicates.size(); ++i) {
+      Init *MCOpPred = MCOpPredicates[i]->getValueInit("MCOperandPredicate");
+      if (StringInit *SI = dyn_cast<StringInit>(MCOpPred)) {
+        O << "  case " << i + 1 << ": {\n"
+          << SI->getValue() << "\n"
+          << "    }\n";
+      } else
+        llvm_unreachable("Unexpected MCOperandPredicate field!");
+    }
+    O << "  }\n"
+      << "}\n\n";
+  }
+
   O << "#endif // PRINT_ALIAS_INSTR\n";
 }
 
diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt
index f277608..feaa7c7 100644
--- a/utils/TableGen/CMakeLists.txt
+++ b/utils/TableGen/CMakeLists.txt
@@ -26,7 +26,6 @@ add_tablegen(llvm-tblgen LLVM
   OptParserEmitter.cpp
   PseudoLoweringEmitter.cpp
   RegisterInfoEmitter.cpp
-  SetTheory.cpp
   SubtargetEmitter.cpp
   TableGen.cpp
   X86DisassemblerTables.cpp
diff --git a/utils/TableGen/CTagsEmitter.cpp b/utils/TableGen/CTagsEmitter.cpp
index 7108679..5d6d6da 100644
--- a/utils/TableGen/CTagsEmitter.cpp
+++ b/utils/TableGen/CTagsEmitter.cpp
@@ -37,8 +37,8 @@ public:
       : Id(&Name), Loc(Location) {}
   int operator<(const Tag &B) const { return *Id < *B.Id; }
   void emit(raw_ostream &OS) const {
-    int BufferID = SrcMgr.FindBufferContainingLoc(Loc);
-    MemoryBuffer *CurMB = SrcMgr.getBufferInfo(BufferID).Buffer;
+    const MemoryBuffer *CurMB =
+        SrcMgr.getMemoryBuffer(SrcMgr.FindBufferContainingLoc(Loc));
     const char *BufferName = CurMB->getBufferIdentifier();
     std::pair<unsigned, unsigned> LineAndColumn = SrcMgr.getLineAndColumn(Loc);
     OS << *Id << "\t" << BufferName << "\t" << LineAndColumn.first << "\n";
diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index 00bc9a5..2602bbc 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -2119,9 +2119,11 @@ InferAllTypes(const StringMap<SmallVector<TreePatternNode*,1> > *InNamedTypes) {
       // If we have input named node types, propagate their types to the named
       // values here.
       if (InNamedTypes) {
-        // FIXME: Should be error?
-        assert(InNamedTypes->count(I->getKey()) &&
-               "Named node in output pattern but not input pattern?");
+        if (!InNamedTypes->count(I->getKey())) {
+          error("Node '" + std::string(I->getKey()) +
+                "' in output pattern but not input pattern");
+          return true;
+        }
 
         const SmallVectorImpl<TreePatternNode*> &InNodes =
           InNamedTypes->find(I->getKey())->second;
diff --git a/utils/TableGen/CodeGenIntrinsics.h b/utils/TableGen/CodeGenIntrinsics.h
index 06daa97..a9ece01 100644
--- a/utils/TableGen/CodeGenIntrinsics.h
+++ b/utils/TableGen/CodeGenIntrinsics.h
@@ -28,6 +28,7 @@ namespace llvm {
     std::string Name;          // The name of the LLVM function "llvm.bswap.i32"
     std::string EnumName;      // The name of the enum "bswap_i32"
     std::string GCCBuiltinName;// Name of the corresponding GCC builtin, or "".
+    std::string MSBuiltinName; // Name of the corresponding MS builtin, or "".
     std::string TargetPrefix;  // Target prefix, e.g. "ppc" for t-s intrinsics.
 
     /// IntrinsicSignature - This structure holds the return values and
diff --git a/utils/TableGen/CodeGenRegisters.h b/utils/TableGen/CodeGenRegisters.h
index 30732c8..278315b 100644
--- a/utils/TableGen/CodeGenRegisters.h
+++ b/utils/TableGen/CodeGenRegisters.h
@@ -15,7 +15,6 @@
 #ifndef CODEGEN_REGISTERS_H
 #define CODEGEN_REGISTERS_H
 
-#include "SetTheory.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
@@ -23,6 +22,7 @@
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/SetTheory.h"
 #include <cstdlib>
 #include <map>
 #include <set>
diff --git a/utils/TableGen/CodeGenSchedule.h b/utils/TableGen/CodeGenSchedule.h
index 65ac602..3fef8ad 100644
--- a/utils/TableGen/CodeGenSchedule.h
+++ b/utils/TableGen/CodeGenSchedule.h
@@ -15,11 +15,11 @@
 #ifndef CODEGEN_SCHEDULE_H
 #define CODEGEN_SCHEDULE_H
 
-#include "SetTheory.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/SetTheory.h"
 
 namespace llvm {
 
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index de00dc6..d1b5711 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -459,6 +459,8 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
 
   if (R->getValue("GCCBuiltinName"))  // Ignore a missing GCCBuiltinName field.
     GCCBuiltinName = R->getValueAsString("GCCBuiltinName");
+  if (R->getValue("MSBuiltinName"))   // Ignore a missing MSBuiltinName field.
+    MSBuiltinName = R->getValueAsString("MSBuiltinName");
 
   TargetPrefix = R->getValueAsString("TargetPrefix");
   Name = R->getValueAsString("LLVMName");
diff --git a/utils/TableGen/IntrinsicEmitter.cpp b/utils/TableGen/IntrinsicEmitter.cpp
index 1927ad9..430ef32 100644
--- a/utils/TableGen/IntrinsicEmitter.cpp
+++ b/utils/TableGen/IntrinsicEmitter.cpp
@@ -54,6 +54,8 @@ public:
                           raw_ostream &OS);
   void EmitIntrinsicToGCCBuiltinMap(const std::vector<CodeGenIntrinsic> &Ints,
                                     raw_ostream &OS);
+  void EmitIntrinsicToMSBuiltinMap(const std::vector<CodeGenIntrinsic> &Ints,
+                                   raw_ostream &OS);
   void EmitSuffix(raw_ostream &OS);
 };
 } // End anonymous namespace
@@ -96,6 +98,9 @@ void IntrinsicEmitter::run(raw_ostream &OS) {
   // Emit code to translate GCC builtins into LLVM intrinsics.
   EmitIntrinsicToGCCBuiltinMap(Ints, OS);
 
+  // Emit code to translate MS builtins into LLVM intrinsics.
+  EmitIntrinsicToMSBuiltinMap(Ints, OS);
+
   EmitSuffix(OS);
 }
 
@@ -380,7 +385,7 @@ static void ComputeFixedEncoding(const CodeGenIntrinsic &Int,
       case 3: TypeSig.push_back(IIT_STRUCT3); break;
       case 4: TypeSig.push_back(IIT_STRUCT4); break;
       case 5: TypeSig.push_back(IIT_STRUCT5); break;
-      default: assert(0 && "Unhandled case in struct");
+      default: llvm_unreachable("Unhandled case in struct");
     }
 
     for (unsigned i = 0, e = Int.IS.RetVTs.size(); i != e; ++i)
@@ -790,6 +795,55 @@ EmitIntrinsicToGCCBuiltinMap(const std::vector<CodeGenIntrinsic> &Ints,
   OS << "#endif\n\n";
 }
 
+void IntrinsicEmitter::
+EmitIntrinsicToMSBuiltinMap(const std::vector<CodeGenIntrinsic> &Ints,
+                            raw_ostream &OS) {
+  std::map<std::string, std::map<std::string, std::string>> TargetBuiltins;
+
+  for (const auto &Intrinsic : Ints) {
+    if (Intrinsic.MSBuiltinName.empty())
+      continue;
+
+    auto &Builtins = TargetBuiltins[Intrinsic.TargetPrefix];
+    if (!Builtins.insert(std::make_pair(Intrinsic.MSBuiltinName,
+                                        Intrinsic.EnumName)).second)
+      PrintFatalError("Intrinsic '" + Intrinsic.TheDef->getName() + "': "
+                      "duplicate MS builtin name!");
+  }
+
+  OS << "// Get the LLVM intrinsic that corresponds to a MS builtin.\n"
+        "// This is used by the C front-end.  The MS builtin name is passed\n"
+        "// in as a BuiltinName, and a target prefix (e.g. 'arm') is passed\n"
+        "// in as a TargetPrefix.  The result is assigned to 'IntrinsicID'.\n"
+        "#ifdef GET_LLVM_INTRINSIC_FOR_MS_BUILTIN\n";
+
+  OS << (TargetOnly ? "static " + TargetPrefix : "") << "Intrinsic::ID "
+     << (TargetOnly ? "" : "Intrinsic::")
+     << "getIntrinsicForMSBuiltin(const char *TP, const char *BN) {\n";
+  OS << "  StringRef BuiltinName(BN);\n"
+        "  StringRef TargetPrefix(TP);\n"
+        "\n";
+
+  for (const auto &Builtins : TargetBuiltins) {
+    OS << "  ";
+    if (Builtins.first.empty())
+      OS << "/* Target Independent Builtins */ ";
+    else
+      OS << "if (TargetPrefix == \"" << Builtins.first << "\") ";
+    OS << "{\n";
+    EmitTargetBuiltins(Builtins.second, TargetPrefix, OS);
+    OS << "}";
+  }
+
+  OS << "  return ";
+  if (!TargetPrefix.empty())
+    OS << "(" << TargetPrefix << "Intrinsic::ID)";
+  OS << "Intrinsic::not_intrinsic;\n";
+  OS << "}\n";
+
+  OS << "#endif\n\n";
+}
+
 void llvm::EmitIntrinsics(RecordKeeper &RK, raw_ostream &OS, bool TargetOnly) {
   IntrinsicEmitter(RK, TargetOnly).run(OS);
 }
diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp
index 00c3a6f..bbd61f5 100644
--- a/utils/TableGen/TableGen.cpp
+++ b/utils/TableGen/TableGen.cpp
@@ -12,13 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "TableGenBackends.h" // Declares all backends.
-#include "SetTheory.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Main.h"
 #include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/SetTheory.h"
 
 using namespace llvm;
 
diff --git a/utils/emacs/tablegen-mode.el b/utils/emacs/tablegen-mode.el
index e83a34c..c0ae751 100644
--- a/utils/emacs/tablegen-mode.el
+++ b/utils/emacs/tablegen-mode.el
@@ -112,6 +112,7 @@
   (set-syntax-table tablegen-mode-syntax-table)
   (make-local-variable 'comment-start)
   (setq comment-start "//")
+  (setq indent-tabs-mode nil)
   (run-hooks 'tablegen-mode-hook))       ; Finally, this permits the user to
                                          ;   customize the mode with a hook.
 
diff --git a/utils/lit/lit/discovery.py b/utils/lit/lit/discovery.py
index c3c0f28..876d4f3 100644
--- a/utils/lit/lit/discovery.py
+++ b/utils/lit/lit/discovery.py
@@ -200,9 +200,7 @@ def find_tests_for_inputs(lit_config, inputs):
     # Expand '@...' form in inputs.
     actual_inputs = []
     for input in inputs:
-        if os.path.exists(input) or not input.startswith('@'):
-            actual_inputs.append(input)
-        else:
+        if input.startswith('@'):
             f = open(input[1:])
             try:
                 for ln in f:
@@ -211,6 +209,8 @@ def find_tests_for_inputs(lit_config, inputs):
                         actual_inputs.append(ln)
             finally:
                 f.close()
+        else:
+            actual_inputs.append(input)
                     
     # Load the tests from the inputs.
     tests = []
diff --git a/utils/lit/lit/util.py b/utils/lit/lit/util.py
index 2b1010c..72a8b48 100644
--- a/utils/lit/lit/util.py
+++ b/utils/lit/lit/util.py
@@ -167,3 +167,20 @@ def executeCommand(command, cwd=None, env=None):
         err = str(err)
 
     return out, err, exitCode
+
+def usePlatformSdkOnDarwin(config, lit_config):
+    # On Darwin, support relocatable SDKs by providing Clang with a
+    # default system root path.
+    if 'darwin' in config.target_triple:
+        try:
+            cmd = subprocess.Popen(['xcrun', '--show-sdk-path'],
+                                   stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            out, err = cmd.communicate()
+            out = out.strip()
+            res = cmd.wait()
+        except OSError:
+            res = -1
+        if res == 0 and out:
+            sdk_path = out
+            lit_config.note('using SDKROOT: %r' % sdk_path)
+            config.environment['SDKROOT'] = sdk_path
diff --git a/utils/llvm-compilers-check b/utils/llvm-compilers-check
index 3173027..4db8426 100755
--- a/utils/llvm-compilers-check
+++ b/utils/llvm-compilers-check
@@ -149,6 +149,10 @@ def add_options(parser):
                       help=("Do not do installs"))
     parser.add_option("--keep-going", default=False, action="store_true",
                       help=("Keep going after failures"))
+    parser.add_option("--no-flavor-prefix", default=False, action="store_true",
+                      help=("Do not append the build flavor to the install path"))
+    parser.add_option("--enable-werror", default=False, action="store_true",
+                      help=("Build with -Werror"))
     return
 
 def check_options(parser, options, valid_builds):
@@ -346,7 +350,9 @@ class Builder(threading.Thread):
         ssabbrev = get_short_abbrevs([ab for ab in self.source_abbrev.values()])
 
         prefix = "[" + ssabbrev[self.source_abbrev[source]] + "-" + self.build_abbrev[build] + "]"
-        self.install_prefix += "/" + self.source_abbrev[source] + "/" + build
+        if (not self.options.no_flavor_prefix):
+            self.install_prefix += "/" + self.source_abbrev[source] + "/" + build
+
         build_suffix += "/" + self.source_abbrev[source] + "/" + build
 
         self.logger = logging.getLogger(prefix)
@@ -361,16 +367,13 @@ class Builder(threading.Thread):
 
         configure_flags = dict(
             llvm=dict(debug=["--prefix=" + self.install_prefix,
-                             "--enable-werror",
                              "--enable-assertions",
                              "--disable-optimized",
                              "--with-gcc-toolchain=" + cxxroot],
                       release=["--prefix=" + self.install_prefix,
-                               "--enable-werror",
                                "--enable-optimized",
                                "--with-gcc-toolchain=" + cxxroot],
                       paranoid=["--prefix=" + self.install_prefix,
-                                "--enable-werror",
                                 "--enable-assertions",
                                 "--enable-expensive-checks",
                                 "--disable-optimized",
@@ -379,6 +382,11 @@ class Builder(threading.Thread):
                            release=[],
                            paranoid=[]))
 
+        if (self.options.enable_werror):
+            configure_flags["llvm"]["debug"].append("--enable-werror")
+            configure_flags["llvm"]["release"].append("--enable-werror")
+            configure_flags["llvm"]["paranoid"].append("--enable-werror")
+
         configure_env = dict(
             llvm=dict(debug=dict(CC=self.cc,
                                  CXX=self.cxx),
@@ -530,10 +538,6 @@ class Builder(threading.Thread):
                 self.logger.info("[" + prefix + "] Configure failed, no configure script " + conf)
                 return -1
 
-            if not os.path.exists(mf):
-                self.logger.info("[" + prefix + "] Configure failed, no makefile " + mf)
-                return -1
-
             if os.path.exists(conf) and os.path.exists(mf):
                 confstat = os.stat(conf)
                 makestat = os.stat(mf)
diff --git a/utils/llvm-lit/llvm-lit.in b/utils/llvm-lit/llvm-lit.in
index a82cbf0..fc96202 100755
--- a/utils/llvm-lit/llvm-lit.in
+++ b/utils/llvm-lit/llvm-lit.in
@@ -29,7 +29,7 @@ if os.path.exists(clang_obj_root):
         builtin_parameters['clang_tools_extra_site_config'] = \
             os.path.join(clang_tools_extra_obj_root, 'test', 'lit.site.cfg')
 
-lld_obj_root = os.path.join(llvm_obj_root, 'projects', 'lld')
+lld_obj_root = os.path.join(llvm_obj_root, 'tools', 'lld')
 if os.path.exists(lld_obj_root):
     builtin_parameters['lld_site_config'] = \
         os.path.join(lld_obj_root, 'test', 'lit.site.cfg')
diff --git a/utils/llvm.natvis b/utils/llvm.natvis
index 9874ce5..6c410a4 100644
--- a/utils/llvm.natvis
+++ b/utils/llvm.natvis
@@ -108,14 +108,6 @@ or create a symbolic link so it updates automatically.
     </Expand>
   </Type>
 
-  <Type Name="llvm::OwningPtr&lt;*&gt;">
-    <DisplayString Condition="Ptr == 0">empty</DisplayString>
-    <DisplayString Condition="Ptr != 0">OwningPtr {*Ptr}</DisplayString>
-    <Expand>
-      <ExpandedItem Condition="Ptr != 0">Ptr</ExpandedItem>
-    </Expand>
-  </Type>
-
   <Type Name="llvm::SmallPtrSet&lt;*,*&gt;">
     <DisplayString Condition="CurArray == SmallArray">{{ [Small Mode] size={NumElements}, capacity={CurArraySize} }}</DisplayString>
     <DisplayString Condition="CurArray != SmallArray">{{ [Big Mode] size={NumElements}, capacity={CurArraySize} }}</DisplayString>
@@ -143,9 +135,9 @@ or create a symbolic link so it updates automatically.
   </Type>
 
   <Type Name="llvm::StringMap&lt;*,*&gt;">
-    <DisplayString>{{ size={ItemSize}, buckets={NumBuckets} }}</DisplayString>
+    <DisplayString>{{ size={NumItems}, buckets={NumBuckets} }}</DisplayString>
     <Expand>
-      <Item Name="[size]">ItemSize</Item>
+      <Item Name="[size]">NumItems</Item>
       <Item Name="[buckets]">NumBuckets</Item>
       <ArrayItems>
         <Size>NumBuckets</Size>
diff --git a/utils/not/not.cpp b/utils/not/not.cpp
index ebd1618..a5c7183 100644
--- a/utils/not/not.cpp
+++ b/utils/not/not.cpp
@@ -30,7 +30,16 @@ int main(int argc, const char **argv) {
   std::string Program = sys::FindProgramByName(argv[0]);
 
   std::string ErrMsg;
-  int Result = sys::ExecuteAndWait(Program, argv, 0, 0, 0, 0, &ErrMsg);
+  int Result = sys::ExecuteAndWait(Program, argv, nullptr, nullptr, 0, 0,
+                                   &ErrMsg);
+#ifdef _WIN32
+  // Handle abort() in msvcrt -- It has exit code as 3.  abort(), aka
+  // unreachable, should be recognized as a crash.  However, some binaries use
+  // exit code 3 on non-crash failure paths, so only do this if we expect a
+  // crash.
+  if (ExpectCrash && Result == 3)
+    Result = -3;
+#endif
   if (Result < 0) {
     errs() << "Error: " << ErrMsg << "\n";
     if (ExpectCrash)
diff --git a/utils/yaml-bench/YAMLBench.cpp b/utils/yaml-bench/YAMLBench.cpp
index 58b7356..e88ce5d 100644
--- a/utils/yaml-bench/YAMLBench.cpp
+++ b/utils/yaml-bench/YAMLBench.cpp
@@ -21,7 +21,7 @@
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
+#include <system_error>
 
 using namespace llvm;
 
@@ -188,9 +188,11 @@ static std::string createJSONText(size_t MemoryMB, unsigned ValueSize) {
 int main(int argc, char **argv) {
   llvm::cl::ParseCommandLineOptions(argc, argv);
   if (Input.getNumOccurrences()) {
-    std::unique_ptr<MemoryBuffer> Buf;
-    if (MemoryBuffer::getFileOrSTDIN(Input, Buf))
+    ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr =
+        MemoryBuffer::getFileOrSTDIN(Input);
+    if (!BufOrErr)
       return 1;
+    std::unique_ptr<MemoryBuffer> Buf = std::move(BufOrErr.get());
 
     llvm::SourceMgr sm;
     if (DumpTokens) {